Spaces:

xfu314
/

Robotics_Data_Engine

Sleeping

App Files Files Community

xfu314 Claude Opus 4.5 commited on 17 days ago

Commit

96da58e

1 Parent(s): 77e2834

Add phantom project with submodules and dependencies

Browse files

Binary files tracked with Git LFS

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +13 -0
app.py +398 -80
phantom +0 -1
phantom/.gitignore +11 -0
phantom/.gitmodules +15 -0
phantom/LICENSE +21 -0
phantom/README.md +168 -0
phantom/configs/default.yaml +30 -0
phantom/configs/epic.yaml +31 -0
phantom/configs/sam2_hiera_l.yaml +117 -0
phantom/data/__init__.py +0 -0
phantom/docs/teaser_masquerade.png +3 -0
phantom/docs/teaser_phantom.png +3 -0
phantom/install.sh +67 -0
phantom/phantom/__init__.py +0 -0
phantom/phantom/camera/__init__.py +0 -0
phantom/phantom/camera/camera_extrinsics.json +42 -0
phantom/phantom/camera/camera_extrinsics_ego_bimanual_shoulders.json +52 -0
phantom/phantom/camera/camera_intrinsics_HD1080.json +48 -0
phantom/phantom/camera/camera_intrinsics_epic.json +48 -0
phantom/phantom/detectors/detector_detectron2.py +121 -0
phantom/phantom/detectors/detector_dino.py +108 -0
phantom/phantom/detectors/detector_hamer.py +447 -0
phantom/phantom/detectors/detector_sam2.py +240 -0
phantom/phantom/hand.py +805 -0
phantom/phantom/process_data.py +243 -0
phantom/phantom/processors/__init__.py +0 -0
phantom/phantom/processors/action_processor.py +478 -0
phantom/phantom/processors/base_processor.py +209 -0
phantom/phantom/processors/bbox_processor.py +851 -0
phantom/phantom/processors/hand_processor.py +675 -0
phantom/phantom/processors/handinpaint_processor.py +485 -0
phantom/phantom/processors/paths.py +219 -0
phantom/phantom/processors/phantom_data.py +340 -0
phantom/phantom/processors/robotinpaint_processor.py +785 -0
phantom/phantom/processors/segmentation_processor.py +1056 -0
phantom/phantom/processors/smoothing_processor.py +303 -0
phantom/phantom/twin_bimanual_robot.py +597 -0
phantom/phantom/twin_robot.py +490 -0
phantom/phantom/utils/__init__.py +0 -0
phantom/phantom/utils/bbox_utils.py +38 -0
phantom/phantom/utils/data_utils.py +38 -0
phantom/phantom/utils/image_utils.py +103 -0
phantom/phantom/utils/pcd_utils.py +210 -0
phantom/phantom/utils/transform_utils.py +43 -0
phantom/setup.py +7 -0
phantom/submodules/phantom-E2FGVI/.gitignore +136 -0
phantom/submodules/phantom-E2FGVI/E2FGVI/__init__.py +0 -0
phantom/submodules/phantom-E2FGVI/E2FGVI/configs/train_e2fgvi.json +41 -0
phantom/submodules/phantom-E2FGVI/E2FGVI/configs/train_e2fgvi_hq.json +41 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,16 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+phantom/submodules/phantom-robosuite/robosuite/models/assets/textures/plywood-4k.jpg filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.hdf5 filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text
+*.obj filter=lfs diff=lfs merge=lfs -text
+*.mtl filter=lfs diff=lfs merge=lfs -text
+*.stl filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.dae filter=lfs diff=lfs merge=lfs -text
+*.hdr filter=lfs diff=lfs merge=lfs -text
+*.msh filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,109 +1,427 @@
 """
-Phantom Video Processor - Hugging Face Space
 """
 import gradio as gr
 import spaces
 import subprocess
 import sys
 from pathlib import Path
-# ========== 环境配置 ==========
 PHANTOM_DIR = Path("/home/user/app/phantom")
-def setup_environment():
-    """配置Phantom环境（仅首次运行）"""
-    # 检查是否已配置
     if Path("/tmp/.phantom_ready").exists():
-        print("✅ Phantom环境已配置")
-        return True
-    print("🔧 首次运行，配置环境（约5-10分钟）...")
-    # 运行setup.sh
     setup_script = Path("/home/user/app/setup.sh")
-    if setup_script.exists():
-        try:
-            result = subprocess.run(
-                ["bash", str(setup_script)],
-                check=True,
-                capture_output=True,
-                text=True
-            )
-            print(result.stdout)
-            print("✅ 环境配置完成")
-            return True
-        except subprocess.CalledProcessError as e:
-            print(f"❌ 配置失败: {e.stderr}")
-            return False
-    else:
-        print("⚠️ setup.sh不存在")
-        return False
-# 添加Phantom到Python路径
-if PHANTOM_DIR.exists():
-    sys.path.insert(0, str(PHANTOM_DIR))
-# 启动时配置环境
-phantom_ready = setup_environment()
-# ========== 其余代码保持不变 ==========
-@spaces.GPU(duration=120)
-def process_video(video_file, robot_type, target_hand):
-    """处理视频"""
     import torch
-    if video_file is None:
-        return None, None, "请先上传视频"
-    # 检查GPU
     if torch.cuda.is_available():
         gpu = torch.cuda.get_device_name(0)
-        status = f"✅ GPU: {gpu}\n"
     else:
-        status = "⚠️ 未检测到GPU\n"
-    status += f"视频: {video_file}\n"
-    status += f"机器人: {robot_type}\n"
-    status += f"手部: {target_hand}\n"
-    if not phantom_ready:
-        status += "\n⚠️ Phantom环境未就绪"
-    return None, None, status
-# Gradio界面
-with gr.Blocks(title="Phantom") as demo:
-    gr.Markdown("# 🤖 Phantom - 机器人视频生成器")
-    with gr.Row():
-        with gr.Column():
-            video_input = gr.Video(label="上传视频")
-            robot_type = gr.Dropdown(
-                choices=["Panda", "Kinova3", "UR5e"],
-                value="Panda",
-                label="机器人类型"
-            )
-            target_hand = gr.Radio(
-                choices=["left", "right"],
-                value="left",
-                label="目标手部"
             )
-            btn = gr.Button("开始处理", variant="primary")
-        with gr.Column():
-            video_out = gr.Video(label="结果视频")
-            data_out = gr.File(label="训练数据")
-            status_out = gr.Textbox(label="状态", lines=10)
-    btn.click(
-        fn=process_video,
-        inputs=[video_input, robot_type, target_hand],
-        outputs=[video_out, data_out, status_out]
-    )
 if __name__ == "__main__":
     demo.queue().launch()

 """
+Phantom Video Processor - Hugging Face Space Demo
+将人类手部视频转换为机器人演示数据
 """
 import gradio as gr
 import spaces
 import subprocess
 import sys
+import os
+import shutil
+import tempfile
 from pathlib import Path
+# ========== 路径配置 ==========
 PHANTOM_DIR = Path("/home/user/app/phantom")
+DATA_RAW_DIR = PHANTOM_DIR / "data" / "raw"
+DATA_PROCESSED_DIR = PHANTOM_DIR / "data" / "processed"
+MANO_DIR = PHANTOM_DIR / "submodules" / "phantom-hamer" / "_DATA" / "data" / "mano"
+# 添加 Phantom 到 Python 路径
+if PHANTOM_DIR.exists():
+    sys.path.insert(0, str(PHANTOM_DIR))
+    sys.path.insert(0, str(PHANTOM_DIR / "phantom"))
+# ========== 环境检测 ==========
+def check_environment():
+    """检查环境状态"""
+    status = {
+        "phantom_installed": Path("/tmp/.phantom_ready").exists(),
+        "mano_ready": (MANO_DIR / "MANO_LEFT.pkl").exists() and (MANO_DIR / "MANO_RIGHT.pkl").exists(),
+        "sample_data": (DATA_RAW_DIR / "pick_and_place").exists(),
+        "cuda_available": False,
+        "gpu_name": None
+    }
+    try:
+        import torch
+        status["cuda_available"] = torch.cuda.is_available()
+        if status["cuda_available"]:
+            status["gpu_name"] = torch.cuda.get_device_name(0)
+    except:
+        pass
+    return status
+def get_status_text():
+    """获取状态文本"""
+    status = check_environment()
+    lines = []
+    lines.append("=" * 40)
+    lines.append("环境状态")
+    lines.append("=" * 40)
+    lines.append(f"Phantom 安装: {'✅' if status['phantom_installed'] else '❌ 首次运行需初始化'}")
+    lines.append(f"MANO 模型: {'✅' if status['mano_ready'] else '❌ 请上传 MANO 模型文件'}")
+    lines.append(f"示例数据: {'✅' if status['sample_data'] else '⏳ 将自动下载'}")
+    lines.append(f"CUDA: {'✅ ' + (status['gpu_name'] or '') if status['cuda_available'] else '⏳ GPU 将在处理时分配'}")
+    lines.append("=" * 40)
+    return "\n".join(lines)
+# ========== MANO 模型上传 ==========
+def upload_mano_files(left_file, right_file):
+    """上传 MANO 模型文件"""
+    MANO_DIR.mkdir(parents=True, exist_ok=True)
+    messages = []
+    if left_file is not None:
+        dest = MANO_DIR / "MANO_LEFT.pkl"
+        shutil.copy(left_file.name, dest)
+        messages.append(f"✅ MANO_LEFT.pkl 已保存")
+    if right_file is not None:
+        dest = MANO_DIR / "MANO_RIGHT.pkl"
+        shutil.copy(right_file.name, dest)
+        messages.append(f"✅ MANO_RIGHT.pkl 已保存")
+    if not messages:
+        return "⚠️ 请选择文件上传"
+    return "\n".join(messages) + "\n\n" + get_status_text()
+# ========== 初始化环境 ==========
+def initialize_environment(progress=gr.Progress()):
+    """初始化 Phantom 环境"""
     if Path("/tmp/.phantom_ready").exists():
+        return "✅ 环境已就绪\n\n" + get_status_text()
+    progress(0, desc="开始初始化...")
     setup_script = Path("/home/user/app/setup.sh")
+    if not setup_script.exists():
+        return "❌ setup.sh 不存在"
+    try:
+        # 运行 setup.sh
+        progress(0.1, desc="运行安装脚本...")
+        process = subprocess.Popen(
+            ["bash", str(setup_script)],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            bufsize=1
+        )
+        output_lines = []
+        for line in iter(process.stdout.readline, ''):
+            output_lines.append(line.strip())
+            if len(output_lines) > 50:
+                output_lines = output_lines[-50:]  # 保留最后 50 行
+        process.wait()
+        if process.returncode == 0:
+            progress(1.0, desc="完成!")
+            return "✅ 初始化完成!\n\n" + "\n".join(output_lines[-20:]) + "\n\n" + get_status_text()
+        else:
+            return f"❌ 初始化失败 (返回码: {process.returncode})\n\n" + "\n".join(output_lines[-30:])
+    except Exception as e:
+        return f"❌ 初始化错误: {str(e)}"
+# ========== 视频处理 ==========
+@spaces.GPU(duration=300)
+def process_video(
+    video_file,
+    robot_type,
+    target_hand,
+    processing_mode,
+    use_sample_data,
+    progress=gr.Progress()
+):
+    """
+    处理视频 - 将人类手部转换为机器人
+    """
     import torch
+    # 状态信息
+    status_lines = []
+    # GPU 检查
     if torch.cuda.is_available():
         gpu = torch.cuda.get_device_name(0)
+        status_lines.append(f"✅ GPU: {gpu}")
+        status_lines.append(f"   VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
     else:
+        status_lines.append("❌ GPU 不可用")
+        return None, None, "\n".join(status_lines)
+    # 检查环境
+    if not Path("/tmp/.phantom_ready").exists():
+        status_lines.append("❌ 请先点击「初始化环境」按钮")
+        return None, None, "\n".join(status_lines)
+    # 检查 MANO
+    if not (MANO_DIR / "MANO_LEFT.pkl").exists():
+        status_lines.append("❌ 请先上传 MANO 模型文件")
+        return None, None, "\n".join(status_lines)
+    progress(0.1, desc="准备处理...")
+    # 确定输入数据
+    if use_sample_data:
+        demo_name = "pick_and_place"
+        data_root = str(DATA_RAW_DIR)
+        status_lines.append(f"📂 使用示例数据: {demo_name}")
+    else:
+        if video_file is None:
+            status_lines.append("❌ 请上传视频或选择使用示例数据")
+            return None, None, "\n".join(status_lines)
+        # 创建临时目录存放上传的视频
+        demo_name = "user_upload"
+        user_data_dir = DATA_RAW_DIR / demo_name / "0"
+        user_data_dir.mkdir(parents=True, exist_ok=True)
+        # 复制视频到正确位置
+        video_dest = user_data_dir / "video.mkv"
+        shutil.copy(video_file, video_dest)
+        data_root = str(DATA_RAW_DIR)
+        status_lines.append(f"📂 处理上传视频: {video_file}")
+    status_lines.append(f"🤖 机器人类型: {robot_type}")
+    status_lines.append(f"✋ 目标手部: {target_hand}")
+    status_lines.append(f"⚙️ 处理模式: {processing_mode}")
+    status_lines.append("-" * 40)
+    progress(0.2, desc="开始处理...")
+    # 构建处理命令
+    cmd = [
+        sys.executable,
+        str(PHANTOM_DIR / "phantom" / "process_data.py"),
+        f"demo_name={demo_name}",
+        f"data_root_dir={data_root}",
+        f"processed_data_root_dir={str(DATA_PROCESSED_DIR)}",
+        f"mode={processing_mode}",
+        f"robot={robot_type}",
+        f"target_hand={target_hand}",
+        "bimanual_setup=single_arm",
+        "demo_num=0",  # 只处理第一个 demo
+    ]
+    status_lines.append(f"命令: {' '.join(cmd)}")
+    try:
+        # 运行处理
+        progress(0.3, desc="处理中...")
+        process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            cwd=str(PHANTOM_DIR / "phantom"),
+            env={**os.environ, "PYTHONPATH": str(PHANTOM_DIR)}
+        )
+        output_lines = []
+        for line in iter(process.stdout.readline, ''):
+            line = line.strip()
+            if line:
+                output_lines.append(line)
+                # 更新进度
+                if "BBOX" in line:
+                    progress(0.4, desc="检测边界框...")
+                elif "HAND2D" in line:
+                    progress(0.5, desc="提取2D手部姿态...")
+                elif "SEGMENTATION" in line:
+                    progress(0.6, desc="分割手臂...")
+                elif "ACTION" in line:
+                    progress(0.7, desc="提取动作...")
+                elif "INPAINT" in line:
+                    progress(0.8, desc="视频修复...")
+                elif "ROBOT" in line:
+                    progress(0.9, desc="叠加机器人...")
+        process.wait()
+        progress(1.0, desc="完成!")
+        # 添加处理输出
+        status_lines.append("-" * 40)
+        status_lines.append("处理日志 (最后 20 行):")
+        status_lines.extend(output_lines[-20:])
+        # 查找输出文件
+        output_video = None
+        output_data = None
+        processed_dir = DATA_PROCESSED_DIR / demo_name / "0"
+        # 查找生成的视频
+        video_pattern = f"video_overlay_{robot_type}_single_arm.mkv"
+        for f in processed_dir.glob("**/*.mkv"):
+            if robot_type.lower() in f.name.lower():
+                output_video = str(f)
+                break
+        # 查找训练数据
+        for f in processed_dir.glob("**/training_data*.npz"):
+            output_data = str(f)
+            break
+        if output_video:
+            status_lines.append(f"\n✅ 输出视频: {output_video}")
+        if output_data:
+            status_lines.append(f"✅ 训练数据: {output_data}")
+        if process.returncode == 0:
+            status_lines.insert(0, "✅ 处理完成!")
+        else:
+            status_lines.insert(0, f"⚠️ 处理完成但有警告 (返回码: {process.returncode})")
+        return output_video, output_data, "\n".join(status_lines)
+    except Exception as e:
+        import traceback
+        status_lines.append(f"\n❌ 处理错误: {str(e)}")
+        status_lines.append(traceback.format_exc())
+        return None, None, "\n".join(status_lines)
+# ========== Gradio 界面 ==========
+with gr.Blocks(
+    title="Phantom - 机器人视频生成器",
+    theme=gr.themes.Soft()
+) as demo:
+    gr.Markdown("""
+    # 🤖 Phantom - 将人类视频转换为机器人演示
+    **论文**: [Phantom: Training Robots Without Robots Using Only Human Videos](https://phantom-human-videos.github.io/)
+    将人类手部操作视频自动转换为机器人演示数据，用于训练机器人策略。
+    """)
+    with gr.Tabs():
+        # ========== 环境设置 Tab ==========
+        with gr.TabItem("1️⃣ 环境设置"):
+            gr.Markdown("""
+            ### 首次使用需要完成以下步骤:
+            1. **初始化环境** - 安装依赖和下载模型 (首次约 5-10 分钟)
+            2. **上传 MANO 模型** - 需要从官网注册下载
+            """)
+            with gr.Row():
+                with gr.Column():
+                    init_btn = gr.Button("🔧 初始化环境", variant="primary", size="lg")
+                    init_output = gr.Textbox(
+                        label="初始化状态",
+                        lines=15,
+                        value=get_status_text()
+                    )
+                with gr.Column():
+                    gr.Markdown("""
+                    ### MANO 模型下载
+                    1. 访问 [MANO 官网](https://mano.is.tue.mpg.de/)
+                    2. 注册账号并下载模型
+                    3. 上传 `MANO_LEFT.pkl` 和 `MANO_RIGHT.pkl`
+                    """)
+                    mano_left = gr.File(label="MANO_LEFT.pkl", file_types=[".pkl"])
+                    mano_right = gr.File(label="MANO_RIGHT.pkl", file_types=[".pkl"])
+                    upload_btn = gr.Button("📤 上传 MANO 模型")
+                    upload_output = gr.Textbox(label="上传状态", lines=5)
+            init_btn.click(fn=initialize_environment, outputs=init_output)
+            upload_btn.click(fn=upload_mano_files, inputs=[mano_left, mano_right], outputs=upload_output)
+        # ========== 视频处理 Tab ==========
+        with gr.TabItem("2️⃣ 视频处理"):
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("### 输入设置")
+                    use_sample = gr.Checkbox(
+                        label="使用示例数据 (pick_and_place)",
+                        value=True,
+                        info="推荐首次使用时勾选，使用预置的示例视频"
+                    )
+                    video_input = gr.Video(
+                        label="或上传自己的视频",
+                        interactive=True
+                    )
+                    robot_type = gr.Dropdown(
+                        choices=["Panda", "Kinova3", "UR5e", "IIWA", "Jaco"],
+                        value="Panda",
+                        label="机器人类型"
+                    )
+                    target_hand = gr.Radio(
+                        choices=["left", "right"],
+                        value="left",
+                        label="目标手部"
+                    )
+                    processing_mode = gr.Dropdown(
+                        choices=[
+                            "bbox",
+                            "hand2d",
+                            "arm_segmentation",
+                            "hand_inpaint",
+                            "robot_inpaint",
+                            "all"
+                        ],
+                        value="bbox",
+                        label="处理模式",
+                        info="建议逐步运行: bbox -> hand2d -> arm_segmentation -> hand_inpaint -> robot_inpaint"
+                    )
+                    process_btn = gr.Button("🚀 开始处理", variant="primary", size="lg")
+                with gr.Column():
+                    gr.Markdown("### 输出结果")
+                    video_output = gr.Video(label="生成的机器人视频")
+                    data_output = gr.File(label="训练数据 (NPZ)")
+                    status_output = gr.Textbox(label="处理状态", lines=20)
+            process_btn.click(
+                fn=process_video,
+                inputs=[video_input, robot_type, target_hand, processing_mode, use_sample],
+                outputs=[video_output, data_output, status_output]
             )
+        # ========== 说明 Tab ==========
+        with gr.TabItem("📖 说明"):
+            gr.Markdown("""
+            ## 处理流程
+            Phantom 将人类手部视频转换为机器人演示数据，处理步骤:
+            | 步骤 | 模式 | 描述 |
+            |------|------|------|
+            | 1 | `bbox` | 检测手部边界框 |
+            | 2 | `hand2d` | 提取 2D 手部姿态 |
+            | 3 | `arm_segmentation` | 分割人类手臂 |
+            | 4 | `hand_inpaint` | 移除手臂并修复背景 |
+            | 5 | `robot_inpaint` | 叠加虚拟机器人 |
+            ## 输入要求
+            - **视频格式**: MKV, MP4 等常见格式
+            - **分辨率**: 推荐 1080p
+            - **内容**: 单手操作视频，手部需清晰可见
+            ## GPU Zero 限制
+            - 单次处理时间限制: 300 秒
+            - 建议逐步运行各处理模式
+            - 复杂视频可能需要多次处理
+            ## 参考资料
+            - [Phantom 论文](https://arxiv.org/abs/2503.00779)
+            - [GitHub 仓库](https://github.com/MarionLepert/phantom)
+            - [MANO 手部模型](https://mano.is.tue.mpg.de/)
+            """)
+# 启动
 if __name__ == "__main__":
     demo.queue().launch()

phantom DELETED Viewed

	@@ -1 +0,0 @@
1	- Subproject commit a8bb81c1bbe6ade129a1f6f0906482f510354a5e

phantom/.gitignore ADDED Viewed

	@@ -0,0 +1,11 @@

+*.egg-info
+**/_DATA/*
+data/raw/*
+!data/raw/.gitkeep
+data/processed/*
+!data/processed/.gitkeep
+**/__pycache__/*
+*.pyc
+*.pth
+outputs/*
+phantom/outputs/*

phantom/.gitmodules ADDED Viewed

	@@ -0,0 +1,15 @@

+[submodule "submodules/phantom-E2FGVI"]
+	path = submodules/phantom-E2FGVI
+	url = git@github.com:MarionLepert/phantom-E2FGVI.git
+[submodule "submodules/sam2"]
+	path = submodules/sam2
+	url = git@github.com:facebookresearch/sam2.git
+[submodule "submodules/phantom-robosuite"]
+	path = submodules/phantom-robosuite
+	url = git@github.com:MarionLepert/phantom-robosuite.git
+[submodule "submodules/phantom-robomimic"]
+	path = submodules/phantom-robomimic
+	url = git@github.com:MarionLepert/phantom-robomimic.git
+[submodule "submodules/phantom-hamer"]
+	path = submodules/phantom-hamer
+	url = git@github.com:MarionLepert/phantom-hamer.git

phantom/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Stanford Interactive Perception and Robot Learning Lab
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

phantom/README.md ADDED Viewed

	@@ -0,0 +1,168 @@

+# Code for Phantom and Masquerade
+[![Python](https://img.shields.io/badge/python-3.10-blue)](https://www.python.org)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+<hr style="border: 2px solid gray;"></hr>
+This repository contains the code used to process human videos in [Phantom: Training Robots Without Robots Using Only Human Videos](https://phantom-human-videos.github.io/) and [Masquerade: Learning from In-the-wild Human Videos using Data-Editing](https://masquerade-robot.github.io/).
+<table>
+<tr>
+  <td align="center" width="50%">
+    <h3><a href="https://phantom-human-videos.github.io/">Phantom: Training Robots Without Robots Using Only Human Videos</a></h3>
+    <p><em><a href=https://marionlepert.github.io/>Marion Lepert</a></em>, <em><a href=https://jiayingfang.github.io/>Jiaying Fang</a></em>, <em><a href=https://web.stanford.edu/~bohg/>Jeannette Bohg</a></em></p>
+    <a href="https://phantom-human-videos.github.io/">
+    <img src="docs/teaser_phantom.png" alt="Phantom Teaser" width="90%">
+    </a>
+  </td>
+  <td align="center" width="50%">
+    <h3><a href="https://masquerade-robot.github.io/">Masquerade: Learning from In-the-wild Human Videos using Data-Editing</a></h3>
+    <p><em><a href=https://marionlepert.github.io/>Marion Lepert*</a></em>, <em><a href=https://jiayingfang.github.io/>Jiaying Fang*</a></em>, <em><a href=https://web.stanford.edu/~bohg/>Jeannette Bohg</a></em></p>
+    <img src="docs/teaser_masquerade.png" alt="Masquerade Teaser" width="90%">
+  </td>
+</tr>
+</table>
+Both projects use data editing to convert human videos into “robotized” demonstrations. They share much of the same codebase, with some differences in the processing pipeline:
+**Phantom**
+* Input: RGBD videos with a single left hand visible in every frame.
+* Data editing: inpaint the single human arm, overlay a rendered robot arm in the same pose.
+* Action labels: extract full 3D end-effector pose (position, orientation, gripper)
+**Masquerade**
+* Input: RGB videos from [Epic Kitchens](https://epic-kitchens.github.io/2025); one or both hands may be visible, sometimes occluded.
+* Data editing: segment and inpaint both arms, overlay a bimanual robot whose effectors follow the estimated poses (with a 3-4cm error along the depth direction due to lack of depth data)
+* Action labels: use 2D projected waypoints as auxiliary supervision only (not full 3D actions)
+## Installation
+1. Clone this repo recursively
+```bash
+git clone --recursive git@github.com:MarionLepert/phantom.git
+```
+2. Run the following script from the root directory to install the required conda environment.
+```bash
+./install.sh
+```
+3. Download the MANO hand models. To do so, go to the [MANO website](https://mano.is.tue.mpg.de/) and register to be able to download the models. Download the left and right hand models and move MANO_LEFT.pkl and MANO_RIGHT.pkl inside the `$ROOT_DIR/submodules/phantom-hamer/_DATA/data/mano/` folder.
+## Getting Started
+Process **Phantom** sample data (manually collected in-lab videos)
+```bash
+conda activate phantom
+python process_data.py demo_name=pick_and_place data_root_dir=../data/raw processed_data_root_dir=../data/processed mode=all
+```
+Process **Masquerade** sample data ([Epic Kitchens](https://epic-kitchens.github.io/2025) video)
+```bash
+conda activate phantom
+python process_data.py demo_name=epic data_root_dir=../data/raw processed_data_root_dir=../data/processed mode=all --config-name=epic
+```
+## Codebase Overview
+### Process data
+Each video is processed using the following steps:
+1. **Extract human hand bounding boxes**: `bbox_processor.py`
+     * `mode=bbox`
+2. **Extract 2d human hand poses**: `hand_processor.py`
+     * `mode=hand2d`: extract the 2d hand pose
+3. **Extract human and arm segmentation masks**: `segmentation_processor.py`
+     * `mode=hand_segmentation`: used for depth alignment in hand pose refinement (only works for hand3d)
+     * `mode=arm_segmentation`: needed in all cases to inpaint the human
+2. **Extract 3d human hand poses**: `hand_processor.py`
+     * `mode=hand3d`: extract the 3d hand pose (note: requires depth, and was only tested on the left hand)
+4. **Retarget human actions to robot actions**: `action_processor.py`
+     * `mode=action`
+5. **Smooth human poses**: `smoothing_processor.py`
+     * `mode=smoothing`
+6. **Remove hand from videos using inpainting**: `handinpaint_processor.py`
+     * `mode=hand_inpaint`
+     * Inpainting method [E2FGVI](https://arxiv.org/pdf/2204.02663) is used.
+7. **Overlay virtual robot on video**: `robotinpaint_processor.py`
+     * `mode=robot_inpaint`: overlay a single robot (default) or bimanual (epic mode) robot on the image
+### Config reference (see configuration files in `configs/`)
+| Flag | Type | Required | Choices | Description |
+|------|------|----------|---------|-------------|
+| `--demo_name` | `str` | ✅ | - | Name of the demonstration/dataset to process |
+| `--mode` | `str` (multiple) | ✅ | `bbox`, `hand2d`, `hand3d`, `hand_segmentation`, `arm_segmentation`, `action`, `smoothing`, `hand_inpaint`, `robot_inpaint`, `all` | Processing modes to run (can specify multiple with e.g. `'mode=[bbox,hand2d]'`) |
+| `--robot_name` | `str` | ✅ | `Panda`, `Kinova3`, `UR5e`, `IIWA`, `Jaco` | Type of robot to use for overlays |
+| `--gripper_name` | `str` | ❌ | `Robotiq85` | Type of gripper to use |
+| `--data_root_dir` | `str` | ❌ | - | Root directory containing raw video data |
+| `--processed_data_root_dir` | `str` | ❌ | - | Root directory to save processed data |
+| `--epic` | `bool` | ❌ | - | Use Epic-Kitchens dataset processing mode |
+| `--bimanual_setup` | `str` | ❌ | `single_arm`, `shoulders` | Bimanual setup configuration to use (shoulders corresponds to the bimanual hardware configuration used in Masquerade) |
+| `--target_hand` | `str` | ❌ | `left`, `right`, `both` | Which hand(s) to target for processing |
+| `--camera_intrinsics` | `str` | ❌ | - | Path to camera intrinsics file |
+| `--camera_extrinsics` | `str` | ❌ | - | Path to camera extrinsics file |
+| `--input_resolution` | `int` | ❌ | - | Resolution of input videos |
+| `--output_resolution` | `int` | ❌ | - | Resolution of output videos |
+| `--depth_for_overlay` | `bool` | ❌ | - | Use depth information for overlays |
+| `--demo_num` | `str` | ❌ | - | Process a single demo number instead of all demos |
+| `--debug_cameras` | `str` (multiple) | ❌ | - | Additional camera names to include for debugging |
+| `--constrained_hand` | `bool` | ❌ | - | Use constrained hand processing |
+| `--render` | `bool` | ❌ | - | Render the robot overlay on the video |
+**Note** Please specify `--bimanual_setup single_arm` along with `--target_hand left` or `--target_hand right` if you are using single arm. For bimanual setups, use `--bimanual_setup shoulders`.
+### Camera details
+* **Phantom**: a Zed2 camera was used to capture the sample data at HD1080 resolution.
+* **Masquerade**: We used Epic-Kitchens videos and used the camera intrinsics provided in the dataset. To use videos captured with a different camera resolution, update the camera intrinsics and extrinsics files in `$ROOT_DIR/phantom/camera/`.
+### Train policy
+After processing the video data, the edited data can be used to train a policy. The following files should be used:
+* Observations
+  * Phantom Samples: extract RGB images from `data/processed/pick_and_place/*/video_overlay_Panda_single_arm.mkv`
+  * Epic (In-the-wild Data) Samples: extract RGB images from `data/processed/epic/*/video_overlay_Kinova3_shoulders.mkv`
+* Actions
+  * Phantom Samples: All data stored in `data/processed/pick_and_place/*/inpaint_processor/training_data_single_arm.npz`
+  * Epic (In-the-wild Data) Samples: All data stored in `data/processed/epic/*/inpaint_processor/training_data_shoulders.npz`
+In Phantom, [Diffusion Policy](https://github.com/real-stanford/diffusion_policy) was used for policy training.
+## Citation
+```bibtex
+@article{lepert2025phantomtrainingrobotsrobots,
+        title={Phantom: Training Robots Without Robots Using Only Human Videos},
+        author={Marion Lepert and Jiaying Fang and Jeannette Bohg},
+        year={2025},
+        eprint={2503.00779},
+        archivePrefix={arXiv},
+        primaryClass={cs.RO},
+        url={https://arxiv.org/abs/2503.00779},
+  }
+```
+```bibtex
+@misc{lepert2025masqueradelearninginthewildhuman,
+      title={Masquerade: Learning from In-the-wild Human Videos using Data-Editing},
+      author={Marion Lepert and Jiaying Fang and Jeannette Bohg},
+      year={2025},
+      eprint={2508.09976},
+      archivePrefix={arXiv},
+      primaryClass={cs.RO},
+      url={https://arxiv.org/abs/2508.09976},
+}
+```

phantom/configs/default.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+# Default configuration (PHANTOM paper settings)
+debug: false
+verbose: false
+skip_existing: false
+n_processes: 1
+data_root_dir: "../data/raw_data/"
+processed_data_root_dir: "../data/processed_data/"
+demo_name: ""
+# Processing settings
+mode: ["bbox"]  # Default processing mode - must be one of: bbox, hand2d, hand3d, hand_segmentation, arm_segmentation, action, smoothing, hand_inpaint, robot_inpaint, all
+demo_num: null  # Process specific demo number (null = process all)
+# Additional settings
+debug_cameras: []
+# PHANTOM paper configuration (default)
+input_resolution: 1080
+output_resolution: 240
+robot: "Panda"
+gripper: "Robotiq85"
+square: true
+epic: false
+bimanual_setup: "single_arm"
+target_hand: "left"
+constrained_hand: true
+depth_for_overlay: true
+render: false
+camera_intrinsics: "camera/camera_intrinsics_HD1080.json"
+camera_extrinsics: "camera/camera_extrinsics.json"

phantom/configs/epic.yaml ADDED Viewed

	@@ -0,0 +1,31 @@

+# Default configuration (PHANTOM paper settings)
+debug: false
+verbose: false
+skip_existing: false
+n_processes: 1
+data_root_dir: "../data/raw_data/"
+processed_data_root_dir: "../data/processed_data/"
+demo_name: ""
+# Processing settings
+mode: ["bbox"]  # Default processing mode
+demo_num: null  # Process specific demo number (null = process all videos in the root folder)
+# Additional settings
+debug_cameras: [] # Add other robomimic cameras like sideview, etc. Warning: this significantly slows down the processing time
+# EPIC-KITCHENS configuration override
+input_resolution: 256
+output_resolution: 256
+robot: "Kinova3"
+gripper: "Robotiq85"
+square: false
+epic: true
+bimanual_setup: "shoulders"
+target_hand: "both"
+constrained_hand: false
+depth_for_overlay: false
+render: false
+camera_intrinsics: "camera/camera_intrinsics_epic.json"
+camera_extrinsics: "camera/camera_extrinsics_ego_bimanual_shoulders.json"

phantom/configs/sam2_hiera_l.yaml ADDED Viewed

	@@ -0,0 +1,117 @@

+# @package _global_
+# Model
+model:
+  _target_: sam2.modeling.sam2_base.SAM2Base
+  image_encoder:
+    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 144
+      num_heads: 2
+      stages: [2, 6, 36, 4]
+      global_att_blocks: [23, 33, 43]
+      window_pos_embed_bkg_spatial_size: [7, 7]
+      window_spec: [8, 4, 16, 8]
+    neck:
+      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [1152, 576, 288, 144]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+  memory_attention:
+    _target_: sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+  memory_encoder:
+      _target_: sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: false
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  compile_image_encoder: False

phantom/data/__init__.py ADDED Viewed

File without changes

phantom/docs/teaser_masquerade.png ADDED Viewed

Git LFS Details

SHA256: 3f0f5355b51b44f98b8aced3b5c41255d3e9a04b0810a4d9b616c67e1ba05b9c
Pointer size: 132 Bytes
Size of remote file: 1.28 MB

phantom/docs/teaser_phantom.png ADDED Viewed

Git LFS Details

SHA256: a79506ef23efac9c85af0805ca5e23ec59a6a90e0de7bc475cfde94bd793f9c0
Pointer size: 132 Bytes
Size of remote file: 3.09 MB

phantom/install.sh ADDED Viewed

	@@ -0,0 +1,67 @@

+eval "$(conda shell.bash hook)"
+# ######################## Phantom Env ###############################
+conda create -n phantom python=3.10 -y
+conda activate phantom
+conda install nvidia/label/cuda-12.1.0::cuda-toolkit -c nvidia/label/cuda-12.1.0 -y
+# Install SAM2
+cd submodules/sam2
+pip install -v -e ".[notebooks]"
+cd ../..
+# Install Hamer
+cd submodules/phantom-hamer
+pip install -e .\[all\]
+pip install -v -e third-party/ViTPose
+wget https://www.cs.utexas.edu/~pavlakos/hamer/data/hamer_demo_data.tar.gz
+tar --warning=no-unknown-keyword --exclude=".*" -xvf hamer_demo_data.tar.gz
+cd ../..
+# Install mmcv
+pip install --index-url https://download.pytorch.org/whl/cu121 torch==2.1.0 torchvision==0.16.0
+pip install mmcv==1.3.9
+pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu121/torch2.1/index.html
+pip install numpy==1.26.4
+# Install phantom-robosuite
+cd submodules/phantom-robosuite
+pip install -e .
+cd ../..
+# Install phantom-robomimic
+cd submodules/phantom-robomimic
+pip install -e .
+cd ../..
+# Install additional packages
+pip install joblib mediapy open3d pandas
+pip install transformers==4.42.4
+pip install PyOpenGL==3.1.4
+pip install Rtree
+pip install git+https://github.com/epic-kitchens/epic-kitchens-100-hand-object-bboxes.git
+pip install protobuf==3.20.0
+pip install hydra-core==1.3.2
+pip install omegaconf==2.3.0
+# Download E2FGVI weights
+cd submodules/phantom-E2FGVI/E2FGVI/release_model/
+pip install gdown
+gdown --fuzzy https://drive.google.com/file/d/10wGdKSUOie0XmCr8SQ2A2FeDe-mfn5w3/view?usp=sharing
+cd ../..
+# Install phantom-E2FGVI
+pip install -e .
+cd ../..
+# Install phantom
+pip install -e .
+# Download sample data
+cd data/raw
+wget https://download.cs.stanford.edu/juno/phantom/pick_and_place.zip
+unzip pick_and_place.zip
+rm pick_and_place.zip
+wget https://download.cs.stanford.edu/juno/phantom/epic.zip
+unzip epic.zip
+rm epic.zip
+cd ../..

phantom/phantom/__init__.py ADDED Viewed

File without changes

phantom/phantom/camera/__init__.py ADDED Viewed

File without changes

phantom/phantom/camera/camera_extrinsics.json ADDED Viewed

	@@ -0,0 +1,42 @@

+[
+    {
+        "camera_base_ori": [
+            [
+                0.9842690634302423,
+                -0.053375086066005106,
+                0.1684206369825258
+            ],
+            [
+                -0.1763762231197722,
+                -0.35235905397979306,
+                0.9190944048336218
+            ],
+            [
+                0.010287793357058851,
+                -0.934341584895969,
+                -0.3562302121408726
+            ]
+        ],
+        "camera_base_ori_rotvec": [
+            -1.930138005212092,
+            0.16467696378244215,
+            -0.12809137765065973
+        ],
+        "camera_base_pos": [
+            0.3407932803063093,
+            -0.40868423448040403,
+            0.39911982578151795
+        ],
+        "camera_base_quat": [
+            0.8204965462375373,
+            -0.07000374049084156,
+            0.054451304871138306,
+            -0.564729979129313
+        ],
+        "p_marker_ee": [
+            -0.01874144739551215,
+            0.029611448317719172,
+            -0.013687685723932594
+        ]
+    }
+]

phantom/phantom/camera/camera_extrinsics_ego_bimanual_shoulders.json ADDED Viewed

	@@ -0,0 +1,52 @@

+[
+    {
+        "num_marker_seen": 114,
+        "stage2_retry": 11,
+        "pixel_error": 2.1157278874907863,
+        "proj_func": "hand_marker_proj_world_camera",
+        "intrinsics": {
+            "fx": 731.4708862304688,
+            "fy": 731.4708862304688,
+            "ppx": 646.266357421875,
+            "ppy": 355.9967956542969
+        },
+        "camera_base_ori": [
+            [
+                -0.7220417114840215,
+                0.37764981440725887,
+                0.579686453658689
+            ],
+            [
+                0.020370475586732495,
+                0.8491206965938227,
+                -0.527805917303316
+            ],
+            [
+                -0.6915495720493177,
+                -0.3692893991088662,
+                -0.6207934673498243
+            ]
+        ],
+        "camera_base_ori_rotvec": [
+            0.2877344548443808,
+            2.3075097094104504,
+            -0.6485227972051454
+        ],
+        "camera_base_pos": [
+            -0.5123627783256401,
+            -0.11387480700266536,
+            0.3151264229148423
+        ],
+        "p_marker_ee": [
+            -0.041990731174163416,
+            -0.02636865486252487,
+            -0.01442948433864288
+        ],
+        "camera_base_quat": [
+            0.11139014686225811,
+            0.8933022830245745,
+            -0.25106152012025673,
+            0.35576871621882866
+        ]
+    }
+]

phantom/phantom/camera/camera_intrinsics_HD1080.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+    "left": {
+        "fx": 1057.7322998046875,
+        "fy": 1057.7322998046875,
+        "cx": 972.5150756835938,
+        "cy": 552.568359375,
+        "disto": [
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0
+        ],
+        "v_fov": 54.09259796142578,
+        "h_fov": 84.45639038085938,
+        "d_fov": 92.32276916503906
+    },
+    "right": {
+        "fx": 1057.7322998046875,
+        "fy": 1057.7322998046875,
+        "cx": 972.5150756835938,
+        "cy": 552.568359375,
+        "disto": [
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0
+        ],
+        "v_fov": 54.09259796142578,
+        "h_fov": 84.45639038085938,
+        "d_fov": 92.32276916503906
+    }
+}

phantom/phantom/camera/camera_intrinsics_epic.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+    "left": {
+        "fx": 248.7892127911359,
+        "fy": 248.7892127911359,
+        "cx": 228,
+        "cy": 128,
+        "disto": [
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0
+        ],
+        "v_fov": 54.6,
+        "h_fov": 83.21271514892578,
+        "d_fov": 91.07240295410156
+    },
+    "right": {
+        "fx": 248.7892127911359,
+        "fy": 248.7892127911359,
+        "cx": 228,
+        "cy": 128,
+        "disto": [
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0,
+            0.0
+        ],
+        "v_fov": 54.6,
+        "h_fov": 83.21271514892578,
+        "d_fov": 91.07240295410156
+    }
+}

phantom/phantom/detectors/detector_detectron2.py ADDED Viewed

	@@ -0,0 +1,121 @@

+"""
+Wrapper around detectron2 for object detection
+"""
+import os
+import numpy as np
+from pathlib import Path
+from typing import Tuple
+import cv2
+import logging
+import mediapy as media
+import requests
+import hamer  # type: ignore
+from hamer.utils.utils_detectron2 import DefaultPredictor_Lazy  # type: ignore
+from detectron2.config import LazyConfig  # type: ignore
+logger = logging.getLogger(__name__)
+def download_detectron_ckpt(root_dir: str, ckpt_path: str) -> None:
+    url = "https://dl.fbaipublicfiles.com/detectron2/ViTDet/COCO/cascade_mask_rcnn_vitdet_h/f328730692/model_final_f05665.pkl"
+    save_path = Path(root_dir, ckpt_path)
+    save_path.parent.mkdir(exist_ok=True, parents=True)
+    response = requests.get(url, stream=True)
+    if response.status_code == 200:
+        with open(save_path, "wb") as file:
+            for chunk in response.iter_content(chunk_size=8192):
+                file.write(chunk)
+        logger.info(f"File downloaded successfully and saved to {save_path}")
+    else:
+        logger.info(f"Failed to download the file. Status code: {response.status_code}")
+class DetectorDetectron2:
+    def __init__(self, root_dir: str):
+        cfg_path = (Path(hamer.__file__).parent / "configs" / "cascade_mask_rcnn_vitdet_h_75ep.py")
+        detectron2_cfg = LazyConfig.load(str(cfg_path))
+        detectron2_cfg.train.init_checkpoint = os.path.join(
+            root_dir, "_DATA/detectron_ckpts/model_final_f05665.pkl"
+        )
+        if not os.path.exists(detectron2_cfg.train.init_checkpoint):
+            download_detectron_ckpt(
+                root_dir, "_DATA/detectron_ckpts/model_final_f05665.pkl"
+            )
+        for predictor in detectron2_cfg.model.roi_heads.box_predictors:
+            predictor.test_score_thresh = 0.25
+        self.detectron2 = DefaultPredictor_Lazy(detectron2_cfg)
+    def get_bboxes(self, img: np.ndarray, visualize: bool=False,
+                   visualize_wait: bool=True) -> Tuple[np.ndarray, np.ndarray]:
+        """ Get bounding boxes and scores for the detected hand in the image """
+        det_out = self.detectron2(img)
+        det_instances = det_out["instances"]
+        valid_idx = (det_instances.pred_classes == 0) & (det_instances.scores > 0.5)
+        pred_bboxes = det_instances.pred_boxes.tensor[valid_idx].cpu().numpy()
+        pred_scores = det_instances.scores[valid_idx].cpu().numpy()
+        if visualize:
+            img_rgb = img.copy()
+            img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)
+            for bbox, score in zip(pred_bboxes, pred_scores):
+                cv2.rectangle(
+                    img_bgr,
+                    (int(bbox[0]), int(bbox[1])),
+                    (int(bbox[2]), int(bbox[3])),
+                    (0, 255, 0),
+                    2,
+                )
+                cv2.putText(img_bgr,
+                            f"{score:.4f}",
+                            (int(bbox[0]), int(bbox[1])),
+                            cv2.FONT_HERSHEY_SIMPLEX,
+                            1,
+                            (0, 255, 0),
+                            2,
+                            cv2.LINE_AA)
+            cv2.imshow(f"Detected bounding boxes", img_bgr)
+            if visualize_wait:
+                cv2.waitKey(0)
+            else:
+                cv2.waitKey(1)
+        return pred_bboxes, pred_scores
+    def get_best_bbox(self, img: np.ndarray, visualize: bool=False,
+                      visualize_wait: bool=True) -> Tuple[np.ndarray, float]:
+        """ Get the best bounding box and score for the detected hand in the image """
+        bboxes, scores = self.get_bboxes(img)
+        if len(bboxes) == 0:
+            logger.info("No bbox found with Detectron")
+            return np.array([]), 0
+        best_idx = scores.argmax()
+        best_bbox, best_score = bboxes[best_idx], scores[best_idx]
+        if visualize:
+            img_rgb = img.copy()
+            img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)
+            cv2.rectangle(
+                img_bgr,
+                (int(best_bbox[0]), int(best_bbox[1])),
+                (int(best_bbox[2]), int(best_bbox[3])),
+                (0, 255, 0),
+                2,
+            )
+            cv2.putText(img_bgr,
+                        f"{best_score:.4f}",
+                        (int(best_bbox[0]), int(best_bbox[1])),
+                        cv2.FONT_HERSHEY_SIMPLEX,
+                        1,
+                        (0, 255, 0),
+                        2,
+                        cv2.LINE_AA)
+            cv2.imshow(f"Best detected bounding box", img_bgr)
+            if visualize_wait:
+                cv2.waitKey(0)
+            else:
+                cv2.waitKey(1)
+        return best_bbox, best_score

phantom/phantom/detectors/detector_dino.py ADDED Viewed

	@@ -0,0 +1,108 @@

+"""
+Wrapper around DINO-V2 for object detection
+"""
+from typing import Sequence, Tuple, Optional
+import numpy as np
+from transformers import pipeline  # type: ignore
+from PIL import Image
+import cv2
+import logging
+from phantom.utils.image_utils import DetectionResult
+logger = logging.getLogger(__name__)
+class DetectorDino:
+    def __init__(self, detector_id: str):
+        self.detector = pipeline(
+            model=detector_id,
+            task="zero-shot-object-detection",
+            device="cuda",
+            batch_size=4,
+        )
+    def get_bboxes(self, frame: np.ndarray, object_name: str, threshold: float = 0.4,
+                   visualize: bool = False, pause_visualization: bool = True) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Detect objects in a frame and return their bounding boxes and confidence scores.
+        Args:
+            frame: Input image as numpy array in RGB format
+            object_name: Target object category to detect
+            threshold: Confidence threshold for detection (0.0-1.0)
+            visualize: If True, displays detection results visually
+            pause_visualization: If True, waits for key press when visualizing
+        Returns:
+            Tuple of (bounding_boxes, confidence_scores) as numpy arrays
+            Empty arrays if no objects detected
+        """
+        img_pil = Image.fromarray(frame)
+        labels = [f"{object_name}."]
+        results = self.detector(img_pil, candidate_labels=labels, threshold=threshold)
+        results = [DetectionResult.from_dict(result) for result in results]
+        if not results:
+            return np.array([]), np.array([])
+        bboxes = np.array([np.array(result.box.xyxy) for result in results])
+        scores = np.array([result.score for result in results])
+        if visualize:
+            img_rgb = frame.copy()
+            img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)
+            for bbox, score in zip(bboxes, scores):
+                cv2.rectangle(
+                    img_bgr,
+                    (int(bbox[0]), int(bbox[1])),
+                    (int(bbox[2]), int(bbox[3])),
+                    (0, 255, 0),
+                    2,
+                )
+                cv2.putText(img_bgr,
+                            f"{score:.4f}",
+                            (int(bbox[0]), int(bbox[1])),
+                            cv2.FONT_HERSHEY_SIMPLEX,
+                            1,
+                            (0, 255, 0),
+                            2,
+                            cv2.LINE_AA)
+            cv2.imshow("Detection", img_bgr)
+            if pause_visualization:
+                cv2.waitKey(0)
+            else:
+                cv2.waitKey(1)
+        return bboxes, scores
+    def get_best_bbox(self, frame: np.ndarray, object_name: str, threshold: float = 0.4,
+               visualize: bool = False, pause_visualization: bool = True) -> Optional[np.ndarray]:
+        bboxes, scores = self.get_bboxes(frame, object_name, threshold)
+        if len(bboxes) == 0:
+            return None
+        best_idx = np.array(scores).argmax()
+        best_bbox, best_score = bboxes[best_idx], scores[best_idx]
+        if visualize:
+            img_rgb = frame.copy()
+            img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)
+            cv2.rectangle(
+                img_bgr,
+                (best_bbox[0], best_bbox[1]),
+                (best_bbox[2], best_bbox[3]),
+                (0, 255, 0),
+                2,
+            )
+            cv2.putText(img_bgr,
+                    f"{best_score:.4f}",
+                    (int(best_bbox[0]), int(best_bbox[1])),
+                    cv2.FONT_HERSHEY_SIMPLEX,
+                    1,
+                    (0, 255, 0),
+                    2,
+                    cv2.LINE_AA)
+            cv2.imshow("Detection", img_bgr)
+            if pause_visualization:
+                cv2.waitKey(0)
+            else:
+                cv2.waitKey(1)
+        return best_bbox

phantom/phantom/detectors/detector_hamer.py ADDED Viewed

	@@ -0,0 +1,447 @@

+"""
+Wrapper around HaMeR for hand pose estimation
+"""
+import os
+import logging
+import numpy as np
+from pathlib import Path
+from typing import Optional, Tuple
+import cv2
+import torch
+from hamer.utils import recursive_to  # type: ignore
+import matplotlib.pyplot as plt
+from hamer.models import HAMER, DEFAULT_CHECKPOINT  # type: ignore
+import sys
+import os
+# Add the phantom-hamer directory to Python path for vitpose_model import
+hamer_path = os.path.join(os.path.dirname(__file__), '..', '..', 'submodules', 'phantom-hamer')
+if hamer_path not in sys.path:
+    sys.path.insert(0, hamer_path)
+from vitpose_model import ViTPoseModel  # type: ignore
+from hamer.datasets.vitdet_dataset import ViTDetDataset  # type: ignore
+from hamer.utils.renderer import cam_crop_to_full  # type: ignore
+from hamer.utils.geometry import perspective_projection  # type: ignore
+from hamer.configs import get_config  # type: ignore
+from yacs.config import CfgNode as CN  # type: ignore
+from phantom.utils.data_utils import get_parent_folder_of_package
+logger = logging.getLogger(__name__)
+THUMB_VERTEX = 756
+INDEX_FINGER_VERTEX = 350
+class DetectorHamer:
+    """
+    Detector using the HaMeR model for 3D hand pose estimation.
+    The detection pipeline consists of:
+    - Initial hand detection using general object detectors
+    - Hand type classification (left/right) using ViTPose
+    - 3D pose estimation using HaMeR
+    - MANO parameters estimation for mesh reconstruction
+    Dependencies:
+    - HaMeR model for 3D pose estimation
+    - ViTPose for keypoint detection
+    - DINO and Detectron2 for initial hand detection
+    """
+    def __init__(self):
+        root_dir = get_parent_folder_of_package("hamer")
+        checkpoint_path = Path(root_dir, DEFAULT_CHECKPOINT)
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.rescale_factor = 2.0 # Factor for padding the box
+        self.batch_size = 1 # Batch size for inference
+        self.model, self.model_cfg = self.load_hamer_model(checkpoint_path, root_dir)
+        self.model.to(self.device)
+        self.model.eval()
+        root_dir = "../submodules/phantom-hamer/"
+        vit_dir = os.path.join(root_dir, "third-party/ViTPose/")
+        self.cpm = ViTPoseModel(device=self.device, root_dir=root_dir, vit_dir=vit_dir)
+        self.faces_right = self.model.mano.faces
+        self.faces_left = self.faces_right[:,[0,2,1]]
+    def detect_hand_keypoints(self,
+                              img: np.ndarray,
+                              hand_side: str,
+                              visualize: bool=False,
+                              visualize_3d: bool=False,
+                              pause_visualization: bool=True,
+                              bboxes: Optional[np.ndarray]=None,
+                              is_right: Optional[np.ndarray]=None,
+                              kpts_2d_only: Optional[bool]=False,
+                              camera_params: Optional[dict]=None) -> Optional[dict]:
+        """
+        Detect hand keypoints in the input image.
+        The method performs the following steps:
+        1. Detect hand bounding boxes using object detectors
+        2. Optionally refine boxes using ViTPose to determine hand type (left/right)
+        3. Run HaMeR model to estimate 3D hand pose
+        4. Project 3D keypoints back to 2D for visualization
+        Args:
+            img: Input RGB image as numpy array
+            hand_side: Target hand side to detect (left or right)
+            visualize: If True, displays detection results in a window
+            visualize_3d: If True, shows 3D visualization of keypoints and mesh
+            pause_visualization: If True, waits for key press when visualizing
+            bboxes: Bounding boxes of the hands
+            is_right: Whether the hand is right
+            kpts_2d_only: If True, only cares about 2D keypoints, i.e., use default
+            focal length in HaMeR instead of real camera intrinsics
+            camera_params: Optional camera intrinsics (fx, fy, cx, cy)
+        Returns:
+            Dictionary containing:
+                - annotated_img: Image with keypoints drawn
+                - success: Whether detection was successful (21 keypoints found)
+                - kpts_3d: 3D keypoints in camera space
+                - kpts_2d: 2D keypoints projected onto image
+                - verts: 3D mesh vertices
+                - T_cam_pred: Camera transformation matrix
+                - Various camera parameters and MANO pose parameters
+        """
+        if not kpts_2d_only:
+            scaled_focal_length, camera_center = self.get_image_params(img, camera_params)
+        else:
+            scaled_focal_length, camera_center = self.get_image_params(img, camera_params=None)
+        dataset = ViTDetDataset(self.model_cfg, img, bboxes, is_right, rescale_factor=self.rescale_factor)
+        dataloader = torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=False, num_workers=0)
+        list_2d_kpts, list_3d_kpts, list_verts = [], [], []
+        T_cam_pred_all: list[torch.Tensor] = []
+        list_global_orient = []
+        kpts_2d_hamer = None
+        for batch in dataloader:
+            batch = recursive_to(batch, "cuda")
+            with torch.no_grad():
+                out = self.model(batch)
+            batch_T_cam_pred_all = DetectorHamer.get_all_T_cam_pred(batch, out, scaled_focal_length)
+            for idx in range(len(batch_T_cam_pred_all)):
+                kpts_3d = out["pred_keypoints_3d"][idx].detach().cpu().numpy()  # [21, 3]
+                verts = out["pred_vertices"][idx].detach().cpu().numpy()  # [778, 3]
+                is_right = batch["right"][idx].cpu().numpy()
+                global_orient = out["pred_mano_params"]["global_orient"][idx].detach().cpu().numpy()
+                hand_pose = out["pred_mano_params"]["hand_pose"][idx].detach().cpu().numpy()
+                list_global_orient.append(global_orient)
+                if hand_side == "left":
+                    kpts_3d, verts = DetectorHamer.convert_right_hand_keypoints_to_left_hand(kpts_3d, verts)
+                T_cam_pred = batch_T_cam_pred_all[idx]
+                img_w, img_h = batch["img_size"][idx].float()
+                kpts_2d_hamer = DetectorHamer.project_3d_kpt_to_2d(kpts_3d, img_w, img_h, scaled_focal_length,
+                                                            camera_center, T_cam_pred)
+                # Keep T_cam_pred as tensor
+                list_2d_kpts.append(kpts_2d_hamer)
+                list_3d_kpts.append(kpts_3d + T_cam_pred.cpu().numpy())
+                list_verts.append(verts + T_cam_pred.cpu().numpy())
+            T_cam_pred_all += batch_T_cam_pred_all
+        annotated_img = DetectorHamer.visualize_2d_kpt_on_img(
+            kpts_2d=list_2d_kpts[0],
+            img=img,
+        )
+        if visualize:
+            if bboxes is not None:
+                cv2.rectangle(annotated_img, (int(bboxes[0][0]), int(bboxes[0][1])), (int(bboxes[0][2]), int(bboxes[0][3])), (0, 255, 0), 2)
+            cv2.imshow("Annotated Image", annotated_img)
+            cv2.waitKey(0 if pause_visualization else 1)
+        if visualize_3d:
+            DetectorHamer.visualize_keypoints_3d(annotated_img, list_3d_kpts[0], list_verts[0])
+        return {
+            "annotated_img": annotated_img,
+            "success": len(list_2d_kpts[0]) == 21,
+            "kpts_3d": list_3d_kpts[0],
+            "kpts_2d": np.rint(list_2d_kpts[0]).astype(np.int32),
+            "verts": list_verts[0],
+            "T_cam_pred": T_cam_pred_all[0],
+            "scaled_focal_length": scaled_focal_length,
+            "camera_center": camera_center,
+            "img_w": img_w,
+            "img_h": img_h,
+            "global_orient": list_global_orient[0],
+            "hand_pose": hand_pose,
+        }
+    def get_image_params(self, img: np.ndarray, camera_params: Optional[dict]) -> Tuple[float, torch.Tensor]:
+        """
+        Get the scaled focal length and camera center.
+        """
+        img_w = img.shape[1]
+        img_h = img.shape[0]
+        if camera_params is not None:
+            scaled_focal_length = camera_params["fx"]
+            cx = camera_params["cx"]
+            cy = camera_params["cy"]
+            camera_center = torch.tensor([img_w-cx, img_h-cy])
+        else:
+            scaled_focal_length = (self.model_cfg.EXTRA.FOCAL_LENGTH / self.model_cfg.MODEL.IMAGE_SIZE
+                                   * max(img_w, img_h))
+            camera_center = torch.tensor([img_w, img_h], dtype=torch.float).reshape(1, 2) / 2.0
+        return scaled_focal_length, camera_center
+    @staticmethod
+    def convert_right_hand_keypoints_to_left_hand(kpts, verts):
+        """
+        Convert right hand keypoints/vertices to left hand by mirroring across the Y-Z plane.
+        This is done by flipping the X coordinates of both keypoints and vertices.
+        The MANO model internally uses right hand, so this conversion is needed
+        when processing left hands.
+        Args:
+            kpts: 3D keypoints [21, 3]
+            verts: 3D mesh vertices [778, 3]
+        Returns:
+            Transformed keypoints and vertices
+        """
+        kpts[:,0] = -kpts[:,0]
+        verts[:,0] = -verts[:,0]
+        return kpts, verts
+    @staticmethod
+    def visualize_keypoints_3d(annotated_img: np.ndarray, kpts_3d: np.ndarray, verts: np.ndarray) -> None:
+        nfingers = len(kpts_3d) - 1
+        npts_per_finger = 4
+        list_fingers = [np.vstack([kpts_3d[0], kpts_3d[i:i + npts_per_finger]]) for i in range(1, nfingers, npts_per_finger)]
+        finger_colors_bgr = [(0, 255, 0), (0, 0, 255), (255, 0, 0), (255, 0, 255), (0, 255, 255)]
+        finger_colors_rgb = [(color[2], color[1], color[0]) for color in finger_colors_bgr]
+        fig, axs = plt.subplots(1,2, figsize=(20, 10))
+        axs[0] = fig.add_subplot(111, projection='3d')
+        for finger_idx, finger_pts in enumerate(list_fingers):
+            for i in range(len(finger_pts) - 1):
+                color = finger_colors_rgb[finger_idx]
+                axs[0].plot(
+                    [finger_pts[i][0], finger_pts[i + 1][0]],
+                    [finger_pts[i][1], finger_pts[i + 1][1]],
+                    [finger_pts[i][2], finger_pts[i + 1][2]],
+                    color=np.array(color)/255.0,
+                )
+        axs[0].scatter(kpts_3d[:, 0], kpts_3d[:, 1], kpts_3d[:, 2])
+        axs[0].scatter(verts[:, 0], verts[:, 1], verts[:, 2])
+        annotated_img_rgb = cv2.cvtColor(annotated_img, cv2.COLOR_BGR2RGB)
+        axs[1].imshow(annotated_img_rgb)
+        fig = plt.figure()
+        ax = fig.add_subplot(111)
+        ax.imshow(annotated_img_rgb)
+        plt.show()
+    @staticmethod
+    def get_all_T_cam_pred(batch: dict, out: dict, scaled_focal_length: float) -> torch.Tensor:
+        """
+        Get the camera transformation matrix
+        """
+        multiplier = 2 * batch["right"] - 1
+        pred_cam = out["pred_cam"]
+        pred_cam[:, 1] = multiplier * pred_cam[:, 1]
+        box_center = batch["box_center"].float()
+        box_size = batch["box_size"].float()
+        # NOTE: FOR HaMeR, they are using the img_size as (W, H)
+        W_H_shapes = batch["img_size"].float()
+        multiplier = 2 * batch["right"] - 1
+        T_cam_pred_all = cam_crop_to_full(
+            pred_cam, box_center, box_size, W_H_shapes, scaled_focal_length
+        )
+        return T_cam_pred_all
+    @staticmethod
+    def visualize_2d_kpt_on_img(kpts_2d: np.ndarray, img: np.ndarray) -> np.ndarray:
+        """
+        Plot 2D hand keypoints on the image with finger connections.
+        Each finger is drawn with a different color:
+        - Thumb: Green
+        - Index: Blue
+        - Middle: Red
+        - Ring: Magenta
+        - Pinky: Cyan
+        Args:
+            kpts_2d: 2D keypoints as integers [21, 2]
+            img: Input RGB image
+        Returns:
+            Image with keypoints and connections drawn (BGR format)
+        """
+        img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+        pts = kpts_2d.astype(np.int32)
+        nfingers = len(pts) - 1
+        npts_per_finger = 4
+        list_fingers = [np.vstack([pts[0], pts[i:i + npts_per_finger]]) for i in range(1, nfingers, npts_per_finger)]
+        finger_colors = [(0, 255, 0), (0, 0, 255), (255, 0, 0), (255, 0, 255), (0, 255, 255)]
+        thickness = 5 if img_bgr.shape[0] > 1000 else 2
+        for finger_idx, finger_pts in enumerate(list_fingers):
+            for i in range(len(finger_pts) - 1):
+                color = finger_colors[finger_idx]
+                cv2.line(
+                    img_bgr,
+                    tuple(finger_pts[i]),
+                    tuple(finger_pts[i + 1]),
+                    color,
+                    thickness=thickness,
+                )
+        cv2.line(img_bgr, [1787, 1522], [1656,1400], (255,0,0), thickness=thickness)
+        for pt in pts:
+            cv2.circle(img_bgr, (pt[0], pt[1]), radius=thickness, color=(0,0,0), thickness=thickness-1)
+        return img_bgr
+    @staticmethod
+    def project_3d_kpt_to_2d(kpts_3d: torch.Tensor, img_w: int, img_h: int, scaled_focal_length: float,
+                                camera_center: torch.Tensor, T_cam: Optional[torch.Tensor] = None,) -> np.ndarray:
+        """
+        Project 3D keypoints to 2D image coordinates using perspective projection.
+        """
+        batch_size = 1
+        rotation = torch.eye(3).unsqueeze(0)
+        assert T_cam is not None
+        T_cam = T_cam.cpu()
+        kpts_3d = torch.tensor(kpts_3d).cpu()
+        T_cam = T_cam.clone().cuda()
+        kpts_3d = kpts_3d.clone().cuda()
+        rotation = rotation.cuda()
+        scaled_focal_length_full = torch.tensor([scaled_focal_length, scaled_focal_length]).reshape(1, 2)
+        # IMPORTANT: The perspective_projection function assumes T_cam has not been added to kpts_3d already!
+        kpts_2d = perspective_projection(
+            kpts_3d.reshape(batch_size, -1, 3),
+            rotation=rotation.repeat(batch_size, 1, 1),
+            translation=T_cam.reshape(batch_size, -1),
+            focal_length=scaled_focal_length_full.repeat(batch_size, 1),
+            camera_center=camera_center.repeat(batch_size, 1),
+            ).reshape(batch_size, -1, 2)
+        kpts_2d = kpts_2d[0].cpu().numpy()
+        return np.rint(kpts_2d).astype(np.int32)
+    @staticmethod
+    def annotate_bboxes_on_img(img: np.ndarray, debug_bboxes: dict) -> np.ndarray:
+        """
+        Annotate bounding boxes on the image.
+        :param img: Input image (numpy array)
+        :param debug_bboxes: Dictionary containing different sets of bounding boxes and optional scores
+        :return: Annotated image
+        """
+        color_dict = {
+            "dino_bboxes": (0, 255, 0),
+            "det_bboxes": (0, 0, 255),
+            "refined_bboxes": (255, 0, 0),
+            "filtered_bboxes": (255, 255, 0),
+        }
+        corner_dict = {
+            "dino_bboxes": "top_left",
+            "det_bboxes": "top_right",
+            "refined_bboxes": "bottom_left",
+            "filtered_bboxes": "bottom_right",
+        }
+        def draw_bbox_and_label(bbox, label, color, label_pos, include_label=True):
+            """ Helper function to draw the bounding box and add label """
+            cv2.rectangle(
+                img,
+                (int(bbox[0]), int(bbox[1])),
+                (int(bbox[2]), int(bbox[3])),
+                color,
+                2,
+            )
+            if include_label:
+                cv2.putText(
+                    img, label, label_pos,
+                    cv2.FONT_HERSHEY_SIMPLEX, 1, color, 2, cv2.LINE_AA
+                )
+        label_pos_dict = {
+            "top_left": lambda bbox: (int(bbox[0]), int(bbox[1]) - 10),
+            "bottom_right": lambda bbox: (int(bbox[2]) - 150, int(bbox[3]) - 10),
+            "top_right": lambda bbox: (int(bbox[2]) - 150, int(bbox[1]) - 10),
+            "bottom_left": lambda bbox: (int(bbox[0]), int(bbox[3]) - 10),
+        }
+        for key, value in debug_bboxes.items():
+            # Unpack bboxes and scores
+            if key in ["dino_bboxes", "det_bboxes"]:
+                bboxes, scores = value
+            else:
+                bboxes = value
+                scores = [None] * len(bboxes)
+            color = color_dict.get(key, (0, 0, 0))
+            label_pos_fn = label_pos_dict[corner_dict.get(key, "top_left")]
+            # Draw each bounding box and its label
+            for idx, bbox in enumerate(bboxes):
+                score_text = f" {scores[idx]:.3f}" if scores[idx] is not None else ""
+                label = key.split("_")[0] + score_text
+                # Draw bounding box and label on the image
+                label_pos = label_pos_fn(bbox)
+                if key in ["dino_bboxes", "det_bboxes"] or idx == 0:
+                    draw_bbox_and_label(bbox, label, color, label_pos)
+        return img
+    @staticmethod
+    def load_hamer_model(checkpoint_path: str, root_dir: Optional[str] = None) -> Tuple[HAMER, CN]:
+        """
+        Load the HaMeR model from the checkpoint path.
+        """
+        model_cfg_path = str(Path(checkpoint_path).parent.parent / "model_config.yaml")
+        model_cfg = get_config(model_cfg_path, update_cachedir=True)
+        # update model and params path
+        if root_dir:
+            model_cfg.defrost()
+            model_cfg.MANO.DATA_DIR = os.path.join(root_dir, model_cfg.MANO.DATA_DIR)
+            model_cfg.MANO.MODEL_PATH = os.path.join(root_dir, model_cfg.MANO.MODEL_PATH.replace("./", ""))
+            model_cfg.MANO.MEAN_PARAMS = os.path.join(root_dir, model_cfg.MANO.MEAN_PARAMS.replace("./", ""))
+            model_cfg.freeze()
+        # Override some config values, to crop bbox correctly
+        if (model_cfg.MODEL.BACKBONE.TYPE == "vit") and ("BBOX_SHAPE" not in model_cfg.MODEL):
+            model_cfg.defrost()
+            assert (
+                model_cfg.MODEL.IMAGE_SIZE == 256
+            ), f"MODEL.IMAGE_SIZE ({model_cfg.MODEL.IMAGE_SIZE}) should be 256 for ViT backbone"
+            model_cfg.MODEL.BBOX_SHAPE = [192, 256]
+            model_cfg.freeze()
+        # Update config to be compatible with demo
+        if "PRETRAINED_WEIGHTS" in model_cfg.MODEL.BACKBONE:
+            model_cfg.defrost()
+            model_cfg.MODEL.BACKBONE.pop("PRETRAINED_WEIGHTS")
+            model_cfg.freeze()
+        model = HAMER.load_from_checkpoint(checkpoint_path, strict=False, cfg=model_cfg)
+        return model, model_cfg

phantom/phantom/detectors/detector_sam2.py ADDED Viewed

	@@ -0,0 +1,240 @@

+"""
+Wrapper around SAM2 for object segmentation
+"""
+import numpy as np
+import pdb
+import os
+import logging
+import requests
+from typing import Tuple, Optional
+from pathlib import Path
+import matplotlib.pyplot as plt
+from matplotlib.axes import Axes
+import cv2
+from PIL import Image
+import torch
+from sam2.build_sam import build_sam2  # type: ignore
+from sam2.sam2_image_predictor import SAM2ImagePredictor  # type: ignore
+from sam2.build_sam import build_sam2_video_predictor  # type: ignore
+logger = logging.getLogger(__name__)
+def download_sam2_ckpt(ckpt_path: str) -> None:
+    url = "https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_large.pt"
+    save_path = Path(ckpt_path)
+    save_path.parent.mkdir(exist_ok=True, parents=True)
+    response = requests.get(url, stream=True)
+    if response.status_code == 200:
+        with open(save_path, "wb") as file:
+            for chunk in response.iter_content(chunk_size=8192):
+                file.write(chunk)
+        logger.info(f"File downloaded successfully and saved to {save_path}")
+    else:
+        logger.info(f"Failed to download the file. Status code: {response.status_code}")
+class DetectorSam2:
+    """
+    A detector that uses the SAM2 model for object segmentation in images and videos.
+    """
+    def __init__(self):
+        checkpoint = "../submodules/sam2/checkpoints/sam2_hiera_large.pt"
+        model_cfg = "sam2_hiera_l.yaml"
+        if not os.path.exists(checkpoint):
+            download_sam2_ckpt(checkpoint)
+        self.device = "cuda"
+        self.video_predictor = build_sam2_video_predictor(model_cfg, checkpoint, device=self.device)
+    def segment_video(self, video_dir: Path, bbox: np.ndarray, points: np.ndarray,
+                      indices: int, reverse: bool=False, output_bboxes: Optional[np.ndarray]=None):
+        """
+        Segment an object across video frames using SAM2's video tracking capabilities.
+        Parameters:
+            video_dir: Directory containing video frames as image files
+            bbox: Bounding box coordinates [x0, y0, x1, y1] for the object to track
+            points: Point(s) on the object to track
+            start_idx: Frame index to start tracking from
+        Returns:
+            video_segments: Dictionary mapping frame indices to segmentation masks
+            list_annotated_imgs: Array of frames with the segmented object masked out
+        """
+        frame_names = os.listdir(video_dir)
+        frame_names = sorted(frame_names)
+        with torch.inference_mode(), torch.autocast(self.device, dtype=torch.bfloat16):
+            state = self.video_predictor.init_state(video_path=str(video_dir))
+            self.video_predictor.reset_state(state)
+            for point, idx in zip(points, indices):
+                try:
+                    if bbox is None or np.all(bbox) == 0:
+                        self.video_predictor.add_new_points_or_box(
+                            state,
+                            frame_idx=int(idx),
+                            obj_id=0,
+                            points=np.array(point),
+                            labels=np.ones(len(point)),
+                        )
+                    else:
+                        self.video_predictor.add_new_points_or_box(
+                            state,
+                            frame_idx=int(idx),
+                            obj_id=0,
+                            box=np.array(bbox),
+                            points=np.array(point),
+                            labels=np.ones(len(point)),
+                        )
+                except Exception as e:
+                    print("Error in adding new points or box:", e)
+                    pdb.set_trace()
+            video_segments = {}
+            for (
+                out_frame_idx,
+                out_obj_ids,
+                out_mask_logits,
+            ) in self.video_predictor.propagate_in_video(state, reverse=reverse):
+                video_segments[out_frame_idx] = {
+                    out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
+                    for i, out_obj_id in enumerate(out_obj_ids)
+                }
+        frame_indices = list(video_segments.keys())
+        frame_indices.sort()
+        list_annotated_imgs = {}
+        for out_frame_idx in frame_indices:
+            img = Image.open(os.path.join(video_dir, frame_names[out_frame_idx]))
+            img_arr = np.array(img)
+            mask = video_segments[out_frame_idx][0]
+            if output_bboxes is not None:
+                # Crop the mask to the bounding box
+                output_bbox = output_bboxes[out_frame_idx].astype(np.int32)
+                if output_bbox.sum() > 0:
+                    bbox_mask = np.zeros_like(mask)
+                    bbox_mask = self._crop_mask_to_bbox(mask, output_bbox)
+                    mask = mask * bbox_mask
+            img_arr[mask[0]] = (0, 0, 0)
+            list_annotated_imgs[out_frame_idx] = img_arr
+        if output_bboxes is not None:
+            for out_frame_idx in frame_indices:
+                output_bbox = output_bboxes[out_frame_idx].astype(np.int32)
+                mask = video_segments[out_frame_idx][0]
+                mask_ori = mask.copy()
+                if output_bbox.sum() > 0:
+                    bbox_mask = np.zeros_like(mask)
+                    bbox_mask = self._crop_mask_to_bbox(mask, output_bbox)
+                    mask = mask * bbox_mask
+                    video_segments[out_frame_idx] = {
+                        0: mask
+                    }
+        # Fix gpu memory leak
+        torch.cuda.empty_cache()
+        return video_segments, list_annotated_imgs
+    def _crop_mask_to_bbox(self, mask: np.ndarray, bbox: np.ndarray) -> np.ndarray:
+        """
+        Crop a mask to a bounding box.
+        """
+        margin = 20
+        bbox = bbox.astype(np.int32)
+        x0, y0, x1, y1 = bbox
+        x0 = max(0, x0 - margin)
+        x1 = min(mask.shape[2], x1 + margin)
+        y0 = max(0, y0 - margin)
+        y1 = min(mask.shape[1], y1 + margin)
+        bbox_mask = np.zeros_like(mask)
+        bbox_mask[:, y0:y1, x0:x1] = 1
+        return bbox_mask
+    def segment_video_from_mask(self, video_dir: str, mask: np.ndarray, frame_idx: int, reverse=False):
+        """
+        Propagate a segmentation mask through video frames (forward or backward).
+        Parameters:
+            video_dir: Directory containing video frames
+            mask: Initial segmentation mask to propagate
+            frame_idx: Frame index where the mask is defined
+            reverse: If True, propagate backward in time; if False, propagate forward
+        Returns:
+            frame_indices: List of frame indices where masks were generated
+            video_segments: Dictionary mapping frame indices to segmentation masks
+        """
+        with torch.inference_mode(), torch.autocast(self.device, dtype=torch.bfloat16):
+            state = self.video_predictor.init_state(video_path=video_dir)
+            self.video_predictor.reset_state(state)
+            self.video_predictor.add_new_mask(state, frame_idx, 0, mask)
+            video_segments = {}
+            mask_prob = {}
+            for (
+                out_frame_idx,
+                out_obj_ids,
+                out_mask_logits,
+            ) in self.video_predictor.propagate_in_video(state, reverse=reverse):
+                mask_prob[out_frame_idx] = torch.mean(torch.sigmoid(out_mask_logits))
+                video_segments[out_frame_idx] = {
+                    out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
+                    for i, out_obj_id in enumerate(out_obj_ids)
+                }
+        frame_indices = list(video_segments.keys())
+        frame_indices.sort()
+        return frame_indices, video_segments
+    @staticmethod
+    def show_mask(mask: np.ndarray, ax: Axes, random_color: bool=False, borders: bool = True) -> None:
+        if random_color:
+            color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
+        else:
+            color = np.array([30/255, 144/255, 255/255, 0.6])
+        h, w = mask.shape[-2:]
+        mask = mask.astype(np.uint8)
+        mask_image =  mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+        if borders:
+            contours, _ = cv2.findContours(mask,cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
+            # Try to smooth contours
+            contours = [cv2.approxPolyDP(contour, epsilon=0.01, closed=True) for contour in contours]
+            mask_image = cv2.drawContours(mask_image, contours, -1, (1, 1, 1, 0.5), thickness=2)
+        ax.imshow(mask_image)
+    @staticmethod
+    def show_masks(image: np.ndarray, masks: np.ndarray, scores: np.ndarray, point_coords: Optional[np.ndarray]=None,
+                   box_coords: Optional[np.ndarray]=None, input_labels: Optional[np.ndarray]=None, borders: bool=True) -> None:
+        n_masks = len(masks)
+        fig, axs = plt.subplots(1, n_masks, figsize=(10*n_masks, 10))
+        for i, (mask, score) in enumerate(zip(masks, scores)):
+            axs[i].imshow(image)
+            DetectorSam2.show_mask(mask, axs[i], borders=borders)
+            if point_coords is not None:
+                assert input_labels is not None
+                DetectorSam2.show_points(point_coords, input_labels, axs[i])
+            if box_coords is not None:
+                DetectorSam2.show_box(box_coords, axs[i])
+            if len(scores) > 1:
+                axs[i].set_title(f"Mask {i+1}, Score: {score:.3f}", fontsize=18)
+            axs[i].axis('off')
+        plt.show()
+    @staticmethod
+    def show_box(box: np.ndarray, ax: Axes) -> None:
+        x0, y0 = box[0], box[1]
+        w, h = box[2] - box[0], box[3] - box[1]
+        ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0, 0, 0, 0), lw=2))
+    @staticmethod
+    def show_points(coords: np.ndarray, labels: np.ndarray, ax: Axes, marker_size: int=375) -> None:
+        pos_points = coords[labels==1]
+        neg_points = coords[labels==0]
+        ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*',
+                   s=marker_size, edgecolor='white', linewidth=1.25)
+        ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*',
+                   s=marker_size, edgecolor='white', linewidth=1.25)

phantom/phantom/hand.py ADDED Viewed

	@@ -0,0 +1,805 @@

+"""
+Hand Model Module
+This module provides hand modeling for action processors. It converts detected hand
+keypoints into kinematic models that can be used for robot control
+Key Components:
+- HandModel: Base class for unconstrained hand kinematic modeling
+- PhysicallyConstrainedHandModel: Extended class with constrained joint and velocity limits
+- Grasp point and orientation calculation for robot end-effector control
+The hand model follows the MediaPipe hand landmark convention with 21 keypoints:
+- Wrist (1 point)
+- Thumb (4 points: MCP, PIP, DIP, TIP)
+- Index finger (4 points: MCP, PIP, DIP, TIP)
+- Middle finger (4 points: MCP, PIP, DIP, TIP)
+- Ring finger (4 points: MCP, PIP, DIP, TIP)
+- Pinky finger (4 points: MCP, PIP, DIP, TIP)
+Coordinate System:
+- All calculations performed in robot coordinate frame
+- Grasp orientations aligned with robot end-effector conventions
+- Joint rotations represented as rotation matrices and Euler angles
+"""
+from typing import Optional, List, Dict, Tuple, Union, Any
+import numpy as np
+import pdb
+import torch
+from scipy.spatial.transform import Rotation
+import logging
+from phantom.utils.transform_utils import *
+logger = logging.getLogger(__name__)
+class HandModel:
+    """
+    Base class for hand kinematic modeling and trajectory analysis.
+    This class provides a kinematic representation of a human hand using 21 keypoints
+    from hand pose estimation. It calculates joint rotations, tracks hand motion over
+    time, and computes grasp points and orientations suitable for robot control.
+    The kinematic structure follows a tree topology with the wrist as the root,
+    and each finger as a separate chain. Joint rotations are calculated relative
+    to parent joints using vector alignment methods.
+    Key Features:
+    - 21-point hand keypoint processing
+    - Joint rotation calculation using vector alignment
+    - Grasp point computation from thumb-index / thumb-middle finger positioning
+    - End-effector orientation calculation for robot control
+    Attributes:
+        robot_name (str): Name of the target robot for coordinate frame alignment
+        kinematic_tree (List[Tuple[int, int]]): Parent-child relationships for hand joints
+        joint_to_neighbors_mapping (Dict[int, Tuple[int, int, int]]): Mapping of joints to their neighbors
+        vertex_positions (List[np.ndarray]): Time series of hand keypoint positions
+        joint_rotations (List[List[np.ndarray]]): Time series of joint rotation matrices
+        grasp_points (List[np.ndarray]): Time series of computed grasp points
+        grasp_oris (List[np.ndarray]): Time series of grasp orientation matrices
+        timestamps (List[float]): Time stamps for each frame
+        num_joints (int): Total number of joints in the hand model
+        joint_rotations_xyz (List[List[np.ndarray]]): Time series of Euler angle representations
+    """
+    def __init__(self, robot_name: str) -> None:
+        """
+        Initialize the hand model with kinematic structure.
+        Args:
+            robot_name: Name of the target robot for coordinate alignment
+        """
+        self.robot_name: str = robot_name
+        # Define the kinematic tree structure for hand joints
+        # Format: (joint_index, parent_index) where -1 indicates root (wrist)
+        self.kinematic_tree: List[Tuple[int, int]] = [
+            (0, -1),    # wrist base (root of the kinematic tree)
+            # Thumb chain (4 joints)
+            (1, 0),     # thumb mcp
+            (2, 1),     # thumb pip
+            (3, 2),     # thumb dip
+            (4, 3),     # thumb tip
+            # Index finger chain (4 joints)
+            (5, 0),     # index mcp
+            (6, 5),     # index pip
+            (7, 6),     # index dip
+            (8, 7),     # index tip
+            # Middle finger chain (4 joints)
+            (9, 0),     # middle mcp
+            (10, 9),    # middle pip
+            (11, 10),   # middle dip
+            (12, 11),   # middle tip
+            # Ring finger chain (4 joints)
+            (13, 0),    # ring mcp
+            (14, 13),   # ring pip
+            (15, 14),   # ring dip
+            (16, 15),   # ring tip
+            # Pinky finger chain (4 joints)
+            (17, 0),    # pinky mcp
+            (18, 17),   # pinky pip
+            (19, 18),   # pinky dip
+            (20, 19),   # pinky tip
+        ]
+        # Mapping from joint index to (current_vertex, child_vertex, parent_vertex)
+        # This defines the local coordinate system for each joint rotation calculation
+        self.joint_to_neighbors_mapping: Dict[int, Tuple[int, int, int]] = {
+            # Thumb joint mappings
+            0: (0, 1, -1),  # wrist to thumb mcp (no parent)
+            1: (1, 2, 0),   # thumb mcp to pip (parent: wrist)
+            2: (2, 3, 1),   # thumb pip to dip (parent: thumb mcp)
+            3: (3, 4, 2),   # thumb dip to tip (parent: thumb pip)
+            # Index finger joint mappings
+            4: (0, 5, -1),  # wrist to index mcp (no parent)
+            5: (5, 6, 0),   # index mcp to pip (parent: wrist)
+            6: (6, 7, 5),   # index pip to dip (parent: index mcp)
+            7: (7, 8, 6),   # index dip to tip (parent: index pip)
+            # Middle finger joint mappings
+            8: (0, 9, -1),  # wrist to middle mcp (no parent)
+            9: (9, 10, 0),  # middle mcp to pip (parent: wrist)
+            10: (10, 11, 9), # middle pip to dip (parent: middle mcp)
+            11: (11, 12, 10),# middle dip to tip (parent: middle pip)
+            # Ring finger joint mappings
+            12: (0, 13, -1), # wrist to ring mcp (no parent)
+            13: (13, 14, 0),# ring mcp to pip (parent: wrist)
+            14: (14, 15, 13),# ring pip to dip (parent: ring mcp)
+            15: (15, 16, 14),# ring dip to tip (parent: ring pip)
+            # Pinky finger joint mappings
+            16: (0, 17, -1), # wrist to pinky mcp (no parent)
+            17: (17, 18, 0),# pinky mcp to pip (parent: wrist)
+            18: (18, 19, 17),# pinky pip to dip (parent: pinky mcp)
+            19: (19, 20, 18),# pinky dip to tip (parent: pinky pip)
+        }
+        self.num_joints: int = len(self.joint_to_neighbors_mapping)
+        # Time series data storage
+        self.vertex_positions: List[np.ndarray] = []    # List of (21, 3) arrays for each timestep
+        self.joint_rotations: List[List[np.ndarray]] = []     # List of rotation matrices for each joint
+        self.joint_rotations_xyz: List[List[np.ndarray]] = [] # List of Euler angle representations
+        self.grasp_points: List[np.ndarray] = []        # List of computed grasp points (3D positions)
+        self.grasp_oris: List[np.ndarray] = []          # List of grasp orientation matrices (3x3)
+        self.timestamps: List[float] = []          # List of timestamps for temporal analysis
+    def calculate_joint_rotation(self, current_pos: np.ndarray, child_pos: np.ndarray, parent_pos: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Calculate the rotation matrix for a single joint using vector alignment.
+        This method computes the rotation that aligns the previous direction vector
+        with the current direction vector. For root joints (no parent), it uses
+        a default upward direction as the reference.
+        Args:
+            current_pos: 3D position of the current joint
+            child_pos: 3D position of the child joint
+            parent_pos: 3D position of the parent joint
+        Returns:
+            Tuple containing:
+                - rotation_matrix: 3x3 rotation matrix
+                - euler_angles: Rotation as XYZ Euler angles
+        """
+        # Calculate current direction vector (current -> child)
+        current_dir = child_pos - current_pos
+        current_norm = np.linalg.norm(current_dir)
+        if current_norm < 1e-10:
+            return np.eye(3), np.array([0,0,0])
+        current_dir /= current_norm
+        # Calculate previous direction vector (parent -> current, or default up)
+        prev_dir = np.array([0.0, 0.0, 1.0]) if parent_pos is None else current_pos - parent_pos
+        prev_norm = np.linalg.norm(prev_dir)
+        if prev_norm < 1e-10:
+            return np.eye(3), np.array([0,0,0])
+        prev_dir /= prev_norm
+        # Check if vectors are already aligned (no rotation needed)
+        if np.abs((np.abs(np.dot(current_dir, prev_dir)) - 1)) < 1e-8:
+            return np.eye(3), np.array([0,0,0])
+        # Calculate rotation that aligns prev_dir with current_dir
+        rotation, _ = Rotation.align_vectors([current_dir], [prev_dir])
+        return rotation.as_matrix(), rotation.as_euler('xyz')
+    def calculate_frame_rotations(self, vertices: np.ndarray) -> Tuple[List[np.ndarray], List[np.ndarray]]:
+        """
+        Calculate rotation matrices for all joints in a single frame.
+        This method processes all joints in the hand and computes their rotations
+        based on the kinematic structure and current vertex positions.
+        Args:
+            vertices: Hand keypoints, shape (21, 3)
+        Returns:
+            Tuple containing:
+                - rotation_matrices: List of 3x3 rotation matrices
+                - euler_angles: List of XYZ Euler angle arrays
+        """
+        rotations, rotations_xyz = zip(*[
+            self.calculate_joint_rotation(vertices[m[0]], vertices[m[1]],
+                                           None if m[2] == -1 else vertices[m[2]])
+            for m in self.joint_to_neighbors_mapping.values()
+        ])
+        return list(rotations), list(rotations_xyz)
+    def calculate_angular_velocity(self, joint_idx: int, t1: int, t2: int) -> np.ndarray:
+        """
+        Calculate angular velocity for a specific joint between two time frames.
+        Angular velocity is computed as the rotation vector difference divided
+        by the time difference between frames.
+        Args:
+            joint_idx: Index of the joint
+            t1: Index of the first time frame
+            t2: Index of the second time frame
+        Returns:
+            Angular velocity vector (3,) in rad/s
+        """
+        dt = self.timestamps[t2] - self.timestamps[t1]
+        if dt == 0:
+            return np.zeros(3)
+        # Get rotation matrices for the two time frames
+        R1, R2 = self.joint_rotations[t1][joint_idx], self.joint_rotations[t2][joint_idx]
+        # Calculate relative rotation and convert to angular velocity
+        R_relative = Rotation.from_matrix(R2) * Rotation.from_matrix(R1).inv()
+        return R_relative.as_rotvec() / dt
+    def calculate_frame_angular_velocities(self, current_frame_idx: int) -> np.ndarray:
+        """
+        Calculate angular velocities for all joints at the current frame.
+        This method computes the angular velocity vectors for all joints by
+        comparing rotations with the previous frame. Returns zeros for the
+        first frame since no previous frame exists.
+        Args:
+            current_frame_idx: Index of the current frame. Must be > 0.
+        Returns:
+            Array of angular velocity vectors (shape: num_joints x 3)
+            Each row contains [wx, wy, wz] for one joint.
+            Returns zeros if current_frame_idx == 0.
+        """
+        if current_frame_idx == 0:
+            return np.zeros((self.num_joints, 3))
+        prev_frame_idx = current_frame_idx - 1
+        # Vectorized calculation for all joints
+        velocities = np.array([
+            self.calculate_angular_velocity(joint_idx, prev_frame_idx, current_frame_idx)
+            for joint_idx in range(self.num_joints)
+        ])
+        return velocities
+    def calculate_grasp_plane(self, vertices: np.ndarray) -> np.ndarray:
+        """
+        Calculate the plane that best fits through a set of hand vertices.
+        This method uses Singular Value Decomposition (SVD) to find the plane.
+        The plane is typically fitted through thumb and index finger points.
+        Args:
+            vertices: Set of 3D points to fit plane through, shape (N, 3)
+        Returns:
+            Plane coefficients [a, b, c, d] for ax + by + cz + d = 0
+        """
+        # Create augmented matrix with homogeneous coordinates for plane fitting
+        A = np.c_[vertices[:, 0], vertices[:, 1], vertices[:, 2], np.ones(vertices.shape[0])]
+        # Right-hand side is zeros for the plane equation ax + by + cz + d = 0
+        b = np.zeros(vertices.shape[0])
+        # Use SVD to solve the least squares problem
+        U, S, Vt = np.linalg.svd(A)
+        # Plane coefficients are in the last row of Vt (smallest singular value)
+        plane_coeffs = Vt[-1, :]
+        # Normalize coefficients for easier interpretation (unit normal vector)
+        plane_coeffs = plane_coeffs / np.linalg.norm(plane_coeffs[:3])
+        return plane_coeffs  # [a, b, c, d]
+    def calculate_grasp_point(self, grasp_plane: np.ndarray, vertices: np.ndarray) -> np.ndarray:
+        """
+        Calculate the optimal grasp point for robot end-effector positioning.
+        The grasp point is computed as the midpoint between projected thumb tip
+        and index finger tip on the grasp plane. This provides a stable reference
+        point for robot grasping operations.
+        Args:
+            grasp_plane: Plane coefficients [a, b, c, d]
+            vertices: Hand keypoints, shape (21, 3)
+        Returns:
+            3D grasp point coordinates
+        """
+        # Project fingertips onto the grasp plane
+        thumb_pt = project_point_to_plane(vertices[4], grasp_plane)
+        index_pt = project_point_to_plane(vertices[8], grasp_plane)
+        # Compute midpoint as the grasp reference
+        hand_ee_pt = np.mean([thumb_pt, index_pt], axis=0)
+        return hand_ee_pt
+    def add_frame(self, vertices: np.ndarray, timestamp: float, hand_detected: bool = True) -> None:
+        """
+        Add a new frame of vertex positions and calculate corresponding data.
+        This is the main method for processing hand data over time. It computes
+        grasp points, orientations, and stores all relevant information for
+        the current timestep.
+        Args:
+            vertices: Array of 21 3D vertex positions
+            timestamp: Time of the frame in seconds
+            hand_detected: Whether hand was successfully detected
+        """
+        if len(vertices) != 21:
+            raise ValueError(f"Expected 21 vertices, got {len(vertices)}")
+        # Handle frames without hand detection
+        if not hand_detected:
+            self.vertex_positions.append(np.zeros((21, 3)))
+            self.grasp_points.append(np.zeros(3))
+            self.grasp_oris.append(np.eye(3))
+            self.timestamps.append(timestamp)
+            return
+        # Extract key finger tip positions
+        thumb_tip = vertices[4]
+        index_tip = vertices[8]
+        middle_tip = vertices[12]
+        # Calculate grasp point as midpoint between thumb and middle finger tips
+        control_point = (thumb_tip + middle_tip) / 2
+        grasp_pt = control_point
+        # Calculate gripper orientation from thumb-index finger configuration
+        gripper_ori, _ = HandModel.get_gripper_orientation(thumb_tip, index_tip, vertices)
+        # Apply 90-degree rotation to align with robot gripper convention
+        rot_90_deg = Rotation.from_euler('Z', 90, degrees=True).as_matrix()
+        grasp_ori = gripper_ori @ rot_90_deg
+        # Store all frame data
+        self.vertex_positions.append(vertices)
+        self.grasp_points.append(grasp_pt)
+        self.grasp_oris.append(grasp_ori)
+        self.timestamps.append(timestamp)
+    def get_joint_data(self, joint_idx: int) -> Dict[str, Union[List[float], List[np.ndarray]]]:
+        """
+        Get all trajectory data for a specific joint across all frames.
+        Args:
+            joint_idx: Index of the joint
+        Returns:
+            Dictionary containing joint trajectory data with keys:
+                - 'timestamps': List of time stamps
+                - 'rotations': List of rotation matrices for this joint
+        """
+        return {
+            'timestamps': self.timestamps,
+            'rotations': [frame[joint_idx] for frame in self.joint_rotations],
+        }
+    @staticmethod
+    def get_parallel_plane(a: float, b: float, c: float, d: float, dist: float) -> Tuple[float, float, float, float]:
+        """
+        Calculate coefficients of a plane parallel to the given plane at specified distance.
+        This utility method is useful for creating offset grasp planes that account
+        for gripper thickness or provide clearance during grasping operations.
+        Parameters:
+            a, b, c, d: Coefficients of the original plane ax + by + cz + d = 0
+            dist: Distance between planes (positive moves in normal direction)
+        Returns:
+            (a, b, c, d_new) coefficients of the parallel plane
+        """
+        # Calculate the magnitude of the normal vector
+        normal_magnitude = np.sqrt(a**2 + b**2 + c**2)
+        # Parallel plane has same normal direction, only d changes
+        d_new = d - dist * normal_magnitude
+        return (a, b, c, d_new)
+    @staticmethod
+    def get_gripper_orientation(thumb_tip: np.ndarray, index_tip: np.ndarray, vertices: np.ndarray, grasp_plane: Optional[np.ndarray] = None) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Compute robot gripper orientation matrix from hand keypoints and fingertip positions.
+        This method calculates a coordinate frame suitable for robot gripper control
+        based on the relative positions of thumb, index finger, and wrist. The resulting
+        orientation matrix can be directly used for robot end-effector control.
+        Args:
+            thumb_tip: 3D position of thumb tip
+            index_tip: 3D position of index finger tip
+            vertices: All hand keypoints, shape (21, 3)
+            grasp_plane: Plane coefficients [a,b,c,d]
+        Returns:
+            Tuple containing:
+                - gripper_orientation: 3x3 rotation matrix
+                - z_axis: Z-axis direction vector of the gripper frame
+        """
+        # Calculate gripper opening direction (thumb to index finger)
+        gripper_direction = thumb_tip - index_tip
+        # Calculate gripper reference point (midpoint of fingertips)
+        midpoint = (thumb_tip + index_tip) / 2
+        if grasp_plane is None:
+            # Use palm geometry when no plane is provided
+            palm_axis = vertices[5] - midpoint  # index MCP to midpoint
+            x_axis = gripper_direction / max(np.linalg.norm(gripper_direction), 1e-10)
+            z_axis = -palm_axis / max(np.linalg.norm(palm_axis), 1e-10)
+        else:
+            # Use grasp plane for orientation calculation
+            palm_axis = project_point_to_plane(vertices[0], grasp_plane) - project_point_to_plane(vertices[1], grasp_plane)
+            z_axis = -palm_axis / max(np.linalg.norm(palm_axis), 1e-10)
+            x_axis = np.cross(grasp_plane[:3], z_axis)
+            x_axis /= max(np.linalg.norm(x_axis), 1e-10)
+        # Compute y-axis
+        y_axis = np.cross(z_axis, x_axis)
+        y_axis /= max(np.linalg.norm(y_axis), 1e-10)
+        # Ensure orthogonality by recalculating z_axis
+        z_axis = np.cross(x_axis, y_axis)
+        z_axis /= max(np.linalg.norm(z_axis), 1e-10)
+        # Check orientation consistency with palm direction
+        if type(palm_axis) == torch.Tensor:
+            palm_axis = palm_axis.cpu().numpy()
+        if z_axis @ palm_axis > 0:
+            x_axis, y_axis, z_axis = -x_axis, -y_axis, -z_axis
+        # Construct orientation matrix
+        gripper_ori = np.column_stack([x_axis, y_axis, z_axis])
+        # Ensure proper handedness (right-handed coordinate system)
+        if np.linalg.det(gripper_ori) < 0:
+            x_axis = -x_axis  # Flip one axis to fix handedness
+            gripper_ori = np.column_stack([x_axis, y_axis, z_axis])
+        # Verify determinant for debugging
+        det = np.linalg.det(gripper_ori)
+        if det < 0.9:
+            pdb.set_trace()
+        return gripper_ori, z_axis
+class PhysicallyConstrainedHandModel(HandModel):
+    """
+    Extended hand model with physical constraints and realistic joint limits.
+    This class builds upon the base HandModel by adding realistic constraints
+    that enforce physically plausible hand poses and motion. It includes:
+    - Joint angle limits based on human hand anatomy
+    - Angular velocity constraints for smooth motion
+    - Pose reconstruction with constraint enforcement
+    - Enhanced grasp point calculation with plane-based refinement
+    Constrained hand model is used in Phantom
+    Key Constraints:
+    - Anatomically correct joint limits for each finger joint
+    - Velocity limiting to prevent jerky motions
+    - Iterative pose refinement with constraint satisfaction
+    - More robust grasp plane calculation and orientation alignment
+    Attributes:
+        joint_limits (Dict[int, Tuple[float, ...]]): Joint angle limits for each joint in radians
+        max_angular_velocity (float): Maximum allowed angular velocity in rad/s
+    """
+    def __init__(self, robot_name: str) -> None:
+        """
+        Initialize the physically constrained hand model.
+        Args:
+            robot_name: Name of the target robot for coordinate alignment
+        """
+        super().__init__(robot_name)
+        # Define joint rotation limits (in radians) for each joint
+        # Format: (min_x, max_x, min_y, max_y, min_z, max_z) for XYZ Euler angles
+        small_angle = np.pi/40  # Small constraint for fine motor control
+        self.joint_limits: Dict[int, Tuple[float, float, float, float, float, float]] = {
+            # Thumb joints - more flexible due to opposable nature
+            0: (-np.pi, np.pi, -np.pi, np.pi, -np.pi, np.pi), # wrist to thumb mcp
+            1: (-np.pi, np.pi, -np.pi, np.pi, -np.pi, np.pi), # thumb mcp to pip
+            2: (-np.pi, np.pi, -np.pi, np.pi, -np.pi, np.pi), # thumb pip to dip
+            3: (-np.pi, np.pi, -np.pi, np.pi, -np.pi, np.pi), # thumb dip to tip
+            # Index finger joints - moderate constraints
+            4: (-np.pi, np.pi, -np.pi, np.pi, -np.pi, np.pi), # wrist to index mcp
+            5: (-np.pi, np.pi, -np.pi, np.pi, -np.pi, np.pi), # index mcp to pip
+            6: (-small_angle, small_angle, -np.pi/8, np.pi/8, -small_angle, small_angle), # index pip to dip
+            7: (-small_angle, small_angle, -np.pi/8, np.pi/8, -small_angle, small_angle), # index dip to tip
+            # Middle finger joints - tighter constraints for stability
+            8: (-np.pi, np.pi, -np.pi, np.pi, -np.pi, np.pi), # wrist to middle mcp
+            9: (-np.pi, np.pi, -np.pi, np.pi, -np.pi, np.pi), # middle mcp to pip
+            10: (-np.pi, np.pi, -np.pi, np.pi, -np.pi/4, np.pi/4), # middle pip to dip
+            11: (-np.pi, np.pi, -np.pi, np.pi, -np.pi/4, np.pi/4), # middle dip to tip
+            # Ring finger joints - similar to middle finger
+            12: (-np.pi, np.pi, -np.pi, np.pi, -np.pi, np.pi), # wrist to ring mcp
+            13: (-np.pi, np.pi, -np.pi, np.pi, -np.pi, np.pi), # ring mcp to pip
+            14: (-np.pi, np.pi, -np.pi, np.pi, -np.pi/4, np.pi/4), # ring pip to dip
+            15: (-np.pi, np.pi, -np.pi, np.pi, -np.pi/4, np.pi/4), # ring dip to tip
+            # Pinky finger joints - most constrained due to size
+            16: (-np.pi, np.pi, -np.pi, np.pi, -np.pi, np.pi), # wrist to pinky mcp
+            17: (-np.pi, np.pi, -np.pi, np.pi, -np.pi, np.pi), # pinky mcp to pip
+            18: (-np.pi, np.pi, -np.pi, np.pi, -np.pi/4, np.pi/4), # pinky pip to dip
+            19: (-np.pi, np.pi, -np.pi, np.pi, -np.pi/4, np.pi/4), # pinky dip to tip
+        }
+        # Maximum angular velocity constraint (2π rad/s = 360°/s)
+        self.max_angular_velocity: float = np.pi * 2
+    def reconstruct_vertices(self, input_vertices: np.ndarray, rotations: List[np.ndarray]) -> np.ndarray:
+        """
+        Reconstruct vertex positions from base vertex and constrained rotations.
+        This method applies the kinematic chain to reconstruct hand vertex positions
+        while respecting the calculated bone lengths from the input vertices.
+        This ensures consistent hand proportions while applying constraints.
+        Args:
+            input_vertices: Original vertex positions, shape (21, 3)
+            rotations: List of constrained rotation matrices
+        Returns:
+            Reconstructed vertex positions, shape (21, 3)
+        """
+        vertices = np.zeros((21, 3))
+        vertices[0] = input_vertices[0]  # Wrist position remains fixed
+        # Calculate bone lengths from original vertices to maintain proportions
+        bone_lengths: Dict[Tuple[int, int], float] = {}
+        min_bone_length = 1e-6  # Minimum length to avoid numerical issues
+        # Extract bone lengths from the kinematic chain
+        for current in range(self.num_joints):
+            mapping = self.joint_to_neighbors_mapping[current]
+            current_vertex = mapping[0]
+            child_vertex = mapping[1]
+            parent_vertex = mapping[2]
+            # Calculate bone length for current->child connection
+            if child_vertex != -2:
+                length = np.linalg.norm(input_vertices[child_vertex] - input_vertices[current_vertex])
+                bone_lengths[(current_vertex, child_vertex)] = max(length, min_bone_length)
+        # Reconstruct positions following the kinematic chain
+        for current in range(self.num_joints):
+            mapping = self.joint_to_neighbors_mapping[current]
+            current_vertex = mapping[0]
+            child_vertex = mapping[1]
+            parent_vertex = mapping[2]
+            if child_vertex == -2:
+                continue
+            # Get positions and rotation for this joint
+            parent_pos = vertices[parent_vertex]
+            current_pos = vertices[current_vertex]
+            rotation = rotations[current]
+            # Determine reference direction for rotation application
+            if parent_vertex == -1:
+                # Root joints use upward direction as reference
+                prev_dir = np.array([0, 0, 1])
+            else:
+                # Use direction from parent to current vertex
+                prev_dir = vertices[current_vertex] - vertices[parent_vertex]
+                prev_dir = prev_dir / np.linalg.norm(prev_dir)
+            # Apply rotation to get new direction
+            current_dir = rotation @ prev_dir
+            # Position child vertex using calculated bone length
+            bone_length = bone_lengths[(current_vertex, child_vertex)]
+            vertices[child_vertex] = current_pos + current_dir * bone_length
+        return vertices
+    def constrain_rotation(self, rotation_matrix: np.ndarray, joint_idx: int) -> np.ndarray:
+        """
+        Apply joint angle constraints to a rotation matrix.
+        This method converts the rotation to Euler angles, clips them to the
+        joint limits, and converts back to a rotation matrix. This ensures
+        all joint angles remain within anatomically realistic ranges.
+        Args:
+            rotation_matrix: 3x3 rotation matrix to constrain
+            joint_idx: Index of the joint for limit lookup
+        Returns:
+            Constrained 3x3 rotation matrix
+        """
+        try:
+            # Convert rotation matrix to Euler angles
+            rot = Rotation.from_matrix(rotation_matrix)
+            euler = rot.as_euler('xyz')
+            # Get joint limits for this joint
+            limits = self.joint_limits[joint_idx]
+            # Clip Euler angles to the specified limits
+            constrained_euler = np.clip(euler,
+                                      [limits[0], limits[2], limits[4]],  # min limits
+                                      [limits[1], limits[3], limits[5]])  # max limits
+            # Convert back to rotation matrix if any clipping occurred
+            if not np.allclose(euler, constrained_euler):
+                return Rotation.from_euler('xyz', constrained_euler).as_matrix()
+            return rotation_matrix
+        except ValueError:
+            logger.error("Error constraining rotation")
+            # Return identity matrix if rotation is invalid
+            return np.eye(3)
+    def constrain_velocity(self, velocity: np.ndarray) -> np.ndarray:
+        """
+        Apply angular velocity constraints to limit motion speed.
+        This method ensures that joint angular velocities don't exceed the
+        maximum allowed velocity, preventing jerky or unrealistic motions.
+        Args:
+            velocity: Angular velocity vector to constrain
+        Returns:
+            Constrained angular velocity vector
+        """
+        velocity_magnitude = np.linalg.norm(velocity)
+        if velocity_magnitude > self.max_angular_velocity:
+            # Scale velocity to maximum while preserving direction
+            return velocity * (self.max_angular_velocity / velocity_magnitude)
+        return velocity
+    def add_frame(self, vertices: np.ndarray, timestamp: float, finger_pts: Any) -> None:
+        """
+        Add a new frame with physical constraints applied.
+        This method extends the base add_frame functionality by applying
+        joint limits, velocity constraints, and enhanced grasp calculations.
+        The result is a more realistic and stable hand model suitable for
+        robot control applications.
+        Args:
+            vertices: Hand keypoints, shape (21, 3)
+            timestamp: Time of the frame in seconds
+            finger_pts: Additional finger point data (currently unused)
+        """
+        # Calculate initial rotations from raw vertex positions
+        rotations, rotations_xyz = self.calculate_frame_rotations(vertices)
+        # Apply joint angle constraints to all rotations
+        constrained_rotations: List[np.ndarray] = []
+        for joint_idx, rotation in enumerate(rotations):
+            constrained_rot = self.constrain_rotation(rotation, joint_idx)
+            constrained_rotations.append(constrained_rot)
+        # Apply velocity constraints if this is not the first frame
+        if len(self.timestamps) > 0:
+            dt = timestamp - self.timestamps[-1]
+            for joint_idx in range(self.num_joints):
+                # Calculate angular velocity for this joint
+                prev_rot = Rotation.from_matrix(self.joint_rotations[-1][joint_idx])
+                curr_rot = Rotation.from_matrix(constrained_rotations[joint_idx])
+                rel_rot = curr_rot * prev_rot.inv()
+                velocity = rel_rot.as_rotvec() / dt
+                # Apply velocity constraint if needed
+                if np.linalg.norm(velocity) > self.max_angular_velocity:
+                    # Constrain velocity and reconstruct rotation
+                    constrained_velocity = self.constrain_velocity(velocity)
+                    delta_rot = Rotation.from_rotvec(constrained_velocity * dt)
+                    new_rot = delta_rot * prev_rot
+                    constrained_rotations[joint_idx] = new_rot.as_matrix()
+        # Reconstruct vertices with constrained rotations
+        constrained_vertices = self.reconstruct_vertices(vertices, constrained_rotations)
+        # Extract key points for grasp calculation
+        thumb_tip = constrained_vertices[4]
+        index_tip = constrained_vertices[8]
+        # Calculate grasp plane using thumb and index finger regions
+        grasp_plane = self.calculate_grasp_plane(constrained_vertices[3:9])
+        # Organize fingers for direction analysis
+        n_fingers = len(constrained_vertices) - 1
+        npts_per_finger = 4
+        list_fingers = [np.vstack([constrained_vertices[0], constrained_vertices[i:i + npts_per_finger]])
+                       for i in range(1, n_fingers, npts_per_finger)]
+        # Calculate finger direction vector for plane orientation
+        dir_vec = list_fingers[1][1] - list_fingers[-1][1]  # index to pinky MCP
+        dir_vec = dir_vec / np.linalg.norm(dir_vec)
+        # Ensure consistent plane orientation (normal pointing away from palm)
+        if np.dot(dir_vec, grasp_plane[:3]) > 0:
+            grasp_plane = -grasp_plane
+        # Create slightly offset plane for grasp point calculation
+        shifted_grasp_plane = self.get_parallel_plane(*grasp_plane, 0.01)
+        grasp_pt = self.calculate_grasp_point(shifted_grasp_plane, constrained_vertices)
+        # Calculate gripper orientation using the grasp plane
+        gripper_ori, _ = HandModel.get_gripper_orientation(thumb_tip, index_tip, constrained_vertices, grasp_plane)
+        # Apply coordinate frame transformations for robot compatibility
+        rot_90_deg = Rotation.from_euler('Z', 90, degrees=True).as_matrix()
+        grasp_ori = gripper_ori @ rot_90_deg
+        # Apply pitch adjustment
+        angle = -np.pi/18 * 1.0  # -10 degrees
+        grasp_ori = Rotation.from_rotvec(angle * np.array([1, 0, 0])).apply(grasp_ori)
+        # Offset grasp point along gripper Z-axis for clearance
+        unit_vectors = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+        transformed_vectors = unit_vectors @ grasp_ori.T
+        grasp_pt = grasp_pt - transformed_vectors[2] * 0.015  # 1.5cm offset
+        # Store all frame data
+        self.joint_rotations.append(constrained_rotations)
+        self.joint_rotations_xyz.append(rotations_xyz)
+        self.vertex_positions.append(constrained_vertices)
+        self.grasp_points.append(grasp_pt)
+        self.grasp_oris.append(grasp_ori)
+        self.timestamps.append(timestamp)
+def get_list_finger_pts_from_skeleton(skeleton_pts: np.ndarray) -> Dict[str, np.ndarray]:
+    """
+    Organize hand skeleton points into finger-specific groups.
+    This utility function takes the 21-point hand skeleton and organizes
+    it into a dictionary with separate arrays for each finger. This makes
+    it easier to perform finger-specific calculations and analysis.
+    Args:
+        skeleton_pts: Hand skeleton points, shape (21, 3)
+            Points are ordered as: wrist, thumb(4), index(4), middle(4), ring(4), pinky(4)
+    Returns:
+        Dictionary with finger names as keys and point arrays as values:
+            - "thumb": Wrist + 4 thumb points, shape (5, 3)
+            - "index": Wrist + 4 index points, shape (5, 3)
+            - "middle": Wrist + 4 middle points, shape (5, 3)
+            - "ring": Wrist + 4 ring points, shape (5, 3)
+            - "pinky": Wrist + 4 pinky points, shape (5, 3)
+    """
+    n_fingers = len(skeleton_pts) - 1  # Exclude wrist point
+    npts_per_finger = 4  # MCP, PIP, DIP, TIP for each finger
+    # Create finger arrays by combining wrist with each finger's points
+    list_fingers = [
+        np.vstack([skeleton_pts[0], skeleton_pts[i : i + npts_per_finger]])
+        for i in range(1, n_fingers, npts_per_finger)
+    ]
+    # Return organized finger dictionary
+    return {
+        "thumb": list_fingers[0],
+        "index": list_fingers[1],
+        "middle": list_fingers[2],
+        "ring": list_fingers[3],
+        "pinky": list_fingers[4]
+    }

phantom/phantom/process_data.py ADDED Viewed

	@@ -0,0 +1,243 @@

+import logging
+from enum import Enum
+from tqdm import tqdm
+from joblib import Parallel, delayed  # type: ignore
+import hydra
+from omegaconf import DictConfig
+from phantom.processors.base_processor import BaseProcessor
+logging.basicConfig(level=logging.WARNING, format="%(name)s - %(levelname)s - %(message)s")
+class ProcessingMode(Enum):
+    """Enumeration of valid processing modes."""
+    BBOX = "bbox"
+    HAND2D = "hand2d"
+    HAND3D = "hand3d"
+    HAND_SEGMENTATION = "hand_segmentation"
+    ARM_SEGMENTATION = "arm_segmentation"
+    ACTION = "action"
+    SMOOTHING = "smoothing"
+    HAND_INPAINT = "hand_inpaint"
+    ROBOT_INPAINT = "robot_inpaint"
+    ALL = "all"
+PROCESSING_ORDER = [
+    "bbox",
+    "hand2d",
+    "arm_segmentation",
+    "hand_segmentation",
+    "hand3d",
+    "action",
+    "smoothing",
+    "hand_inpaint",
+    "robot_inpaint",
+]
+PROCESSING_ORDER_EPIC = [
+    "bbox",
+    "hand2d",
+    "arm_segmentation",
+    "action",
+    "smoothing",
+    "hand_inpaint",
+    "robot_inpaint",
+]
+def process_one_demo(data_sub_folder: str, cfg: DictConfig, processor_classes: dict) -> None:
+    # Choose processing order based on epic flag
+    processing_order = PROCESSING_ORDER_EPIC if cfg.epic else PROCESSING_ORDER
+    # Handle both string and list modes
+    if isinstance(cfg.mode, str):
+        # Handle comma-separated string format
+        if ',' in cfg.mode:
+            selected_modes = []
+            for mode in cfg.mode.split(','):
+                mode = mode.strip()
+                if mode == "all":
+                    selected_modes.extend(processing_order)
+                elif mode in processing_order:
+                    selected_modes.append(mode)
+        else:
+            selected_modes = [m for m in processing_order if m in cfg.mode or "all" in cfg.mode]
+    else:
+        # For list of modes, use the order provided by user
+        selected_modes = []
+        for mode in cfg.mode:
+            if mode == "all":
+                selected_modes.extend(processing_order)
+            elif mode in processing_order:
+                selected_modes.append(mode)
+    for mode in selected_modes:
+        print(f"----------------- {mode.upper()} PROCESSOR -----------------")
+        processor_cls = processor_classes[mode]
+        processor = processor_cls(cfg)
+        try:
+            processor.process_one_demo(data_sub_folder)
+        except Exception as e:
+            print(f"Error in {mode} processing: {e}")
+            if cfg.debug:
+                raise
+def process_all_demos(cfg: DictConfig, processor_classes: dict) -> None:
+    # Choose processing order based on epic flag
+    processing_order = PROCESSING_ORDER_EPIC if cfg.epic else PROCESSING_ORDER
+    # Handle both string and list modes
+    if isinstance(cfg.mode, str):
+        # Handle comma-separated string format
+        if ',' in cfg.mode:
+            selected_modes = []
+            for mode in cfg.mode.split(','):
+                mode = mode.strip()
+                if mode == "all":
+                    selected_modes.extend(processing_order)
+                elif mode in processing_order:
+                    selected_modes.append(mode)
+        else:
+            selected_modes = [m for m in processing_order if m in cfg.mode or "all" in cfg.mode]
+    else:
+        # For list of modes, use the order provided by user
+        selected_modes = []
+        for mode in cfg.mode:
+            if mode == "all":
+                selected_modes.extend(processing_order)
+            elif mode in processing_order:
+                selected_modes.append(mode)
+    base_processor = BaseProcessor(cfg)
+    all_data_folders = base_processor.all_data_folders.copy()
+    for mode in selected_modes:
+        print(f"----------------- {mode.upper()} PROCESSOR -----------------")
+        processor_cls = processor_classes[mode]
+        processor = processor_cls(cfg)
+        for data_sub_folder in tqdm(all_data_folders):
+            try:
+                processor.process_one_demo(data_sub_folder)
+            except Exception as e:
+                print(f"Error in {mode} processing: {e}")
+                if cfg.debug:
+                    raise
+def process_all_demos_parallel(cfg: DictConfig, processor_classes: dict) -> None:
+    # Choose processing order based on epic flag
+    processing_order = PROCESSING_ORDER_EPIC if cfg.epic else PROCESSING_ORDER
+    # Handle both string and list modes
+    if isinstance(cfg.mode, str):
+        # Handle comma-separated string format
+        if ',' in cfg.mode:
+            selected_modes = []
+            for mode in cfg.mode.split(','):
+                mode = mode.strip()
+                if mode == "all":
+                    selected_modes.extend(processing_order)
+                elif mode in processing_order:
+                    selected_modes.append(mode)
+        else:
+            selected_modes = [m for m in processing_order if m in cfg.mode or "all" in cfg.mode]
+    else:
+        # For list of modes, use the order provided by user
+        selected_modes = []
+        for mode in cfg.mode:
+            if mode == "all":
+                selected_modes.extend(processing_order)
+            elif mode in processing_order:
+                selected_modes.append(mode)
+    base_processor = BaseProcessor(cfg)
+    all_data_folders = base_processor.all_data_folders.copy()
+    for mode in selected_modes:
+        print(f"----------------- {mode.upper()} PROCESSOR -----------------")
+        processor_cls = processor_classes[mode]
+        processor = processor_cls(cfg)
+        Parallel(n_jobs=cfg.n_processes)(
+            delayed(processor.process_one_demo)(data_sub_folder) for data_sub_folder in all_data_folders
+        )
+def get_processor_classes(cfg: DictConfig) -> dict:
+    """Initialize the processor classes"""
+    from phantom.processors.bbox_processor import BBoxProcessor
+    from phantom.processors.segmentation_processor import HandSegmentationProcessor, ArmSegmentationProcessor
+    from phantom.processors.hand_processor import Hand2DProcessor, Hand3DProcessor
+    from phantom.processors.action_processor import ActionProcessor
+    from phantom.processors.smoothing_processor import SmoothingProcessor
+    from phantom.processors.robotinpaint_processor import RobotInpaintProcessor
+    from phantom.processors.handinpaint_processor import HandInpaintProcessor
+    return {
+        "bbox": BBoxProcessor,
+        "hand2d": Hand2DProcessor,
+        "hand3d": Hand3DProcessor,
+        "hand_segmentation": HandSegmentationProcessor,
+        "arm_segmentation": ArmSegmentationProcessor,
+        "action": ActionProcessor,
+        "smoothing": SmoothingProcessor,
+        "robot_inpaint": RobotInpaintProcessor,
+        "hand_inpaint": HandInpaintProcessor,
+    }
+def validate_mode(cfg: DictConfig) -> None:
+    """
+    Validate that the mode parameter contains only valid processing modes.
+    Args:
+        cfg: Configuration object containing mode parameter
+    Raises:
+        ValueError: If mode contains invalid options
+    """
+    if isinstance(cfg.mode, str):
+        # Handle comma-separated string format
+        if ',' in cfg.mode:
+            modes = [mode.strip() for mode in cfg.mode.split(',')]
+        else:
+            modes = [cfg.mode]
+    else:
+        modes = cfg.mode
+    # Get valid modes from enum
+    valid_modes = {mode.value for mode in ProcessingMode}
+    invalid_modes = [mode for mode in modes if mode not in valid_modes]
+    if invalid_modes:
+        valid_mode_list = [mode.value for mode in ProcessingMode]
+        raise ValueError(
+            f"Invalid mode(s): {invalid_modes}. "
+            f"Valid modes are: {valid_mode_list}"
+        )
+def main(cfg: DictConfig):
+    # Validate mode parameter
+    validate_mode(cfg)
+    # Get processor classes
+    processor_classes = get_processor_classes(cfg)
+    if cfg.n_processes > 1:
+        process_all_demos_parallel(cfg, processor_classes)
+    elif cfg.demo_num is not None:
+        process_one_demo(cfg.demo_num, cfg, processor_classes)
+    else:
+        process_all_demos(cfg, processor_classes)
+@hydra.main(version_base=None, config_path="../configs", config_name="default")
+def hydra_main(cfg: DictConfig):
+    """
+    Main entry point using Hydra configuration.
+    Example usage:
+    - Process all demos with bbox: python process_data.py mode=bbox
+    - Process single demo: python process_data.py mode=bbox demo_num=0
+    - Use EPIC dataset: python process_data.py dataset=epic mode=bbox
+    - Parallel processing: python process_data.py mode=bbox n_processes=4
+    - Process multiple modes sequentially: python process_data.py mode=bbox,hand3d
+    - Process with custom order: python process_data.py mode=hand3d,bbox,action
+    - Process with bracket notation (use quotes): python process_data.py "mode=[bbox,hand3d]"
+    """
+    main(cfg)
+if __name__ == "__main__":
+    hydra_main()

phantom/phantom/processors/__init__.py ADDED Viewed

File without changes

phantom/phantom/processors/action_processor.py ADDED Viewed

	@@ -0,0 +1,478 @@

+"""
+Action Processor Module
+This module processes hand motion capture data and converts it into robot-executable actions.
+It handles both single-arm and bimanual robotic setups, converting detected hand keypoints
+into end-effector positions, orientations, and gripper widths that can be used for robot control.
+Key Features:
+- Converts hand keypoints from camera frame to robot frame
+- Supports both unconstrained and physically constrained hand models
+- Handles missing hand detections with interpolation
+- Processes bimanual data with union-based frame selection
+- Generates neutral poses when no hand data is available
+The processor follows this pipeline:
+1. Load hand sequence data (keypoints, detection flags)
+2. Convert keypoints to robot coordinate frame
+3. Apply hand model constraints (optional)
+4. Extract end-effector poses and gripper states
+5. Refine actions to handle missing detections
+6. Save processed actions for robot execution
+"""
+import os
+import numpy as np
+from typing import Tuple, Optional
+from dataclasses import dataclass
+import logging
+from scipy.spatial.transform import Rotation
+from phantom.processors.base_processor import BaseProcessor
+from phantom.processors.phantom_data import HandSequence
+from phantom.processors.paths import Paths
+from phantom.hand import HandModel, PhysicallyConstrainedHandModel, get_list_finger_pts_from_skeleton
+logger = logging.getLogger(__name__)
+@dataclass
+class EEActions:
+    """
+    Container for bimanual end-effector action data.
+    This dataclass holds the processed robot actions for a sequence of timesteps,
+    including 3D positions, 3D orientations, and gripper opening widths.
+    Attributes:
+        ee_pts (np.ndarray): End-effector positions, shape (N, 3) in robot frame coordinates
+        ee_oris (np.ndarray): End-effector orientations as rotation matrices, shape (N, 3, 3)
+        ee_widths (np.ndarray): Gripper opening widths in meters, shape (N,)
+    """
+    ee_pts: np.ndarray      # End-effector positions (N, 3)
+    ee_oris: np.ndarray     # End-effector orientations (N, 3, 3) as rotation matrices
+    ee_widths: np.ndarray   # Gripper widths (N,)
+class ActionProcessor(BaseProcessor):
+    """
+    Processor for converting hand motion capture data into robot-executable actions.
+    This class handles the complete pipeline from raw hand keypoints to refined robot actions.
+    It supports both single-arm and bimanual robotic setups, with intelligent handling of
+    missing hand detections and physically realistic constraints.
+    The processor can operate in different modes:
+    - Single arm: Processes only left or right hand data
+    - Bimanual: Processes both hands with union-based frame selection
+    Key processing steps:
+    1. Load hand sequences with 3D keypoints and detection flags
+    2. Transform keypoints from camera frame to robot frame
+    3. Fit hand model (optionally with physical constraints)
+    4. Extract end-effector poses and gripper states
+    5. Refine actions using last-valid-value interpolation
+    6. Generate neutral poses for undetected periods
+    Attributes:
+        dt (float): Time delta between frames (1/15 seconds for 15Hz processing)
+        bimanual_setup (str): Setup type ("single_arm", "shoulders", etc.)
+        target_hand (str): Which hand to process in single-arm mode ("left"/"right")
+        constrained_hand (bool): Whether to use physically constrained hand model
+        T_cam2robot (np.ndarray): 4x4 transformation matrix from camera to robot frame
+    """
+    def __init__(self, args):
+        # Set processing frequency to 15Hz
+        self.dt = 1/15
+        super().__init__(args)
+    def process_one_demo(self, data_sub_folder: str) -> None:
+        """
+        Process a single demonstration recording into robot actions.
+        This is the main entry point for processing one demo. It handles both
+        single-arm and bimanual processing modes, loading the raw hand data,
+        converting it to robot actions, and saving the results.
+        Args:
+            data_sub_folder (str): Path to the folder containing this demo's data
+        """
+        save_folder = self.get_save_folder(data_sub_folder)
+        paths = self.get_paths(save_folder)
+        # Load hand sequence data for both hands
+        left_sequence, right_sequence = self._load_sequences(paths)
+        # Handle single-arm processing mode
+        if self.bimanual_setup == "single_arm":
+            self._process_single_arm(left_sequence, right_sequence, paths)
+        else:
+            self._process_bimanual(left_sequence, right_sequence, paths)
+    def _process_single_arm(self, left_sequence: HandSequence, right_sequence: HandSequence, paths) -> None:
+        """Process single-arm setup with one target hand."""
+        # Select target hand based on configuration
+        target_sequence = left_sequence if self.target_hand == "left" else right_sequence
+        # Process the selected hand sequence
+        target_actions = self._process_hand_sequence(target_sequence, self.T_cam2robot)
+        # Get indices where hand was detected for this sequence
+        union_indices = np.where(target_sequence.hand_detected)[0]
+        # Refine actions to handle missing detections
+        target_actions_refined = self._refine_actions(target_sequence, target_actions, union_indices, self.target_hand)
+        # Save results for the selected hand only
+        if self.target_hand == "left":
+            self._save_results(paths, union_indices=union_indices, left_actions=target_actions_refined)
+        else:
+            self._save_results(paths, union_indices=union_indices, right_actions=target_actions_refined)
+    def _process_bimanual(self, left_sequence: HandSequence, right_sequence: HandSequence, paths) -> None:
+        """Process bimanual setup with both hands."""
+        # Process both hand sequences
+        left_actions = self._process_hand_sequence(left_sequence, self.T_cam2robot)
+        right_actions = self._process_hand_sequence(right_sequence, self.T_cam2robot)
+        # Combine detection results using OR logic - frame is valid if either hand detected
+        union_indices = np.where(left_sequence.hand_detected | right_sequence.hand_detected)[0]
+        # Refine actions for both hands using the union indices
+        left_actions_refined = self._refine_actions(left_sequence, left_actions, union_indices, "left")
+        right_actions_refined = self._refine_actions(right_sequence, right_actions, union_indices, "right")
+        # Save results for both hands
+        self._save_results(paths, union_indices, left_actions_refined, right_actions_refined)
+    def _load_sequences(self, paths) -> Tuple[HandSequence, HandSequence]:
+        """
+        Load hand sequences from disk for both left and right hands.
+        HandSequence objects contain the processed keypoint data, detection flags,
+        and other metadata needed for action processing.
+        Args:
+            paths: Paths object containing file locations for hand data
+        Returns:
+            Tuple[HandSequence, HandSequence]: Left and right hand sequences
+        """
+        return (
+            HandSequence.load(paths.hand_data_left),
+            HandSequence.load(paths.hand_data_right)
+        )
+    def _process_hand_sequence(
+        self,
+        sequence: HandSequence,
+        T_cam2robot: np.ndarray,
+    ) -> EEActions:
+        """
+        Process a single hand sequence into end-effector actions.
+        This method performs the following processing pipeline for one hand:
+        1. Transform keypoints from camera frame to robot frame
+        2. Fit a hand model to the keypoint sequence
+        3. Extract end-effector poses and gripper states
+        Args:
+            sequence (HandSequence): Hand keypoint sequence with detection flags
+            T_cam2robot (np.ndarray): 4x4 transformation matrix from camera to robot frame
+        Returns:
+            EEActions: Processed end-effector positions, orientations, and gripper widths
+        """
+        # Convert keypoints from camera frame to robot frame coordinates
+        kpts_3d_cf = sequence.kpts_3d  # Camera frame keypoints
+        kpts_3d_rf = ActionProcessor._convert_pts_to_robot_frame(
+            kpts_3d_cf,
+            T_cam2robot
+        )
+        # Create and fit hand model to the keypoint sequence
+        hand_model = self._get_hand_model(kpts_3d_rf, sequence.hand_detected)
+        # Extract end-effector poses and gripper states from fitted model
+        kpts_3d, ee_pts, ee_oris = self._get_model_keypoints(hand_model)
+        # Compute gripper opening distances from fingertip positions
+        ee_widths = self._compute_gripper_distances(
+            kpts_3d,
+            sequence.hand_detected
+        )
+        return EEActions(
+            ee_pts=ee_pts,
+            ee_oris=ee_oris,
+            ee_widths=ee_widths,
+        )
+    def _get_hand_model(self, kpts_3d_rf: np.ndarray, hand_detected: np.ndarray) -> HandModel | PhysicallyConstrainedHandModel:
+        """
+        Create and fit a hand model to the keypoint sequence.
+        The hand model can be either unconstrained (simple fitting) or physically
+        constrained (enforces realistic hand poses and robot constraints).
+        Args:
+            kpts_3d_rf (np.ndarray): Hand keypoints in robot frame, shape (N, 21, 3)
+            hand_detected (np.ndarray): Boolean array indicating valid detections, shape (N,)
+        Returns:
+            HandModel | PhysicallyConstrainedHandModel: Fitted hand model with trajectory data
+        """
+        # Choose hand model type based on configuration
+        if self.constrained_hand:
+            hand_model = PhysicallyConstrainedHandModel(self.robot)
+        else:
+            hand_model = HandModel(self.robot)
+        # Add each frame to the model for trajectory fitting
+        for t_idx in range(len(kpts_3d_rf)):
+            hand_model.add_frame(
+                kpts_3d_rf[t_idx],
+                t_idx * self.dt,  # Convert frame index to time
+                hand_detected[t_idx]
+            )
+        return hand_model
+    def _get_model_keypoints(self, model: HandModel | PhysicallyConstrainedHandModel) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Extract keypoints and end-effector data from fitted hand model.
+        Args:
+            model (HandModel | PhysicallyConstrainedHandModel): Fitted hand model
+        Returns:
+            Tuple containing:
+                - kpts_3d (np.ndarray): Model keypoint positions, shape (N, 21, 3)
+                - ee_pts (np.ndarray): End-effector positions, shape (N, 3)
+                - ee_oris (np.ndarray): End-effector orientations, shape (N, 3, 3)
+        """
+        kpts_3d = np.array(model.vertex_positions)   # All hand keypoints
+        ee_pts = np.array(model.grasp_points)        # End-effector positions (palm center)
+        ee_oris = np.array(model.grasp_oris)         # End-effector orientations (rotation matrices)
+        return kpts_3d, ee_pts, ee_oris
+    def _compute_gripper_distances(
+        self,
+        kpts_3d_rf: np.ndarray,
+        hand_detected: np.ndarray
+    ) -> np.ndarray:
+        """
+        Compute gripper opening distances for all frames in the sequence.
+        The gripper distance is calculated as the Euclidean distance between
+        the thumb tip and index finger tip, providing a proxy for gripper state.
+        Args:
+            kpts_3d_rf (np.ndarray): Hand keypoints in robot frame, shape (N, 21, 3)
+            hand_detected (np.ndarray): Boolean flags for valid detections, shape (N,)
+        Returns:
+            np.ndarray: Gripper distances for each frame, shape (N,)
+        """
+        gripper_dists = np.zeros(len(kpts_3d_rf))
+        for idx in range(len(kpts_3d_rf)):
+            if hand_detected[idx]:
+                # Only compute distance for frames with valid hand detection
+                gripper_dists[idx] = ActionProcessor._compute_gripper_opening(
+                    kpts_3d_rf[idx]
+                )
+            # Note: Invalid frames remain at 0.0, will be refined later
+        return gripper_dists
+    def _refine_actions(
+        self,
+        sequence: HandSequence,
+        actions: EEActions,
+        union_indices: np.ndarray,
+        hand_side: str
+    ) -> EEActions:
+        """
+        Refine actions to handle missing hand detections using last-valid-value interpolation.
+        When hand detection fails, this method fills in missing values by carrying forward
+        the last valid pose and gripper state. This creates smooth, executable trajectories
+        even when the vision system temporarily loses tracking.
+        Args:
+            sequence (HandSequence): Original hand sequence with detection flags
+            actions (EEActions): Raw actions from hand model
+            union_indices (np.ndarray): Frame indices to include in final trajectory
+            hand_side (str): "left" or "right" for neutral pose generation
+        Returns:
+            EEActions: Refined actions with interpolated values for missing detections
+        """
+        # Find frames where this hand was actually detected
+        hand_detected_indices = np.where(sequence.hand_detected)[0]
+        # If no valid detections, return neutral pose for entire sequence
+        if len(hand_detected_indices) == 0:
+            return self._get_neutral_actions(hand_side, len(union_indices))
+        # Apply carry-forward interpolation
+        return self._apply_carry_forward_interpolation(sequence, actions, union_indices, hand_detected_indices)
+    def _apply_carry_forward_interpolation(
+        self,
+        sequence: HandSequence,
+        actions: EEActions,
+        union_indices: np.ndarray,
+        hand_detected_indices: np.ndarray
+    ) -> EEActions:
+        """Apply last-valid-value interpolation to fill missing detections."""
+        # Initialize with first valid detection values
+        first_valid_idx = hand_detected_indices[0]
+        last_valid_pt = actions.ee_pts[first_valid_idx]
+        last_valid_ori = actions.ee_oris[first_valid_idx]
+        last_valid_width = actions.ee_widths[first_valid_idx]
+        # Process each frame in the union sequence
+        ee_pts_refined = []
+        ee_oris_refined = []
+        ee_widths_refined = []
+        for idx in union_indices:
+            if sequence.hand_detected[idx]:
+                # Update with new valid values when available
+                last_valid_pt = actions.ee_pts[idx]
+                last_valid_ori = actions.ee_oris[idx]
+                last_valid_width = actions.ee_widths[idx]
+            # Always append the last valid values (carry-forward for missing frames)
+            ee_pts_refined.append(last_valid_pt)
+            ee_oris_refined.append(last_valid_ori)
+            ee_widths_refined.append(last_valid_width)
+        return EEActions(
+            ee_pts=np.array(ee_pts_refined),
+            ee_oris=np.array(ee_oris_refined),
+            ee_widths=np.array(ee_widths_refined),
+        )
+    def _get_neutral_actions(self, hand_side: str, n_frames: int) -> EEActions:
+        """
+        Generate neutral pose actions when no hand detection is available.
+        Neutral poses place the robot arms in out-of-frame positions.
+        Args:
+            hand_side (str): "left" or "right" to determine which neutral pose to use
+            n_frames (int): Number of frames to generate
+        Returns:
+            EEActions: Neutral pose actions for the specified number of frames
+        """
+        # Define neutral pose configurations
+        neutral_configs = {
+            "single_arm": {
+                "right": {"pos": [0.2, -0.8, 0.3], "quat": [1, 0.0, 0.0, 0.0]},
+                "left": {"pos": [0.2, 0.8, 0.3], "quat": [1, 0.0, 0.0, 0.0]}
+            },
+            "shoulders": {
+                "right": {"pos": [0.4, -0.5, 0.3], "quat": [-0.7071, 0.0, 0.0, 0.7071]},
+                "left": {"pos": [0.4, 0.5, 0.3], "quat": [0.7071, 0.0, 0.0, 0.7071]}
+            }
+        }
+        # Get configuration for current setup and hand
+        config = neutral_configs[self.bimanual_setup][hand_side]
+        # Convert to numpy arrays and create rotation matrix
+        neutral_pos = np.array(config["pos"])
+        neutral_ori = Rotation.from_quat(config["quat"], scalar_first=False).as_matrix()
+        neutral_width = 0.085  # Standard gripper opening (8.5cm)
+        # Create arrays replicated for all frames
+        return EEActions(
+            ee_pts=np.repeat(neutral_pos.reshape(1, 3), n_frames, axis=0),
+            ee_oris=np.repeat(neutral_ori.reshape(1, 3, 3), n_frames, axis=0),
+            ee_widths=np.full(n_frames, neutral_width)
+        )
+    def _save_results(
+        self,
+        paths: Paths,
+        union_indices: np.ndarray,
+        left_actions: Optional[EEActions] = None,
+        right_actions: Optional[EEActions] = None,
+    ) -> None:
+        """
+        Save processed action results to disk in NPZ format.
+        The saved files contain all necessary data for robot execution:
+        - union_indices: Valid frame indices in the original sequence
+        - ee_pts: End-effector positions
+        - ee_oris: End-effector orientations (rotation matrices)
+        - ee_widths: Gripper opening widths
+        Args:
+            paths (Paths): File path configuration object
+            union_indices (np.ndarray): Valid frame indices
+            left_actions (Optional[EEActions]): Left hand actions to save
+            right_actions (Optional[EEActions]): Right hand actions to save
+        """
+        # Create output directory if it doesn't exist
+        os.makedirs(paths.action_processor, exist_ok=True)
+        # Save actions for each hand if provided
+        if left_actions is not None:
+            self._save_hand_actions(paths.actions_left, union_indices, left_actions)
+        if right_actions is not None:
+            self._save_hand_actions(paths.actions_right, union_indices, right_actions)
+    def _save_hand_actions(self, base_path: str, union_indices: np.ndarray, actions: EEActions) -> None:
+        """Save actions for a single hand to NPZ file."""
+        file_path = str(base_path).split(".npz")[0] + f"_{self.bimanual_setup}.npz"
+        np.savez(
+            file_path,
+            union_indices=union_indices,
+            ee_pts=actions.ee_pts,
+            ee_oris=actions.ee_oris,
+            ee_widths=actions.ee_widths
+        )
+    @staticmethod
+    def _compute_gripper_opening(skeleton_pts: np.ndarray) -> float:
+        """
+        Compute gripper opening distance from hand keypoints for a single frame.
+        The gripper distance is calculated as the Euclidean distance between
+        the thumb tip and index finger tip.
+        Args:
+            skeleton_pts (np.ndarray): Hand keypoints for one frame, shape (21, 3)
+        Returns:
+            float: Distance between thumb tip and index finger tip in meters
+        """
+        # Extract finger tip positions from the hand skeleton
+        finger_dict = get_list_finger_pts_from_skeleton(skeleton_pts)
+        # Compute distance between thumb tip and index finger tip
+        return np.linalg.norm(finger_dict["thumb"][-1] - finger_dict["index"][-1])
+    @staticmethod
+    def _convert_pts_to_robot_frame(skeleton_poses_cf: np.ndarray, T_cam2robot: np.ndarray) -> np.ndarray:
+        """
+        Convert hand keypoints from camera frame to robot frame coordinates.
+        Args:
+            skeleton_poses_cf (np.ndarray): Hand poses in camera frame, shape (N, 21, 3)
+            T_cam2robot (np.ndarray): 4x4 transformation matrix from camera to robot frame
+        Returns:
+            np.ndarray: Hand poses in robot frame, shape (N, 21, 3)
+        """
+        # Convert to homogeneous coordinates by adding ones
+        pts_h = np.ones((skeleton_poses_cf.shape[0], skeleton_poses_cf.shape[1], 1))
+        skeleton_poses_cf_h = np.concatenate([skeleton_poses_cf, pts_h], axis=-1)
+        # Apply transformation matrix to convert coordinate frames
+        skeleton_poses_rf_h0 = np.einsum('ij,bpj->bpi', T_cam2robot, skeleton_poses_cf_h)
+        # Remove homogeneous coordinate and return 3D points
+        return skeleton_poses_rf_h0[..., :3]

phantom/phantom/processors/base_processor.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import os
+import json
+import logging
+import numpy as np
+import shutil
+import errno
+from typing import Tuple
+from pathlib import Path
+from omegaconf import DictConfig
+from phantom.utils.data_utils import get_parent_folder_of_package
+from phantom.utils.image_utils import get_intrinsics_from_json, get_transformation_matrix_from_extrinsics
+from phantom.processors.paths import Paths, PathsConfig
+logger = logging.getLogger(__name__)
+class BaseProcessor:
+    def __init__(self, cfg: DictConfig):
+        # Store configuration for potential future use
+        self.cfg = cfg
+        # Apply configuration to instance attributes
+        self._apply_config(cfg)
+        # Validate configuration
+        self._validate_config(cfg)
+        # Set up paths and data folders
+        self._setup_paths_and_folders(cfg)
+        # Initialize camera parameters
+        self._init_camera_parameters()
+    def _apply_config(self, cfg: DictConfig) -> None:
+        """Apply configuration to instance attributes."""
+        # Basic attributes
+        self.input_resolution = cfg.input_resolution
+        self.output_resolution = cfg.output_resolution
+        self.project_folder = get_parent_folder_of_package("phantom")
+        self.debug = cfg.debug
+        self.n_processes = cfg.n_processes
+        self.verbose = cfg.verbose
+        self.skip_existing = cfg.skip_existing
+        self.robot = cfg.robot
+        self.gripper = cfg.gripper
+        self.square = cfg.square
+        self.epic = cfg.epic
+        self.bimanual_setup = cfg.bimanual_setup
+        self.target_hand = cfg.target_hand
+        self.constrained_hand = cfg.constrained_hand
+        self.depth_for_overlay = cfg.depth_for_overlay
+        self.render = cfg.render
+        self.debug_cameras = getattr(cfg, 'debug_cameras', [])
+        # Apply bimanual setup logic
+        if self.bimanual_setup != "single_arm":
+            self.target_hand = "both"
+    def _validate_config(self, cfg: DictConfig) -> None:
+        """Validate critical configuration parameters."""
+        if cfg.input_resolution <= 0 or cfg.output_resolution <= 0:
+            raise ValueError(f"Resolutions must be positive: input={cfg.input_resolution}, output={cfg.output_resolution}")
+        if not os.path.exists(cfg.data_root_dir):
+            raise FileNotFoundError(f"Data root directory not found: {cfg.data_root_dir}")
+        if not os.path.exists(cfg.camera_intrinsics):
+            raise FileNotFoundError(f"Camera intrinsics file not found: {cfg.camera_intrinsics}")
+    def _setup_paths_and_folders(self, cfg: DictConfig) -> None:
+        """Set up paths configuration and create necessary directories."""
+        # Set up paths configuration
+        self.paths_config = PathsConfig()
+        self.paths_config.config['data_root'] = cfg.data_root_dir
+        self.paths_config.config['processed_root'] = cfg.processed_data_root_dir
+        self.data_folder = os.path.join(cfg.data_root_dir, cfg.demo_name)
+        self.processed_data_folder = os.path.join(cfg.processed_data_root_dir, cfg.demo_name)
+        # Validate that data folder exists
+        if not os.path.exists(self.data_folder):
+            raise FileNotFoundError(f"Data folder not found: {self.data_folder}")
+        os.makedirs(self.processed_data_folder, exist_ok=True)
+        # Get all folders in data_folder
+        try:
+            all_data_folders = [d1 for d1 in os.listdir(self.data_folder) if os.path.isdir(os.path.join(self.data_folder, d1))]
+            self.all_data_folders = sorted(all_data_folders, key=lambda x: int(x))
+            self.all_data_folders_idx = {x: idx for idx, x in enumerate(self.all_data_folders)}
+        except OSError as e:
+            if e.errno == errno.EACCES:
+                raise PermissionError(f"Permission denied accessing data folder: {self.data_folder}")
+            elif e.errno == errno.ENOENT:
+                raise FileNotFoundError(f"Data folder not found: {self.data_folder}")
+            else:
+                raise RuntimeError(f"OS error accessing data folder {self.data_folder}: {e}")
+        except ValueError as e:
+            raise ValueError(f"Invalid folder name format in {self.data_folder}. Folders should be numbered: {e}")
+    def _init_camera_parameters(self) -> None:
+        """Initialize camera intrinsics and extrinsics."""
+        # Get camera intrinsics and extrinsics
+        self.intrinsics_dict, self.intrinsics_matrix = self.get_intrinsics(self.cfg.camera_intrinsics)
+        # Use camera_extrinsics from config if available, otherwise determine from bimanual_setup
+        if hasattr(self.cfg, 'camera_extrinsics') and self.cfg.camera_extrinsics:
+            camera_extrinsics_path = self.cfg.camera_extrinsics
+        else:
+            camera_extrinsics_path = self._get_camera_extrinsics_path()
+        self.T_cam2robot, self.extrinsics = self.get_extrinsics(camera_extrinsics_path)
+    def _get_camera_extrinsics_path(self) -> str:
+        """Get the appropriate camera extrinsics path based on bimanual setup."""
+        if self.bimanual_setup == "shoulders":
+            return "camera/camera_extrinsics_ego_bimanual_shoulders.json"
+        elif self.bimanual_setup == "single_arm":
+            return "camera/camera_extrinsics.json"
+        else:
+            raise ValueError(f"Invalid bimanual setup: {self.bimanual_setup}. Must be 'single_arm' or 'shoulders'.")
+    def get_paths(self, data_path: str) -> Paths:
+        """
+        Get all file paths for a demo.
+        Args:
+            data_path: Path to the demo data
+        Returns:
+            Paths object containing all file paths
+        """
+        paths = Paths(
+            data_path=Path(data_path),
+            robot_name=self.robot
+        )
+        paths.ensure_directories_exist()
+        return paths
+    def get_save_folder(self, data_sub_folder: str) -> str:
+        data_sub_folder_fullpath = os.path.join(self.data_folder, str(data_sub_folder))
+        save_folder = os.path.join(self.processed_data_folder, str(data_sub_folder))
+        # Check existing dirs using os.scandir
+        with os.scandir(self.processed_data_folder) as it:
+            existing_dirs = {entry.name for entry in it if entry.is_dir()}
+        if str(data_sub_folder) not in existing_dirs:
+            shutil.copytree(data_sub_folder_fullpath, save_folder)
+        return save_folder
+    def process_one_demo(self, data_sub_folder: str):
+        raise NotImplementedError
+    def get_intrinsics(self, intrinsics_path: str) -> Tuple[dict, np.ndarray]:
+        intrinsics_matrix, intrinsics_dict = get_intrinsics_from_json(intrinsics_path)
+        if self.square:
+            intrinsics_dict, intrinsics_matrix = self.update_intrinsics_for_square_image(self.input_resolution,
+                                                                                        intrinsics_dict,
+                                                                                        intrinsics_matrix)
+        return intrinsics_dict, intrinsics_matrix
+    def get_extrinsics(self, extrinsics_path: str) -> Tuple[np.ndarray, dict]:
+        """Load and process camera extrinsics from JSON file.
+        Args:
+            extrinsics_path: Path to the extrinsics JSON file
+        Returns:
+            Tuple of (transformation_matrix, extrinsics_dict)
+        Raises:
+            FileNotFoundError: If extrinsics file doesn't exist
+            json.JSONDecodeError: If extrinsics file is invalid JSON
+            ValueError: If extrinsics data is invalid
+        """
+        if not os.path.exists(extrinsics_path):
+            raise FileNotFoundError(f"Camera extrinsics file not found: {extrinsics_path}")
+        try:
+            with open(extrinsics_path, "r") as f:
+                camera_extrinsics = json.load(f)
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Invalid JSON in extrinsics file {extrinsics_path}: {str(e)}")
+        try:
+            T_cam2robot = get_transformation_matrix_from_extrinsics(camera_extrinsics)
+        except Exception as e:
+            raise ValueError(f"Failed to process extrinsics data from {extrinsics_path}: {str(e)}")
+        return T_cam2robot, camera_extrinsics
+    @staticmethod
+    def update_intrinsics_for_square_image(img_h: int, intrinsics_dict: dict,
+                                           intrinsics_matrix: np.ndarray) -> Tuple[dict, np.ndarray]:
+        """
+        Adjusts camera intrinsic parameters for a square image by modifying the principal point offset.
+        Args:
+            img_h (int): Height of the image (assumed to be square).
+            intrinsics_dict (dict): Dictionary of intrinsic parameters.
+            intrinsics_matrix (np.ndarray): Intrinsic matrix.
+        Returns:
+            Tuple[dict, np.ndarray]: Updated intrinsic parameters and matrix.
+        """
+        img_w = img_h * 16 // 9
+        offset = (img_w - img_h) // 2
+        intrinsics_dict["cx"] -= offset
+        intrinsics_matrix[0, 2] -= offset
+        return intrinsics_dict, intrinsics_matrix

phantom/phantom/processors/bbox_processor.py ADDED Viewed

	@@ -0,0 +1,851 @@

+"""
+Bounding Box Processor Module
+This module provides video processing capabilities for detecting and tracking hand bounding boxes
+in demonstration videos. It serves as the first stage in the hand processing pipeline, providing
+spatial localization data for downstream pose estimation and segmentation tasks.
+Key Features:
+- Multiple hand detection methods (DINO, EPIC-KITCHENS integration)
+- Bimanual hand tracking with left/right classification
+- Temporal consistency through outlier filtering and interpolation
+- Spatial constraint validation (edge detection, center positioning)
+- Visualization and annotation generation
+Processing Pipeline:
+1. Video loading and validation
+2. Frame-by-frame hand detection using configured detectors
+3. Bounding box classification (left/right) based on spatial positioning
+4. Temporal filtering to remove outliers and large jumps
+5. Gap interpolation for smooth trajectories
+6. Edge distance calculation for quality assessment
+7. Result visualization and storage
+The processor supports multiple detection backends:
+- DINO-based detection for general hand detection
+- EPIC-KITCHENS pre-computed detections
+- Configurable confidence thresholds and spatial constraints
+Output Data:
+- Hand detection flags per frame (boolean arrays)
+- Bounding box coordinates [x1, y1, x2, y2] per frame
+- Bounding box centers [x, y] per frame
+- Distance metrics to image edges
+- Annotated visualization videos
+"""
+import os
+import pickle
+import logging
+import numpy as np
+import mediapy as media
+import cv2
+import itertools
+import time
+import matplotlib.pyplot as plt
+from typing import List, Tuple, Optional, Any, Dict
+from typing_extensions import Literal
+import numpy.typing as npt
+from omegaconf import DictConfig
+from phantom.processors.base_processor import BaseProcessor
+from phantom.processors.paths import Paths
+from phantom.processors.phantom_data import hand_side_dict
+from phantom.utils.bbox_utils import get_bbox_center, get_bbox_center_min_dist_to_edge
+logger = logging.getLogger(__name__)
+# Type aliases for better readability
+DetectionResults = Dict[str, npt.NDArray]
+BBoxArray = npt.NDArray[np.float32]  # [x1, y1, x2, y2]
+CenterArray = npt.NDArray[np.float32]  # [x, y]
+DetectionFlagArray = npt.NDArray[np.bool_]
+HandSide = Literal["left", "right"]
+class BBoxProcessor(BaseProcessor):
+    # Detection configuration constants
+    HAND_SIDE_MARGIN = 50  # Pixel margin for hand side classification tolerance
+    OVERLAP_THRESHOLD = 0.3  # Threshold for considering bboxes as overlapping
+    MAX_INTERPOLATION_GAP = 10  # Maximum frames to interpolate over
+    MAX_SPATIAL_JUMP = 200.0  # Maximum allowed pixel jump between detections
+    MAX_JUMP_LOOKAHEAD = 10  # Maximum consecutive distant points to filter
+    DINO_CONFIDENCE_THRESH = 0.2  # Default confidence threshold
+    # Visualization constants
+    LEFT_HAND_COLOR = (0, 0, 255)  # BGR format - Red for left hand
+    RIGHT_HAND_COLOR = (0, 255, 0)  # BGR format - Green for right hand
+    BBOX_THICKNESS = 2  # Thickness of bounding box lines
+    """
+    Bounding box detection and tracking processor for hand localization in videos.
+    This processor serves as the foundation of the hand processing pipeline by detecting
+    and tracking hand bounding boxes across video frames. It handles both single-arm
+    and bimanual setups.
+    The processor employs multiple strategies for reliable detection:
+    - Primary detection using DINO or pre-computed EPIC data
+    - Spatial reasoning for left/right hand classification
+    - Temporal filtering to maintain trajectory consistency
+    - Gap interpolation for handling missing detections
+    - Quality assessment through edge distance metrics
+    Attributes:
+        H (int): Video frame height (set during processing)
+        W (int): Video frame width (set during processing)
+        center (int): Horizontal center of the frame for left/right classification
+        margin (int): Pixel margin for hand side classification tolerance
+        confidence_threshold (float): Minimum confidence for valid detections
+        dino_detector: DINO-based hand detector (if not using EPIC data)
+        filtered_hand_detection_data (dict): Processed EPIC detection data
+        sorted_keys (list): Sorted frame indices for EPIC data processing
+    """
+    def __init__(self, cfg: DictConfig) -> None:
+        """
+        Initialize the bounding box processor with configuration parameters.
+        Args:
+            cfg: Hydra configuration object containing processing configuration
+                 including confidence thresholds, target hands, and dataset type
+        """
+        super().__init__(cfg)
+        # Image dimensions (set when processing video)
+        self.H: int = 0
+        self.W: int = 0
+        # Initialize detection backend based on dataset type
+        if not self.epic:
+            from phantom.detectors.detector_dino import DetectorDino
+            self.dino_detector: DetectorDino = DetectorDino("IDEA-Research/grounding-dino-base")
+        else:
+            self.dino_detector: Optional[DetectorDino] = None
+        # EPIC-specific attributes
+        self.filtered_hand_detection_data: Dict[str, List[Any]] = {}
+        self.sorted_keys: List[str] = []
+    # ============================================================================
+    # COMMON/SHARED METHODS (Used by both Phantom and EPIC modes)
+    # ============================================================================
+    def process_one_demo(self, data_sub_folder: str) -> None:
+        """
+        Process a single demonstration video to extract hand bounding boxes.
+        Args:
+            data_sub_folder: Path to the demonstration data folder containing the video
+                           and any pre-computed hand detection data.
+        The method performs the following steps:
+        1. Loads and validates input video and detection data
+        2. Processes each frame to detect and classify hand positions
+        3. Applies post-processing filters for temporal consistency
+        4. Generates quality metrics and visualizations
+        5. Saves all results in standardized format
+        Raises:
+            FileNotFoundError: If required input files (video, detection data) are not found
+            ValueError: If video frames or hand detection data are invalid
+        """
+        # Setup and validation
+        save_folder = self.get_save_folder(data_sub_folder)
+        paths = self.get_paths(save_folder)
+        # Load and validate input data
+        imgs_rgb = self._load_video(paths)
+        # Process frames based on dataset type
+        if self.epic:
+            self._load_epic_hand_data(paths)
+            detection_results = self._process_epic_frames(imgs_rgb)
+        else:
+            detection_results = self._process_frames(imgs_rgb)
+        # Post-process results for temporal consistency
+        processed_results = self._post_process_detections(detection_results)
+        # Generate visualization for quality assessment
+        visualization_results = self._generate_visualization(imgs_rgb, processed_results)
+        # Save all results to disk
+        self._save_results(paths, processed_results, visualization_results)
+    def _load_video(self, paths: Paths) -> np.ndarray:
+        """
+        Load and validate video data from the specified path.
+        Args:
+            paths: Paths object containing video file locations
+        Returns:
+            RGB video frames as array
+        Raises:
+            FileNotFoundError: If video file doesn't exist
+            ValueError: If video is empty or corrupted
+        """
+        if not os.path.exists(paths.video_left):
+            raise FileNotFoundError(f"Video file not found: {paths.video_left}")
+        imgs_rgb = media.read_video(getattr(paths, f"video_left"))
+        if len(imgs_rgb) == 0:
+            raise ValueError("Empty video file")
+        # Store video dimensions for coordinate calculations
+        self.H, self.W, _ = imgs_rgb[0].shape
+        self.center: int = self.W // 2  # Center line for left/right classification
+        return imgs_rgb
+    # ============================================================================
+    # PHANTOM-SPECIFIC METHODS (DINO Detection)
+    # ============================================================================
+    def _process_frames(self, imgs_rgb: np.ndarray) -> Dict[str, np.ndarray]:
+        """
+        Process RGB frames using DINO detector for hand detection and classification.
+        This method handles the core detection pipeline for non-EPIC datasets,
+        using DINO for hand detection and implementing spatial reasoning for
+        left/right classification.
+        Args:
+            imgs_rgb: Array of RGB images with shape (num_frames, height, width, 3)
+        Returns:
+            Dictionary containing:
+                - left/right_hand_detected: Boolean arrays indicating hand detection per frame
+                - left/right_bboxes: Bounding box coordinates [x1,y1,x2,y2] per frame
+                - left/right_bboxes_ctr: Bounding box centers [x,y] per frame
+        """
+        num_frames = len(imgs_rgb)
+        detection_arrays = self._initialize_detection_arrays(num_frames)
+        for idx in range(num_frames):
+            try:
+                # Run DINO detection on current frame
+                bboxes, scores = self.dino_detector.get_bboxes(imgs_rgb[idx], "a hand", threshold=self.DINO_CONFIDENCE_THRESH, visualize=False)
+                if len(bboxes) == 0:
+                    continue
+                bboxes = np.array(bboxes)
+                scores = np.array(scores)
+                # Process detections for current frame
+                self._process_frame_detections(idx, bboxes, scores, detection_arrays)
+            except Exception as e:
+                logger.warning(f"Frame {idx} processing failed: {str(e)}")
+                continue
+        return {
+            'left_hand_detected': detection_arrays['left_hand_detected'],
+            'right_hand_detected': detection_arrays['right_hand_detected'],
+            'left_bboxes': detection_arrays['left_bboxes'],
+            'right_bboxes': detection_arrays['right_bboxes'],
+            'left_bboxes_ctr': detection_arrays['left_bboxes_ctr'],
+            'right_bboxes_ctr': detection_arrays['right_bboxes_ctr'],
+        }
+    def _initialize_detection_arrays(self, num_frames: int) -> Dict[str, npt.NDArray]:
+        """
+        Initialize arrays for storing detection results.
+        Args:
+            num_frames: Number of frames in the video
+        Returns:
+            Dictionary containing pre-allocated arrays for left/right hand detections,
+            bounding boxes, centers, and detection flags
+        """
+        return {
+            'left_bboxes': np.zeros((num_frames, 4)),
+            'right_bboxes': np.zeros((num_frames, 4)),
+            'left_bboxes_ctr': np.zeros((num_frames, 2)),
+            'right_bboxes_ctr': np.zeros((num_frames, 2)),
+            'left_hand_detected': np.zeros(num_frames, dtype=bool),
+            'right_hand_detected': np.zeros(num_frames, dtype=bool)
+        }
+    def _process_frame_detections(self, idx: int, bboxes: npt.NDArray, scores: npt.NDArray,
+                                 detection_arrays: Dict[str, npt.NDArray]) -> None:
+        """
+        Process detections for a single frame.
+        Args:
+            idx: Frame index
+            bboxes: Array of detected bounding boxes
+            scores: Array of detection confidence scores
+            detection_arrays: Dictionary to store detection results
+        """
+        if len(bboxes) == 0:
+            return
+        # Always select the bounding box with the highest score
+        best_idx = np.argmax(scores)
+        best_bbox = bboxes[best_idx]
+        best_bbox_ctr = get_bbox_center(best_bbox)
+        # Assign hand type directly based on self.target_hand
+        if self.target_hand == "left":
+            detection_arrays['left_bboxes'][idx] = best_bbox
+            detection_arrays['left_bboxes_ctr'][idx] = best_bbox_ctr
+            detection_arrays['left_hand_detected'][idx] = True
+        elif self.target_hand == "right":
+            detection_arrays['right_bboxes'][idx] = best_bbox
+            detection_arrays['right_bboxes_ctr'][idx] = best_bbox_ctr
+            detection_arrays['right_hand_detected'][idx] = True
+    # ============================================================================
+    # EPIC-SPECIFIC METHODS (EPIC Dataset Processing)
+    # ============================================================================
+    def _validate_epic_data_structure(self, epic_data: List[Any]) -> bool:
+        """Validate EPIC data structure before processing."""
+        if not epic_data:
+            return False
+        # Check if first item has required attributes
+        try:
+            first_item = epic_data[0]
+            if not hasattr(first_item, 'side') or not hasattr(first_item, 'bbox'):
+                logging.warning("EPIC data missing required attributes: 'side' or 'bbox'")
+                return False
+            # Check if bbox has required attributes
+            bbox = first_item.bbox
+            required_attrs = ['left', 'right', 'top', 'bottom']
+            if not all(hasattr(bbox, attr) for attr in required_attrs):
+                logging.warning("EPIC bbox missing required attributes: left, right, top, bottom")
+                return False
+            return True
+        except Exception as e:
+            logging.warning(f"Error validating EPIC data structure: {str(e)}")
+            return False
+    def _load_epic_hand_data(self, paths: Paths) -> Dict[str, Any]:
+        """
+        Load and validate pre-computed hand detection data from EPIC-KITCHENS dataset.
+        EPIC-KITCHENS provides pre-computed hand detection annotations that we can
+        use directly instead of running our own detection. This method filters and
+        sorts the data for efficient frame-by-frame processing.
+        Args:
+            paths: Paths object containing detection data file location
+        Returns:
+            Dictionary of filtered and sorted hand detection data
+        Raises:
+            FileNotFoundError: If detection data file doesn't exist
+        """
+        if not os.path.exists(paths.hand_detection_data):
+            raise FileNotFoundError(f"Hand detection data not found: {paths.hand_detection_data}")
+        with open(paths.hand_detection_data, 'rb') as f:
+            hand_detection_data = dict(pickle.load(f))
+        # Filter out detection objects without valid side information
+        filtered_data = {
+            key: [obj for obj in obj_list if hasattr(obj, 'side')]
+            for key, obj_list in hand_detection_data.items()
+        }
+        # Sort by frame index for sequential processing
+        self.filtered_hand_detection_data = dict(sorted(filtered_data.items(), key=lambda x: int(x[0])))
+        self.sorted_keys = sorted(self.filtered_hand_detection_data.keys(), key=lambda k: int(k))
+        return self.filtered_hand_detection_data
+    def _process_epic_frames(self, imgs_rgb: npt.NDArray[np.uint8]) -> DetectionResults:
+        """
+        Process frames using pre-computed EPIC-KITCHENS hand detection data.
+        This method processes EPIC-KITCHENS dataset videos using their provided
+        hand detection annotations, converting them to our standard format while
+        applying spatial validation constraints.
+        Args:
+            imgs_rgb: Array of RGB images for dimension reference
+        Returns:
+            Dictionary containing detection results in the same format as _process_frames
+        """
+        num_frames = len(imgs_rgb)
+        detection_arrays = self._initialize_detection_arrays(num_frames)
+        # Process each frame using EPIC detection data
+        for idx in range(num_frames):
+            try:
+                epic_data = self.filtered_hand_detection_data[self.sorted_keys[idx]]
+                if len(epic_data) == 0:
+                    continue
+                # Process frame detections
+                self._process_epic_frame_detections(idx, epic_data, detection_arrays)
+            except KeyError:
+                logger.warning(f"Missing EPIC data for frame {idx}")
+                continue
+            except Exception as e:
+                logger.warning(f"EPIC frame {idx} processing failed: {str(e)}")
+                continue
+        return {
+            'left_hand_detected': detection_arrays['left_hand_detected'],
+            'right_hand_detected': detection_arrays['right_hand_detected'],
+            'left_bboxes': detection_arrays['left_bboxes'],
+            'right_bboxes': detection_arrays['right_bboxes'],
+            'left_bboxes_ctr': detection_arrays['left_bboxes_ctr'],
+            'right_bboxes_ctr': detection_arrays['right_bboxes_ctr']
+        }
+    def _process_epic_frame_detections(self, idx: int, epic_data: List[Any],
+                                      detection_arrays: Dict[str, npt.NDArray]) -> None:
+        """Process EPIC detections for a single frame."""
+        # Process left and right hands separately
+        left_detected, left_bbox, left_bbox_ctr = self._process_epic_hand_detection(epic_data, "left")
+        right_detected, right_bbox, right_bbox_ctr = self._process_epic_hand_detection(epic_data, "right")
+        # Store results in pre-allocated arrays
+        detection_arrays['left_hand_detected'][idx] = left_detected
+        detection_arrays['right_hand_detected'][idx] = right_detected
+        if left_detected:
+            detection_arrays['left_bboxes'][idx] = left_bbox
+            detection_arrays['left_bboxes_ctr'][idx] = left_bbox_ctr
+        if right_detected:
+            detection_arrays['right_bboxes'][idx] = right_bbox
+            detection_arrays['right_bboxes_ctr'][idx] = right_bbox_ctr
+        # Quality check: If hands appear crossed (left hand on right side),
+        # mark both as invalid to avoid confusion
+        if left_detected and right_detected:
+            self._validate_hand_positions(idx, left_bbox_ctr, right_bbox_ctr, detection_arrays)
+    def _validate_hand_positions(self, idx: int, left_bbox_ctr: npt.NDArray, right_bbox_ctr: npt.NDArray,
+                                detection_arrays: Dict[str, npt.NDArray]) -> None:
+        """Validate that hands are on correct sides of the image."""
+        if left_bbox_ctr[0] > right_bbox_ctr[0]:
+            # Left hand appears to be on the right side - mark both as invalid
+            detection_arrays['left_hand_detected'][idx] = False
+            detection_arrays['right_hand_detected'][idx] = False
+    def _process_epic_hand_detection(self,
+                            epic_data: List[Any],
+                            hand_side: HandSide) -> Tuple[bool, BBoxArray, CenterArray]:
+        """
+        Process EPIC hand detection data for a single frame and hand side.
+        This method extracts and validates hand detection data from EPIC annotations,
+        converting normalized coordinates to pixel coordinates and applying spatial
+        validation constraints.
+        Args:
+            epic_data: List of detection objects for the current frame
+            hand_side: Either "left" or "right" specifying which hand to process
+        Returns:
+            Tuple of (is_detected: bool, bbox: ndarray, bbox_center: ndarray)
+        """
+        if hand_side not in hand_side_dict:
+            raise ValueError(f"Invalid hand side: {hand_side}")
+        # Default empty result for failed detections
+        empty_result = (False, np.array([0, 0, 0, 0]), np.array([0, 0]))
+        try:
+            # Filter and validate detection data
+            hand_data = self._filter_epic_hand_data(epic_data, hand_side)
+            if not hand_data:
+                return empty_result
+            # Validate data structure
+            if not self._validate_epic_data_structure(hand_data):
+                return empty_result
+            # Extract and process bounding box
+            bbox, bbox_center = self._extract_epic_bbox(hand_data[0])
+            # Validate bounding box coordinates
+            if not self._validate_bbox_coordinates(hand_data[0].bbox, hand_side):
+                return empty_result
+            # Apply spatial validation
+            is_valid = self._validate_spatial_position(bbox_center, hand_side)
+            return (is_valid, bbox, bbox_center) if is_valid else empty_result
+        except Exception as e:
+            logging.warning(f"Unexpected error processing {hand_side} hand detection: {str(e)}")
+            return empty_result
+    def _filter_epic_hand_data(self, epic_data: List[Any], hand_side: HandSide) -> List[Any]:
+        """Filter EPIC detection data for the specified hand side."""
+        return [data for data in epic_data if data.side.value == hand_side_dict[hand_side]]
+    def _extract_epic_bbox(self, hand_data: Any) -> Tuple[BBoxArray, CenterArray]:
+        """Extract bounding box and center from EPIC hand detection data."""
+        bbox_cls = hand_data.bbox
+        # Convert normalized coordinates to pixel coordinates
+        bbox = np.array([
+            bbox_cls.left * self.W,
+            bbox_cls.top * self.H,
+            bbox_cls.right * self.W,
+            bbox_cls.bottom * self.H
+        ])
+        # Calculate center point for spatial validation
+        bbox_center = np.array([
+            (bbox[0] + bbox[2]) / 2,
+            (bbox[1] + bbox[3]) / 2
+        ]).astype(np.int32)
+        return bbox, bbox_center
+    def _validate_spatial_position(self, bbox_center: CenterArray, hand_side: HandSide) -> bool:
+        """Validate that hand center is on correct side of image."""
+        if hand_side == "left":
+            return bbox_center[0] <= (self.center + self.HAND_SIDE_MARGIN)
+        else:  # right
+            return bbox_center[0] >= (self.center - self.HAND_SIDE_MARGIN)
+    def _validate_bbox_coordinates(self, bbox_cls: Any, hand_side: HandSide) -> bool:
+        """Validate bounding box coordinates are within valid range [0,1]."""
+        if not (0 <= bbox_cls.left <= 1 and 0 <= bbox_cls.right <= 1 and
+                0 <= bbox_cls.top <= 1 and 0 <= bbox_cls.bottom <= 1):
+            logging.warning(f"Invalid bbox coordinates detected for {hand_side} hand: "
+                            f"left={bbox_cls.left:.3f}, right={bbox_cls.right:.3f}, "
+                            f"top={bbox_cls.top:.3f}, bottom={bbox_cls.bottom:.3f}")
+            return False
+        return True
+    # ============================================================================
+    # UTILITY/HELPER METHODS (General utilities and post-processing)
+    # ============================================================================
+    def _post_process_detections(self, detection_results: DetectionResults) -> DetectionResults:
+        """
+        Apply post-processing to improve detection temporal consistency.
+        This method applies several filters and enhancements to the raw detection
+        results to improve their quality and temporal coherence:
+        1. Filter out large spatial jumps that indicate tracking errors
+        2. Interpolate short gaps in detection sequences
+        3. Calculate quality metrics (distance to image edges)
+        Args:
+            detection_results: Raw detection results from frame processing
+        Returns:
+            Enhanced detection results with improved temporal consistency
+        """
+        # Filter out large jumps for both hands
+        left_results = self._filter_large_jumps(
+            detection_results['left_hand_detected'],
+            detection_results['left_bboxes'],
+            detection_results['left_bboxes_ctr'],
+            max_jump=self.MAX_SPATIAL_JUMP,
+            lookahead=self.MAX_JUMP_LOOKAHEAD
+        )
+        right_results = self._filter_large_jumps(
+            detection_results['right_hand_detected'],
+            detection_results['right_bboxes'],
+            detection_results['right_bboxes_ctr'],
+            max_jump=self.MAX_SPATIAL_JUMP,
+            lookahead=self.MAX_JUMP_LOOKAHEAD
+        )
+        # Interpolate missing detections for smooth trajectories
+        left_results = self._interpolate_detections(*left_results, max_gap=self.MAX_INTERPOLATION_GAP)
+        right_results = self._interpolate_detections(*right_results, max_gap=self.MAX_INTERPOLATION_GAP)
+        # Calculate quality metrics: minimum distance from bbox center to image edges
+        left_bbox_min_dist = get_bbox_center_min_dist_to_edge(left_results[1], self.W, self.H)
+        right_bbox_min_dist = get_bbox_center_min_dist_to_edge(right_results[1], self.W, self.H)
+        return {
+            'left_hand_detected': left_results[0],
+            'right_hand_detected': right_results[0],
+            'left_bboxes': left_results[1],
+            'right_bboxes': right_results[1],
+            'left_bboxes_ctr': left_results[2],
+            'right_bboxes_ctr': right_results[2],
+            'left_bbox_min_dist_to_edge': left_bbox_min_dist,
+            'right_bbox_min_dist_to_edge': right_bbox_min_dist
+        }
+    def _generate_visualization(self, imgs_rgb: np.ndarray, results: Dict[str, np.ndarray]) -> List[np.ndarray]:
+        """
+        Generate visualization of detection results for quality assessment.
+        Creates annotated frames showing detected bounding boxes for visual
+        inspection of detection quality and temporal consistency.
+        Args:
+            imgs_rgb: Original RGB video frames
+            results: Processed detection results
+        Returns:
+            List of annotated images with bounding boxes drawn
+        """
+        list_img_annot = []
+        for idx in range(len(imgs_rgb)):
+            left_bbox = None
+            right_bbox = None
+            # Prepare bounding boxes for visualization
+            if results['left_hand_detected'][idx] or results['right_hand_detected'][idx]:
+                left_bbox = results['left_bboxes'][idx] if results['left_hand_detected'][idx] else None
+                right_bbox = results['right_bboxes'][idx] if results['right_hand_detected'][idx] else None
+            # Generate annotated image
+            img_annot = self.visualize_detections(imgs_rgb[idx], left_bbox, right_bbox, show_image=False)
+            list_img_annot.append(img_annot)
+        return list_img_annot
+    def _save_results(self, paths: Paths, results: DetectionResults, visualization_results: List[npt.NDArray[np.uint8]]) -> None:
+        """
+        Save all processed results to disk in standardized format.
+        Args:
+            paths: Paths object containing output file locations
+            results: Processed detection results
+            visualization_results: Generated visualization frames
+        """
+        # Create output directory if it doesn't exist
+        if not os.path.exists(paths.bbox_processor):
+            os.makedirs(paths.bbox_processor)
+        # Save detection data in compressed NumPy format
+        np.savez(paths.bbox_data, **results)
+        # Save visualization video with lossless compression
+        media.write_video(paths.video_bboxes, visualization_results, fps=15, codec="ffv1")
+    def _interpolate_detections(self, detected: DetectionFlagArray,
+                               bboxes: BBoxArray,
+                               centers: CenterArray,
+                               max_gap: int = 10) -> Tuple[DetectionFlagArray, BBoxArray, CenterArray]:
+        """
+        Interpolate bounding boxes and detection status for short gaps in tracking.
+        This method fills in missing detections using linear interpolation when the
+        gap is small enough to reasonably assume continuous hand motion. This helps
+        create smoother trajectories for downstream processing.
+        Args:
+            detected: Boolean array of detection status per frame
+            bboxes: Array of bounding boxes [N, 4] format [x1, y1, x2, y2]
+            centers: Array of bbox centers [N, 2] format [x, y]
+            max_gap: Maximum gap size (in frames) to interpolate over
+        Returns:
+            Tuple of (interpolated detection status, interpolated bboxes, interpolated centers)
+        """
+        detected = detected.copy()
+        bboxes = bboxes.copy()
+        centers = centers.copy()
+        # Handle single-frame gaps first (most common case)
+        for i in range(1, len(detected) - 1):
+            if not detected[i] and detected[i-1] and detected[i+1]:
+                # Get valid bboxes/centers before and after gap
+                start_bbox = bboxes[i-1]
+                end_bbox = bboxes[i+1]
+                start_center = centers[i-1]
+                end_center = centers[i+1]
+                # Linear interpolation with t = 0.5 for single frame
+                interpolated_bbox = 0.5 * (start_bbox + end_bbox)
+                interpolated_center = 0.5 * (start_center + end_center)
+                # Validate interpolated values are reasonable
+                if self._is_valid_bbox(interpolated_bbox) and self._is_valid_center(interpolated_center):
+                    bboxes[i] = interpolated_bbox
+                    centers[i] = interpolated_center
+                    detected[i] = True
+        # Handle multi-frame gaps
+        non_detect_start = None
+        for i in range(1, len(detected) - 1):
+            # Start of non-detection sequence
+            if detected[i-1] and not detected[i]:
+                non_detect_start = i
+            # End of non-detection sequence
+            elif non_detect_start is not None and not detected[i] and detected[i+1]:
+                non_detect_end = i
+                gap_size = non_detect_end - non_detect_start + 1
+                # Only interpolate if gap is small enough and has valid detections on both sides
+                if gap_size <= max_gap:
+                    # Get valid bboxes/centers before and after gap
+                    start_bbox = bboxes[non_detect_start - 1]
+                    end_bbox = bboxes[non_detect_end + 1]
+                    start_center = centers[non_detect_start - 1]
+                    end_center = centers[non_detect_end + 1]
+                    # Generate interpolation steps
+                    steps = gap_size + 1
+                    for j in range(gap_size):
+                        t = (j + 1) / steps  # Interpolation factor
+                        # Linear interpolation of bbox coordinates
+                        bboxes[non_detect_start + j] = (1 - t) * start_bbox + t * end_bbox
+                        # Linear interpolation of center coordinates
+                        centers[non_detect_start + j] = (1 - t) * start_center + t * end_center
+                        # Mark as detected
+                        detected[non_detect_start + j] = True
+                non_detect_start = None
+        return detected, bboxes, centers
+    def _is_valid_bbox(self, bbox: BBoxArray) -> bool:
+        """Validate that bbox coordinates are reasonable."""
+        if bbox is None or len(bbox) != 4:
+            return False
+        # Check for reasonable bounds (not negative, not too large)
+        return (bbox >= 0).all() and (bbox[:2] < bbox[2:]).all() and bbox.max() < max(self.W, self.H) * 2
+    def _is_valid_center(self, center: CenterArray) -> bool:
+        """Validate that center coordinates are reasonable."""
+        if center is None or len(center) != 2:
+            return False
+        # Check for reasonable bounds
+        return (center >= 0).all() and center[0] < self.W * 2 and center[1] < self.H * 2
+    def visualize_detections(self, img: npt.NDArray[np.uint8],
+                           left_bbox: Optional[npt.NDArray[np.float32]] = None,
+                           right_bbox: Optional[npt.NDArray[np.float32]] = None,
+                           show_image: bool = True) -> npt.NDArray[np.uint8]:
+        """
+        Visualize hand detections by drawing bounding boxes on the image.
+        This method creates annotated images showing detected hand locations with
+        color-coded bounding boxes (red for left hand, green for right hand).
+        Args:
+            img: Input RGB image to annotate
+            left_bbox: Left hand bounding box [x1, y1, x2, y2] or None if not detected
+            right_bbox: Right hand bounding box [x1, y1, x2, y2] or None if not detected
+            show_image: Whether to display the image using cv2.imshow
+        Returns:
+            The annotated image
+        """
+        # Work directly with the input image (assumed to be in BGR format)
+        img_bgr = img
+        # Draw left hand bounding box in red
+        if left_bbox is not None and not np.array_equal(left_bbox, np.array([0, 0, 0, 0])):
+            cv2.rectangle(
+                img_bgr,
+                (int(left_bbox[0]), int(left_bbox[1])),
+                (int(left_bbox[2]), int(left_bbox[3])),
+                self.LEFT_HAND_COLOR,
+                self.BBOX_THICKNESS
+            )
+        # Draw right hand bounding box in green
+        if right_bbox is not None and not np.array_equal(right_bbox, np.array([0, 0, 0, 0])):
+            cv2.rectangle(
+                img_bgr,
+                (int(right_bbox[0]), int(right_bbox[1])),
+                (int(right_bbox[2]), int(right_bbox[3])),
+                self.RIGHT_HAND_COLOR,
+                self.BBOX_THICKNESS
+            )
+        # Optionally display the image for debugging
+        if show_image:
+            cv2.imshow("Hand Detections", img_bgr)
+            cv2.waitKey(0)
+            cv2.destroyAllWindows()
+        return img_bgr
+    @staticmethod
+    def _filter_large_jumps(detected: DetectionFlagArray,
+                           bboxes: BBoxArray,
+                           centers: CenterArray,
+                           max_jump: float = 200.0,
+                           lookahead: int = 10) -> Tuple[DetectionFlagArray, BBoxArray, CenterArray]:
+        """
+        Filter out small groups of detections that are spatially inconsistent with the trajectory.
+        This method identifies and removes isolated detections that are far from the
+        expected trajectory, which usually indicate false positives or tracking errors.
+        It helps maintain temporal consistency in hand tracking.
+        Args:
+            detected: Boolean array of detection status per frame
+            bboxes: Array of bounding boxes [N, 4] format [x1, y1, x2, y2]
+            centers: Array of bbox centers [N, 2] format [x, y]
+            max_jump: Maximum allowed distance (in pixels) between consecutive detections
+            lookahead: Maximum number of consecutive distant points to filter as a group
+        Returns:
+            Tuple of (filtered detection status, filtered bboxes, filtered centers)
+        """
+        detected = detected.copy()
+        bboxes = bboxes.copy()
+        centers = centers.copy()
+        # Templates for clearing invalid detections
+        empty_bbox = np.array([0, 0, 0, 0])
+        empty_center = np.array([0, 0])
+        i = 0
+        while i < len(detected):
+            # Find next detected point to compare against
+            next_valid = i + 1
+            if next_valid >= len(detected):
+                break
+            # Calculate spatial distance to next detection
+            dist = np.linalg.norm(centers[next_valid] - centers[i])
+            if dist > max_jump:
+                # Large jump detected - check if it's part of a small group of outliers
+                distant_points = []
+                ref_center = centers[i]  # Use current point as reference
+                # Look ahead to find consecutive distant points
+                for j in range(next_valid, len(detected)):
+                    curr_dist = np.linalg.norm(centers[j] - ref_center)
+                    if curr_dist > max_jump:
+                        distant_points.append(j)
+                    else:
+                        break
+                # If we found a small group of distant points, filter them out
+                if len(distant_points) > 0 and len(distant_points) <= lookahead:
+                    for idx in distant_points:
+                        detected[idx] = False
+                        bboxes[idx] = empty_bbox
+                        centers[idx] = empty_center
+                        logging.warning(f"Filtered out frame {idx} as part of small distant group")
+            i = next_valid
+        return detected, bboxes, centers

phantom/phantom/processors/hand_processor.py ADDED Viewed

	@@ -0,0 +1,675 @@

+"""
+Hand Processor Module
+This module converts detected hand bounding boxes into detailed 3D hand poses using
+state-of-the-art pose estimation models, with optional depth-based refinement for improved accuracy.
+Processing Pipeline:
+1. Load video frames and bounding box data from previous stage
+2. Apply HaMeR pose estimation within detected bounding boxes
+3. Filter poses based on edge proximity and quality metrics
+4. Optionally refine 3D poses using depth data and segmentation
+5. Generate hand mesh models and extract keypoint trajectories
+6. Save processed hand sequences for downstream tasks
+The module supports multiple processing modes:
+- Hand2DProcessor: 2D pose estimation only (faster, camera-based)
+- Hand3DProcessor: Full 3D processing with depth alignment (more accurate, if depth is available)
+Output Data:
+- HandSequence objects containing pose trajectories
+- 2D keypoint positions in image coordinates
+- 3D keypoint positions in camera coordinates
+- Hand detection flags per frame
+- Annotated visualization videos
+"""
+import glob
+import os
+import logging
+from tqdm import tqdm
+import numpy as np
+import mediapy as media
+import open3d as o3d  # type: ignore
+from typing import Tuple, Optional, Dict, Any
+import trimesh
+from collections import defaultdict
+import argparse
+from phantom.utils.pcd_utils import get_visible_points, get_pcd_from_points, icp_registration, get_point_cloud_of_segmask, get_3D_points_from_pixels, remove_outliers, get_bbox_of_3d_points, trim_pcd_to_bbox, visualize_pcds
+from phantom.utils.transform_utils import transform_pts
+from phantom.processors.base_processor import BaseProcessor
+from phantom.detectors.detector_hamer import DetectorHamer
+from phantom.processors.phantom_data import HandSequence, HandFrame, hand_side_dict
+from phantom.processors.paths import Paths
+from phantom.processors.segmentation_processor import HandSegmentationProcessor
+logger = logging.getLogger(__name__)
+class HandBaseProcessor(BaseProcessor):
+    """
+    Base class for hand pose processing using HaMeR detection and optional depth refinement.
+    The processor operates on the output of BBoxProcessor, using detected hand bounding boxes
+    to guide pose estimation. It supports both 2D and 3D processing modes, with the 3D mode
+    providing enhanced accuracy through depth sensor integration.
+    Processing Workflow:
+    1. Load video frames and bounding box detection results
+    2. For each frame with detected hands:
+       - Apply HaMeR pose estimation within bounding box
+       - Validate pose quality (edge proximity, confidence)
+       - Optionally generate hand segmentation masks for depth refinement
+       - Optionally apply depth-based pose refinement
+    3. Generate temporal hand sequences with smooth trajectories
+    4. Save processed results and visualization videos
+    Attributes:
+        process_hand_masks (bool): Whether to generate hand segmentation masks
+        apply_depth_alignment (bool): Whether to use depth-based pose refinement
+        detector_hamer (DetectorHamer): HaMeR pose estimation model
+        hand_mask_processor: Segmentation processor for hand mask generation
+        H (int): Video frame height
+        W (int): Video frame width
+        imgs_depth (np.ndarray): Depth images for 3D refinement
+        left_masks (np.ndarray): Left hand segmentation masks
+        right_masks (np.ndarray): Right hand segmentation masks
+    """
+    def __init__(self, args: argparse.Namespace) -> None:
+        """
+        Initialize the hand processor with configuration parameters.
+        Args:
+            args: Command line arguments containing processing configuration
+                 including depth processing flags and model parameters
+        """
+        super().__init__(args)
+        self.process_hand_masks: bool = False
+        self._initialize_detectors()
+        self.hand_mask_processor: Optional[HandSegmentationProcessor] = None
+        self.apply_depth_alignment: bool = False
+    def _initialize_detectors(self) -> None:
+        """
+        Initialize all required detection models.
+        Sets up the HaMeR detector for hand pose estimation.
+        """
+        self.detector_hamer = DetectorHamer()
+    def process_one_demo(self, data_sub_folder: str) -> None:
+        """
+        Process a single demonstration video to extract hand poses and segmentation.
+        Args:
+            data_sub_folder: Path to the demonstration data folder containing
+                           video files, bounding box data, and optional depth data
+        """
+        save_folder = self.get_save_folder(data_sub_folder)
+        paths = self.get_paths(save_folder)
+        # Load RGB video frames
+        imgs_rgb = media.read_video(getattr(paths, f"video_left"))
+        self.H, self.W, _ = imgs_rgb[0].shape
+        # Load depth data if available (for 3D processing)
+        if os.path.exists(paths.depth):
+            self.imgs_depth = np.load(paths.depth)
+        else:
+            self.imgs_depth = np.zeros((len(imgs_rgb), imgs_rgb[0].shape[0], imgs_rgb[0].shape[1]))
+        # Load hand segmentation masks if available
+        if os.path.exists(paths.masks_hand_left) and os.path.exists(paths.masks_hand_right):
+            self.left_masks = np.load(paths.masks_hand_left)
+            self.right_masks = np.load(paths.masks_hand_right)
+        else:
+            self.left_masks = np.zeros((len(imgs_rgb), imgs_rgb[0].shape[0], imgs_rgb[0].shape[1]))
+            self.right_masks = np.zeros((len(imgs_rgb), imgs_rgb[0].shape[0], imgs_rgb[0].shape[1]))
+        # Load bounding box detection results from previous stage
+        bbox_data = np.load(paths.bbox_data)
+        left_hand_detected = bbox_data["left_hand_detected"]
+        right_hand_detected = bbox_data["right_hand_detected"]
+        left_bboxes = bbox_data["left_bboxes"]
+        right_bboxes = bbox_data["right_bboxes"]
+        # Validate data consistency
+        assert len(left_hand_detected) == len(right_hand_detected)
+        assert len(left_hand_detected) == len(imgs_rgb)
+        # Process left and right hand sequences
+        left_sequence = self._process_all_frames(imgs_rgb, left_bboxes, left_hand_detected, "left")
+        right_sequence = self._process_all_frames(imgs_rgb, right_bboxes, right_hand_detected, "right")
+        # Generate hand segmentation masks if enabled
+        if self.process_hand_masks:
+            self._get_hand_masks(data_sub_folder, left_sequence, right_sequence)
+            self.left_masks = np.load(paths.masks_hand_left)
+            self.right_masks = np.load(paths.masks_hand_right)
+        # Apply depth-based pose refinement if enabled
+        if self.apply_depth_alignment:
+            left_sequence = self._process_all_frames_depth_alignment(imgs_rgb, left_hand_detected, "left", left_sequence)
+            right_sequence = self._process_all_frames_depth_alignment(imgs_rgb, right_hand_detected, "right", right_sequence)
+        # Save processed sequences and generate visualizations
+        self._save_results(paths, left_sequence, right_sequence)
+    def _process_all_frames(self, imgs_rgb: np.ndarray, bboxes: np.ndarray,
+                            hand_detections: np.ndarray, hand_side: str) -> HandSequence:
+        """
+        Process all frames in a video sequence to extract hand poses.
+        This method iterates through all video frames, applying pose estimation
+        where hands are detected and creating empty frames where they are not.
+        It maintains temporal consistency and provides quality filtering.
+        Args:
+            imgs_rgb: RGB video frames, shape (num_frames, height, width, 3)
+            bboxes: Hand bounding boxes per frame, shape (num_frames, 4)
+            hand_detections: Boolean flags indicating valid detections per frame
+            hand_side: "left" or "right" to specify which hand is being processed
+        Returns:
+            HandSequence object containing processed pose data for all frames
+        """
+        sequence = HandSequence()
+        for img_idx in tqdm(range(len(imgs_rgb)), disable=False, leave=False):
+            if not hand_detections[img_idx]:
+                # Create empty frame for missing detections
+                sequence.add_frame(HandFrame.create_empty_frame(
+                    frame_idx=img_idx,
+                    img_rgb=imgs_rgb[img_idx],
+                ))
+                continue
+            # Process frame with detected hand
+            frame_data = self._process_frame(img_idx, imgs_rgb[img_idx], bboxes[img_idx],
+                                            hand_side)
+            sequence.add_frame(frame_data)
+        return sequence
+    def _process_frame(self, img_idx: int, img_rgb: np.ndarray, bbox: np.ndarray,
+                       hand_side: str, view: bool = False) -> HandFrame:
+        """
+        Process a single frame to extract hand pose and validate quality.
+        This method applies HaMeR pose estimation within the detected bounding box
+        and performs quality checks to ensure the pose is suitable for downstream
+        processing. Poor quality poses (e.g., hands too close to image edges) are
+        rejected to maintain data quality.
+        Args:
+            img_idx: Index of the current frame
+            img_rgb: RGB image data for this frame
+            bbox: Hand bounding box coordinates [x1, y1, x2, y2]
+            hand_side: "left" or "right" specifying which hand is being processed
+            view: Whether to display debug visualizations
+        Returns:
+            HandFrame object containing pose data or empty frame if quality is poor
+        """
+        try:
+            # Apply HaMeR pose estimation within bounding box
+            processed_data = self._process_image_with_hamer(img_rgb, bbox[None,...], hand_side, img_idx, view=view)
+            # Quality check: reject poses where keypoints are too close to image edges
+            if self.are_kpts_too_close_to_margin(processed_data["kpts_2d"], self.W, self.H, margin=5, threshold=0.1):
+                logger.error(f"Error processing frame {img_idx}: Edge hand")
+                return HandFrame.create_empty_frame(
+                    frame_idx=img_idx,
+                    img_rgb=img_rgb,
+                )
+            # Create frame with validated pose data
+            frame_data = HandFrame(
+                frame_idx=img_idx,
+                hand_detected=True,
+                img_rgb=img_rgb,
+                img_hamer=processed_data["img_hamer"],
+                kpts_2d=processed_data["kpts_2d"],
+                kpts_3d=processed_data["kpts_3d"],
+            )
+            return frame_data
+        except Exception as e:
+            logger.error(f"Error processing frame {img_idx}: {str(e)}")
+            return HandFrame.create_empty_frame(
+                frame_idx=img_idx,
+                img_rgb=img_rgb,
+            )
+    def are_kpts_too_close_to_margin(self, kpts_2d: np.ndarray, img_width: int, img_height: int,
+                                   margin: int = 20, threshold: float = 0.5) -> bool:
+        """
+        Filter hand keypoints based on proximity to image edges.
+        This quality check rejects hand poses where too many keypoints are near
+        the image boundaries, which typically indicates partial occlusion or
+        tracking errors that would lead to poor pose estimates.
+        Args:
+            kpts_2d: 2D keypoint positions, shape (N, 2) where N is number of keypoints
+            img_width: Image width in pixels
+            img_height: Image height in pixels
+            margin: Distance from edge (in pixels) to consider "too close"
+            threshold: Fraction of keypoints that triggers rejection (e.g., 0.5 = 50%)
+        Returns:
+            True if hand should be rejected due to edge proximity, False otherwise
+        """
+        x = kpts_2d[:, 0]
+        y = kpts_2d[:, 1]
+        # Create boolean mask for keypoints near any image edge
+        near_edge = (
+            (x < margin) |
+            (y < margin) |
+            (x > img_width - margin) |
+            (y > img_height - margin)
+        )
+        frac_near_edge = np.mean(near_edge)  # Fraction of keypoints near edge
+        return frac_near_edge > threshold
+    def _save_results(self, paths: Paths, left_sequence: HandSequence, right_sequence: HandSequence) -> None:
+        """
+        Save processed hand sequences and generate visualization videos.
+        Args:
+            paths: Paths object containing output file locations
+            left_sequence: Processed left hand pose sequence
+            right_sequence: Processed right hand pose sequence
+        """
+        # Create output directory
+        if not os.path.exists(getattr(paths, f"hand_processor")):
+            os.makedirs(getattr(paths, f"hand_processor"))
+        # Save hand sequence data in compressed format
+        left_sequence.save(getattr(paths, f"hand_data_left"))
+        right_sequence.save(getattr(paths, f"hand_data_right"))
+        # Save RGB frames for reference
+        media.write_video(getattr(paths, f"video_rgb_imgs"), left_sequence.imgs_rgb, fps=10, codec="ffv1")
+        # Load additional visualization components
+        imgs_bbox = media.read_video(getattr(paths, f"video_bboxes"))
+        # Load segmentation visualization if available
+        if os.path.exists(getattr(paths, f"video_sam_arm")):
+            imgs_sam = media.read_video(getattr(paths, f"video_sam_arm"))
+        else:
+            imgs_sam = np.zeros((len(left_sequence.imgs_rgb), left_sequence.imgs_rgb[0].shape[0], left_sequence.imgs_rgb[0].shape[1], 3))
+        # Create comprehensive annotation video showing all processing stages
+        annot_imgs = []
+        for idx in range(len(left_sequence.imgs_rgb)):
+            img_hamer_left = left_sequence.imgs_hamer[idx]
+            img_hamer_right = right_sequence.imgs_hamer[idx]
+            img_bbox = imgs_bbox[idx]
+            img_sam = imgs_sam[idx]
+            # Combine visualizations in 2x2 grid: [bbox, sam] on top, [left_hand, right_hand] on bottom
+            annot_img = np.vstack((np.hstack((img_bbox, img_sam)), np.hstack((img_hamer_left, img_hamer_right)))).astype(np.uint8)
+            annot_imgs.append(annot_img)
+        # Save comprehensive visualization video
+        media.write_video(getattr(paths, f"video_annot"), np.array(annot_imgs), fps=10, codec="h264") # mp4
+    def _create_hand_mesh(self, hamer_out: Dict[str, Any]) -> trimesh.Trimesh:
+        """
+        Create a 3D triangle mesh from HaMeR pose estimation output.
+        Args:
+            hamer_out: HaMeR output dictionary containing vertex positions
+        Returns:
+            Trimesh object representing the hand mesh
+        """
+        return trimesh.Trimesh(hamer_out["verts"].copy(), self.detector_hamer.faces_left.copy(), process=False)
+    def _get_hand_masks(self, data_sub_folder: str, hamer_data_left: HandSequence, hamer_data_right: HandSequence) -> None:
+        """
+        Generate hand segmentation masks using processed pose data.
+        This method integrates with the segmentation processor to generate
+        detailed hand masks that can be used for depth-based pose refinement.
+        Args:
+            data_sub_folder: Path to demonstration data folder
+            hamer_data_left: Processed left hand sequence for guidance
+            hamer_data_right: Processed right hand sequence for guidance
+        """
+        hamer_data = {
+            "left": hamer_data_left,
+            "right": hamer_data_right
+        }
+        self.hand_mask_processor.process_one_demo(data_sub_folder, hamer_data)
+    @staticmethod
+    def _get_visible_pts_from_hamer(detector_hamer: DetectorHamer, hamer_out: Dict[str, Any], mesh: trimesh.Trimesh,
+                                img_depth: np.ndarray, cam_intrinsics: Dict[str, Any]) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Identify visible hand vertices and their corresponding depth points.
+        Args:
+            detector_hamer: HaMeR detector instance for coordinate projections
+            hamer_out: HaMeR output containing pose estimates and camera parameters
+            mesh: 3D hand mesh generated from HaMeR output
+            img_depth: Depth image corresponding to the RGB frame
+            cam_intrinsics: Camera intrinsic parameters for 3D projection
+        Returns:
+            Tuple of (visible_points_3d, visible_hamer_vertices):
+                - visible_points_3d: 3D points from depth image at visible mesh locations
+                - visible_hamer_vertices: Corresponding vertices from the HaMeR mesh
+        """
+        # Perform ray-casting to identify visible mesh vertices
+        visible_hamer_vertices, _ = get_visible_points(mesh, origin=np.array([0,0,0]))
+        # Project 3D vertices to 2D image coordinates
+        visible_points_2d = detector_hamer.project_3d_kpt_to_2d(
+            (visible_hamer_vertices-hamer_out["T_cam_pred"].cpu().numpy()).astype(np.float32),
+            hamer_out["img_w"], hamer_out["img_h"], hamer_out["scaled_focal_length"],
+            hamer_out["camera_center"], hamer_out["T_cam_pred"])
+        # Filter out points that fall outside the depth image boundaries
+        original_visible_points_2d = visible_points_2d.copy()
+        # Create valid region mask (note: depth indexing is [y, x])
+        valid_mask = ((original_visible_points_2d[:, 0] < img_depth.shape[1]) &
+                     (original_visible_points_2d[:, 1] < img_depth.shape[0]))
+        visible_points_2d = visible_points_2d[valid_mask]
+        visible_hamer_vertices = visible_hamer_vertices[valid_mask]
+        # Convert 2D depth pixels to 3D points using camera intrinsics
+        visible_points_3d = get_3D_points_from_pixels(visible_points_2d, img_depth, cam_intrinsics)
+        return visible_points_3d, visible_hamer_vertices
+    @staticmethod
+    def _get_transformation_estimate(visible_points_3d: np.ndarray,
+                                    visible_hamer_vertices: np.ndarray,
+                                    pcd: o3d.geometry.PointCloud) -> Tuple[np.ndarray, o3d.geometry.PointCloud]:
+        """
+        Estimate transformation to align HaMeR mesh with observed point cloud.
+        This method uses Iterative Closest Point (ICP) registration to find the
+        optimal transformation that aligns the visible parts of the predicted
+        hand mesh with the point cloud extracted from depth and segmentation data.
+        Args:
+            visible_points_3d: 3D points from depth image at mesh locations
+            visible_hamer_vertices: Corresponding vertices from HaMeR mesh
+            pcd: Point cloud from segmentation and depth data
+        Returns:
+            Tuple of (transformation_matrix, aligned_mesh_pointcloud):
+                - transformation_matrix: 4x4 transformation to align mesh with depth
+                - aligned_mesh_pointcloud: Transformed mesh point cloud after alignment
+        """
+        # Get initial transformation estimate using median translation
+        T_0 = HandBaseProcessor._get_initial_transformation_estimate(visible_points_3d, visible_hamer_vertices)
+        # Create point cloud from visible mesh vertices
+        visible_hamer_pcd = get_pcd_from_points(visible_hamer_vertices, colors=np.ones_like(visible_hamer_vertices) * [0, 1, 0])
+        try:
+            # Apply ICP registration for fine alignment
+            aligned_hamer_pcd, T = icp_registration(visible_hamer_pcd, pcd, voxel_size=0.005, init_transform=T_0)
+        except Exception as e:
+            logger.error(f"ICP registration failed: {e}")
+            return T_0, visible_hamer_pcd
+        return T, aligned_hamer_pcd
+    @staticmethod
+    def _get_initial_transformation_estimate(visible_points_3d: np.ndarray,
+                                            visible_hamer_vertices: np.ndarray) -> np.ndarray:
+        """
+        Compute initial transformation estimate for mesh-to-depth alignment.
+        This method provides a coarse alignment between the HaMeR prediction and
+        the depth-based point cloud using median translation. It assumes that
+        orientation is approximately correct and only translation correction is needed.
+        Args:
+            visible_points_3d: 3D points from depth image
+            visible_hamer_vertices: Corresponding HaMeR mesh vertices
+        Returns:
+            4x4 transformation matrix with estimated translation
+        """
+        # Calculate median translation between corresponding point sets
+        translation = np.nanmedian(visible_points_3d - visible_hamer_vertices, axis=0)
+        # Create transformation matrix (identity rotation + translation)
+        T_0 = np.eye(4)
+        if not np.isnan(translation).any():
+            T_0[:3, 3] = translation
+        return T_0
+class Hand2DProcessor(HandBaseProcessor):
+    """
+    2D hand pose processor optimized for speed and RGB-only operation.
+    This processor focuses on extracting 2D hand poses and basic 3D estimates
+    without depth sensor integration. It's designed for applications where
+    depth sensors are not available.
+    """
+    def __init__(self, args: argparse.Namespace) -> None:
+        """
+        Initialize 2D hand processor with RGB-only configuration.
+        Args:
+            args: Command line arguments for processor configuration
+        """
+        super().__init__(args)
+    def _process_image_with_hamer(self, img_rgb: np.ndarray, bboxes: np.ndarray, hand_side: str,
+                                  img_idx: int, view: bool = False) -> Dict[str, Any]:
+        """
+        Process RGB image with HaMeR for 2D pose estimation.
+        Args:
+            img_rgb: RGB image to process
+            bboxes: Hand bounding boxes for pose estimation guidance
+            hand_side: "left" or "right" specifying which hand to process
+            img_idx: Frame index for debugging and logging
+            view: Whether to display debug visualizations
+        Returns:
+            Dictionary containing:
+                - img_hamer: Annotated image with pose visualization
+                - kpts_3d: Estimated 3D keypoints
+                - kpts_2d: 2D keypoint projections in image coordinates
+        Raises:
+            ValueError: If no valid hand pose is detected in the image
+        """
+        # Configure HaMeR for target hand side
+        is_right = np.array([hand_side_dict[str(hand_side)]*True]*len(bboxes))
+        # Apply HaMeR pose estimation
+        hamer_out = self.detector_hamer.detect_hand_keypoints(
+            img_rgb,
+            hand_side=hand_side,
+            bboxes=bboxes,
+            is_right=is_right,
+            camera_params=self.intrinsics_dict,
+            visualize=False
+        )
+        if hamer_out is None or not hamer_out.get("success", False):
+            raise ValueError("No hand detected in image")
+        return {
+            "img_hamer": hamer_out["annotated_img"][:,:,::-1],  # Convert BGR to RGB
+            "kpts_3d": hamer_out["kpts_3d"],
+            "kpts_2d": hamer_out['kpts_2d']
+        }
+class Hand3DProcessor(HandBaseProcessor):
+    """
+    3D hand pose processor with depth-based refinement capabilities.
+    This processor provides more accurate 3D hand poses by combining HaMeR
+    estimation with depth sensor data and hand segmentation. It uses point cloud
+    registration techniques to refine the initial pose estimates, resulting in
+    poses that are better aligned with the physical environment.
+    Processing Enhancements:
+    - Mesh generation from HaMeR output for visibility analysis
+    - Hand segmentation using SAM2 for accurate depth extraction
+    - ICP-based alignment between predicted mesh and observed point cloud
+    """
+    def __init__(self, args: argparse.Namespace) -> None:
+        """
+        Initialize 3D hand processor with depth refinement capabilities.
+        Args:
+            args: Command line arguments containing depth processing configuration
+        """
+        super().__init__(args)
+        self.args = args
+        # Storage for HaMeR outputs needed for depth alignment
+        self.hamer_out_dict: Dict[str, Dict[int, Dict[str, Any]]] = {
+            "left": defaultdict(dict),
+            "right": defaultdict(dict)
+        }
+        # Enable advanced processing features
+        self.process_hand_masks = True
+        self.apply_depth_alignment = True
+        self.hand_mask_processor = HandSegmentationProcessor(self.args)
+    def _process_image_with_hamer(self, img_rgb: np.ndarray, bboxes: np.ndarray, hand_side: str,
+                                  img_idx: int, view: bool = False) -> Dict[str, Any]:
+        """
+        Process RGB image with HaMeR optimized for subsequent depth refinement.
+        This method applies HaMeR pose estimation configured for 3D processing,
+        storing intermediate results needed for later depth-based refinement.
+        Args:
+            img_rgb: RGB image to process
+            bboxes: Hand bounding boxes for pose estimation guidance
+            hand_side: "left" or "right" specifying which hand to process
+            img_idx: Frame index for result storage and debugging
+            view: Whether to display debug visualizations
+        Returns:
+            Dictionary containing pose estimation results
+        Raises:
+            ValueError: If no valid hand pose is detected in the image
+        """
+        # Configure HaMeR for target hand side
+        is_right = np.array([hand_side_dict[str(hand_side)]*True]*len(bboxes))
+        # Apply HaMeR with 2D keypoint focus (3D refinement happens later)
+        hamer_out = self.detector_hamer.detect_hand_keypoints(
+            img_rgb,
+            hand_side=hand_side,
+            bboxes=bboxes,
+            is_right=is_right,
+            kpts_2d_only=True,  # Initial processing focuses on 2D
+            camera_params=self.intrinsics_dict
+        )
+        if hamer_out is None or not hamer_out.get("success", False):
+            raise ValueError("No hand detected in image")
+        # Store HaMeR output for later depth alignment processing
+        self.hamer_out_dict[hand_side][img_idx] = hamer_out
+        return {
+            "img_hamer": hamer_out["annotated_img"][:,:,::-1],  # Convert BGR to RGB
+            "kpts_3d": hamer_out["kpts_3d"],
+            "kpts_2d": hamer_out['kpts_2d']
+        }
+    def _process_all_frames_depth_alignment(self, imgs_rgb: np.ndarray, hand_detections: np.ndarray,
+                                    hand_side: str, sequence: Optional[HandSequence] = None) -> HandSequence:
+        """
+        Apply depth-based refinement to all frames in the sequence.
+        This method performs the depth alignment stage of processing, using
+        segmentation masks and depth data to refine the initial HaMeR pose
+        estimates for improved 3D accuracy.
+        Args:
+            imgs_rgb: RGB video frames for reference
+            hand_detections: Boolean flags indicating frames with valid detections
+            hand_side: "left" or "right" specifying which hand to process
+            sequence: HandSequence containing initial pose estimates to refine
+        Returns:
+            HandSequence with refined 3D poses aligned to depth data
+        """
+        for img_idx in tqdm(range(len(imgs_rgb)), disable=False, leave=False):
+            if not hand_detections[img_idx]:
+                continue
+            # Apply depth-based refinement to this frame
+            frame_data = sequence.get_frame(img_idx)
+            frame_data.kpts_3d = self._depth_alignment(img_idx, hand_side, imgs_rgb[img_idx])
+            sequence.modify_frame(img_idx, frame_data)
+        return sequence
+    def _depth_alignment(self, img_idx: int, hand_side: str, img_rgb: np.ndarray) -> np.ndarray:
+        """
+        Perform depth-based pose refinement for a single frame.
+        Algorithm Steps:
+        1. Extract depth image and segmentation mask for the frame
+        2. Obtain 3D hand mesh from HaMeR output
+        3. Create point cloud from segmented depth region
+        4. Identify visible mesh vertices through ray casting
+        5. Apply ICP registration between mesh and point cloud
+        6. Transform original keypoints using computed alignment
+        Args:
+            img_idx: Index of the frame to process
+            hand_side: "left" or "right" specifying which hand to process
+            img_rgb: RGB image for reference (used in point cloud generation)
+        Returns:
+            Refined 3D keypoint positions aligned with depth data
+        """
+        # Load frame-specific data
+        img_depth = self.imgs_depth[img_idx]
+        mask = self.left_masks[img_idx] if hand_side == "left" else self.right_masks[img_idx]
+        hamer_out = self.hamer_out_dict[hand_side][img_idx]
+        # Create 3D hand mesh from HaMeR pose estimate
+        mesh = self._create_hand_mesh(hamer_out)
+        # Generate point cloud from depth image within segmented hand region
+        pcd = get_point_cloud_of_segmask(mask, img_depth, img_rgb, self.intrinsics_dict, visualize=False)
+        # Identify visible mesh vertices and corresponding depth points
+        visible_points_3d, visible_hamer_vertices = self._get_visible_pts_from_hamer(
+            self.detector_hamer,
+            hamer_out,
+            mesh,
+            img_depth,
+            self.intrinsics_dict
+        )
+        # Compute optimal transformation using ICP registration
+        T, _ = self._get_transformation_estimate(visible_points_3d, visible_hamer_vertices, pcd)
+        # Apply transformation to refine original keypoint positions
+        kpts_3d = transform_pts(hamer_out["kpts_3d"], T)
+        return kpts_3d

phantom/phantom/processors/handinpaint_processor.py ADDED Viewed

	@@ -0,0 +1,485 @@

+"""
+Hand Inpainting Processor Module
+This module removes human hands from demonstration videos using the E2FGVI model.
+Paper:
+Towards An End-to-End Framework for Flow-Guided Video Inpainting
+https://github.com/MCG-NKU/E2FGVI.git
+Processing Pipeline:
+1. Load pre-trained E2FGVI model and initialize GPU processing
+2. Read input video frames and corresponding hand segmentation masks
+3. Process frames in batches with neighboring temporal context
+4. Apply mask-guided inpainting to remove hand regions
+5. Verify complete processing and handle any missed frames
+6. Save final hand-free video for robot learning applications
+"""
+import cv2
+from PIL import Image
+import numpy as np
+import os
+from pathlib import Path
+from tqdm import tqdm
+import torch
+import mediapy as media
+import logging
+import gc
+from typing import List, Tuple, Optional, Any, Union
+from phantom.processors.base_processor import BaseProcessor
+from phantom.utils.data_utils import get_parent_folder_of_package
+from E2FGVI.model.e2fgvi_hq import InpaintGenerator  # type: ignore
+from E2FGVI.core.utils import to_tensors  # type: ignore
+DEFAULT_CHECKPOINT = 'E2FGVI/release_model/E2FGVI-HQ-CVPR22.pth'
+logger = logging.getLogger(__name__)
+class HandInpaintProcessor(BaseProcessor):
+    """
+    Hand inpainting processor for removing human hands from demonstration videos.
+    Attributes:
+        model: E2FGVI neural network model for video inpainting
+        device: GPU/CPU device for model execution
+        ref_length (int): Spacing between reference frames for temporal consistency
+        num_ref (int): Number of reference frames to use (-1 for automatic)
+        neighbor_stride (int): Spacing between neighboring frames in temporal context
+        batch_size (int): Number of frame groups to process simultaneously
+        scale_factor (int): Resolution scaling factor for processing optimization
+    """
+    def __init__(self, args: Any) -> None:
+        """
+        Initialize the hand inpainting processor with E2FGVI model and parameters.
+        Args:
+            args: Command line arguments containing processing configuration
+                 including scale factor and other inpainting parameters
+        """
+        super().__init__(args)
+        # Load pre-trained E2FGVI model
+        root_dir = get_parent_folder_of_package("E2FGVI")
+        checkpoint_path = Path(root_dir, DEFAULT_CHECKPOINT)
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Initialize and load the inpainting model
+        self.model = InpaintGenerator().to(self.device)
+        data = torch.load(checkpoint_path, map_location=self.device)
+        self.model.load_state_dict(data)
+        self.model.eval()
+        # Configure temporal processing parameters
+        self.ref_length: int = 20        # Spacing between reference frames
+        self.num_ref: int = -1           # Number of reference frames (-1 = automatic)
+        self.neighbor_stride: int = 5    # Stride for neighboring frame selection
+        # Configure batch processing parameters for memory optimization
+        self.batch_size: int = 10        # Number of frame groups per batch
+        self.scale_factor: int = getattr(args, 'scale_factor', 2)  # Resolution scaling
+    def _clear_gpu_memory(self) -> None:
+        """Clear GPU memory cache and trigger garbage collection."""
+        torch.cuda.empty_cache()
+        gc.collect()
+    def process_one_demo(self, data_sub_folder: str) -> None:
+        """
+        Process a single demonstration video to remove hand regions.
+        Args:
+            data_sub_folder: Path to demonstration data folder containing
+                           input video and hand segmentation masks
+        """
+        save_folder = self.get_save_folder(data_sub_folder)
+        paths = self.get_paths(save_folder)
+        if not os.path.exists(paths.inpaint_processor):
+            os.makedirs(paths.inpaint_processor)
+        self._process_frames(paths)
+    def _process_frames(self, paths: Any) -> None:
+        """
+        Process all video frames to remove hand regions using E2FGVI inpainting.
+        Args:
+            paths: Paths object containing input video and mask file locations
+        """
+        # Load and prepare video frames
+        frames = self._load_and_prepare_frames(paths)
+        video_length = len(frames)
+        logger.info(f"Processing {video_length} frames")
+        # Initialize tracking arrays for processed frames
+        comp_frames: List[Optional[np.ndarray]] = [None] * video_length
+        processed_frame_mask: List[bool] = [False] * video_length
+        # Process frames in batches with temporal overlap for consistency
+        self._process_frames_in_batches(frames, paths, comp_frames, processed_frame_mask)
+        # Handle any missed frames
+        self._process_missed_frames(frames, paths, comp_frames, processed_frame_mask)
+        # Final verification and save
+        self._verify_and_save_results(comp_frames, paths)
+    def _load_and_prepare_frames(self, paths: Any) -> List[Image.Image]:
+        """Load video frames and prepare them for processing."""
+        frames = self.read_frame_from_videos(paths.video_rgb_imgs)
+        # Calculate output dimensions based on configuration
+        h, w = frames[0].height, frames[0].width
+        if self.epic:
+            size = (w, h)
+        else:
+            if self.square:
+                output_resolution = np.array([self.output_resolution, self.output_resolution])
+            else:
+                output_resolution = np.array([int(w/h*self.output_resolution), self.output_resolution])
+            output_resolution = output_resolution.astype(np.int32)
+            size = output_resolution
+            frames, size = self.resize_frames(frames, size)
+        return frames
+    def _process_frames_in_batches(self, frames: List[Image.Image], paths: Any,
+                                 comp_frames: List[Optional[np.ndarray]],
+                                 processed_frame_mask: List[bool]) -> None:
+        """Process frames in batches with temporal overlap."""
+        video_length = len(frames)
+        h, w = frames[0].height, frames[0].width
+        for batch_start in tqdm(range(0, video_length, self.batch_size * self.neighbor_stride),
+                               desc="Processing batches"):
+            batch_end = min(batch_start + self.batch_size * self.neighbor_stride + self.neighbor_stride, video_length)
+            # Prepare batch data
+            batch_data = self._prepare_batch_data(frames, paths, batch_start, batch_end, h, w)
+            # Process frames within batch
+            self._process_batch_frames(frames, batch_data, batch_start, batch_end,
+                                     comp_frames, processed_frame_mask, h, w)
+            # Clean up batch memory
+            del batch_data['batch_imgs'], batch_data['batch_masks']
+            self._clear_gpu_memory()
+    def _prepare_batch_data(self, frames: List[Image.Image], paths: Any,
+                          batch_start: int, batch_end: int, h: int, w: int) -> dict:
+        """Prepare batch data including frames, masks, and binary masks."""
+        batch_frames = frames[batch_start:batch_end]
+        batch_imgs = to_tensors()(batch_frames).unsqueeze(0).to(self.device) * 2 - 1
+        batch_masks = self.read_mask(paths.masks_arm, (w, h))[batch_start:batch_end]
+        batch_masks = to_tensors()(batch_masks).unsqueeze(0).to(self.device)
+        binary_masks = self._create_binary_masks(paths.masks_arm, batch_start, batch_end, w, h)
+        return {
+            'batch_imgs': batch_imgs,
+            'batch_masks': batch_masks,
+            'binary_masks': binary_masks
+        }
+    def _create_binary_masks(self, mask_path: str, batch_start: int, batch_end: int,
+                           w: int, h: int) -> List[np.ndarray]:
+        """Create binary masks for the batch."""
+        masks = self.read_mask(mask_path, (w, h))[batch_start:batch_end]
+        binary_masks = []
+        for mask in masks:
+            mask_array = np.array(mask)
+            binary_mask = np.expand_dims((mask_array != 0).astype(np.uint8), 2)
+            binary_mask = cv2.resize(binary_mask, (w, h), interpolation=cv2.INTER_NEAREST)
+            binary_mask = np.expand_dims(binary_mask, 2)
+            binary_masks.append(binary_mask)
+        return binary_masks
+    def _process_batch_frames(self, frames: List[Image.Image], batch_data: dict,
+                            batch_start: int, batch_end: int,
+                            comp_frames: List[Optional[np.ndarray]],
+                            processed_frame_mask: List[bool], h: int, w: int) -> None:
+        """Process individual frames within a batch."""
+        stride = max(1, self.neighbor_stride if batch_start + self.batch_size * self.neighbor_stride < len(frames) else 1)
+        for frame_idx in range(batch_start, batch_end, stride):
+            neighbor_ids = self._get_neighbor_ids(frame_idx, batch_start, batch_end)
+            ref_ids = self.get_ref_index(frame_idx, neighbor_ids, batch_end)
+            if not neighbor_ids:
+                continue
+            # Convert to batch-relative indices
+            batch_neighbor_ids = [i - batch_start for i in neighbor_ids]
+            batch_ref_ids = [i - batch_start for i in ref_ids if batch_start <= i < batch_end]
+            # Process frame with temporal context
+            self._process_single_frame(frames, batch_data, neighbor_ids, batch_neighbor_ids,
+                                     batch_ref_ids, comp_frames, processed_frame_mask, h, w)
+            self._clear_gpu_memory()
+    def _get_neighbor_ids(self, frame_idx: int, batch_start: int, batch_end: int) -> List[int]:
+        """Get neighboring frame indices for temporal context."""
+        return list(range(
+            max(batch_start, frame_idx - self.neighbor_stride),
+            min(batch_end, frame_idx + self.neighbor_stride + 1)
+        ))
+    def _process_single_frame(self, frames: List[Image.Image], batch_data: dict,
+                            neighbor_ids: List[int], batch_neighbor_ids: List[int],
+                            batch_ref_ids: List[int], comp_frames: List[Optional[np.ndarray]],
+                            processed_frame_mask: List[bool], h: int, w: int) -> None:
+        """Process a single frame with its temporal context."""
+        batch_start = neighbor_ids[0] - batch_neighbor_ids[0]
+        # Select relevant frames and masks
+        selected_imgs = batch_data['batch_imgs'][:, batch_neighbor_ids + batch_ref_ids, :, :, :]
+        selected_masks = batch_data['batch_masks'][:, batch_neighbor_ids + batch_ref_ids, :, :]
+        with torch.no_grad():
+            # Apply masks and generate inpainted frames
+            masked_imgs = selected_imgs * (1 - selected_masks)
+            masked_imgs = self._pad_images(masked_imgs, h, w)
+            pred_imgs, _ = self.model(masked_imgs, len(batch_neighbor_ids))
+            pred_imgs = (pred_imgs[:, :, :h, :w] + 1) / 2
+            pred_imgs = (pred_imgs.cpu().permute(0, 2, 3, 1).numpy() * 255).astype(np.uint8)
+            # Composite with original background
+            for i, idx in enumerate(neighbor_ids):
+                binary_mask = batch_data['binary_masks'][idx - batch_start]
+                original_frame = np.array(frames[idx])
+                inpainted_frame = (pred_imgs[i] * binary_mask +
+                                 original_frame * (1 - binary_mask))
+                # Average with previous results if frame was already processed
+                if comp_frames[idx] is None:
+                    comp_frames[idx] = inpainted_frame
+                else:
+                    comp_frames[idx] = ((comp_frames[idx].astype(np.float32) +
+                                       inpainted_frame.astype(np.float32)) / 2).astype(np.uint8)
+                processed_frame_mask[idx] = True
+    def _process_missed_frames(self, frames: List[Image.Image], paths: Any,
+                             comp_frames: List[Optional[np.ndarray]],
+                             processed_frame_mask: List[bool]) -> None:
+        """Process any frames that were missed during batch processing."""
+        unprocessed_frames = [i for i, processed in enumerate(processed_frame_mask) if not processed]
+        if not unprocessed_frames:
+            return
+        logger.warning(f"Found {len(unprocessed_frames)} unprocessed frames at indices: {unprocessed_frames}")
+        # Determine processing context for missed frames
+        start_idx, end_idx = self._get_missed_frame_context(unprocessed_frames, processed_frame_mask, len(frames))
+        logger.info(f"Processing missed frames from {start_idx} to {end_idx}")
+        self._process_missed_frame_sequence(frames, paths, unprocessed_frames,
+                                          start_idx, end_idx, comp_frames, processed_frame_mask)
+    def _get_missed_frame_context(self, unprocessed_frames: List[int],
+                                processed_frame_mask: List[bool], video_length: int) -> Tuple[int, int]:
+        """Get the context range for processing missed frames."""
+        last_processed_idx = max([i for i, processed in enumerate(processed_frame_mask[:unprocessed_frames[0]])
+                                if processed], default=-1)
+        if last_processed_idx == -1:
+            last_processed_idx = 0
+        next_processed_idx = min([i for i, processed in enumerate(processed_frame_mask[unprocessed_frames[-1]:],
+                                 start=unprocessed_frames[-1]) if processed], default=video_length)
+        start_idx = max(0, last_processed_idx - self.neighbor_stride)
+        end_idx = min(video_length, next_processed_idx + self.neighbor_stride)
+        return start_idx, end_idx
+    def _process_missed_frame_sequence(self, frames: List[Image.Image], paths: Any,
+                                     unprocessed_frames: List[int], start_idx: int, end_idx: int,
+                                     comp_frames: List[Optional[np.ndarray]],
+                                     processed_frame_mask: List[bool]) -> None:
+        """Process the sequence containing missed frames."""
+        h, w = frames[0].height, frames[0].width
+        # Prepare sequence data
+        batch_frames = frames[start_idx:end_idx]
+        batch_imgs = to_tensors()(batch_frames).unsqueeze(0).to(self.device) * 2 - 1
+        batch_masks = self.read_mask(paths.masks_arm, (w, h))[start_idx:end_idx]
+        batch_masks = to_tensors()(batch_masks).unsqueeze(0).to(self.device)
+        binary_masks = self._create_binary_masks(paths.masks_arm, start_idx, end_idx, w, h)
+        # Process each missed frame
+        for idx in tqdm(unprocessed_frames, desc="Processing missed frames"):
+            self._process_missed_single_frame(frames, batch_imgs, batch_masks, binary_masks,
+                                           idx, start_idx, end_idx, comp_frames, processed_frame_mask, h, w)
+        del batch_imgs, batch_masks
+        self._clear_gpu_memory()
+    def _process_missed_single_frame(self, frames: List[Image.Image], batch_imgs: torch.Tensor,
+                                   batch_masks: torch.Tensor, binary_masks: List[np.ndarray],
+                                   frame_idx: int, start_idx: int, end_idx: int,
+                                   comp_frames: List[Optional[np.ndarray]],
+                                   processed_frame_mask: List[bool], h: int, w: int) -> None:
+        """Process a single missed frame."""
+        relative_start = frame_idx - start_idx
+        neighbor_ids = list(range(
+            max(0, relative_start - self.neighbor_stride),
+            min(end_idx - start_idx, relative_start + self.neighbor_stride + 1)
+        ))
+        ref_ids = self.get_ref_index(relative_start, neighbor_ids, end_idx - start_idx)
+        with torch.no_grad():
+            selected_imgs = batch_imgs[:, neighbor_ids + ref_ids, :, :, :]
+            selected_masks = batch_masks[:, neighbor_ids + ref_ids, :, :]
+            masked_imgs = selected_imgs * (1 - selected_masks)
+            masked_imgs = self._pad_images(masked_imgs, h, w)
+            pred_imgs, _ = self.model(masked_imgs, len(neighbor_ids))
+            pred_imgs = (pred_imgs[:, :, :h, :w] + 1) / 2
+            pred_imgs = (pred_imgs.cpu().permute(0, 2, 3, 1).numpy() * 255).astype(np.uint8)
+            relative_idx = frame_idx - start_idx - neighbor_ids[0]
+            binary_mask = binary_masks[frame_idx - start_idx]
+            original_frame = np.array(frames[frame_idx])
+            inpainted_frame = (pred_imgs[relative_idx] * binary_mask +
+                             original_frame * (1 - binary_mask))
+            comp_frames[frame_idx] = inpainted_frame
+            processed_frame_mask[frame_idx] = True
+    def _verify_and_save_results(self, comp_frames: List[Optional[np.ndarray]], paths: Any) -> None:
+        """Verify all frames were processed and save the final video."""
+        missing_frames = [i for i, frame in enumerate(comp_frames)
+                         if frame is None or (isinstance(frame, np.ndarray) and frame.size == 0)]
+        if missing_frames:
+            raise RuntimeError(f"Still found unprocessed frames after cleanup: {missing_frames}")
+        logger.info("Successfully processed all frames")
+        # Save final inpainted video
+        media.write_video(paths.video_human_inpaint, comp_frames, fps=15, codec="ffv1")
+    def get_ref_index(self, f: int, neighbor_ids: List[int], length: int) -> List[int]:
+        """
+        Select reference frame indices for temporal consistency.
+        Args:
+            f: Current frame index
+            neighbor_ids: List of neighboring frame indices
+            length: Total length of the sequence
+        Returns:
+            List of reference frame indices for temporal consistency
+        """
+        if self.num_ref == -1:
+            # Automatic reference selection: every ref_length frames not in neighbors
+            ref_index = [
+                i for i in range(0, length, self.ref_length)
+                if i not in neighbor_ids
+            ]
+        else:
+            # Limited reference selection: specific number around current frame
+            ref_index = []
+            for i in range(max(0, f - self.ref_length * (self.num_ref // 2)),
+                          min(length, f + self.ref_length * (self.num_ref // 2)) + 1,
+                          self.ref_length):
+                if i not in neighbor_ids and len(ref_index) < self.num_ref:
+                    ref_index.append(i)
+        return ref_index
+    @staticmethod
+    def read_mask(mask_path: str, size: Tuple[int, int]) -> List[Image.Image]:
+        """
+        Load and process hand segmentation masks for inpainting guidance.
+        Args:
+            mask_path: Path to mask file containing hand segmentation data
+            size: Target size (width, height) for mask resizing
+        Returns:
+            List of processed PIL Images containing binary hand masks
+        """
+        masks = []
+        frames_media = np.load(mask_path, allow_pickle=True)
+        frames = [frame for frame in frames_media]
+        for mask_frame in frames:
+            # Convert to PIL Image and resize
+            mask_img = Image.fromarray(mask_frame)
+            mask_img = mask_img.resize(size, Image.NEAREST)
+            mask_array = np.array(mask_img.convert('L'))
+            # Create binary mask
+            binary_mask = np.array(mask_array > 0).astype(np.uint8)
+            # Apply morphological dilation to expand mask boundaries
+            # This helps ensure complete coverage of hand regions
+            dilated_mask = cv2.dilate(binary_mask,
+                                    cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3)),
+                                    iterations=4)
+            masks.append(Image.fromarray(dilated_mask * 255))
+        return masks
+    @staticmethod
+    def read_frame_from_videos(video_path: str) -> List[Image.Image]:
+        """
+        Load video frames and convert to PIL Images.
+        Args:
+            video_path: Path to video file
+        Returns:
+            List of PIL Images containing video frames
+        """
+        return [Image.fromarray(frame) for frame in media.read_video(video_path)]
+    @staticmethod
+    def resize_frames(frames: List[Image.Image], size: Optional[Tuple[int, int]] = None) -> Tuple[List[Image.Image], Tuple[int, int]]:
+        """
+        Resize video frames to target resolution.
+        Args:
+            frames: List of PIL Images to resize
+            size: Target size (width, height), or None to keep original
+        Returns:
+            Tuple containing resized frames and final size
+        """
+        return ([f.resize(size) for f in frames], size)
+    @staticmethod
+    def _pad_images(img_tensor: torch.Tensor, h: int, w: int) -> torch.Tensor:
+        """
+        Pad image tensor to meet model input requirements.
+        Args:
+            img_tensor: Input image tensor to pad
+            h: Original height
+            w: Original width
+        Returns:
+            Padded image tensor suitable for model input
+        """
+        # Model requires specific dimension multiples
+        mod_size_h, mod_size_w = 60, 108
+        # Calculate required padding
+        h_pad = (mod_size_h - h % mod_size_h) % mod_size_h
+        w_pad = (mod_size_w - w % mod_size_w) % mod_size_w
+        # Apply reflection padding to avoid boundary artifacts
+        img_tensor = torch.cat([img_tensor, torch.flip(img_tensor, [3])], 3)[:, :, :, :h + h_pad, :]
+        return torch.cat([img_tensor, torch.flip(img_tensor, [4])], 4)[:, :, :, :, :w + w_pad]

phantom/phantom/processors/paths.py ADDED Viewed

	@@ -0,0 +1,219 @@

+"""
+Path management for Phantom.
+"""
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Dict, Optional
+import yaml
+from phantom.utils.image_utils import convert_video_to_images
+@dataclass
+class Paths:
+    """Data class containing all file paths used by processors."""
+    data_path: Path
+    robot_name: str = "franka"
+    def __post_init__(self):
+        """Compute derived paths based on base paths."""
+        # Convert string paths to Path objects if needed
+        if isinstance(self.data_path, str):
+            self.data_path = Path(self.data_path)
+        # Validate data path
+        if not self.data_path.exists():
+            raise FileNotFoundError(f"Data path does not exist: {self.data_path}")
+        # Videos
+        self.video_left = self.data_path / "video_L.mp4"
+        self.video_right = self.data_path / "video_R.mp4"
+        self.video_rgb_imgs = self.data_path / "video_rgb_imgs.mkv"
+        # Image folders
+        self.original_images_folder = self.data_path / "original_images"
+        # self._setup_original_images()
+        self.original_images_folder_reverse = self.data_path / "original_images_reverse"
+        # self._setup_original_images_reverse()
+        # Epic annotations
+        self.hand_detection_data = self.data_path / "hand_det.pkl"
+        self.cam_extrinsics_data = self.data_path / "extrinsics.npy"
+        # Depth
+        self.depth = self.data_path / "depth.npy"
+        # Bbox processor
+        self.bbox_processor = self.data_path / "bbox_processor"
+        self.bbox_data = self.bbox_processor / "bbox_data.npz"
+        self.video_bboxes = self.bbox_processor / "video_bboxes.mkv"
+        # Segmentation processor
+        self.segmentation_processor = self.data_path / "segmentation_processor"
+        self.masks_arm = self.segmentation_processor / "masks_arm.npy"
+        self.video_masks_arm = self.segmentation_processor / "video_masks_arm.mkv"
+        self.video_sam_arm = self.segmentation_processor / "video_sam_arm.mkv"
+        for side in ["left", "right"]:
+            setattr(self, f"masks_hand_{side}", self.segmentation_processor / f"masks_hand_{side}.npy")
+            setattr(self, f"video_masks_hand_{side}", self.segmentation_processor / f"video_masks_hand_{side}.mkv")
+            setattr(self, f"video_sam_hand_{side}", self.segmentation_processor / f"video_sam_hand_{side}.mkv")
+        # Hand Processor
+        self.hand_processor = self.data_path / f"hand_processor"
+        for side in ["left", "right"]:
+            setattr(self, f"hand_data_{side}", self.hand_processor / f"hand_data_{side}.npz")
+            setattr(self, f"hand_data_3d_{side}", self.hand_processor / f"hand_data_3d_{side}.npz")
+        self.video_annot = self.data_path / "video_annot.mp4"
+        # Action processor
+        self.action_processor = self.data_path / "action_processor"
+        for side in ["left", "right"]:
+            setattr(self, f"actions_{side}", self.action_processor / f"actions_{side}.npz")
+        # Smoothing processor
+        self.smoothing_processor = self.data_path / f"smoothing_processor"
+        for side in ["left", "right"]:
+            setattr(self, f"smoothed_actions_{side}", self.smoothing_processor / f"smoothed_actions_{side}.npz")
+        # Inpaint processor
+        self.inpaint_processor = self.data_path / "inpaint_processor"
+        self.video_overlay = self.data_path / "video_overlay.mkv"
+        self.video_human_inpaint = self.inpaint_processor / "video_human_inpaint.mkv"
+        self.video_inpaint_overlay = self.inpaint_processor / "video_inpaint_overlay.mkv"
+        self.video_birdview = self.inpaint_processor / "video_birdview.mkv"
+        self.training_data = self.inpaint_processor / "training_data.npz"
+    def _setup_original_images(self):
+        """Set up original images paths."""
+        convert_video_to_images(self.video_left, self.original_images_folder, square=False)
+        image_paths = sorted(
+            list(self.original_images_folder.glob("*.jpg")),
+            key=lambda x: int(x.stem)
+        )
+        self.original_images = image_paths
+    def _setup_original_images_reverse(self):
+        """Set up original images paths."""
+        convert_video_to_images(self.video_left, self.original_images_folder_reverse, square=False, reverse=True)
+        image_paths = sorted(
+            list(self.original_images_folder_reverse.glob("*.jpg")),
+            key=lambda x: int(x.stem)
+        )
+        self.original_images_reverse = image_paths
+    def ensure_directories_exist(self):
+        """
+        Create necessary directories if they don't exist.
+        """
+        # Create all necessary directories
+        directories = [
+            self.data_path,
+        ]
+        for directory in directories:
+            if isinstance(directory, Path) and not directory.exists():
+                directory.mkdir(parents=True, exist_ok=True)
+class PathsConfig:
+    """
+    Configuration for paths used in the project.
+    This class handles loading and saving path configurations from files,
+    and provides methods for creating Paths objects.
+    """
+    def __init__(self, config_file: Optional[str] = None) -> None:
+        """
+        Initialize paths configuration.
+        Args:
+            config_file: Path to configuration file. If None, use default config.
+        """
+        self.config: dict[str, str] = {}
+        if config_file:
+            self.load_config(config_file)
+        else:
+            self.set_default_config()
+    def load_config(self, config_file: str) -> None:
+        """
+        Load configuration from a YAML file.
+        Args:
+            config_file: Path to configuration file
+        Raises:
+            FileNotFoundError: If config file doesn't exist
+            yaml.YAMLError: If config file is invalid YAML
+        """
+        try:
+            with open(config_file, 'r') as f:
+                self.config = yaml.safe_load(f)
+        except FileNotFoundError:
+            raise FileNotFoundError(f"Configuration file not found: {config_file}")
+        except yaml.YAMLError as e:
+            raise yaml.YAMLError(f"Invalid YAML in configuration file {config_file}: {e}")
+    def save_config(self, config_file: str) -> None:
+        """
+        Save configuration to a YAML file.
+        Args:
+            config_file: Path to save configuration file
+        Raises:
+            OSError: If unable to write to the file
+        """
+        with open(config_file, 'w') as f:
+            yaml.dump(self.config, f, default_flow_style=False)
+    def set_default_config(self) -> None:
+        """Set default configuration values."""
+        self.config = {
+            'data_root': './data',
+            'processed_root': './processed_data',
+            'project_name': 'phantom',
+        }
+    def get_paths(self, demo_name: str, robot_name: str = "franka") -> Paths:
+        """
+        Get Paths object for a specific demo.
+        Args:
+            demo_name: Name of the demo
+            robot_name: Name of the robot
+        Returns:
+            Paths object for the demo
+        """
+        data_path = os.path.join(self.config['data_root'], demo_name)
+        return Paths(
+            data_path=Path(data_path),
+            robot_name=robot_name
+        )
+    def get_all_demo_paths(self) -> List[str]:
+        """
+        Get list of all demo paths in data root.
+        Returns:
+            List of demo paths
+        """
+        data_root = self.config['data_root']
+        all_data_collection_folders = [
+            f for f in os.listdir(data_root)
+            if os.path.isdir(os.path.join(data_root, f))
+        ]
+        all_data_folders = [
+            os.path.join(d1, d2)
+            for d1 in os.listdir(data_root)
+            if os.path.isdir(os.path.join(data_root, d1))
+            for d2 in os.listdir(os.path.join(data_root, d1))
+            if os.path.isdir(os.path.join(data_root, d1, d2))
+        ]
+        return sorted(all_data_folders, key=lambda x: tuple(map(int, x.rsplit('/', 2)[-2:])))

phantom/phantom/processors/phantom_data.py ADDED Viewed

	@@ -0,0 +1,340 @@

+from dataclasses import dataclass
+from typing import Dict, List, Optional, Callable, Any
+import numpy as np
+hand_side_dict = {
+    'left': 0,
+    'right': 1,
+}
+class LazyLoadingMixin:
+    """Mixin to provide lazy loading functionality for cached properties."""
+    def _invalidate_cache(self) -> None:
+        """Invalidate all cached properties. Override in subclasses."""
+        pass
+    def _get_cached_property(self, cache_attr: str, compute_func: Callable[[], Any]) -> Any:
+        """Generic lazy loading for cached properties."""
+        if getattr(self, cache_attr) is None:
+            setattr(self, cache_attr, compute_func())
+        return getattr(self, cache_attr)
+@dataclass
+class TrainingData:
+    """Container for processing results"""
+    frame_idx: int
+    valid: bool
+    action_pos_left: np.ndarray
+    action_orixyzw_left: np.ndarray
+    action_pos_right: np.ndarray
+    action_orixyzw_right: np.ndarray
+    action_gripper_left: np.ndarray
+    action_gripper_right: np.ndarray
+    gripper_width_left: np.ndarray
+    gripper_width_right: np.ndarray
+    @classmethod
+    def create_empty_frame(cls, frame_idx: int) -> 'TrainingData':
+        """Create a frame with no hand detection"""
+        return cls(
+            frame_idx=frame_idx,
+            valid=False,
+            action_pos_left=np.zeros((3,)),
+            action_orixyzw_left=np.zeros((4,)),
+            action_pos_right=np.zeros((3,)),
+            action_orixyzw_right=np.zeros((4,)),
+            action_gripper_left=0,
+            action_gripper_right=0,
+            gripper_width_left=0,
+            gripper_width_right=0,
+        )
+class TrainingDataSequence(LazyLoadingMixin):
+    """Container for a sequence of training data"""
+    def __init__(self):
+        self.frames: List[TrainingData] = []
+        self.metadata: Dict = {}
+        self._frame_indices: Optional[np.ndarray] = None
+        self._valid: Optional[np.ndarray] = None
+        self._action_pos_left: Optional[np.ndarray] = None
+        self._action_orixyzw_left: Optional[np.ndarray] = None
+        self._action_pos_right: Optional[np.ndarray] = None
+        self._action_orixyzw_right: Optional[np.ndarray] = None
+        self._action_gripper_left: Optional[np.ndarray] = None
+        self._action_gripper_right: Optional[np.ndarray] = None
+        self._gripper_width_left: Optional[np.ndarray] = None
+        self._gripper_width_right: Optional[np.ndarray] = None
+    def add_frame(self, frame: TrainingData) -> None:
+        """Add a frame to the sequence and invalidate cached properties."""
+        self.frames.append(frame)
+        self._invalidate_cache()
+    def save(self, path: str) -> None:
+        """Save the sequence to disk in both frame-wise and sequence-wise formats"""
+        sequence_data = {
+            'frame_indices': self.frame_indices,
+            'valid': self.valid,
+            'action_pos_left': self.action_pos_left,
+            'action_orixyzw_left': self.action_orixyzw_left,
+            'action_pos_right': self.action_pos_right,
+            'action_orixyzw_right': self.action_orixyzw_right,
+            'action_gripper_left': self.action_gripper_left,
+            'action_gripper_right': self.action_gripper_right,
+            'gripper_width_left': self.gripper_width_left,
+            'gripper_width_right': self.gripper_width_right,
+        }
+        np.savez_compressed(
+            path,
+            **sequence_data
+        )
+    @property
+    def frame_indices(self) -> np.ndarray:
+        """Lazy loading of all frame indices"""
+        return self._get_cached_property(
+            '_frame_indices',
+            lambda: np.arange(len(self.frames))
+        )
+    @property
+    def valid(self) -> np.ndarray:
+        """Lazy loading of all valid flags"""
+        return self._get_cached_property(
+            '_valid',
+            lambda: np.stack([f.valid for f in self.frames])
+        )
+    @property
+    def action_pos_left(self) -> np.ndarray:
+        """Lazy loading of all action positions"""
+        return self._get_cached_property(
+            '_action_pos_left',
+            lambda: np.stack([f.action_pos_left for f in self.frames])
+        )
+    @property
+    def action_orixyzw_left(self) -> np.ndarray:
+        """Lazy loading of all action orientations"""
+        return self._get_cached_property(
+            '_action_orixyzw_left',
+            lambda: np.stack([f.action_orixyzw_left for f in self.frames])
+        )
+    @property
+    def action_pos_right(self) -> np.ndarray:
+        """Lazy loading of all action positions"""
+        return self._get_cached_property(
+            '_action_pos_right',
+            lambda: np.stack([f.action_pos_right for f in self.frames])
+        )
+    @property
+    def action_orixyzw_right(self) -> np.ndarray:
+        """Lazy loading of all action orientations"""
+        return self._get_cached_property(
+            '_action_orixyzw_right',
+            lambda: np.stack([f.action_orixyzw_right for f in self.frames])
+        )
+    @property
+    def action_gripper_left(self) -> np.ndarray:
+        """Lazy loading of all action gripper distances"""
+        return self._get_cached_property(
+            '_action_gripper_left',
+            lambda: np.stack([f.action_gripper_left for f in self.frames])
+        )
+    @property
+    def action_gripper_right(self) -> np.ndarray:
+        """Lazy loading of all action gripper distances"""
+        return self._get_cached_property(
+            '_action_gripper_right',
+            lambda: np.stack([f.action_gripper_right for f in self.frames])
+        )
+    @property
+    def gripper_width_left(self) -> np.ndarray:
+        """Lazy loading of all gripper widths"""
+        return self._get_cached_property(
+            '_gripper_width_left',
+            lambda: np.stack([f.gripper_width_left for f in self.frames])
+        )
+    @property
+    def gripper_width_right(self) -> np.ndarray:
+        """Lazy loading of all gripper widths"""
+        return self._get_cached_property(
+            '_gripper_width_right',
+            lambda: np.stack([f.gripper_width_right for f in self.frames])
+        )
+    def _invalidate_cache(self):
+        """Invalidate all cached properties."""
+        self._frame_indices = None
+        self._valid = None
+        self._action_pos_left = None
+        self._action_orixyzw_left = None
+        self._action_pos_right = None
+        self._action_orixyzw_right = None
+        self._action_gripper_left = None
+        self._action_gripper_right = None
+        self._gripper_width_left = None
+        self._gripper_width_right = None
+    @classmethod
+    def load(cls, path: str) -> 'TrainingDataSequence':
+        """Load a sequence from disk"""
+        data = np.load(path, allow_pickle=True)
+        sequence = cls()
+        sequence._frame_indices = data['frame_indices']
+        sequence._valid = data['valid']
+        sequence._action_pos_left = data['action_pos_left']
+        sequence._action_orixyzw_left = data['action_orixyzw_left']
+        sequence._action_pos_right = data['action_pos_right']
+        sequence._action_orixyzw_right = data['action_orixyzw_right']
+        sequence._action_gripper_left = data['action_gripper_left']
+        sequence._action_gripper_right = data['action_gripper_right']
+        sequence._gripper_width_left = data['gripper_width_left']
+        sequence._gripper_width_right = data['gripper_width_right']
+        return sequence
+@dataclass
+class HandFrame:
+    """Data structure for a single frame of hand data"""
+    frame_idx: int
+    hand_detected: bool
+    img_rgb: np.ndarray
+    img_hamer: np.ndarray
+    kpts_2d: np.ndarray  # shape: (N, 2)
+    kpts_3d: np.ndarray  # shape: (N, 3)
+    @classmethod
+    def create_empty_frame(cls, frame_idx: int, img_rgb: np.ndarray) -> 'HandFrame':
+        """Create a frame with no hand detection"""
+        return cls(
+            frame_idx=frame_idx,
+            hand_detected=False,
+            img_rgb=img_rgb,
+            img_hamer=np.zeros_like(img_rgb),
+            kpts_2d=np.zeros((21, 2)),
+            kpts_3d=np.zeros((21, 3)),
+        )
+class HandSequence(LazyLoadingMixin):
+    """Container for a sequence of hand data"""
+    def __init__(self):
+        self.frames: List[HandFrame] = []
+        self.metadata: Dict = {}
+        self._frame_indices: Optional[np.ndarray] = None
+        self._hand_detected: Optional[np.ndarray] = None
+        self._img_rgb: Optional[np.ndarray] = None
+        self._img_hamer: Optional[np.ndarray] = None
+        self._kpts_2d: Optional[np.ndarray] = None
+        self._kpts_3d: Optional[np.ndarray] = None
+    def add_frame(self, frame: HandFrame) -> None:
+        """Add a frame to the sequence and invalidate cached properties."""
+        self.frames.append(frame)
+        self._invalidate_cache()
+    def get_frame(self, frame_idx: int) -> HandFrame:
+        """Get a frame by index."""
+        return self.frames[frame_idx]
+    def modify_frame(self, frame_idx: int, frame: HandFrame) -> None:
+        """Modify a frame at the given index and invalidate cached properties."""
+        self.frames[frame_idx] = frame
+        self._invalidate_cache()
+    def save(self, path: str) -> None:
+        """Save the sequence to disk in both frame-wise and sequence-wise formats"""
+        sequence_data = {
+            'hand_detected': self.hand_detected,
+            'kpts_2d': self.kpts_2d,
+            'kpts_3d': self.kpts_3d,
+            'frame_indices': self.frame_indices,
+        }
+        np.savez_compressed(
+            path,
+            **sequence_data
+        )
+    @property
+    def frame_indices(self) -> np.ndarray:
+        """Lazy loading of all frame indices"""
+        return self._get_cached_property(
+            '_frame_indices',
+            lambda: np.arange(len(self.frames))
+        )
+    @property
+    def hand_detected(self) -> np.ndarray:
+        """Lazy loading of all hand detection flags"""
+        return self._get_cached_property(
+            '_hand_detected',
+            lambda: np.stack([f.hand_detected for f in self.frames])
+        )
+    @property
+    def imgs_rgb(self) -> np.ndarray:
+        """Lazy loading of all RGB images"""
+        return self._get_cached_property(
+            '_img_rgb',
+            lambda: np.stack([f.img_rgb for f in self.frames])
+        )
+    @property
+    def imgs_hamer(self) -> np.ndarray:
+        """Lazy loading of all HAMER images"""
+        return self._get_cached_property(
+            '_img_hamer',
+            lambda: np.stack([f.img_hamer for f in self.frames])
+        )
+    @property
+    def kpts_2d(self) -> np.ndarray:
+        """Lazy loading of all 2D keypoints"""
+        return self._get_cached_property(
+            '_kpts_2d',
+            lambda: np.stack([f.kpts_2d for f in self.frames])
+        )
+    @property
+    def kpts_3d(self) -> np.ndarray:
+        """Lazy loading of all 3D keypoints"""
+        return self._get_cached_property(
+            '_kpts_3d',
+            lambda: np.stack([f.kpts_3d for f in self.frames])
+        )
+    @classmethod
+    def load(cls, path: str) -> 'HandSequence':
+        """Load a sequence from disk"""
+        data = np.load(path, allow_pickle=True)
+        sequence = cls()
+        # Load pre-computed sequence-wise data
+        sequence._frame_indices = data['frame_indices']
+        sequence._hand_detected = data['hand_detected']
+        sequence._kpts_2d = data['kpts_2d']
+        sequence._kpts_3d = data['kpts_3d']
+        return sequence
+    def _invalidate_cache(self):
+        """Invalidate all cached properties."""
+        self._frame_indices = None
+        self._hand_detected = None
+        self._img_rgb = None
+        self._img_hamer = None
+        self._kpts_2d = None
+        self._kpts_3d = None

phantom/phantom/processors/robotinpaint_processor.py ADDED Viewed

	@@ -0,0 +1,785 @@

+"""
+Robot Inpainting Processor Module
+This module uses MuJoCo to render robot models and overlay them onto human demonstration videos.
+Processing Pipeline:
+1. Load smoothed robot trajectories from previous processing stages
+2. Initialize MuJoCo robot simulation with calibrated camera parameters
+3. For each frame:
+   - Move simulated robot to target pose from human demonstration
+   - Render robot from calibrated camera viewpoint
+   - Apply depth-based occlusion handling (Optional)
+   - Create robot overlay on human demonstration video
+4. Generate training data with robot state annotations
+5. Save robot-inpainted videos and training data
+"""
+import os
+import pdb
+import numpy as np
+import cv2
+from tqdm import tqdm
+import mediapy as media
+from scipy.spatial.transform import Rotation
+from typing import Tuple, Dict, List, Optional, Any, Union
+import logging
+from dataclasses import dataclass
+from phantom.processors.phantom_data import TrainingData, TrainingDataSequence, HandSequence
+from phantom.processors.base_processor import BaseProcessor
+from phantom.twin_bimanual_robot import TwinBimanualRobot, MujocoCameraParams
+from phantom.twin_robot import TwinRobot
+from phantom.processors.paths import Paths
+logger = logging.getLogger(__name__)
+@dataclass
+class RobotState:
+    """
+    Container for robot state data including pose and gripper configuration.
+    Attributes:
+        pos: 3D position coordinates in world frame
+        ori_xyzw: Quaternion orientation in XYZW format (scalar-last)
+        gripper_pos: Gripper opening distance or action value
+    """
+    pos: np.ndarray
+    ori_xyzw: np.ndarray
+    gripper_pos: float
+class RobotInpaintProcessor(BaseProcessor):
+    """
+    Uses mujoco to overlay robot on human inpainted images.
+    """
+    # Processing constants for quality control and output formatting
+    TRACKING_ERROR_THRESHOLD = 0.05  # Maximum tracking error in meters
+    DEFAULT_FPS = 15                 # Standard frame rate for output videos
+    DEFAULT_CODEC = "ffv1"          # Lossless codec for high-quality output
+    def __init__(self, args: Any) -> None:
+        """
+        Initialize the robot inpainting processor with simulation parameters.
+        Args:
+            args: Command line arguments containing robot configuration,
+                 camera parameters, and processing options
+        """
+        super().__init__(args)
+        self.use_depth = self.depth_for_overlay
+        self._initialize_robot()
+    def _initialize_robot(self) -> None:
+        """
+        Initialize the twin robot simulation with calibrated camera parameters.
+        """
+        # Generate MuJoCo camera parameters from real-world calibration
+        camera_params = self._get_mujoco_camera_params()
+        img_w, img_h = self._get_image_dimensions()
+        # Initialize appropriate robot configuration
+        if self.bimanual_setup == "single_arm":
+            self.twin_robot = TwinRobot(
+                self.robot,
+                self.gripper,
+                camera_params,
+                camera_height=img_h,
+                camera_width=img_w,
+                render=self.render,
+                n_steps_short=3,
+                n_steps_long=75,
+                debug_cameras=self.debug_cameras,
+                square=self.square,
+            )
+        else:
+            self.twin_robot = TwinBimanualRobot(
+                self.robot,
+                self.gripper,
+                self.bimanual_setup,
+                camera_params,
+                camera_height=img_h,
+                camera_width=img_w,
+                render=self.render,
+                n_steps_short=10,
+                n_steps_long=75,
+                debug_cameras=self.debug_cameras,
+                epic=self.epic,
+                joint_controller=False,  # Use operational-space control
+            )
+    def __del__(self):
+        """Clean up robot simulation resources."""
+        if hasattr(self, 'twin_robot'):
+            self.twin_robot.close()
+    def process_one_demo(self, data_sub_folder: str) -> None:
+        """
+        Process a single demonstration to create robot-inpainted visualization.
+        Args:
+            data_sub_folder: Path to demonstration data folder containing
+                           smoothed trajectories and original video data
+        """
+        save_folder = self.get_save_folder(data_sub_folder)
+        if self._should_skip_processing(save_folder):
+            return
+        paths = self.get_paths(save_folder)
+        # Reinitialize robot simulation for each demo to ensure clean state
+        self.__del__()
+        self._initialize_robot()
+        # Load and prepare demonstration data
+        data = self._load_data(paths)
+        images = self._load_images(paths, data["union_indices"])
+        gripper_actions, gripper_widths = self._process_gripper_widths(paths, data)
+        # Process all frames to generate robot overlays and training data
+        sequence, img_overlay, img_birdview = self._process_frames(images, data, gripper_actions, gripper_widths)
+        # Save comprehensive results
+        self._save_results(paths, sequence, img_overlay, img_birdview)
+    def _process_frames(self, images: Dict[str, np.ndarray], data: Dict[str, np.ndarray],
+                       gripper_actions: Dict[str, np.ndarray], gripper_widths: Dict[str, np.ndarray]) -> Tuple[TrainingDataSequence, List[np.ndarray], Optional[List[np.ndarray]]]:
+        """
+        Process each frame to generate robot overlays and training data.
+        Args:
+            images: Dictionary containing human demonstration images and masks
+            data: Robot trajectory data (positions and orientations)
+            gripper_actions: Processed gripper action commands
+            gripper_widths: Gripper opening distances
+        Returns:
+            Tuple containing:
+                - TrainingDataSequence with robot state annotations
+                - List of robot overlay images
+                - Optional list of bird's eye view images (if debug cameras enabled)
+        """
+        sequence = TrainingDataSequence()
+        img_overlay = []
+        img_birdview = None
+        if "birdview" in self.debug_cameras:
+            img_birdview = []
+        for idx in tqdm(range(len(images['human_imgs'])), desc="Processing frames"):
+            # Extract robot states for current frame
+            left_state = self._get_robot_state(
+                data['ee_pts_left'][idx],
+                data['ee_oris_left'][idx],
+                gripper_widths['left'][idx]
+            )
+            right_state = self._get_robot_state(
+                data['ee_pts_right'][idx],
+                data['ee_oris_right'][idx],
+                gripper_widths['right'][idx]
+            )
+            # Process individual frame with robot simulation
+            frame_results = self._process_single_frame(
+                images, left_state, right_state, idx
+            )
+            # Handle failed processing (tracking errors, simulation issues)
+            if frame_results is None:
+                print(f"sdfsdfsTracking error too large at frame {idx}, skipping")
+                sequence.add_frame(TrainingData.create_empty_frame(
+                    frame_idx=idx,
+                ))
+                img_overlay.append(np.zeros_like(images['human_imgs'][idx]))
+                if "birdview" in self.debug_cameras:
+                    img_birdview.append(np.zeros_like(images['human_imgs'][idx]))
+            else:
+                # Create comprehensive training data annotation
+                sequence.add_frame(TrainingData(
+                    frame_idx=idx,
+                    valid=True,
+                    action_pos_left=left_state.pos,
+                    action_orixyzw_left=left_state.ori_xyzw,
+                    action_pos_right=right_state.pos,
+                    action_orixyzw_right=right_state.ori_xyzw,
+                    action_gripper_left=gripper_actions['left'][idx],
+                    action_gripper_right=gripper_actions['right'][idx],
+                    gripper_width_left=gripper_widths['left'][idx],
+                    gripper_width_right=gripper_widths['right'][idx],
+                ))
+                img_overlay.append(frame_results['rgb_robot_overlay'])
+                if "birdview" in self.debug_cameras:
+                    img_birdview.append(frame_results['birdview_img'])
+        return sequence, img_overlay, img_birdview
+    def _process_single_frame(self, images: Dict[str, np.ndarray],
+                            left_state: RobotState,
+                            right_state: RobotState,
+                            idx: int) -> Optional[Dict[str, np.ndarray]]:
+        """
+        Process a single frame to generate robot overlay and validate tracking.
+        Args:
+            images: Dictionary containing human images and segmentation data
+            left_state: Target state for left robot arm
+            right_state: Target state for right robot arm
+            idx: Frame index for initialization and logging
+        Returns:
+            Dictionary containing rendered robot overlay and debug camera views,
+            or None if tracking error exceeds threshold
+        """
+        # Prepare robot target state based on configuration
+        if self.bimanual_setup == "single_arm":
+            if self.target_hand == "left":
+                target_state = {
+                    "pos": left_state.pos,
+                    "ori_xyzw": left_state.ori_xyzw,
+                    "gripper_pos": left_state.gripper_pos,
+                }
+            else:
+                target_state = {
+                    "pos": right_state.pos,
+                    "ori_xyzw": right_state.ori_xyzw,
+                    "gripper_pos": right_state.gripper_pos,
+                }
+        else:
+            # Bimanual configuration requires coordinated control
+            target_state = {
+                "pos": [right_state.pos, left_state.pos],
+                "ori_xyzw": [right_state.ori_xyzw, left_state.ori_xyzw],
+                "gripper_pos": [right_state.gripper_pos, left_state.gripper_pos],
+            }
+        # Move robot to target state and get simulation results
+        robot_results = self.twin_robot.move_to_target_state(
+            target_state, init=(idx == 0)  # Initialize on first frame
+        )
+        # Validate tracking accuracy to ensure quality
+        if self.bimanual_setup == "single_arm":
+            if robot_results['pos_err'] > self.TRACKING_ERROR_THRESHOLD:
+                print(f"Tracking error too large at frame {idx}, skipping", robot_results['pos_err'])
+                logger.warning(f"Tracking error too large at frame {idx}, skipping")
+                return None
+        else:
+            if robot_results['left_pos_err'] > self.TRACKING_ERROR_THRESHOLD or robot_results['right_pos_err'] > self.TRACKING_ERROR_THRESHOLD:
+                logger.warning(f"Tracking error too large at frame {idx}, skipping")
+                return None
+        # Generate robot overlay using appropriate method
+        if self.use_depth:
+            rgb_robot_overlay = self._process_robot_overlay_with_depth(
+                images['human_imgs'][idx],
+                images['human_masks'][idx],
+                images['imgs_depth'][idx],
+                robot_results
+            )
+        else:
+            rgb_robot_overlay = self._process_robot_overlay(
+                images['human_imgs'][idx], robot_results
+            )
+        # Prepare output with main overlay and debug camera views
+        output = {
+            'rgb_robot_overlay': rgb_robot_overlay,
+        }
+        # Add debug camera views if requested
+        for cam in self.debug_cameras:
+            output[f"{cam}_img"] = (robot_results[f"{cam}_img"] * 255).astype(np.uint8)
+        return output
+    def _should_skip_processing(self, save_folder: str) -> bool:
+        """
+        Check if processing should be skipped due to existing output files.
+        Args:
+            save_folder: Directory where output files would be saved
+        Returns:
+            True if processing should be skipped, False otherwise
+        """
+        if self.skip_existing:
+            try:
+                with os.scandir(save_folder) as it:
+                    existing_files = {entry.name for entry in it if entry.is_file()}
+                if str("video_overlay"+f"_{self.robot}_{self.bimanual_setup}.mkv") in existing_files:
+                    print(f"Skipping existing demo {save_folder}")
+                    return True
+            except FileNotFoundError:
+                return False
+        return False
+    def _load_data(self, paths: Paths) -> Dict[str, np.ndarray]:
+        """
+        Load robot trajectory data from smoothed action files.
+        Args:
+            paths: Paths object containing file locations
+        Returns:
+            Dictionary containing robot trajectory data and frame indices
+        """
+        if self.bimanual_setup == "single_arm":
+            # Get paths based on target hand for single-arm operation
+            smoothed_base = getattr(paths, f"smoothed_actions_{self.target_hand}")
+            actions_base = getattr(paths, f"actions_{self.target_hand}")
+            smoothed_actions_path = str(smoothed_base).replace(".npz", f"_{self.bimanual_setup}.npz")
+            actions_path = str(actions_base).replace(".npz", f"_{self.bimanual_setup}.npz")
+            # Load actual trajectory data for target hand
+            ee_pts = np.load(smoothed_actions_path)["ee_pts"]
+            ee_oris = np.load(smoothed_actions_path)["ee_oris"]
+            # Create dummy data for non-target hand
+            dummy_pts = np.zeros((len(ee_pts), 3))
+            dummy_oris = np.eye(3)[None, :, :].repeat(len(ee_oris), axis=0)
+            # Create data dictionary with target hand data and dummy data for other hand
+            other_hand = "right" if self.target_hand == "left" else "left"
+            return {
+                f'ee_pts_{self.target_hand}': ee_pts,
+                f'ee_oris_{self.target_hand}': ee_oris,
+                f'ee_pts_{other_hand}': dummy_pts,
+                f'ee_oris_{other_hand}': dummy_oris,
+                'union_indices': np.load(actions_path, allow_pickle=True)["union_indices"]
+            }
+        # Load bimanual trajectory data
+        smoothed_actions_left_path = str(paths.smoothed_actions_left).split(".npz")[0] + f"_{self.bimanual_setup}.npz"
+        smoothed_actions_right_path = str(paths.smoothed_actions_right).split(".npz")[0] + f"_{self.bimanual_setup}.npz"
+        actions_left_path = str(paths.actions_left).split(".npz")[0] + f"_{self.bimanual_setup}.npz"
+        return {
+            'ee_pts_left': np.load(smoothed_actions_left_path)["ee_pts"],
+            'ee_oris_left': np.load(smoothed_actions_left_path)["ee_oris"],
+            'ee_pts_right': np.load(smoothed_actions_right_path)["ee_pts"],
+            'ee_oris_right': np.load(smoothed_actions_right_path)["ee_oris"],
+            'union_indices': np.load(actions_left_path, allow_pickle=True)["union_indices"]
+        }
+    def _load_images(self, paths: Paths, union_indices: np.ndarray) -> Dict[str, np.ndarray]:
+        """
+        Load and index human demonstration images and associated data.
+        Args:
+            paths: Paths object containing image file locations
+            union_indices: Frame indices to extract from full video sequences
+        Returns:
+            Dictionary containing indexed human images, masks, and depth data
+        """
+        return {
+            'human_masks': np.load(paths.masks_arm)[union_indices],
+            'human_imgs': np.array(media.read_video(paths.video_human_inpaint))[union_indices],
+            'imgs_depth': np.load(paths.depth)[union_indices] if self.use_depth else None
+        }
+    def _process_gripper_widths(self, paths: Paths, data: Dict[str, np.ndarray]) -> Tuple[Dict[str, np.ndarray], Dict[str, np.ndarray]]:
+        """
+        Process gripper distance data into robot action commands.
+        Args:
+            paths: Paths object containing smoothed action file locations
+            data: Dictionary containing trajectory data and frame indices
+        Returns:
+            Tuple containing:
+                - Dictionary of gripper action commands for each hand
+                - Dictionary of gripper width values for each hand
+        """
+        if self.bimanual_setup == "single_arm":
+            # Get the appropriate smoothed actions path based on target hand
+            base_path = getattr(paths, f"smoothed_actions_{self.target_hand}")
+            smoothed_actions_path = str(base_path).replace(".npz", f"_{self.bimanual_setup}.npz")
+            # Compute gripper actions and widths from smoothed data
+            actions, widths = self._compute_gripper_actions(
+                np.load(smoothed_actions_path)["ee_widths"]
+            )
+            # Create return dictionaries with actions for target hand, zeros for the other
+            num_indices = len(data['union_indices'])
+            other_hand = "right" if self.target_hand == "left" else "left"
+            return (
+                {self.target_hand: actions, other_hand: np.zeros(num_indices)},
+                {self.target_hand: widths, other_hand: np.zeros(num_indices)}
+            )
+        # Process bimanual gripper data
+        smoothed_actions_left_path = str(paths.smoothed_actions_left).split(".npz")[0] + f"_{self.bimanual_setup}.npz"
+        smoothed_actions_right_path = str(paths.smoothed_actions_right).split(".npz")[0] + f"_{self.bimanual_setup}.npz"
+        left_actions, left_widths = self._compute_gripper_actions(
+            np.load(smoothed_actions_left_path)["ee_widths"]
+        )
+        right_actions, right_widths = self._compute_gripper_actions(
+            np.load(smoothed_actions_right_path)["ee_widths"]
+        )
+        return {'left': left_actions, 'right': right_actions}, {'left': left_widths, 'right': right_widths}
+    def _compute_gripper_actions(self, list_gripper_dist: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Convert continuous gripper distances to discrete robot gripper actions.
+        Args:
+            list_gripper_dist: Array of gripper distances throughout trajectory
+        Returns:
+            Tuple containing:
+                - Gripper action commands (0 for grasp, distance for open)
+                - Processed gripper width values
+        """
+        try:
+            # Analyze gripper distance range and determine grasp threshold
+            min_val, max_val = np.min(list_gripper_dist), np.max(list_gripper_dist)
+            thresh = min_val + 0.2 * (max_val - min_val)  # 20% above minimum
+            # Classify gripper states: 0 = closed/grasping, 1 = open
+            gripper_state = np.array([0 if dist < thresh else 1 for dist in list_gripper_dist])
+            # Find range of grasping action
+            min_idx_pos = np.where(gripper_state == 0)[0][0]
+            max_idx_pos = np.where(gripper_state == 0)[0][-1]
+            # Generate gripper action commands
+            list_gripper_actions = []
+            for idx in range(len(list_gripper_dist)):
+                if min_idx_pos <= idx <= max_idx_pos:
+                    # During grasping phase: use grasp command (0) and limit distance
+                    list_gripper_actions.append(0)
+                    list_gripper_dist[idx] = np.min([list_gripper_dist[idx], thresh])
+                else:
+                    # Outside grasping phase: use distance as action command
+                    list_gripper_actions.append(list_gripper_dist[idx])
+        except:
+            # Fallback: use distances directly if processing fails
+            list_gripper_actions = list_gripper_dist.tolist()
+        return np.array(list_gripper_actions), list_gripper_dist
+    def _get_robot_state(self, ee_pt: np.ndarray, ori_matrix: np.ndarray, gripper_dist: float) -> RobotState:
+        """
+        Convert trajectory data to robot state representation.
+        Args:
+            ee_pt: End-effector position in 3D space
+            ori_matrix: 3x3 rotation matrix for end-effector orientation
+            gripper_dist: Gripper opening distance
+        Returns:
+            RobotState object containing pose and gripper information
+        """
+        # Convert rotation matrix to quaternion (XYZW format for robot control)
+        ori_xyzw = Rotation.from_matrix(ori_matrix).as_quat(scalar_first=False)
+        robot_state = RobotState(pos=ee_pt, ori_xyzw=ori_xyzw, gripper_pos=gripper_dist)
+        return robot_state
+    def _process_robot_overlay(self, img: np.ndarray, robot_results: Dict[str, Any]) -> np.ndarray:
+        """
+        Create robot overlay on human image using segmentation masks.
+        Args:
+            img: Original human demonstration image
+            robot_results: Dictionary containing robot rendering results
+        Returns:
+            Image with robot overlay applied
+        """
+        # Extract robot rendering and segmentation data
+        rgb_img_sim = (robot_results['rgb_img'] * 255).astype(np.uint8)
+        H, W = rgb_img_sim.shape[:2]
+        # Resize robot rendering and masks to match output resolution
+        if self.square:
+            rgb_img_sim = cv2.resize(rgb_img_sim, (self.output_resolution, self.output_resolution))
+            robot_mask = cv2.resize(robot_results['robot_mask'], (self.output_resolution, self.output_resolution))
+            robot_mask[robot_mask > 0] = 1
+            gripper_mask = cv2.resize(robot_results['gripper_mask'], (self.output_resolution, self.output_resolution))
+            gripper_mask[gripper_mask > 0] = 1
+        else:
+            rgb_img_sim = cv2.resize(rgb_img_sim, (int(W/H*self.output_resolution), self.output_resolution))
+            robot_mask = cv2.resize(robot_results['robot_mask'], (int(W/H*self.output_resolution), self.output_resolution))
+            robot_mask[robot_mask > 0] = 1
+            gripper_mask = cv2.resize(robot_results['gripper_mask'], (int(W/H*self.output_resolution), self.output_resolution))
+            gripper_mask[gripper_mask > 0] = 1
+        # Create overlay by compositing robot over human image
+        img_robot_overlay = img.copy()
+        overlay_mask = (robot_mask == 1) | (gripper_mask == 1)
+        img_robot_overlay[overlay_mask] = rgb_img_sim[overlay_mask]
+        return img_robot_overlay
+    def _process_robot_overlay_with_depth(self, img: np.ndarray, hand_mask: np.ndarray,
+                                    img_depth: np.ndarray, robot_results: Dict[str, Any]) -> np.ndarray:
+        """
+        Create depth-aware robot overlay with realistic occlusion handling.
+        Args:
+            img: Original human demonstration image
+            hand_mask: Segmentation mask of human hand regions
+            img_depth: Depth image corresponding to the demonstration
+            robot_results: Dictionary containing robot rendering and depth results
+        Returns:
+            Image with depth-aware robot overlay applied
+        """
+        # Extract robot rendering and depth data
+        robot_mask = robot_results['robot_mask']
+        gripper_mask = robot_results['gripper_mask']
+        rgb_img_sim = robot_results['rgb_img']
+        depth_img_sim = np.squeeze(robot_results['depth_img'])
+        H, W = rgb_img_sim.shape[:2]
+        # Create masked depth images for occlusion analysis
+        depth_sim_masked = self._create_masked_depth(depth_img_sim, robot_mask, gripper_mask)
+        depth_masked = self._create_masked_depth(img_depth, robot_mask, gripper_mask)
+        # Process hand mask for improved occlusion handling
+        hand_mask = self._dilate_mask(hand_mask.astype(np.uint8))
+        # Create overlay mask using depth-based occlusion
+        img_robot_overlay = img.copy()
+        overlay_mask = self._create_overlay_mask(
+            robot_mask, gripper_mask, depth_masked, depth_sim_masked, hand_mask
+        )
+        # Convert and resize robot rendering
+        rgb_img_sim = (rgb_img_sim * 255).astype(np.uint8)
+        if self.square:
+            resize_shape = (self.output_resolution, self.output_resolution)
+        else:
+            resize_shape = (int(W/H*self.output_resolution), self.output_resolution)
+        # Apply final overlay with depth-aware occlusion
+        rgb_img_sim = cv2.resize(rgb_img_sim, resize_shape)
+        overlay_mask = cv2.resize(overlay_mask.astype(np.uint8), resize_shape)
+        overlay_mask[overlay_mask > 0] = 1
+        overlay_mask = overlay_mask.astype(bool)
+        img_robot_overlay[overlay_mask] = rgb_img_sim[overlay_mask]
+        return img_robot_overlay
+    def _create_masked_depth(self, depth_img: np.ndarray, robot_mask: np.ndarray,
+                            gripper_mask: np.ndarray) -> np.ndarray:
+        """
+        Create depth image masked to robot regions for occlusion analysis.
+        Args:
+            depth_img: Input depth image
+            robot_mask: Binary mask indicating robot regions
+            gripper_mask: Binary mask indicating gripper regions
+        Returns:
+            Depth image with values only in robot/gripper regions
+        """
+        masked_img = np.zeros_like(depth_img)
+        mask = (robot_mask == 1) | (gripper_mask == 1)
+        masked_img[mask] = depth_img[mask]
+        return masked_img
+    def _dilate_mask(self, mask: np.ndarray) -> np.ndarray:
+        """
+        Apply morphological dilation to expand mask boundaries.
+        Args:
+            mask: Binary mask to dilate
+        Returns:
+            Dilated binary mask
+        """
+        kernel = np.ones((5, 5), np.uint8)
+        return cv2.dilate(mask, kernel, iterations=1)
+    def _create_overlay_mask(self, robot_mask: np.ndarray, gripper_mask: np.ndarray,
+                            depth_masked: np.ndarray, depth_sim_masked: np.ndarray,
+                            hand_mask: np.ndarray) -> np.ndarray:
+        """
+        Create sophisticated overlay mask using depth-based occlusion reasoning.
+        Args:
+            robot_mask: Binary mask for robot body regions
+            gripper_mask: Binary mask for robot gripper regions
+            depth_masked: Real depth image masked to robot regions
+            depth_sim_masked: Simulated robot depth masked to robot regions
+            hand_mask: Binary mask for human hand regions
+        Returns:
+            Binary mask indicating where robot overlay should be applied
+        """
+        # Start with basic robot visibility mask
+        overlay_mask = (robot_mask == 1) | (gripper_mask == 1)
+        # Apply depth-based occlusion: hide robot when it's behind real objects
+        # and not in hand regions (where occlusion handling is more complex)
+        overlay_mask[(depth_masked < depth_sim_masked) & (hand_mask == 0)] = 0
+        return overlay_mask
+    def _save_results(self, paths: Paths, sequence: TrainingDataSequence, img_overlay: List[np.ndarray],
+                     img_birdview: Optional[List[np.ndarray]] = None) -> None:
+        """
+        Save comprehensive robot inpainting results to disk.
+        Args:
+            paths: Paths object containing output file locations
+            sequence: Training data sequence with robot state annotations
+            img_overlay: List of robot overlay images
+            img_birdview: Optional list of bird's eye view images for analysis
+        """
+        # Create output directory
+        os.makedirs(paths.inpaint_processor, exist_ok=True)
+        if len(img_overlay) == 0:
+            print("No robot inpainted images, skipping")
+            return
+        # Save main robot-inpainted video
+        video_path = str(paths.video_overlay).split(".mkv")[0] + f"_{self.robot}_{self.bimanual_setup}.mkv"
+        self._save_video(video_path, img_overlay)
+        # Save bird's eye view video for analysis and debugging
+        if img_birdview is not None:
+            birdview_path = str(paths.video_birdview).split(".mkv")[0] + f"_{self.robot}_{self.bimanual_setup}.mkv"
+            self._save_video(birdview_path, np.array(img_birdview))
+        # Save comprehensive training data with robot state annotations
+        training_data_path = str(paths.training_data).split(".npz")[0] + f"_{self.bimanual_setup}.npz"
+        sequence.save(training_data_path)
+    def _save_video(self, path: str, frames: List[np.ndarray]) -> None:
+        """
+        Save video with consistent encoding parameters.
+        Args:
+            path: Output video file path
+            frames: List of video frames to save
+        """
+        media.write_video(
+            path,
+            frames,
+            fps=self.DEFAULT_FPS,
+            codec=self.DEFAULT_CODEC
+        )
+    def _get_mujoco_camera_params(self) -> MujocoCameraParams:
+        """
+        Generate MuJoCo camera parameters from real-world camera calibration.
+        Returns:
+            MujocoCameraParams object with calibrated camera settings
+        """
+        # Extract real-world camera extrinsics and convert to MuJoCo format
+        extrinsics = self.extrinsics[0]
+        camera_ori_wxyz = self._convert_real_camera_ori_to_mujoco(
+            np.array(extrinsics["camera_base_ori"])
+        )
+        # Calculate image dimensions and camera intrinsics
+        img_w, img_h = self._get_image_dimensions()
+        offset = self._calculate_image_offset(img_w, img_h)
+        fx, fy, cx, cy = self._get_camera_intrinsics(offset)
+        sensor_width, sensor_height = self._calculate_sensor_size(img_w, img_h, fx, fy)
+        # Select appropriate camera name based on dataset
+        if self.epic:
+            camera_name = "zed"
+        else:
+            camera_name = "frontview"
+        return MujocoCameraParams(
+            name=camera_name,
+            pos=extrinsics["camera_base_pos"],
+            ori_wxyz=camera_ori_wxyz,
+            fov=self.intrinsics_dict["v_fov"],
+            resolution=(img_h, img_w),
+            sensorsize=np.array([sensor_width, sensor_height]),
+            principalpixel=np.array([img_w/2-cx, cy-img_h/2]),
+            focalpixel=np.array([fx, fy])
+        )
+    def _get_image_dimensions(self) -> Tuple[int, int]:
+        """
+        Calculate image dimensions based on input resolution configuration.
+        Returns:
+            Tuple of (width, height) in pixels
+        """
+        # Epic
+        if self.input_resolution == 256:
+            img_w = 456
+        # Phantom paper
+        elif self.input_resolution == 1080:
+            img_w = self.input_resolution * 16 // 9
+        img_h = self.input_resolution
+        return img_w, img_h
+    def _calculate_image_offset(self, img_w: int, img_h: int) -> int:
+        """
+        Calculate horizontal image offset for square aspect ratio processing.
+        Args:
+            img_w: Image width in pixels
+            img_h: Image height in pixels
+        Returns:
+            Horizontal offset in pixels
+        """
+        if self.square:
+            offset = (img_w - img_h) // 2
+        else:
+            offset = 0
+        return offset
+    def _get_camera_intrinsics(self, offset: int) -> Tuple[float, float, float, float]:
+        """
+        Extract camera intrinsic parameters with offset correction.
+        Args:
+            offset: Horizontal offset for principal point adjustment
+        Returns:
+            Tuple of (fx, fy, cx, cy) camera intrinsic parameters
+        """
+        return self.intrinsics_dict["fx"], self.intrinsics_dict["fy"], self.intrinsics_dict["cx"]+offset, self.intrinsics_dict["cy"]
+    def _calculate_sensor_size(self, img_w: int, img_h: int, fx: float, fy: float) -> Tuple[float, float]:
+        """
+        Calculate physical sensor dimensions from image resolution and focal length.
+        Args:
+            img_w: Image width in pixels
+            img_h: Image height in pixels
+            fx: Focal length in x direction (pixels)
+            fy: Focal length in y direction (pixels)
+        Returns:
+            Tuple of (sensor_width, sensor_height) in meters
+        """
+        sensor_width = img_w / fy / 1000
+        sensor_height = img_h / fx / 1000
+        return sensor_width, sensor_height
+    @staticmethod
+    def _convert_real_camera_ori_to_mujoco(camera_ori_matrix: np.ndarray) -> np.ndarray:
+        """
+        Convert real-world camera orientation to MuJoCo coordinate system.
+        Args:
+            camera_ori_matrix: 3x3 rotation matrix in real-world coordinates
+        Returns:
+            Quaternion in WXYZ format for MuJoCo
+        """
+        # Apply coordinate system transformation (flip Y and Z axes)
+        camera_ori_matrix[:, [1, 2]] = -camera_ori_matrix[:, [1, 2]]
+        # Convert to quaternion in MuJoCo's WXYZ format
+        r = Rotation.from_matrix(camera_ori_matrix)
+        camera_ori_wxyz = r.as_quat(scalar_first=True)
+        return camera_ori_wxyz

phantom/phantom/processors/segmentation_processor.py ADDED Viewed

	@@ -0,0 +1,1056 @@

+"""
+Segmentation Processor Module
+This module uses SAM2 to create masks of hands and arms in video sequences.
+Processing Pipeline:
+1. Load video frames and detection/pose data from previous stages
+2. Initialize segmentation with highest-quality detection frame
+3. Propagate segmentation bidirectionally (forward and reverse)
+4. Combine temporal results for complete sequence coverage
+5. Generate visualization videos and save segmentation masks
+The module supports different segmentation modes:
+- HandSegmentationProcessor: Precise hand-only segmentation
+- ArmSegmentationProcessor: Combined hand + arm segmentation
+"""
+import os
+import logging
+import shutil
+from tqdm import tqdm
+import numpy as np
+import mediapy as media
+import argparse
+from typing import Dict, Tuple, Optional, List
+from phantom.processors.paths import Paths
+from phantom.processors.base_processor import BaseProcessor
+from phantom.detectors.detector_sam2 import DetectorSam2
+from phantom.detectors.detector_detectron2 import DetectorDetectron2
+from phantom.utils.bbox_utils import get_overlap_score
+from phantom.processors.phantom_data import HandSequence
+logger = logging.getLogger(__name__)
+# Configuration constants for segmentation processing
+DEFAULT_FPS = 10
+DEFAULT_OVERLAP_THRESHOLD = 0.5
+DEFAULT_CODEC = "ffv1"
+ANNOTATION_CODEC = "h264"
+class BaseSegmentationProcessor(BaseProcessor):
+    """
+    Base class for video segmentation processing using SAM2.
+    The base processor establishes the framework for temporal segmentation processing,
+    where segmentation masks are propagated both forward and backward through time
+    to ensure temporal consistency and complete coverage of the video sequence.
+    Attributes:
+        detector_sam (DetectorSam2): SAM2 segmentation model instance
+    """
+    def __init__(self, args: argparse.Namespace) -> None:
+        """
+        Initialize the base segmentation processor.
+        Args:
+            args: Command line arguments containing segmentation configuration
+        """
+        super().__init__(args)
+        self.detector_sam = DetectorSam2()
+    def process_one_demo(self, data_sub_folder: str) -> None:
+        """
+        Process a single demonstration - to be implemented by subclasses.
+        Args:
+            data_sub_folder: Path to demonstration data folder
+        Raises:
+            NotImplementedError: Must be implemented by concrete subclasses
+        """
+        raise NotImplementedError("Subclasses must implement this method")
+    def _load_hamer_data(self, paths: Paths) -> Dict[str, HandSequence]:
+        """
+        Load hand pose estimation data from previous processing stage.
+        Args:
+            paths: Paths object containing file locations
+        Returns:
+            Dictionary containing left and right hand sequences
+        """
+        if self.bimanual_setup == "single_arm":
+            if self.target_hand == "left":
+                return {"left": HandSequence.load(paths.hand_data_left)}
+            elif self.target_hand == "right":
+                return {"right": HandSequence.load(paths.hand_data_right)}
+            else:
+                raise ValueError(f"Invalid target hand: {self.target_hand}")
+        elif self.bimanual_setup == "shoulders":
+            return {
+                "left": HandSequence.load(paths.hand_data_left),
+                "right": HandSequence.load(paths.hand_data_right)
+            }
+        else:
+            raise ValueError(f"Invalid bimanual setup: {self.bimanual_setup}")
+    @staticmethod
+    def _load_video(video_path: str) -> np.ndarray:
+        """
+        Load and validate video frames from disk.
+        Args:
+            video_path: Path to video file
+        Returns:
+            Array of RGB video frames
+        Raises:
+            FileNotFoundError: If video file doesn't exist
+            ValueError: If video file is empty or corrupted
+        """
+        if not os.path.exists(video_path):
+            raise FileNotFoundError(f"Video file not found: {video_path}")
+        imgs_rgb = media.read_video(video_path)
+        if len(imgs_rgb) == 0:
+            raise ValueError("Empty video file")
+        return imgs_rgb
+    @staticmethod
+    def _load_bbox_data(bbox_path: str) -> Dict[str, np.ndarray]:
+        """
+        Load and validate bounding box detection data.
+        Args:
+            bbox_path: Path to bounding box data file
+        Returns:
+            Dictionary containing detection results from bounding box processor
+        Raises:
+            FileNotFoundError: If bounding box data file doesn't exist
+        """
+        if not os.path.exists(bbox_path):
+            raise FileNotFoundError(f"Bbox data not found: {bbox_path}")
+        return np.load(bbox_path)
+    @staticmethod
+    def _combine_sam_images(
+        imgs_rgb: np.ndarray,
+        imgs_forward: Dict[int, np.ndarray],
+        imgs_reverse: Dict[int, np.ndarray]
+    ) -> np.ndarray:
+        """
+        Combine forward and reverse SAM visualization images.
+        This method merges the visualization results from bidirectional
+        processing to create a complete visualization sequence.
+        Args:
+            imgs_rgb: Original RGB frames for shape reference
+            imgs_forward: Forward propagation visualization results
+            imgs_reverse: Reverse propagation visualization results
+        Returns:
+            Combined visualization array
+        """
+        result = np.zeros_like(imgs_rgb)
+        # Fill in forward propagation results
+        for idx in imgs_forward:
+            result[idx] = imgs_forward[idx]
+        # Fill in reverse propagation results (may overwrite forward results)
+        for idx in imgs_reverse:
+            result[idx] = imgs_reverse[idx]
+        return result
+    @staticmethod
+    def _combine_masks(
+        imgs_rgb: np.ndarray,
+        masks_forward: Dict[int, np.ndarray],
+        masks_reverse: Dict[int, np.ndarray]
+    ) -> np.ndarray:
+        """
+        Combine forward and reverse segmentation masks.
+        This method merges segmentation masks from bidirectional processing
+        to ensure complete temporal coverage of the video sequence.
+        Args:
+            imgs_rgb: Original RGB frames for shape reference
+            masks_forward: Forward propagation mask results
+            masks_reverse: Reverse propagation mask results
+        Returns:
+            Combined mask array with shape (num_frames, height, width)
+        """
+        result = np.zeros((len(imgs_rgb), imgs_rgb[0].shape[0], imgs_rgb[0].shape[1]))
+        for idx in masks_forward:
+            result[idx] = masks_forward[idx][0]
+        for idx in masks_reverse:
+            result[idx] = masks_reverse[idx][0]
+        return result
+class ArmSegmentationProcessor(BaseSegmentationProcessor):
+    """
+    Processor for segmenting combined hand and arm regions in video sequences.
+    Attributes:
+        detectron_detector (DetectorDetectron2): Detectron2 model for initial detection
+    """
+    def __init__(self, args: argparse.Namespace) -> None:
+        """
+        Initialize the arm segmentation processor with detection models.
+        Args:
+            args: Command line arguments containing model configuration
+        """
+        super().__init__(args)
+        # Initialize Detectron2 for initial hand/arm detection
+        root_dir = "../submodules/phantom-hamer/"
+        self.detectron_detector = DetectorDetectron2(root_dir)
+    def process_one_demo(self, data_sub_folder: str, hamer_data: Optional[Dict[str, HandSequence]] = None) -> None:
+        """
+        Process a single video demonstration to generate combined hand + arm segmentation masks.
+        Args:
+            data_sub_folder: Path to the subfolder containing the demo data
+            hamer_data: Optional pre-loaded hand pose data for segmentation guidance
+        Raises:
+            FileNotFoundError: If required input files are not found
+            ValueError: If video frames or bounding boxes are invalid
+        """
+        # Setup and load all required data
+        save_folder, paths, imgs_rgb, bbox_data, det_bbox_data, hamer_data = self._setup_processing(
+            data_sub_folder, hamer_data
+        )
+        # Process based on setup type
+        if self.bimanual_setup == "single_arm":
+            masks = self._process_single_arm(imgs_rgb, bbox_data, det_bbox_data, hamer_data, paths)
+        elif self.bimanual_setup == "shoulders":
+            masks = self._process_bimanual(imgs_rgb, bbox_data, det_bbox_data, hamer_data, paths)
+        else:
+            raise ValueError(f"Invalid bimanual setup: {self.bimanual_setup}")
+        # Create visualization and save results
+        sam_imgs = self._create_visualization(imgs_rgb, masks)
+        self._validate_output_consistency(imgs_rgb, masks, sam_imgs)
+        self._save_results(paths, masks, sam_imgs)
+    def _setup_processing(
+        self,
+        data_sub_folder: str,
+        hamer_data: Optional[Dict[str, HandSequence]]
+    ) -> Tuple[str, Paths, np.ndarray, Dict[str, np.ndarray], Dict[str, np.ndarray], Dict[str, HandSequence]]:
+        """
+        Setup processing environment and load all required data.
+        Args:
+            data_sub_folder: Path to the subfolder containing the demo data
+            hamer_data: Optional pre-loaded hand pose data
+        Returns:
+            Tuple containing: (save_folder, paths, imgs_rgb, bbox_data, det_bbox_data, hamer_data)
+        """
+        save_folder = self.get_save_folder(data_sub_folder)
+        paths = self.get_paths(save_folder)
+        paths._setup_original_images()
+        paths._setup_original_images_reverse()
+        # Load and validate all input data
+        imgs_rgb = self._load_video(paths.video_left)
+        bbox_data = self._load_bbox_data(paths.bbox_data)
+        det_bbox_data = self.get_detectron_bboxes(imgs_rgb, bbox_data)
+        if hamer_data is None:
+            hamer_data = self._load_hamer_data(paths)
+        return save_folder, paths, imgs_rgb, bbox_data, det_bbox_data, hamer_data
+    def _process_single_arm(
+        self,
+        imgs_rgb: np.ndarray,
+        bbox_data: Dict[str, np.ndarray],
+        det_bbox_data: Dict[str, np.ndarray],
+        hamer_data: Dict[str, HandSequence],
+        paths: Paths
+    ) -> np.ndarray:
+        """
+        Process single arm setup (left or right hand only).
+        Args:
+            imgs_rgb: RGB video frames
+            bbox_data: Bounding box detection data
+            det_bbox_data: Detectron2 refined bounding boxes
+            hamer_data: Hand pose estimation data
+            paths: Paths object for file management
+        Returns:
+            Boolean segmentation masks
+        """
+        if self.target_hand == "left":
+            hand_data = self._process_hand_data(
+                imgs_rgb,
+                bbox_data["left_bboxes"],
+                bbox_data["left_bbox_min_dist_to_edge"],
+                bbox_data["left_hand_detected"],
+                det_bbox_data["left_det_bboxes"],
+                hamer_data["left"],
+                paths,
+                "left"
+            )
+            masks = hand_data["left_masks"].astype(np.bool_)
+        elif self.target_hand == "right":
+            hand_data = self._process_hand_data(
+                imgs_rgb,
+                bbox_data["right_bboxes"],
+                bbox_data["right_bbox_min_dist_to_edge"],
+                bbox_data["right_hand_detected"],
+                det_bbox_data["right_det_bboxes"],
+                hamer_data["right"],
+                paths,
+                "right"
+            )
+            masks = hand_data["right_masks"].astype(np.bool_)
+        else:
+            raise ValueError(f"Invalid target hand: {self.target_hand}")
+        return masks.astype(np.bool_)
+    def _process_bimanual(
+        self,
+        imgs_rgb: np.ndarray,
+        bbox_data: Dict[str, np.ndarray],
+        det_bbox_data: Dict[str, np.ndarray],
+        hamer_data: Dict[str, HandSequence],
+        paths: Paths
+    ) -> np.ndarray:
+        """
+        Process bimanual setup (both hands combined).
+        Args:
+            imgs_rgb: RGB video frames
+            bbox_data: Bounding box detection data
+            det_bbox_data: Detectron2 refined bounding boxes
+            hamer_data: Hand pose estimation data
+            paths: Paths object for file management
+        Returns:
+            Combined boolean segmentation masks
+        """
+        # Process left hand with arm segmentation
+        left_data = self._process_hand_data(
+            imgs_rgb,
+            bbox_data["left_bboxes"],
+            bbox_data["left_bbox_min_dist_to_edge"],
+            bbox_data["left_hand_detected"],
+            det_bbox_data["left_det_bboxes"],
+            hamer_data["left"],
+            paths,
+            "left"
+        )
+        # Process right hand with arm segmentation
+        right_data = self._process_hand_data(
+            imgs_rgb,
+            bbox_data["right_bboxes"],
+            bbox_data["right_bbox_min_dist_to_edge"],
+            bbox_data["right_hand_detected"],
+            det_bbox_data["right_det_bboxes"],
+            hamer_data["right"],
+            paths,
+            "right"
+        )
+        # Convert to boolean masks and combine
+        left_masks = left_data["left_masks"].astype(np.bool_)
+        right_masks = right_data["right_masks"].astype(np.bool_)
+        # Generate combined video masks by taking the union of left and right masks
+        masks = np.zeros((len(imgs_rgb), imgs_rgb[0].shape[0], imgs_rgb[0].shape[1]))
+        for idx in range(len(imgs_rgb)):
+            masks[idx] = left_masks[idx] | right_masks[idx]
+        return masks.astype(np.bool_)
+    def _create_visualization(self, imgs_rgb: np.ndarray, masks: np.ndarray) -> np.ndarray:
+        """
+        Create visualization by masking out segmented regions.
+        Args:
+            imgs_rgb: Original RGB video frames
+            masks: Boolean segmentation masks
+        Returns:
+            Visualization images with masked regions set to black
+        """
+        sam_imgs = []
+        for idx in range(len(imgs_rgb)):
+            img = imgs_rgb[idx].copy()  # Create copy to avoid modifying original
+            mask = masks[idx]
+            img[mask] = 0  # Set masked regions to black
+            sam_imgs.append(img)
+        return np.array(sam_imgs)
+    def _validate_output_consistency(
+        self,
+        imgs_rgb: np.ndarray,
+        masks: np.ndarray,
+        sam_imgs: np.ndarray
+    ) -> None:
+        """
+        Validate that output arrays have consistent dimensions.
+        Args:
+            imgs_rgb: Original RGB video frames
+            masks: Segmentation masks
+            sam_imgs: Visualization images
+        Raises:
+            AssertionError: If dimensions don't match
+        """
+        assert len(sam_imgs) == len(imgs_rgb), "Visualization length doesn't match input"
+        assert len(masks) == len(imgs_rgb), "Masks length doesn't match input"
+    def _process_hand_data(
+        self,
+        imgs_rgb: np.ndarray,
+        bboxes: np.ndarray,
+        bbox_min_dist: np.ndarray,
+        hand_detected: np.ndarray,
+        det_bboxes: np.ndarray,
+        hamer_data: HandSequence,
+        paths: Paths,
+        hand_side: str
+    ) -> Dict[str, np.ndarray]:
+        """
+        Process segmentation data for a single hand (left or right) with arm inclusion.
+        Args:
+            imgs_rgb: RGB video frames
+            bboxes: Hand bounding boxes from detection stage
+            bbox_min_dist: Minimum distances to image edges (quality metric)
+            hand_detected: Boolean flags indicating valid hand detections
+            det_bboxes: Refined bounding boxes from Detectron2
+            hamer_data: Hand pose data for segmentation guidance
+            paths: Paths object for file management
+            hand_side: "left" or "right" specifying which hand to process
+        Returns:
+            Dictionary containing segmentation masks and visualization images
+        """
+        # Handle cases with no valid detections
+        if not hand_detected.any() or max(bbox_min_dist) == 0:
+            return {
+                f"{hand_side}_masks": np.zeros((len(imgs_rgb), imgs_rgb[0].shape[0], imgs_rgb[0].shape[1])),
+                f"{hand_side}_sam_imgs": np.zeros((len(imgs_rgb), imgs_rgb[0].shape[0], imgs_rgb[0].shape[1], 3))
+            }
+        # Extract hand pose keypoints for segmentation guidance
+        kpts_2d = hamer_data.kpts_2d
+        # Find the frame with highest quality (furthest from edges)
+        max_dist_idx = np.argmax(bbox_min_dist)
+        points = np.expand_dims(kpts_2d[max_dist_idx], axis=1)
+        bbox_dets = det_bboxes[max_dist_idx]
+        # Use original bounding box if Detectron2 detection failed
+        if bbox_dets.sum() == 0:
+            bbox_dets = bboxes[max_dist_idx]
+        # Process segmentation in both temporal directions
+        masks_forward, sam_imgs_forward = self._run_sam_segmentation(
+            paths, bbox_dets, points, max_dist_idx, reverse=False
+        )
+        masks_reverse, sam_imgs_reverse = self._run_sam_segmentation(
+            paths, bbox_dets, points, max_dist_idx, reverse=True
+        )
+        # Combine bidirectional results
+        sam_imgs = self._combine_sam_images(imgs_rgb, sam_imgs_forward, sam_imgs_reverse)
+        masks = self._combine_masks(imgs_rgb, masks_forward, masks_reverse)
+        return {
+            f"{hand_side}_masks": masks,
+            f"{hand_side}_sam_imgs": sam_imgs
+        }
+    def _run_sam_segmentation(
+        self,
+        paths: Paths,
+        bbox_dets: np.ndarray,
+        points: np.ndarray,
+        max_dist_idx: int,
+        reverse: bool
+    ) -> Tuple[Dict[int, np.ndarray], Dict[int, np.ndarray]]:
+        """
+        Process video segmentation in either forward or reverse temporal direction.
+        Args:
+            paths: Paths object for file management
+            bbox_dets: Detectron2 bounding box for initialization
+            points: Hand keypoints for segmentation guidance
+            max_dist_idx: Index of highest-quality frame for initialization
+            reverse: Whether to process in reverse temporal order
+        Returns:
+            Tuple of (segmentation_masks, visualization_images)
+        """
+        return self.detector_sam.segment_video(
+            paths.original_images_folder,
+            bbox_dets,
+            points,
+            [max_dist_idx],
+            reverse=reverse
+        )
+    def get_detectron_bboxes(self, imgs_rgb: np.ndarray, bbox_data: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
+        """
+        Generate enhanced bounding boxes using Detectron2 for improved segmentation.
+        Args:
+            imgs_rgb: Array of RGB frames with shape (N, H, W, 3)
+            bbox_data: Initial bounding box data from hand detection stage containing:
+                      - left_bboxes: Left hand bounding boxes
+                      - right_bboxes: Right hand bounding boxes
+                      - left_hand_detected: Boolean flags for left hand detection
+                      - right_hand_detected: Boolean flags for right hand detection
+                      - left_bbox_min_dist_to_edge: Quality metrics for left hand
+                      - right_bbox_min_dist_to_edge: Quality metrics for right hand
+        Returns:
+            Dictionary containing refined bounding boxes:
+            - left_det_bboxes: Enhanced left hand bounding boxes
+            - right_det_bboxes: Enhanced right hand bounding boxes
+        Raises:
+            ValueError: If input array is empty or has incorrect shape
+        """
+        self._validate_detectron_input(imgs_rgb)
+        # Extract detection data and initialize output arrays
+        detection_data = self._extract_detection_data(bbox_data)
+        left_det_bboxes, right_det_bboxes = self._initialize_bbox_arrays(imgs_rgb)
+        # Process only highest-quality frames for efficiency
+        idx_list = self._get_quality_frame_indices(bbox_data)
+        for idx in tqdm(idx_list, desc="Processing frames"):
+            try:
+                self._process_detectron_frame(
+                    idx, imgs_rgb, detection_data, left_det_bboxes, right_det_bboxes
+                )
+            except Exception as e:
+                logging.error(f"Error processing frame {idx}: {str(e)}")
+        return {"left_det_bboxes": left_det_bboxes, "right_det_bboxes": right_det_bboxes}
+    def _validate_detectron_input(self, imgs_rgb: np.ndarray) -> None:
+        """
+        Validate input array for Detectron2 processing.
+        Args:
+            imgs_rgb: Array of RGB frames
+        Raises:
+            ValueError: If input array is empty or has incorrect shape
+        """
+        if len(imgs_rgb) == 0:
+            raise ValueError("Empty input array - no video frames provided")
+        if len(imgs_rgb.shape) != 4 or imgs_rgb.shape[-1] != 3:
+            raise ValueError(f"Expected input shape (N, H, W, 3), got {imgs_rgb.shape}. "
+                           f"Input should be RGB video frames.")
+    def _extract_detection_data(self, bbox_data: Dict[str, np.ndarray]) -> Dict[str, np.ndarray]:
+        """
+        Extract detection data from bounding box data.
+        Args:
+            bbox_data: Bounding box detection data
+        Returns:
+            Dictionary containing extracted detection data
+        """
+        return {
+            "left_bboxes": bbox_data["left_bboxes"],
+            "right_bboxes": bbox_data["right_bboxes"],
+            "left_hand_detected": bbox_data["left_hand_detected"],
+            "right_hand_detected": bbox_data["right_hand_detected"]
+        }
+    def _initialize_bbox_arrays(self, imgs_rgb: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        """
+        Initialize output bounding box arrays.
+        Args:
+            imgs_rgb: RGB video frames for shape reference
+        Returns:
+            Tuple of (left_det_bboxes, right_det_bboxes) initialized arrays
+        """
+        left_det_bboxes = np.zeros((len(imgs_rgb), 4))
+        right_det_bboxes = np.zeros((len(imgs_rgb), 4))
+        return left_det_bboxes, right_det_bboxes
+    def _get_quality_frame_indices(self, bbox_data: Dict[str, np.ndarray]) -> List[int]:
+        """
+        Get indices of highest-quality frames for processing.
+        Args:
+            bbox_data: Bounding box detection data
+        Returns:
+            List of frame indices to process
+        """
+        idx_left = np.argmax(bbox_data["left_bbox_min_dist_to_edge"])
+        idx_right = np.argmax(bbox_data["right_bbox_min_dist_to_edge"])
+        return [idx_left, idx_right]
+    def _process_detectron_frame(
+        self,
+        idx: int,
+        imgs_rgb: np.ndarray,
+        detection_data: Dict[str, np.ndarray],
+        left_det_bboxes: np.ndarray,
+        right_det_bboxes: np.ndarray
+    ) -> None:
+        """
+        Process a single frame with Detectron2 detection.
+        Args:
+            idx: Frame index to process
+            imgs_rgb: RGB video frames
+            detection_data: Extracted detection data
+            left_det_bboxes: Left hand bounding box output array
+            right_det_bboxes: Right hand bounding box output array
+        """
+        left_hand_detected = detection_data["left_hand_detected"]
+        right_hand_detected = detection_data["right_hand_detected"]
+        # Skip frames without any hand detections
+        if not left_hand_detected[idx] and not right_hand_detected[idx]:
+            left_det_bboxes[idx] = np.array([0, 0, 0, 0])
+            right_det_bboxes[idx] = np.array([0, 0, 0, 0])
+            return
+        # Apply Detectron2 detection
+        img = imgs_rgb[idx]
+        det_bboxes, det_scores = self.detectron_detector.get_bboxes(img, visualize=False)
+        if len(det_bboxes) == 0:
+            return
+        # Match left hand detection with Detectron2 results
+        if left_hand_detected[idx]:
+            self._match_hand_detection(
+                idx, "left", detection_data, det_bboxes, left_det_bboxes
+            )
+        # Match right hand detection with Detectron2 results
+        if right_hand_detected[idx]:
+            self._match_hand_detection(
+                idx, "right", detection_data, det_bboxes, right_det_bboxes
+            )
+    def _match_hand_detection(
+        self,
+        idx: int,
+        hand_side: str,
+        detection_data: Dict[str, np.ndarray],
+        det_bboxes: np.ndarray,
+        output_bboxes: np.ndarray
+    ) -> None:
+        """
+        Match hand detection with Detectron2 results using overlap scores.
+        Args:
+            idx: Frame index
+            hand_side: "left" or "right" hand
+            detection_data: Extracted detection data
+            det_bboxes: Detectron2 detection results
+            output_bboxes: Output bounding box array to update
+        """
+        bbox = detection_data[f"{hand_side}_bboxes"][idx]
+        overlap_scores = []
+        for det_bbox in det_bboxes:
+            overlap_score = get_overlap_score(bbox, det_bbox)
+            overlap_scores.append(overlap_score)
+        if np.max(overlap_scores) > DEFAULT_OVERLAP_THRESHOLD:
+            best_idx = np.argmax(overlap_scores)
+            output_bboxes[idx] = det_bboxes[best_idx].astype(np.int32)
+    @staticmethod
+    def _save_results(
+        paths: Paths,
+        masks: np.ndarray,
+        sam_imgs: np.ndarray,
+        fps: int = DEFAULT_FPS
+    ) -> None:
+        """
+        Save arm segmentation results to disk.
+        Args:
+            paths: Paths object containing output file locations
+            masks: Combined arm segmentation masks
+            sam_imgs: SAM visualization images
+            fps: Frames per second for output videos (default: 10)
+        """
+        ArmSegmentationProcessor._create_output_directory(paths)
+        try:
+            ArmSegmentationProcessor._save_mask_data(paths, masks)
+            ArmSegmentationProcessor._create_videos(paths, masks, sam_imgs, fps)
+        except Exception as e:
+            logging.error(f"Error saving results: {str(e)}")
+            raise
+        ArmSegmentationProcessor._cleanup_temp_files(paths)
+        ArmSegmentationProcessor._update_annotation_video(paths, masks, sam_imgs, fps)
+    @staticmethod
+    def _create_output_directory(paths: Paths) -> None:
+        """
+        Create output directory for segmentation results.
+        Args:
+            paths: Paths object containing output directory location
+        """
+        if not os.path.exists(paths.segmentation_processor):
+            os.makedirs(paths.segmentation_processor)
+    @staticmethod
+    def _save_mask_data(paths: Paths, masks: np.ndarray) -> None:
+        """
+        Save mask data to disk.
+        Args:
+            paths: Paths object containing output file locations
+            masks: Segmentation masks to save
+        """
+        np.save(paths.masks_arm, masks)
+    @staticmethod
+    def _create_videos(paths: Paths, masks: np.ndarray, sam_imgs: np.ndarray, fps: int) -> None:
+        """
+        Create visualization videos from masks and SAM images.
+        Args:
+            paths: Paths object containing output file locations
+            masks: Segmentation masks
+            sam_imgs: SAM visualization images
+            fps: Frames per second for output videos
+        """
+        for name, data in [
+            ("video_masks_arm", masks),
+            ("video_sam_arm", sam_imgs),
+        ]:
+            output_path = getattr(paths, name)
+            media.write_video(output_path, data, fps=fps, codec=DEFAULT_CODEC)
+    @staticmethod
+    def _cleanup_temp_files(paths: Paths) -> None:
+        """
+        Clean up temporary directories created during processing.
+        Args:
+            paths: Paths object containing temporary directory locations
+        """
+        if os.path.exists(paths.original_images_folder):
+            shutil.rmtree(paths.original_images_folder)
+        if os.path.exists(paths.original_images_folder_reverse):
+            shutil.rmtree(paths.original_images_folder_reverse)
+    @staticmethod
+    def _update_annotation_video(paths: Paths, masks: np.ndarray, sam_imgs: np.ndarray, fps: int) -> None:
+        """
+        Update existing annotation video with segmentation results.
+        Args:
+            paths: Paths object containing annotation video location
+            masks: Segmentation masks
+            sam_imgs: SAM visualization images
+            fps: Frames per second for output video
+        """
+        if os.path.exists(paths.video_annot):
+            annot_imgs = media.read_video(paths.video_annot)
+            for idx in range(len(annot_imgs)):
+                annot_img = annot_imgs[idx]
+                h = masks[idx].shape[0]
+                w = masks[idx].shape[1]
+                # Insert segmentation visualization in the top-right quadrant
+                annot_img[:h, w:, :] = sam_imgs[idx]
+            media.write_video(paths.video_annot, annot_imgs, fps=fps, codec=ANNOTATION_CODEC)
+class HandSegmentationProcessor(BaseSegmentationProcessor):
+    """
+    Processor for precise hand-only segmentation in video sequences.
+    Attributes:
+        Inherits detector_sam from BaseSegmentationProcessor
+    """
+    def __init__(self, args: argparse.Namespace) -> None:
+        """
+        Initialize the hand segmentation processor.
+        Args:
+            args: Command line arguments containing segmentation configuration
+        """
+        super().__init__(args)
+    def process_one_demo(self, data_sub_folder: str, hamer_data: Optional[Dict[str, HandSequence]] = None) -> None:
+        """
+        Process a single video demonstration to generate precise hand segmentation masks.
+        Args:
+            data_sub_folder: Path to the subfolder containing the demo data
+            hamer_data: Optional pre-loaded hand pose data for segmentation guidance
+        Raises:
+            FileNotFoundError: If required input files are not found
+            ValueError: If video frames or bounding boxes are invalid
+        """
+        save_folder = self.get_save_folder(data_sub_folder)
+        paths = self.get_paths(save_folder)
+        paths._setup_original_images()
+        paths._setup_original_images_reverse()
+        # Load and validate input data
+        imgs_rgb = self._load_video(paths.video_left)
+        bbox_data = self._load_bbox_data(paths.bbox_data)
+        if hamer_data is None:
+            hamer_data = self._load_hamer_data(paths)
+        # Process left and right hands separately for precise segmentation
+        left_data = self._process_hand_data(
+            imgs_rgb,
+            bbox_data["left_bboxes"],
+            bbox_data["left_bbox_min_dist_to_edge"],
+            bbox_data["left_hand_detected"],
+            hamer_data["left"],
+            paths,
+            "left"
+        )
+        right_data = self._process_hand_data(
+            imgs_rgb,
+            bbox_data["right_bboxes"],
+            bbox_data["right_bbox_min_dist_to_edge"],
+            bbox_data["right_hand_detected"],
+            hamer_data["right"],
+            paths,
+            "right"
+        )
+        # Convert to boolean masks
+        left_masks = left_data["left_masks"].astype(np.bool_)
+        left_sam_imgs = left_data["left_sam_imgs"]
+        right_masks = right_data["right_masks"].astype(np.bool_)
+        right_sam_imgs = right_data["right_sam_imgs"]
+        # Save results with separate left/right hand data
+        self._save_results(paths, left_masks, left_sam_imgs, right_masks, right_sam_imgs)
+    def _process_hand_data(
+        self,
+        imgs_rgb: np.ndarray,
+        bboxes: np.ndarray,
+        bbox_min_dist: np.ndarray,
+        hand_detected: np.ndarray,
+        hamer_data: HandSequence,
+        paths: Paths,
+        hand_side: str
+    ) -> Dict[str, np.ndarray]:
+        """
+        Process hand segmentation data for a single hand (left or right).
+        Args:
+            imgs_rgb: RGB video frames
+            bboxes: Hand bounding boxes from detection stage
+            bbox_min_dist: Minimum distances to image edges (quality metric)
+            hand_detected: Boolean flags indicating valid hand detections
+            hamer_data: Hand pose data for segmentation guidance
+            paths: Paths object for file management
+            hand_side: "left" or "right" specifying which hand to process
+        Returns:
+            Dictionary containing segmentation masks and visualization images
+        """
+        # Handle cases with no valid detections
+        if not hand_detected.any() or max(bbox_min_dist) == 0:
+            return {
+                f"{hand_side}_masks": np.zeros((len(imgs_rgb), imgs_rgb[0].shape[0], imgs_rgb[0].shape[1])),
+                f"{hand_side}_sam_imgs": np.zeros((len(imgs_rgb), imgs_rgb[0].shape[0], imgs_rgb[0].shape[1], 3))
+            }
+        # Extract hand pose keypoints for segmentation guidance
+        kpts_2d = hamer_data.kpts_2d
+        # Find the frame with highest quality (furthest from edges)
+        max_dist_idx = np.argmax(bbox_min_dist)
+        bbox = bboxes[max_dist_idx]
+        points = np.expand_dims(kpts_2d[max_dist_idx], axis=1)
+        # Process segmentation in both temporal directions
+        masks_forward, sam_imgs_forward = self._run_sam_segmentation(
+            paths, bbox, points, max_dist_idx, reverse=False, output_bboxes=bboxes
+        )
+        masks_reverse, sam_imgs_reverse = self._run_sam_segmentation(
+            paths, bbox, points, max_dist_idx, reverse=True, output_bboxes=bboxes
+        )
+        # Combine bidirectional results
+        sam_imgs = self._combine_sam_images(imgs_rgb, sam_imgs_forward, sam_imgs_reverse)
+        masks = self._combine_masks(imgs_rgb, masks_forward, masks_reverse)
+        return {
+            f"{hand_side}_masks": masks,
+            f"{hand_side}_sam_imgs": sam_imgs
+        }
+    def _run_sam_segmentation(
+        self,
+        paths: Paths,
+        bbox: np.ndarray,
+        points: np.ndarray,
+        max_dist_idx: int,
+        reverse: bool,
+        output_bboxes: np.ndarray
+    ) -> Tuple[Dict[int, np.ndarray], Dict[int, np.ndarray]]:
+        """
+        Process video segmentation in either forward or reverse temporal direction.
+        Args:
+            paths: Paths object for file management
+            bbox: Initial bounding box for segmentation
+            points: Hand keypoints for segmentation guidance
+            max_dist_idx: Index of highest-quality frame for initialization
+            reverse: Whether to process in reverse temporal order
+            output_bboxes: All bounding boxes for the sequence
+        Returns:
+            Tuple of (segmentation_masks, visualization_images)
+        """
+        return self.detector_sam.segment_video(
+            paths.original_images_folder,
+            bbox,
+            points,
+            [max_dist_idx],
+            reverse=reverse,
+            output_bboxes=output_bboxes
+        )
+    @staticmethod
+    def _save_results(
+        paths: Paths,
+        left_masks: np.ndarray,
+        left_sam_imgs: np.ndarray,
+        right_masks: np.ndarray,
+        right_sam_imgs: np.ndarray,
+        fps: int = DEFAULT_FPS
+    ) -> None:
+        """
+        Save hand segmentation results to disk.
+        Args:
+            paths: Paths object containing output file locations
+            left_masks: Left hand segmentation masks
+            left_sam_imgs: Left hand SAM visualization images
+            right_masks: Right hand segmentation masks
+            right_sam_imgs: Right hand SAM visualization images
+            fps: Frames per second for output videos (default: 10)
+        """
+        HandSegmentationProcessor._create_output_directory(paths)
+        try:
+            HandSegmentationProcessor._save_hand_mask_data(paths, left_masks, right_masks)
+            HandSegmentationProcessor._create_hand_videos(paths, left_masks, left_sam_imgs, right_masks, right_sam_imgs, fps)
+        except Exception as e:
+            logging.error(f"Error saving results: {str(e)}")
+            raise
+        HandSegmentationProcessor._cleanup_temp_files(paths)
+    @staticmethod
+    def _create_output_directory(paths: Paths) -> None:
+        """
+        Create output directory for segmentation results.
+        Args:
+            paths: Paths object containing output directory location
+        """
+        if not os.path.exists(paths.segmentation_processor):
+            os.makedirs(paths.segmentation_processor)
+    @staticmethod
+    def _save_hand_mask_data(paths: Paths, left_masks: np.ndarray, right_masks: np.ndarray) -> None:
+        """
+        Save hand mask data to disk.
+        Args:
+            paths: Paths object containing output file locations
+            left_masks: Left hand segmentation masks
+            right_masks: Right hand segmentation masks
+        """
+        np.save(paths.masks_hand_left, left_masks)
+        np.save(paths.masks_hand_right, right_masks)
+    @staticmethod
+    def _create_hand_videos(
+        paths: Paths,
+        left_masks: np.ndarray,
+        left_sam_imgs: np.ndarray,
+        right_masks: np.ndarray,
+        right_sam_imgs: np.ndarray,
+        fps: int
+    ) -> None:
+        """
+        Create visualization videos for hand segmentation.
+        Args:
+            paths: Paths object containing output file locations
+            left_masks: Left hand segmentation masks
+            left_sam_imgs: Left hand SAM visualization images
+            right_masks: Right hand segmentation masks
+            right_sam_imgs: Right hand SAM visualization images
+            fps: Frames per second for output videos
+        """
+        for name, data in [
+            ("video_masks_hand_left", left_masks),
+            ("video_masks_hand_right", right_masks),
+            ("video_sam_hand_left", left_sam_imgs),
+            ("video_sam_hand_right", right_sam_imgs),
+        ]:
+            output_path = getattr(paths, name)
+            media.write_video(output_path, data, fps=fps, codec=DEFAULT_CODEC)
+    @staticmethod
+    def _cleanup_temp_files(paths: Paths) -> None:
+        """
+        Clean up temporary directories created during processing.
+        Args:
+            paths: Paths object containing temporary directory locations
+        """
+        if os.path.exists(paths.original_images_folder):
+            shutil.rmtree(paths.original_images_folder)
+        if os.path.exists(paths.original_images_folder_reverse):
+            shutil.rmtree(paths.original_images_folder_reverse)

phantom/phantom/processors/smoothing_processor.py ADDED Viewed

	@@ -0,0 +1,303 @@

+"""
+Trajectory Smoothing Processor Module
+This module does trajectory smoothing for end-effector positions, orientations, and gripper states
+extracted from human demonstrations.
+Processing Pipeline:
+1. Load processed action data from previous pipeline stages
+2. Apply Gaussian Process smoothing to 3D position trajectories
+3. Apply SLERP-based smoothing to rotation matrix trajectories
+4. Apply Gaussian Process smoothing to gripper distance trajectories
+5. Save smoothed trajectories for robot execution
+"""
+import os
+from typing import Optional
+import argparse
+import numpy as np
+import logging
+from sklearn.gaussian_process import GaussianProcessRegressor  # type: ignore
+from sklearn.gaussian_process.kernels import RBF, WhiteKernel  # type: ignore
+from scipy.spatial.transform import Rotation, Slerp
+from phantom.processors.base_processor import BaseProcessor
+from phantom.processors.paths import Paths
+logger = logging.getLogger(__name__)
+def gaussian_kernel(size: int, sigma: float) -> np.ndarray:
+    """
+    Generate a centered Gaussian kernel for local smoothing operations.
+    Args:
+        size: Size of the kernel (should be odd for proper centering)
+        sigma: Standard deviation of the Gaussian distribution
+    Returns:
+        Normalized Gaussian kernel array
+    Raises:
+        ValueError: If size is not positive
+    """
+    if size <= 0:
+        raise ValueError("Kernel size must be positive")
+    x = np.arange(size) - size // 2
+    kernel = np.exp(-0.5 * (x / sigma) ** 2)
+    return kernel / kernel.sum()
+class SmoothingProcessor(BaseProcessor):
+    """
+    This processor takes raw trajectory data extracted from human demonstrations
+    and applies smoothing techniques to create executable robot trajectories.
+    Attributes:
+        bimanual_setup (str): Configuration mode ("single_arm" or bimanual type)
+        target_hand (str): Target hand for single-arm processing ("left" or "right")
+    """
+    def __init__(self, args: argparse.Namespace) -> None:
+        """
+        Initialize the smoothing processor with configuration parameters.
+        Args:
+            args: Command line arguments containing smoothing configuration
+                 including bimanual setup and target hand specification
+        """
+        super().__init__(args)
+    def process_one_demo(self, data_sub_folder: str) -> None:
+        """
+        Process and smooth trajectories for a single demonstration.
+        Args:
+            data_sub_folder: Path to demonstration data folder containing
+                           processed action trajectories from previous stages
+        """
+        save_folder = self.get_save_folder(data_sub_folder)
+        paths = self.get_paths(save_folder)
+        # Handle single-arm processing mode
+        if self.bimanual_setup == "single_arm":
+            self._process_single_arm_demo(paths)
+        else:
+            self._process_bimanual_demo(paths)
+    def _process_single_arm_demo(self, paths: Paths) -> None:
+        """
+        Process single-arm demonstration data.
+        Args:
+            paths: Paths object containing file locations
+        """
+        # Load action data for target hand
+        actions_path = self._get_actions_path(paths)
+        actions = np.load(actions_path, allow_pickle=True)
+        # Apply smoothing to each trajectory component
+        smoothed_ee_pts = self.gaussian_process_smoothing(actions["ee_pts"])
+        # Apply rotation smoothing with configuration-specific parameters
+        if self.constrained_hand:
+            smoothed_ee_oris = self.gaussian_slerp_smoothing(
+                actions["ee_oris"], sigma=10.0, kernel_size=41
+            )
+        else:
+            smoothed_ee_oris = self.gaussian_slerp_smoothing(
+                actions["ee_oris"], sigma=10.0
+            )
+        smoothed_ee_widths = self.gaussian_process_smoothing(actions["ee_widths"])
+        # Save results based on target hand
+        if self.target_hand == "left":
+            self._save_results(paths, smoothed_ee_pts_left=smoothed_ee_pts,
+                             smoothed_ee_oris_left=smoothed_ee_oris,
+                             smoothed_ee_widths_left=smoothed_ee_widths)
+        else:
+            self._save_results(paths, smoothed_ee_pts_right=smoothed_ee_pts,
+                             smoothed_ee_oris_right=smoothed_ee_oris,
+                             smoothed_ee_widths_right=smoothed_ee_widths)
+    def _process_bimanual_demo(self, paths: Paths) -> None:
+        """
+        Process bimanual demonstration data.
+        Args:
+            paths: Paths object containing file locations
+        """
+        # Load data for both hands
+        actions_left_path = str(paths.actions_left).split(".npz")[0] + f"_{self.bimanual_setup}.npz"
+        actions_right_path = str(paths.actions_right).split(".npz")[0] + f"_{self.bimanual_setup}.npz"
+        actions_left = np.load(actions_left_path, allow_pickle=True)
+        actions_right = np.load(actions_right_path, allow_pickle=True)
+        # Apply position smoothing using Gaussian Process regression
+        smoothed_ee_pts_left = self.gaussian_process_smoothing(actions_left["ee_pts"])
+        smoothed_ee_pts_right = self.gaussian_process_smoothing(actions_right["ee_pts"])
+        # Apply rotation smoothing using SLERP with optimized parameters for bimanual coordination
+        smoothed_ee_oris_left = self.gaussian_slerp_smoothing(
+            actions_left["ee_oris"], sigma=10.0, kernel_size=21
+        )
+        smoothed_ee_oris_right = self.gaussian_slerp_smoothing(
+            actions_right["ee_oris"], sigma=10.0, kernel_size=21
+        )
+        # Apply gripper distance smoothing
+        smoothed_ee_widths_left = self.gaussian_process_smoothing(actions_left["ee_widths"])
+        smoothed_ee_widths_right = self.gaussian_process_smoothing(actions_right["ee_widths"])
+        # Save all smoothed trajectories
+        self._save_results(paths, smoothed_ee_pts_left, smoothed_ee_oris_left, smoothed_ee_widths_left,
+                           smoothed_ee_pts_right, smoothed_ee_oris_right, smoothed_ee_widths_right)
+    def _get_actions_path(self, paths: Paths) -> str:
+        """
+        Get the appropriate actions file path based on target hand.
+        Args:
+            paths: Paths object containing file locations
+        Returns:
+            Path to the actions file for the target hand
+        """
+        if self.target_hand == "left":
+            base_path = str(paths.actions_left)
+        else:
+            base_path = str(paths.actions_right)
+        return base_path.split(".npz")[0] + f"_{self.bimanual_setup}.npz"
+    def _save_results(self, paths: Paths, smoothed_ee_pts_left: Optional[np.ndarray] = None,
+                      smoothed_ee_oris_left: Optional[np.ndarray] = None,
+                      smoothed_ee_widths_left: Optional[np.ndarray] = None,
+                      smoothed_ee_pts_right: Optional[np.ndarray] = None,
+                      smoothed_ee_oris_right: Optional[np.ndarray] = None,
+                      smoothed_ee_widths_right: Optional[np.ndarray] = None) -> None:
+        """
+        Save smoothed trajectory results to disk.
+        Args:
+            paths: Paths object containing output file locations
+            smoothed_ee_pts_left: Smoothed left hand position trajectory
+            smoothed_ee_oris_left: Smoothed left hand orientation trajectory
+            smoothed_ee_widths_left: Smoothed left hand gripper trajectory
+            smoothed_ee_pts_right: Smoothed right hand position trajectory
+            smoothed_ee_oris_right: Smoothed right hand orientation trajectory
+            smoothed_ee_widths_right: Smoothed right hand gripper trajectory
+        """
+        # Create output directory
+        os.makedirs(paths.smoothing_processor, exist_ok=True)
+        # Save left hand trajectories if provided
+        if smoothed_ee_pts_left is not None:
+            smoothed_actions_left_path = str(paths.smoothed_actions_left).split(".npz")[0] + f"_{self.bimanual_setup}.npz"
+            np.savez(smoothed_actions_left_path,
+                    ee_pts=smoothed_ee_pts_left,
+                    ee_oris=smoothed_ee_oris_left,
+                    ee_widths=smoothed_ee_widths_left)
+        # Save right hand trajectories if provided
+        if smoothed_ee_pts_right is not None:
+            smoothed_actions_right_path = str(paths.smoothed_actions_right).split(".npz")[0] + f"_{self.bimanual_setup}.npz"
+            np.savez(smoothed_actions_right_path,
+                    ee_pts=smoothed_ee_pts_right,
+                    ee_oris=smoothed_ee_oris_right,
+                    ee_widths=smoothed_ee_widths_right)
+    @staticmethod
+    def gaussian_slerp_smoothing(rot_mats: np.ndarray, sigma: float = 2, kernel_size: int = 9) -> np.ndarray:
+        """
+        Apply Gaussian-weighted SLERP smoothing to rotation matrices.
+        Args:
+            rot_mats: Array of rotation matrices to smooth, shape (N, 3, 3)
+            sigma: Standard deviation for Gaussian kernel
+            kernel_size: Size of the smoothing kernel (should be odd)
+        Returns:
+            Array of smoothed rotation matrices, shape (N, 3, 3)
+        Raises:
+            ValueError: If kernel_size is not odd
+        """
+        if kernel_size % 2 != 1:
+            raise ValueError("Kernel size must be odd for proper centering")
+        half_k = kernel_size // 2
+        N = len(rot_mats)
+        # Step 1: Convert rotation matrices to quaternions for interpolation
+        quats = Rotation.from_matrix(rot_mats).as_quat()
+        # Step 2: Apply hemisphere correction to ensure quaternion continuity
+        quats_fixed = [quats[0]]
+        for i in range(1, N):
+            q = quats[i]
+            # Choose quaternion hemisphere that minimizes distance to previous quaternion
+            if np.dot(q, quats_fixed[-1]) < 0:
+                q = -q
+            quats_fixed.append(q)
+        quats_fixed = np.array(quats_fixed)
+        # Step 3: Prepare normalized Gaussian weights for local smoothing
+        weights = gaussian_kernel(kernel_size, sigma)
+        # Step 4: Apply weighted SLERP averaging for each time point
+        smoothed_rots = []
+        for i in range(N):
+            # Define local neighborhood around current time point
+            start = max(0, i - half_k)
+            end = min(N, i + half_k + 1)
+            # Extract local quaternions and corresponding weights
+            local_quats = quats_fixed[start:end]
+            local_weights = weights[half_k - (i - start): half_k + (end - i)]
+            # Normalize weights for current neighborhood
+            local_weights /= local_weights.sum()
+            # Initialize weighted average with first quaternion
+            q_avg = local_quats[0]
+            r_avg = Rotation.from_quat(q_avg)
+            # Iteratively apply weighted SLERP interpolation
+            for j in range(1, len(local_quats)):
+                r_next = Rotation.from_quat(local_quats[j])
+                # Use SLERP with weight proportional to current quaternion's contribution
+                r_avg = Slerp([0, 1], Rotation.concatenate([r_avg, r_next]))([local_weights[j] / (local_weights[:j+1].sum())])[0]
+            smoothed_rots.append(r_avg.as_matrix())
+        return np.stack(smoothed_rots)
+    @staticmethod
+    def gaussian_process_smoothing(pts: np.ndarray) -> np.ndarray:
+        """
+        Apply Gaussian process smoothing to trajectory points.
+        Args:
+            pts: Trajectory points to smooth, shape (N,) for 1D or (N, D) for multi-dimensional
+        Returns:
+            Smoothed trajectory points with same shape as input
+        Raises:
+            ValueError: If pts is empty
+        """
+        if len(pts) == 0:
+            raise ValueError("Cannot smooth empty trajectory")
+        # Create time indices as features for GP regression
+        time = np.arange(len(pts))[:, None]  # Time as a single feature
+        # Configure GP kernel: RBF for smoothness + White noise for robustness
+        kernel = RBF(length_scale=1) + WhiteKernel(noise_level=1)
+        gpr = GaussianProcessRegressor(kernel=kernel, normalize_y=True)
+        # Handle 1D trajectory case
+        if pts.ndim == 1:
+            return gpr.fit(time, pts).predict(time)
+        # Handle multi-dimensional trajectory case by processing each dimension independently
+        return np.column_stack([gpr.fit(time, pts[:, i]).predict(time) for i in range(pts.shape[1])])

phantom/phantom/twin_bimanual_robot.py ADDED Viewed

	@@ -0,0 +1,597 @@

+"""
+Virtual twin bimanual robot implementation for MuJoCo simulation.
+This module provides a TwinBimanualRobot class that creates a virtual representation
+of a bimanual (two-arm) robot system in MuJoCo using the robosuite framework.
+The twin robot can be controlled via end-effector poses or joint positions and
+provides observation data including RGB images, depth maps, and robot masks.
+"""
+from collections import deque
+import re
+import cv2
+import pdb
+import matplotlib.pyplot as plt
+import numpy as np
+from scipy.spatial.transform import Rotation
+from dataclasses import dataclass
+from typing import Tuple, Union, Any
+from robosuite.controllers import load_controller_config # type: ignore
+from robosuite.utils.camera_utils import get_real_depth_map # type: ignore
+from robomimic.envs.env_robosuite import EnvRobosuite # type: ignore
+import robomimic.utils.obs_utils as ObsUtils # type: ignore
+@dataclass
+class MujocoCameraParams:
+    """
+    Camera parameters for MuJoCo simulation.
+    Attributes:
+        name: Camera name identifier
+        pos: 3D position of camera in world coordinates
+        ori_wxyz: Camera orientation as quaternion (w, x, y, z)
+        fov: Field of view in degrees
+        resolution: Image resolution as (width, height)
+        sensorsize: Physical sensor size in mm
+        principalpixel: Principal point coordinates in pixels
+        focalpixel: Focal length in pixels
+    """
+    name: str
+    pos: np.ndarray
+    ori_wxyz: np.ndarray
+    fov: float
+    resolution: Tuple[int, int]
+    sensorsize: np.ndarray
+    principalpixel: np.ndarray
+    focalpixel: np.ndarray
+# Color constants for visualization (RGBA format)
+THUMB_COLOR = [0, 1, 0, 1]  # Green for thumb
+INDEX_COLOR = [1, 0, 0, 1]  # Red for index finger
+HAND_EE_COLOR = [0, 0, 1, 1]  # Blue for hand end-effector
+# Transformation matrix for Epic Kitchen setup - converts from base frame to robot frame
+BASE_T_1 = np.array([[0.0, -1.0,  0.0,  0.0],
+                    [ 0.5,  0.0,  0.866,  0.2],
+                    [-0.866,  0.0,  0.5,  1.50],
+                    [ 0.0,  0.0,  0.0,  1.0]])
+def convert_real_camera_ori_to_mujoco(camera_ori_matrix: np.ndarray) -> np.ndarray:
+    """
+    Convert camera orientation from real world to MuJoCo XML format.
+    MuJoCo uses a different coordinate system convention, so we need to
+    flip the Y and Z axes of the rotation matrix before converting to quaternion.
+    Args:
+        camera_ori_matrix: 3x3 rotation matrix in real-world coordinates
+    Returns:
+        Camera orientation as quaternion in MuJoCo format (w, x, y, z)
+    """
+    camera_ori_matrix[:, [1, 2]] = -camera_ori_matrix[:, [1, 2]]
+    r = Rotation.from_matrix(camera_ori_matrix)
+    camera_ori_wxyz = r.as_quat(scalar_first=True)
+    return camera_ori_wxyz
+class TwinBimanualRobot:
+    """
+    Virtual twin of a bimanual robot system in MuJoCo simulation.
+    This class creates a simulated bimanual robot that can be controlled via
+    end-effector poses or joint positions. It provides functionality for:
+    - Robot pose control (OSC or joint-level)
+    - Camera observation collection (RGB, depth, segmentation)
+    - Robot and gripper mask generation
+    - Observation history management
+    """
+    def __init__(self, robot_name: str, gripper_name: str, bimanual_setup: str,
+                 camera_params: MujocoCameraParams, camera_height: int, camera_width: int,
+                 render: bool, n_steps_short: int, n_steps_long: int, square: bool = False,
+                 debug_cameras: list[str] = [], epic: bool = False, joint_controller: bool = False):
+        """
+        Initialize the bimanual robot twin.
+        Args:
+            robot_name: Type of robot (e.g., "Kinova3")
+            gripper_name: Type of gripper (e.g., "Robotiq85")
+            bimanual_setup: Configuration for bimanual setup
+            camera_params: Camera configuration parameters
+            camera_height: Height of camera images in pixels
+            camera_width: Width of camera images in pixels
+            render: Whether to render the simulation visually
+            n_steps_short: Number of simulation steps for quick movements
+            n_steps_long: Number of simulation steps for initial/slow movements
+            square: Whether to crop images to square aspect ratio
+            debug_cameras: Additional camera names for debugging views
+            epic: Whether to use Epic Kitchen coordinate system
+            joint_controller: Whether to use joint-level control instead of OSC
+        """
+        # Store configuration parameters
+        self.robot_name = robot_name
+        self.gripper_name = gripper_name
+        self.bimanual_setup = bimanual_setup
+        self.camera_params = camera_params
+        self.render = render
+        self.n_steps_long = n_steps_long
+        self.n_steps_short= n_steps_short
+        self.num_frames = 2  # Number of frames to keep in observation history
+        self.camera_height = camera_height
+        self.camera_width = camera_width
+        self.camera_name = "zed"  # Main camera name
+        self.square = square
+        self.debug_cameras = list(debug_cameras) if debug_cameras else []
+        self.epic = epic  # Epic Kitchen mode flag
+        self.joint_controller = joint_controller  # Control mode flag
+        # Configure observation specifications for robomimic
+        obs_spec = dict(
+            obs=dict(
+                low_dim=["robot0_eef_pos"],  # End-effector position observations
+                rgb=[f"{self.camera_params.name}_image"] + [f"{cam}_image" for cam in self.debug_cameras],
+            ),
+        )
+        ObsUtils.initialize_obs_utils_with_obs_specs(
+            obs_modality_specs=obs_spec)
+        # Configure robosuite environment options
+        options: dict[str, Union[str, list[str], dict[str, Any], bool, int, np.ndarray]] = {}
+        options["env_name"] = "PhantomBimanual"
+        options["bimanual_setup"] = bimanual_setup
+        options["robots"] = [self.robot_name, self.robot_name]  # Two identical robots
+        if self.robot_name == "Kinova3":
+            options["gripper_types"] = [f"{self.gripper_name}GripperRealKinova", f"{self.gripper_name}GripperRealKinova"]
+        else:
+            options["gripper_types"] = [f"{self.gripper_name}Gripper", f"{self.gripper_name}Gripper"]
+        # Configure controller (OSC pose control by default)
+        controller_config = load_controller_config(default_controller="OSC_POSE")
+        controller_config["control_delta"] = False  # Use absolute positioning
+        controller_config["uncouple_pos_ori"] = False  # Couple position and orientation
+        options["controller_configs"] = controller_config
+        # Override with joint controller if specified
+        if self.joint_controller:
+            controller_config = load_controller_config(default_controller="JOINT_POSITION")
+            controller_config["input_type"] = "absolute"
+            controller_config["input_max"] = 10
+            controller_config["input_min"] = -10
+            controller_config["output_max"] = 10
+            controller_config["output_min"] = -10
+            controller_config["kd"] = 200  # Derivative gain
+            controller_config["kv"] = 200  # Velocity gain
+            controller_config["kp"] = 1000  # Proportional gain
+            controller_config["kp_limits"] = [0, 1000]  # Proportional gain limits
+            options["controller_configs"] = controller_config
+        # Camera and observation settings
+        options["camera_heights"] = self.camera_height
+        options["camera_widths"] = self.camera_width
+        options["camera_segmentations"] = "instance"  # Instance segmentation masks
+        options["direct_gripper_control"] = True
+        options["use_depth_obs"] = True
+        # Apply Epic Kitchen coordinate transformation if enabled
+        if self.epic:
+            self.base_T_1 = BASE_T_1
+            # Transform camera position and orientation to Epic Kitchen frame
+            self.camera_params.pos = self.base_T_1[:3, :3] @ self.camera_params.pos + self.base_T_1[:3, 3]
+            camera_ori_matrix = self.base_T_1[:3, :3] @ Rotation.from_quat(self.camera_params.ori_wxyz, scalar_first=True).as_matrix()
+            self.camera_params.ori_wxyz = Rotation.from_matrix(camera_ori_matrix).as_quat(scalar_first=True)
+        # Set camera parameters
+        options["camera_pos"] = self.camera_params.pos
+        options["camera_quat_wxyz"] = self.camera_params.ori_wxyz
+        options["camera_sensorsize"] = self.camera_params.sensorsize
+        options["camera_principalpixel"] = self.camera_params.principalpixel
+        options["camera_focalpixel"] = self.camera_params.focalpixel
+        # Create the robosuite environment
+        self.env = EnvRobosuite(
+            **options,
+            render=render,
+            render_offscreen=True,  # Enable offscreen rendering for image capture
+            use_image_obs=True,
+            camera_names=[self.camera_params.name] + self.debug_cameras,
+            control_freq=20,  # 20 Hz control frequency
+        )
+        # Initialize environment and compute robot base position
+        self.reset()
+        self.robot_base_pos = np.array([0, 0, self.env.env.robot_base_height+self.env.env.robot_base_offset])
+    def reset(self):
+        """Reset environment and clear observation history."""
+        self.env.reset()
+        self.obs_history = deque()
+    def close(self):
+        """Close the simulation environment."""
+        self.env.env.close()
+    def get_action_from_ee_pose(self, ee_pos: np.ndarray, ee_quat_xyzw: np.ndarray, gripper_action: float,
+                                use_base_offset: bool = False) -> np.ndarray:
+        """
+        Convert end-effector pose to robot action vector.
+        This method transforms the desired end-effector position and orientation
+        into the action format expected by the robot controller.
+        Args:
+            ee_pos: End-effector position as 3D array
+            ee_quat_xyzw: End-effector orientation as quaternion (x, y, z, w)
+            gripper_action: Gripper action value
+            use_base_offset: Whether to add robot base offset to position
+        Returns:
+            Action vector [position(3), rotation(3), gripper(1)]
+        """
+        # Handle batch inputs by taking the last element
+        if ee_pos.ndim > 1:
+            ee_pos = ee_pos[-1]
+            ee_quat_xyzw = ee_quat_xyzw[-1]
+        # Add base offset if requested and not in Epic mode
+        if use_base_offset and not self.epic:
+            ee_pos = ee_pos + self.robot_base_pos
+        # Apply coordinate transformations based on mode
+        if self.epic:
+            # Transform position and orientation to Epic Kitchen coordinate frame
+            ee_pos = self.base_T_1[:3, 3] + self.base_T_1[:3, :3] @ ee_pos
+            axis_angle = Rotation.from_matrix(self.base_T_1[:3, :3] @ Rotation.from_quat(ee_quat_xyzw).as_matrix()).as_rotvec()
+        elif not self.epic:
+            # Apply 135-degree Z rotation for standard setup
+            rot = Rotation.from_quat(ee_quat_xyzw)
+            rot_135deg = Rotation.from_euler('z', 135, degrees=True)
+            new_rot = rot * rot_135deg
+            axis_angle = new_rot.as_rotvec()
+        # Combine into action vector
+        action = np.concatenate([ee_pos, axis_angle, [gripper_action]])
+        return action
+    def _get_initial_obs_history(self, state: dict) -> deque:
+        """
+        Initialize observation history by repeating the first observation.
+        This creates a history buffer filled with the initial robot state,
+        which is useful for algorithms that require temporal context.
+        Args:
+            state: Initial robot state dictionary
+        Returns:
+            Deque containing repeated initial observations
+        """
+        obs_history = deque(
+                [self.move_to_target_state(state, init=True)],
+                maxlen=self.num_frames,
+        )
+        # Fill remaining slots with copies of the initial observation
+        for _ in range(self.num_frames-1):
+            obs_history.append(self.move_to_target_state(state))
+        return obs_history
+    def get_obs_history(self, state: dict) -> list:
+        """
+        Get observation history with specified length.
+        Maintains a rolling buffer of recent observations for temporal context.
+        Args:
+            state: Current robot state dictionary
+        Returns:
+            List of recent observations (length = self.num_frames)
+        """
+        if len(self.obs_history) == 0:
+            # Initialize history if empty
+            self.obs_history = self._get_initial_obs_history(state)
+        else:
+            # Add new observation to history
+            self.obs_history.append(self.move_to_target_state(state))
+        return list(self.obs_history)
+    def move_to_target_state(self, state: dict, init=False) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Move robot to target state and collect observation data.
+        This is the main method for controlling the robot and collecting observations.
+        It handles both pose and joint control modes, and collects RGB, depth,
+        and segmentation data along with tracking errors.
+        Args:
+            state: Target state containing positions, orientations, and gripper states
+            init: Whether this is an initialization step (uses longer movement time)
+        Returns:
+            Dictionary containing observation data:
+            - robot_mask: Binary mask showing robot pixels
+            - gripper_mask: Binary mask showing gripper pixels
+            - rgb_img: RGB camera image
+            - depth_img: Depth camera image
+            - robot_pos: Robot end-effector position
+            - left_pos_err: Left arm position tracking error
+            - right_pos_err: Right arm position tracking error
+            - {cam}_img: Additional camera images if debug_cameras specified
+        """
+        # Convert gripper positions to actions based on controller type
+        if not self.joint_controller:
+            # Use pose controller with gripper position mapping
+            gripper_action_0 = self._convert_handgripper_pos_to_action(state["gripper_pos"][0])
+            gripper_action_1 = self._convert_handgripper_pos_to_action(state["gripper_pos"][1])
+            gripper_action = [gripper_action_0, gripper_action_1]
+        else:
+            # Use joint controller with direct gripper control
+            gripper_action = [state["gripper_pos"][0]*255, state["gripper_pos"][1]*255]
+        # Choose movement duration based on whether this is initialization
+        n_steps = self.n_steps_long if init else self.n_steps_short
+        # Execute movement based on controller type
+        if not self.joint_controller:
+            # Move using pose control
+            obs = self.move_to_pose(state["pos"], state["ori_xyzw"], gripper_action, n_steps)
+        else:
+            # Move using joint control
+            obs = self.move_to_pose(state["pos"], state["ori_xyzw"], gripper_action, n_steps, state["q0"], state["q1"])
+        # Extract observation data from simulation
+        robot_mask = np.squeeze(self.get_robot_mask(obs))
+        gripper_mask = np.squeeze(self.get_gripper_mask(obs))
+        rgb_img = self.get_image(obs)
+        depth_img = self.get_depth_image(obs)
+        robot_pos = obs["robot0_eef_pos"] - self.robot_base_pos
+        # Calculate end-effector tracking errors for both arms
+        if not self.epic:
+            # Standard coordinate frame
+            right_pos_error = np.linalg.norm(obs['robot0_eef_pos']-self.robot_base_pos - state["pos"][0])
+            left_pos_error = np.linalg.norm(obs['robot1_eef_pos']-self.robot_base_pos - state["pos"][1])
+        else:
+            # Epic Kitchen coordinate frame
+            right_pos_error = np.linalg.norm(obs['robot0_eef_pos']-self.base_T_1[:3, 3] - self.base_T_1[:3, :3] @ state["pos"][0])
+            left_pos_error = np.linalg.norm(obs['robot1_eef_pos']-self.base_T_1[:3, 3] - self.base_T_1[:3, :3] @ state["pos"][1])
+        # Compile output dictionary
+        output = {
+            "robot_mask": robot_mask,
+            "gripper_mask": gripper_mask,
+            "rgb_img": rgb_img,
+            "depth_img": depth_img,
+            "robot_pos": robot_pos,
+            "left_pos_err": left_pos_error,
+            "right_pos_err": right_pos_error,
+        }
+        # Add debug camera images if specified
+        for cam in self.debug_cameras:
+            cam_img = self.get_camera_image(obs, cam)
+            output[f"{cam}_img"] = cam_img
+        return output
+    def _convert_handgripper_pos_to_action(self, gripper_pos: float) -> np.ndarray:
+        """
+        Convert hand gripper position to robot gripper action.
+        Maps from physical gripper opening distance to robot action values.
+        Different gripper types may have different mappings.
+        Args:
+            gripper_pos: Gripper opening distance in meters
+        Returns:
+            Robot gripper action value (0-255 for Robotiq85)
+        Raises:
+            ValueError: If gripper type is not supported
+        """
+        if self.gripper_name == "Robotiq85":
+            # Robotiq85 gripper specifications
+            min_gripper_pos, max_gripper_pos = 0.0, 0.085  # 0 to 8.5cm opening
+            gripper_pos = np.clip(gripper_pos, min_gripper_pos, max_gripper_pos)
+            open_gripper_action, closed_gripper_action = 0, 255  # 0=open, 255=closed
+            # Linear interpolation between open and closed states
+            return np.interp(gripper_pos, [min_gripper_pos, max_gripper_pos], [closed_gripper_action, open_gripper_action])
+        else:
+            raise ValueError(f"Gripper name {self.gripper_name} not supported")
+    def move_to_pose(self, ee_pos: dict, ee_ori: dict, gripper_action: dict, n_steps: int, q0=None, q1=None) -> dict:
+        """
+        Execute robot movement to target pose.
+        Sends action commands to the simulation for the specified number of steps.
+        Handles both pose control (OSC) and joint control modes.
+        Args:
+            ee_pos: End-effector positions for both arms {0: pos0, 1: pos1}
+            ee_ori: End-effector orientations for both arms {0: ori0, 1: ori1}
+            gripper_action: Gripper actions for both arms {0: grip0, 1: grip1}
+            n_steps: Number of simulation steps to execute
+            q0: Joint positions for arm 0 (only for joint controller)
+            q1: Joint positions for arm 1 (only for joint controller)
+        Returns:
+            Final observation dictionary from simulation
+        """
+        if not self.joint_controller:
+            # Pose control mode: convert poses to actions
+            action_0 = self.get_action_from_ee_pose(ee_pos[0], ee_ori[0], gripper_action[0], use_base_offset=True)
+            action_1 = self.get_action_from_ee_pose(ee_pos[1], ee_ori[1], gripper_action[1], use_base_offset=True)
+            action = np.concatenate([action_0, action_1])
+        else:
+            # Joint control mode: convert joint angles from degrees to radians
+            q0_new = []
+            for rot_q in q0:
+                if rot_q >= 180:
+                    q0_new.append((rot_q/180*np.pi-2*np.pi))  # Handle angle wrapping
+                else:
+                    q0_new.append(rot_q/180*np.pi)
+            q1_new = []
+            for rot_q in q1:
+                if rot_q >= 180:
+                    q1_new.append((rot_q/180*np.pi-2*np.pi))  # Handle angle wrapping
+                else:
+                    q1_new.append(rot_q/180*np.pi)
+            # Combine joint positions and gripper actions
+            action_0 = q0_new
+            action_1 = q1_new
+            action = np.concatenate([action_0, np.array(gripper_action[0]).reshape(1,), action_1, np.array(gripper_action[1]).reshape(1,)])
+        # Execute action for specified number of steps
+        for _ in range(n_steps):
+            obs, _, _, _ = self.env.step(action)
+            if self.render:
+                self.env.render()
+        return obs
+    def get_proprioception(self, obs: dict) -> np.ndarray:
+        """
+        Get proprioceptive information (robot's internal state).
+        Args:
+            obs: Observation dictionary from simulation
+        Returns:
+            End-effector position of first robot
+        """
+        pos = obs["robot0_eef_pos"]
+        return pos
+    def get_image(self, obs: dict) -> np.ndarray:
+        """
+        Extract RGB image from observation.
+        Handles image format conversion and optional square cropping.
+        Args:
+            obs: Observation dictionary containing image data
+        Returns:
+            RGB image as numpy array (H, W, 3)
+        """
+        img = obs[f"{self.camera_name}_image"]
+        img = img.transpose(1, 2, 0)  # Convert from CHW to HWC format
+        height = img.shape[0]
+        width = img.shape[1]
+        # Crop to square if requested
+        if self.square:
+            n_remove = int((width - height)/2)
+            img = img[:,n_remove:-n_remove,:]
+        return img
+    def get_camera_image(self, obs: dict, camera_name: str) -> np.ndarray:
+        """
+        Extract RGB image from specific camera.
+        Args:
+            obs: Observation dictionary containing image data
+            camera_name: Name of the camera to extract image from
+        Returns:
+            RGB image as numpy array (H, W, 3)
+        """
+        img = obs[f"{camera_name}_image"]
+        img = img.transpose(1, 2, 0)  # Convert from CHW to HWC format
+        height = img.shape[0]
+        width = img.shape[1]
+        # Crop to square if requested
+        if self.square:
+            n_remove = int((width - height)/2)
+            img = img[:,n_remove:-n_remove,:]
+        return img
+    def get_seg_image(self, obs: dict) -> np.ndarray:
+        """
+        Extract instance segmentation image.
+        Args:
+            obs: Observation dictionary containing segmentation data
+        Returns:
+            Segmentation image as uint8 array where each pixel value
+            represents a different object instance ID
+        """
+        img = obs[f"{self.camera_name}_segmentation_instance"]
+        height = img.shape[0]
+        width = img.shape[1]
+        # Crop to square if requested
+        if self.square:
+            n_remove = int((width - height)/2)
+            img = img[:,n_remove:-n_remove,:]
+            img = img.astype(np.uint8)
+        return img
+    def get_depth_image(self, obs: dict) -> np.ndarray:
+        """
+        Extract and process depth image.
+        Converts raw depth buffer to real-world depth values using
+        robosuite's depth processing utilities.
+        Args:
+            obs: Observation dictionary containing depth data
+        Returns:
+            Depth image as numpy array where values represent
+            distance in meters
+        """
+        img = obs[f"{self.camera_name}_depth"]
+        img = get_real_depth_map(sim=self.env.env.sim, depth_map=img)
+        height = img.shape[0]
+        width = img.shape[1]
+        # Crop to square if requested
+        if self.square:
+            n_remove = int((width - height)/2)
+            img = img[:,n_remove:-n_remove,:]
+        return img
+    def get_robot_mask(self, obs: dict) -> np.ndarray:
+        """
+        Generate binary mask for robot pixels.
+        Uses instance segmentation to identify which pixels belong to
+        the robot arms (instance IDs 1 and 4).
+        Args:
+            obs: Observation dictionary containing segmentation data
+        Returns:
+            Binary mask where 1 indicates robot pixels, 0 otherwise
+        """
+        seg_img = self.get_seg_image(obs)
+        mask = np.zeros_like(seg_img)
+        mask[seg_img == 1] = 1  # First robot arm
+        mask[seg_img == 4] = 1  # Second robot arm
+        return mask
+    def get_gripper_mask(self, obs: dict) -> np.ndarray:
+        """
+        Generate binary mask for gripper pixels.
+        Uses instance segmentation to identify which pixels belong to
+        the robot grippers (instance IDs 3 and 6).
+        Args:
+            obs: Observation dictionary containing segmentation data
+        Returns:
+            Binary mask where 1 indicates gripper pixels, 0 otherwise
+        """
+        seg_img = self.get_seg_image(obs)
+        mask = np.zeros_like(seg_img)
+        mask[seg_img == 3] = 1  # First gripper
+        mask[seg_img == 6] = 1  # Second gripper
+        return mask

phantom/phantom/twin_robot.py ADDED Viewed

	@@ -0,0 +1,490 @@

+"""
+Virtual twin single-arm robot implementation for MuJoCo simulation.
+This module provides a TwinRobot class that creates a virtual representation
+of a single-arm robot system in MuJoCo using the robosuite framework.
+The twin robot can be controlled via end-effector poses and provides
+observation data including RGB images, depth maps, and robot masks.
+"""
+from collections import deque
+import cv2
+import numpy as np
+from scipy.spatial.transform import Rotation
+from dataclasses import dataclass
+from typing import Tuple, Union, Any
+from robosuite.controllers import load_controller_config # type: ignore
+from robosuite.utils.camera_utils import get_real_depth_map # type: ignore
+from robomimic.envs.env_robosuite import EnvRobosuite # type: ignore
+import robomimic.utils.obs_utils as ObsUtils # type: ignore
+@dataclass
+class MujocoCameraParams:
+    """
+    Camera parameters for MuJoCo simulation.
+    Attributes:
+        name: Camera name identifier
+        pos: 3D position of camera in world coordinates
+        ori_wxyz: Camera orientation as quaternion (w, x, y, z)
+        fov: Field of view in degrees
+        resolution: Image resolution as (width, height)
+        sensorsize: Physical sensor size in mm
+        principalpixel: Principal point coordinates in pixels
+        focalpixel: Focal length in pixels
+    """
+    name: str
+    pos: np.ndarray
+    ori_wxyz: np.ndarray
+    fov: float
+    resolution: Tuple[int, int]
+    sensorsize: np.ndarray
+    principalpixel: np.ndarray
+    focalpixel: np.ndarray
+# Color constants for visualization (RGBA format)
+THUMB_COLOR = [0, 1, 0, 1]  # Green for thumb
+INDEX_COLOR = [1, 0, 0, 1]  # Red for index finger
+HAND_EE_COLOR = [0, 0, 1, 1]  # Blue for hand end-effector
+def convert_real_camera_ori_to_mujoco(camera_ori_matrix: np.ndarray) -> np.ndarray:
+    """
+    Convert camera orientation from real world to MuJoCo XML format.
+    MuJoCo uses a different coordinate system convention, so we need to
+    flip the Y and Z axes of the rotation matrix before converting to quaternion.
+    Args:
+        camera_ori_matrix: 3x3 rotation matrix in real-world coordinates
+    Returns:
+        Camera orientation as quaternion in MuJoCo format (w, x, y, z)
+    """
+    camera_ori_matrix[:, [1, 2]] = -camera_ori_matrix[:, [1, 2]]
+    r = Rotation.from_matrix(camera_ori_matrix)
+    camera_ori_wxyz = r.as_quat(scalar_first=True)
+    return camera_ori_wxyz
+class TwinRobot:
+    """
+    Virtual twin of a single-arm robot system in MuJoCo simulation.
+    This class creates a simulated single-arm robot that can be controlled via
+    end-effector poses. It provides functionality for:
+    - Robot pose control using OSC (Operational Space Control)
+    - Camera observation collection (RGB, depth, segmentation)
+    - Robot and gripper mask generation
+    - Observation history management
+    """
+    # Robot configuration constants
+    DEFAULT_ROBOT_BASE_POS = np.array([-0.56, 0, 0.912])
+    def __init__(self, robot_name: str, gripper_name: str, camera_params: MujocoCameraParams, camera_height: int, camera_width: int,
+                 render: bool, n_steps_short: int, n_steps_long: int, debug_cameras: list[str] = [],
+                 square: bool = False):
+        """
+        Initialize the single-arm robot twin.
+        Args:
+            robot_name: Type of robot (e.g., "Kinova3")
+            gripper_name: Type of gripper (e.g., "Robotiq85")
+            camera_params: Camera configuration parameters
+            camera_height: Height of camera images in pixels
+            camera_width: Width of camera images in pixels
+            render: Whether to render the simulation visually
+            n_steps_short: Number of simulation steps for quick movements
+            n_steps_long: Number of simulation steps for initial/slow movements
+            debug_cameras: Additional camera names for debugging views
+            square: Whether to crop images to square aspect ratio
+        """
+        # Store configuration parameters
+        self.robot_name = robot_name
+        self.gripper_name = gripper_name
+        self.camera_params = camera_params
+        self.render = render
+        self.n_steps_long = n_steps_long
+        self.n_steps_short= n_steps_short
+        self.num_frames = 2  # Number of frames to keep in observation history
+        self.camera_height = camera_height
+        self.camera_width = camera_width
+        self.camera_name = "frontview"  # Main camera name for single-arm setup
+        self.square = square
+        self.debug_cameras = list(debug_cameras) if debug_cameras else []
+        # Configure observation specifications for robomimic
+        obs_spec = dict(
+            obs=dict(
+                low_dim=["robot0_eef_pos"],  # End-effector position observations
+                rgb=[f"{self.camera_params.name}_image"] + [f"{cam}_image" for cam in self.debug_cameras],
+            ),
+        )
+        ObsUtils.initialize_obs_utils_with_obs_specs(
+            obs_modality_specs=obs_spec)
+        # Configure robosuite environment options
+        options: dict[str, Union[str, list[str], dict[str, Any], bool, int, np.ndarray]] = {}
+        options["env_name"] = "Phantom"  # Single-arm environment
+        options["robots"] = [self.robot_name]  # Single robot
+        options["gripper_types"] = [f"{self.gripper_name}Gripper"]  # Single gripper
+        # Configure OSC pose controller
+        controller_config = load_controller_config(default_controller="OSC_POSE")
+        controller_config["control_delta"] = False  # Use absolute positioning
+        controller_config["uncouple_pos_ori"] = False  # Couple position and orientation
+        options["controller_configs"] = controller_config
+        # Camera and observation settings
+        options["camera_heights"] = self.camera_height
+        options["camera_widths"] = self.camera_width
+        options["camera_segmentations"] = "instance"  # Instance segmentation masks
+        options["direct_gripper_control"] = True
+        options["use_depth_obs"] = True
+        # Set camera parameters
+        options["camera_pos"] = self.camera_params.pos
+        options["camera_quat_wxyz"] = self.camera_params.ori_wxyz
+        options["camera_sensorsize"] = self.camera_params.sensorsize
+        options["camera_principalpixel"] = self.camera_params.principalpixel
+        options["camera_focalpixel"] = self.camera_params.focalpixel
+        # Create the robosuite environment
+        self.env = EnvRobosuite(
+            **options,
+            render=render,
+            render_offscreen=True,  # Enable offscreen rendering for image capture
+            use_image_obs=True,
+            camera_names=[self.camera_params.name] + self.debug_cameras,
+            control_freq=20,  # 20 Hz control frequency
+        )
+        # Initialize environment and set robot base position
+        self.reset()
+        self.robot_base_pos = self.DEFAULT_ROBOT_BASE_POS  # Fixed base position for single-arm setup
+    def reset(self):
+        """Reset environment and clear observation history."""
+        self.env.reset()
+        self.obs_history = deque()
+    def close(self):
+        """Close the simulation environment."""
+        self.env.env.close()
+    def get_action_from_ee_pose(self, ee_pos: np.ndarray, ee_quat_xyzw: np.ndarray, gripper_action: float,
+                                use_base_offset: bool = False) -> np.ndarray:
+        """
+        Convert end-effector pose to robot action vector.
+        This method transforms the desired end-effector position and orientation
+        into the action format expected by the robot controller.
+        Args:
+            ee_pos: End-effector position as 3D array
+            ee_quat_xyzw: End-effector orientation as quaternion (x, y, z, w)
+            gripper_action: Gripper action value
+            use_base_offset: Whether to add robot base offset to position
+        Returns:
+            Action vector [position(3), rotation(3), gripper(1)]
+        """
+        # Handle batch inputs by taking the last element
+        if ee_pos.ndim > 1:
+            ee_pos = ee_pos[-1]
+            ee_quat_xyzw = ee_quat_xyzw[-1]
+        # Add base offset if requested
+        if use_base_offset:
+            ee_pos = ee_pos + self.robot_base_pos
+        # Apply -135 degree Z rotation for single-arm setup coordinate conversion
+        rot = Rotation.from_quat(ee_quat_xyzw)
+        rot_135deg = Rotation.from_euler('z', -135, degrees=True)
+        new_rot = rot * rot_135deg
+        # Convert rotation to axis-angle representation
+        # Note: commented lines show alternative approach using quaternion directly
+        # quat_rotated = rot_rotated135.as_quat()
+        # axis_angle = Rotation.from_quat(quat_rotated).as_rotvec()
+        axis_angle = new_rot.as_rotvec()
+        # Combine position, rotation, and gripper action into action vector
+        action = np.concatenate([ee_pos, axis_angle, [gripper_action]])
+        return action
+    def _get_initial_obs_history(self, state: dict) -> deque:
+        """
+        Initialize observation history by repeating the first observation.
+        This creates a history buffer filled with the initial robot state,
+        which is useful for algorithms that require temporal context.
+        Args:
+            state: Initial robot state dictionary
+        Returns:
+            Deque containing repeated initial observations
+        """
+        obs_history = deque(
+                [self.move_to_target_state(state, init=True)],
+                maxlen=self.num_frames,
+        )
+        # Fill remaining slots with copies of the initial observation
+        for _ in range(self.num_frames-1):
+            obs_history.append(self.move_to_target_state(state))
+        return obs_history
+    def get_obs_history(self, state: dict) -> list:
+        """
+        Get observation history with specified length.
+        Maintains a rolling buffer of recent observations for temporal context.
+        Args:
+            state: Current robot state dictionary
+        Returns:
+            List of recent observations (length = self.num_frames)
+        """
+        if len(self.obs_history) == 0:
+            # Initialize history if empty
+            self.obs_history = self._get_initial_obs_history(state)
+        else:
+            # Add new observation to history
+            self.obs_history.append(self.move_to_target_state(state))
+        return list(self.obs_history)
+    def move_to_target_state(self, state: dict, init=False) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        """
+        Move robot to target state and collect observation data.
+        Args:
+            state: Target state containing position, orientation, and gripper state
+            init: Whether this is an initialization step (uses longer movement time)
+        Returns:
+            Dictionary containing observation data:
+            - robot_mask: Binary mask showing robot pixels
+            - gripper_mask: Binary mask showing gripper pixels
+            - rgb_img: RGB camera image
+            - depth_img: Depth camera image
+            - robot_pos: Robot end-effector position relative to base
+            - pos_err: Position tracking error magnitude
+            - {cam}_img: Additional camera images if debug_cameras specified
+        """
+        # Convert gripper position to robot action
+        gripper_action = self._convert_handgripper_pos_to_action(state["gripper_pos"])
+        # Choose movement duration based on whether this is initialization
+        n_steps = self.n_steps_long if init else self.n_steps_short
+        # Execute movement to target pose
+        obs = self.move_to_pose(state["pos"], state["ori_xyzw"], float(gripper_action), n_steps)
+        # Extract observation data from simulation
+        robot_mask = np.squeeze(self.get_robot_mask(obs))
+        gripper_mask = np.squeeze(self.get_gripper_mask(obs))
+        rgb_img = self.get_image(obs)
+        depth_img = self.get_depth_image(obs)
+        robot_pos = obs["robot0_eef_pos"] - self.robot_base_pos
+        pos_error = np.linalg.norm(robot_pos - state["pos"])
+        # Compile output dictionary
+        output = {
+            "robot_mask": robot_mask,
+            "gripper_mask": gripper_mask,
+            "rgb_img": rgb_img,
+            "depth_img": depth_img,
+            "robot_pos": robot_pos,
+            "pos_err": pos_error,
+        }
+        # Add debug camera images if specified
+        for cam in self.debug_cameras:
+            cam_img = self.get_cam_image(obs, cam)
+            output[f"{cam}_img"] = cam_img
+        return output
+    def _convert_handgripper_pos_to_action(self, gripper_pos: float) -> np.ndarray:
+        """
+        Convert hand gripper position to robot gripper action.
+        Maps from physical gripper opening distance to robot action values.
+        Different gripper types may have different mappings.
+        Args:
+            gripper_pos: Gripper opening distance in meters
+        Returns:
+            Robot gripper action value (0-255 for Robotiq85)
+        Raises:
+            ValueError: If gripper type is not supported
+        """
+        if self.gripper_name == "Robotiq85":
+            # Robotiq85 gripper specifications
+            min_gripper_pos, max_gripper_pos = 0.0, 0.085  # 0 to 8.5cm opening
+            gripper_pos = np.clip(gripper_pos, min_gripper_pos, max_gripper_pos)
+            open_gripper_action, closed_gripper_action = 0, 255  # 0=open, 255=closed
+            # Linear interpolation between open and closed states
+            return np.interp(gripper_pos, [min_gripper_pos, max_gripper_pos], [closed_gripper_action, open_gripper_action])
+        else:
+            raise ValueError(f"Gripper name {self.gripper_name} not supported")
+    def move_to_pose(self, ee_pos: np.ndarray, ee_ori: np.ndarray, gripper_action: float, n_steps: int) -> dict:
+        """
+        Execute robot movement to target pose.
+        Sends action commands to the simulation for the specified number of steps.
+        Args:
+            ee_pos: End-effector position as 3D array
+            ee_ori: End-effector orientation as quaternion (x, y, z, w)
+            gripper_action: Gripper action value
+            n_steps: Number of simulation steps to execute
+        Returns:
+            Final observation dictionary from simulation
+        """
+        # Convert pose to action vector
+        action = self.get_action_from_ee_pose(ee_pos, ee_ori, gripper_action, use_base_offset=True)
+        # Execute action for specified number of steps
+        for _ in range(n_steps):
+            obs, _, _, _ = self.env.step(action)
+            if self.render:
+                self.env.render()
+        return obs
+    def get_image(self, obs: dict) -> np.ndarray:
+        """
+        Extract RGB image from observation.
+        Handles image format conversion and optional square cropping.
+        Args:
+            obs: Observation dictionary containing image data
+        Returns:
+            RGB image as numpy array (H, W, 3)
+        """
+        img = obs[f"{self.camera_name}_image"]
+        img = img.transpose(1, 2, 0)  # Convert from CHW to HWC format
+        height = img.shape[0]
+        width = img.shape[1]
+        # Crop to square if requested
+        if self.square:
+            n_remove = int((width - height)/2)
+            img = img[:,n_remove:-n_remove,:]
+        return img
+    def get_cam_image(self, obs: dict, camera_name: str) -> np.ndarray:
+        """
+        Extract RGB image from specific camera.
+        Args:
+            obs: Observation dictionary containing image data
+            camera_name: Name of the camera to extract image from
+        Returns:
+            RGB image as numpy array (H, W, 3)
+        """
+        img = obs[f"{camera_name}_image"]
+        img = img.transpose(1, 2, 0)  # Convert from CHW to HWC format
+        height = img.shape[0]
+        width = img.shape[1]
+        # Crop to square if requested
+        if self.square:
+            n_remove = int((width - height)/2)
+            img = img[:,n_remove:-n_remove,:]
+        return img
+    def get_seg_image(self, obs: dict) -> np.ndarray:
+        """
+        Extract instance segmentation image.
+        Args:
+            obs: Observation dictionary containing segmentation data
+        Returns:
+            Segmentation image as uint8 array where each pixel value
+            represents a different object instance ID
+        """
+        img = obs["frontview_segmentation_instance"]  # Fixed camera name for single-arm
+        height = img.shape[0]
+        width = img.shape[1]
+        # Crop to square if requested
+        if self.square:
+            n_remove = int((width - height)/2)
+            img = img[:,n_remove:-n_remove,:]
+        img = img.astype(np.uint8)
+        return img
+    def get_depth_image(self, obs: dict) -> np.ndarray:
+        """
+        Extract and process depth image.
+        Converts raw depth buffer to real-world depth values using
+        robosuite's depth processing utilities.
+        Args:
+            obs: Observation dictionary containing depth data
+        Returns:
+            Depth image as numpy array where values represent
+            distance in meters
+        """
+        img = obs["frontview_depth"]  # Fixed camera name for single-arm
+        img = get_real_depth_map(sim=self.env.env.sim, depth_map=img)
+        height = img.shape[0]
+        width = img.shape[1]
+        # Crop to square if requested
+        if self.square:
+            n_remove = int((width - height)/2)
+            img = img[:,n_remove:-n_remove,:]
+        return img
+    def get_robot_mask(self, obs: dict) -> np.ndarray:
+        """
+        Generate binary mask for robot pixels.
+        Uses instance segmentation to identify which pixels belong to
+        the robot arm (instance ID 1).
+        Args:
+            obs: Observation dictionary containing segmentation data
+        Returns:
+            Binary mask where 1 indicates robot pixels, 0 otherwise
+        """
+        seg_img = self.get_seg_image(obs)
+        mask = np.zeros_like(seg_img)
+        mask[seg_img == 1] = 1  # Robot arm
+        return mask
+    def get_gripper_mask(self, obs: dict) -> np.ndarray:
+        """
+        Generate binary mask for gripper pixels.
+        Uses instance segmentation to identify which pixels belong to
+        the robot gripper (instance ID 3).
+        Args:
+            obs: Observation dictionary containing segmentation data
+        Returns:
+            Binary mask where 1 indicates gripper pixels, 0 otherwise
+        """
+        seg_img = self.get_seg_image(obs)
+        mask = np.zeros_like(seg_img)
+        mask[seg_img == 3] = 1  # Gripper
+        return mask

phantom/phantom/utils/__init__.py ADDED Viewed

File without changes

phantom/phantom/utils/bbox_utils.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import numpy as np
+import numpy.typing as npt
+def get_bbox_center(bbox: np.ndarray) -> np.ndarray:
+    """Calculate center point of bounding box."""
+    return np.array([(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2])
+def get_bbox_area(bbox: np.ndarray) -> float:
+    """Get the area of a bounding box."""
+    return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
+def get_overlap_score(bbox1: np.ndarray, bbox2: np.ndarray) -> float:
+    """ Get the overlap area between two boxes divided by the area of the smaller box """
+    area1 = get_bbox_area(bbox1)
+    area2 = get_bbox_area(bbox2)
+    overlap_area = get_overlap_area(bbox1, bbox2)
+    return overlap_area / min(area1, area2)
+def get_overlap_area(bbox1: np.ndarray, bbox2: np.ndarray) -> float:
+    """ Get the overlap area between two boxes """
+    return max(0, min(bbox1[2], bbox2[2]) - max(bbox1[0], bbox2[0])) * max(0, min(bbox1[3], bbox2[3]) - max(bbox1[1], bbox2[1]))
+def get_bbox_center_min_dist_to_edge(bboxes: npt.NDArray[np.float32], W: int, H: int) -> npt.NDArray[np.float32]:
+    """
+    Get the minimum distance of the bbox center to the edge of the image.
+    """
+    center_min_dist_to_edge_list = []
+    for bbox in bboxes:
+        x1, y1, x2, y2 = bbox
+        center = np.array([(x1 + x2) / 2, (y1 + y2) / 2])
+        min_dist_to_edge = min(center[0], center[1], W - center[0], H - center[1])
+        center_min_dist_to_edge_list.append(min_dist_to_edge)
+    return np.array(center_min_dist_to_edge_list)

phantom/phantom/utils/data_utils.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import re
+import os
+import numpy as np
+import pandas as pd
+from pathlib import Path
+def get_finger_poses_from_pkl(path: Path) -> dict:
+    """Get human finger poses from pkl file."""
+    finger_poses = pd.read_pickle(path)
+    thumb_poses = np.vstack(finger_poses["thumb"])
+    index_poses = np.vstack(finger_poses["index"])
+    hand_ee_poses = np.vstack(finger_poses["hand_ee"])
+    skeleton_poses = np.stack(finger_poses["skeleton"], axis=0)
+    hand_poses = np.stack(finger_poses["hand_pose"], axis=0)
+    all_global_orient = np.vstack(finger_poses["global_orient"])
+    data = {
+        "thumb": thumb_poses,
+        "index": index_poses,
+        "hand_ee": hand_ee_poses,
+        "skeleton": skeleton_poses,
+        "hand_pose": hand_poses,
+        "global_orient": all_global_orient
+    }
+    return data
+def get_parent_folder_of_package(package_name: str) -> str:
+    # Import the package
+    package = __import__(package_name)
+    # Get the absolute path of the imported package
+    package_path = package.__file__
+    if package_path is None:
+        raise ValueError(f"Package {package_name} does not have a valid __file__ attribute")
+    package_path = os.path.abspath(package_path)
+    # Get the parent directory of the package directory
+    return os.path.dirname(os.path.dirname(package_path))

phantom/phantom/utils/image_utils.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import json
+import numpy as np
+import cv2
+import os
+import mediapy as media
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+@dataclass
+class BoundingBox:
+    xmin: int
+    ymin: int
+    xmax: int
+    ymax: int
+    @property
+    def xyxy(self) -> List[float]:
+        return [self.xmin, self.ymin, self.xmax, self.ymax]
+@dataclass
+class DetectionResult:
+    score: float
+    label: str
+    box: BoundingBox
+    mask: Optional[np.ndarray] = None
+    @classmethod
+    def from_dict(cls, detection_dict: Dict) -> "DetectionResult":
+        return cls(
+            score=detection_dict["score"],
+            label=detection_dict["label"],
+            box=BoundingBox(
+                xmin=detection_dict["box"]["xmin"],
+                ymin=detection_dict["box"]["ymin"],
+                xmax=detection_dict["box"]["xmax"],
+                ymax=detection_dict["box"]["ymax"],
+            ),
+        )
+def get_transformation_matrix_from_extrinsics(camera_extrinsics: List[Dict]) -> np.ndarray:
+    """Get homogeneous transformation matrix from camera extrinsics."""
+    cam_base_pos = np.array(camera_extrinsics[0]["camera_base_pos"])
+    cam_base_ori = np.array(camera_extrinsics[0]["camera_base_ori"])
+    T_cam2robot = np.eye(4)
+    T_cam2robot[:3, 3] = cam_base_pos
+    T_cam2robot[:3, :3] = np.array(cam_base_ori).reshape(3, 3)
+    return T_cam2robot
+def get_intrinsics_from_json(json_path: str) -> Tuple[np.ndarray, dict]:
+    with open(json_path, "r") as f:
+        camera_intrinsics = json.load(f)
+    # Get camera matrix
+    fx = camera_intrinsics["left"]["fx"]
+    fy = camera_intrinsics["left"]["fy"]
+    cx = camera_intrinsics["left"]["cx"]
+    cy = camera_intrinsics["left"]["cy"]
+    v_fov = camera_intrinsics["left"]["v_fov"]
+    intrinsics_matrix = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]])
+    intrinsics_dict = {
+        "fx": fx,
+        "fy": fy,
+        "cx": cx,
+        "cy": cy,
+        "v_fov": v_fov,
+    }
+    return intrinsics_matrix, intrinsics_dict
+def resize_binary_image(image: np.ndarray, new_size: int) -> np.ndarray:
+    max_value = np.max(image)
+    # Resize the image
+    resized_image = cv2.resize(image, (new_size, new_size), interpolation=cv2.INTER_NEAREST)
+    if max_value == 1:
+        _, binary_image = cv2.threshold(resized_image, 0.5, 1, cv2.THRESH_BINARY)
+    else:
+        _, binary_image = cv2.threshold(resized_image, 127, 255, cv2.THRESH_BINARY)
+    return binary_image
+def convert_video_to_images(video_path: str, save_folder: str, square=False, reverse=False):
+    """Save each frame of video as an image in save_folder."""
+    if not os.path.exists(save_folder):
+        os.makedirs(save_folder)
+    imgs = np.array(media.read_video(str(video_path)))
+    n_imgs = len(imgs)
+    if reverse:
+        imgs = imgs[::-1]
+    for idx in range(n_imgs):
+        img = imgs[idx]
+        if square:
+            delta = (img.shape[1] - img.shape[0]) // 2
+            img = img[:, delta:-delta, :]
+        media.write_image(f"{save_folder}/{idx:05d}.jpg", img)

phantom/phantom/utils/pcd_utils.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import numpy as np
+from typing import Tuple, Optional
+import open3d as o3d  # type: ignore
+import trimesh
+from sklearn.neighbors import NearestNeighbors # type: ignore
+def preprocess_point_cloud(pcd: o3d.geometry.PointCloud,
+                           voxel_size: float) -> Tuple[o3d.geometry.PointCloud, o3d.pipelines.registration.Feature]:
+    """
+    Downsample point cloud to desired voxel resolution and compute FPFH features.
+    """
+    pcd_down = pcd.voxel_down_sample(voxel_size)
+    radius_normal = voxel_size * 2
+    pcd_down.estimate_normals(o3d.geometry.KDTreeSearchParamHybrid(radius=radius_normal, max_nn=30))
+    radius_feature = voxel_size * 5
+    pcd_fpfh = o3d.pipelines.registration.compute_fpfh_feature(
+        pcd_down, o3d.geometry.KDTreeSearchParamHybrid(radius=radius_feature, max_nn=100))
+    return pcd_down, pcd_fpfh
+def global_registration(source_pcd: o3d.geometry.PointCloud, target_pcd: o3d.geometry.PointCloud,
+                        voxel_size: float) -> o3d.pipelines.registration.RegistrationResult:
+    """
+    Register two point clouds using global registration with RANSAC.
+    """
+    source_down, source_fpfh = preprocess_point_cloud(source_pcd, voxel_size)
+    target_down, target_fpfh = preprocess_point_cloud(target_pcd, voxel_size)
+    distance_threshold = voxel_size * 1.5
+    result_ransac = o3d.pipelines.registration.registration_ransac_based_on_feature_matching(
+        source_down, target_down, source_fpfh, target_fpfh, True,
+        distance_threshold,
+        o3d.pipelines.registration.TransformationEstimationPointToPoint(),
+        4,  # RANSAC iterations
+        [o3d.pipelines.registration.CorrespondenceCheckerBasedOnEdgeLength(0.9),
+         o3d.pipelines.registration.CorrespondenceCheckerBasedOnDistance(distance_threshold)],
+        o3d.pipelines.registration.RANSACConvergenceCriteria(4000000, 500))
+    return result_ransac
+def icp_registration(source_pcd: o3d.geometry.PointCloud, target_pcd: o3d.geometry.PointCloud,
+                     voxel_size: float=0.05, use_global_registration:bool=True,
+                     init_transform:Optional[np.ndarray]=None) -> Tuple[o3d.geometry.PointCloud, np.ndarray]:
+    """
+    Register two point clouds using ICP algorithm.
+    """
+    # Optional global registration using RANSAC
+    if use_global_registration:
+        if init_transform is None:
+            result_ransac = global_registration(source_pcd, target_pcd, voxel_size)
+            init_transform = result_ransac.transformation
+    else:
+        init_transform = np.eye(4)
+    # Refine alignment using ICP
+    max_correspondence_distance = voxel_size * 5
+    result_icp = o3d.pipelines.registration.registration_icp(
+        source=source_pcd, target=target_pcd, max_correspondence_distance=max_correspondence_distance,
+        init=init_transform,
+        estimation_method=o3d.pipelines.registration.TransformationEstimationPointToPoint())
+    if np.array_equal(init_transform, result_icp.transformation):
+        result_ransac = global_registration(source_pcd, target_pcd, voxel_size)
+        init_transform = result_ransac.transformation
+        result_icp = o3d.pipelines.registration.registration_icp(
+            source=source_pcd, target=target_pcd, max_correspondence_distance=max_correspondence_distance,
+            init=init_transform,
+            estimation_method=o3d.pipelines.registration.TransformationEstimationPointToPoint())
+    aligned_source_pcd = source_pcd.transform(result_icp.transformation)
+    return aligned_source_pcd, result_icp.transformation
+def get_visible_points(mesh, origin: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Return list of points in mesh that are visible from origin.
+    """
+    intersector = trimesh.ray.ray_triangle.RayMeshIntersector(mesh)
+    pts = mesh.vertices
+    vectors = pts - origin
+    directions = vectors / np.linalg.norm(vectors, axis=1, keepdims=True)
+    visible_triangle_indices = intersector.intersects_first(np.tile(origin, (pts.shape[0], 1)), directions)
+    visible_triangles = mesh.faces[visible_triangle_indices]
+    visible_vertex_indices = np.unique(visible_triangles)
+    visible_points = pts[visible_vertex_indices]
+    return np.array(visible_points).astype(np.float32), np.array(visible_vertex_indices)
+def get_pcd_from_points(points: np.ndarray, colors: Optional[np.ndarray]=None) -> o3d.geometry.PointCloud:
+    """
+    Convert a list of points to an Open3D point cloud.
+    """
+    pcd = o3d.geometry.PointCloud()
+    pcd.points = o3d.utility.Vector3dVector(points)
+    if colors is not None:
+        pcd.colors = o3d.utility.Vector3dVector(colors)
+    pcd.remove_non_finite_points()
+    return pcd
+def visualize_pcds(list_pcds: list, visible: bool=True) -> np.ndarray:
+    """
+    Visualize a list of point clouds.
+    """
+    visualization_image = None
+    vis = o3d.visualization.Visualizer()
+    vis.create_window(visible=visible)
+    opt = vis.get_render_option()
+    opt.background_color = np.asarray([0.2, 0.2, 0.2])
+    for pcd in list_pcds:
+        if pcd is not None:
+            vis.add_geometry(pcd)
+    vis.poll_events()
+    vis.update_renderer()
+    if not visible:
+        visualization_image = vis.capture_screen_float_buffer(do_render=True)
+        visualization_image = (255.0 * np.asarray(visualization_image)).astype(np.uint8)
+    if visible:
+        vis.run()
+    vis.destroy_window()
+    if visualization_image is None:
+        visualization_image = np.array([])
+    return visualization_image
+def radius_outlier_detection(points: np.ndarray, radius: float=5,
+                             min_neighbors: int=5) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Detect outliers in a point cloud using radius-based outlier detection.
+    """
+    # Fit the NearestNeighbors model
+    nbrs = NearestNeighbors(radius=radius).fit(points)
+    # Get the number of neighbors for each point within the specified radius
+    distances, indices = nbrs.radius_neighbors(points)
+    # Detect points with fewer neighbors than the minimum threshold
+    outliers_mask = np.array([len(neigh) < min_neighbors for neigh in indices])
+    outlier_pts = points[outliers_mask]
+    return outliers_mask, outlier_pts
+def remove_outliers(pcd: o3d.geometry.PointCloud, radius: float=5,
+                    min_neighbors: int=5) -> Tuple[o3d.geometry.PointCloud, np.ndarray]:
+    """
+    Remove outliers from a point cloud using radius-based outlier detection.
+    """
+    outlier_indices, outlier_pts = radius_outlier_detection(np.asarray(pcd.points),
+                                                            radius=radius, min_neighbors=min_neighbors)
+    filtered_pts = np.asarray(pcd.points)[~outlier_indices]
+    filtered_colors = np.asarray(pcd.colors)[~outlier_indices]
+    filtered_pcd = get_pcd_from_points(filtered_pts, colors=filtered_colors)
+    return filtered_pcd, outlier_indices
+def get_3D_points_from_pixels(pixels_2d: np.ndarray, depth_map: np.ndarray, intrinsics: dict) -> np.ndarray:
+    """
+    Convert an array of pixel coordinates and depth map to 3D points.
+    """
+    px = pixels_2d[:, 0]
+    py = pixels_2d[:, 1]
+    x = (px - intrinsics["cx"]) / intrinsics["fx"]
+    y = (py - intrinsics["cy"]) / intrinsics["fy"]
+    if len(depth_map.shape) == 3:
+        depth_map = depth_map[:, :, 0]
+    depth = depth_map[py, px]
+    X = x * depth
+    Y = y * depth
+    points_3d = np.stack((X, Y, depth), axis=1)
+    return points_3d
+def get_point_cloud_of_segmask(mask: np.ndarray, depth_img: np.ndarray, img: np.ndarray,
+                               intrinsics: dict, visualize: bool=False) -> o3d.geometry.PointCloud:
+    """
+    Return the point cloud that corresponds to the segmentation mask in the depth image.
+    """
+    idxs_y, idxs_x = mask.nonzero()
+    pixels_2d = np.stack((idxs_x, idxs_y), axis=1)
+    seg_points = get_3D_points_from_pixels(pixels_2d, depth_img, intrinsics)
+    seg_colors = img[idxs_y, idxs_x, :] / 255.0  # Normalize to [0,1] for cv2
+    pcd = get_pcd_from_points(seg_points, colors=seg_colors)
+    if visualize:
+        visualize_pcds([pcd])
+    return pcd
+def get_bbox_of_3d_points(points: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Return the bounding box of 3D points.
+    """
+    min_xyz = np.min(points, axis=0)
+    max_xyz = np.max(points, axis=0)
+    return min_xyz, max_xyz
+def trim_pcd_to_bbox(pcd: o3d.geometry.PointCloud, bbox: Tuple[np.ndarray, np.ndarray]) -> o3d.geometry.PointCloud:
+    """
+    Trim a point cloud to the specified bounding box.
+    """
+    min_xyz, max_xyz = bbox
+    trimmed_pcd = pcd.crop(o3d.geometry.AxisAlignedBoundingBox(min_xyz, max_xyz))
+    return trimmed_pcd

phantom/phantom/utils/transform_utils.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import numpy as np
+import math
+EPS = np.finfo(float).eps * 4.0
+def transform_pts(pts: np.ndarray, T: np.ndarray) -> np.ndarray:
+    pts = np.hstack([pts, np.ones((len(pts), 1))])
+    pts = np.dot(T, pts.T).T
+    return pts[:, :3]
+def project_point_to_plane(point: np.ndarray, plane_coeffs: np.ndarray) -> np.ndarray:
+    """
+    Projects a 3D point onto a plane defined by its coefficients.
+    Args:
+        point (array-like): Coordinates of the point to be projected (x0, y0, z0).
+        plane_coeffs (array-like): Coefficients of the plane (a, b, c, d) for ax + by + cz + d = 0.
+    Returns:
+        numpy.ndarray: The projected point's coordinates on the plane.
+    """
+    # Convert inputs to numpy arrays
+    point = np.array(point)
+    plane_coeffs = np.array(plane_coeffs)
+    # Extract the plane normal vector and constant term
+    normal = plane_coeffs[:3]  # [a, b, c]
+    d = plane_coeffs[3]
+    # Normalize the plane normal vector
+    normal_magnitude = np.linalg.norm(normal)
+    if normal_magnitude == 0:
+        raise ValueError("Invalid plane coefficients: normal vector cannot have zero magnitude.")
+    normal /= normal_magnitude
+    # Calculate the signed distance from the point to the plane
+    distance = np.dot(normal, point) + d / normal_magnitude
+    # Project the point onto the plane
+    projected_point = point - distance * normal
+    return projected_point

phantom/setup.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import setuptools
+setuptools.setup(
+    name="phantom",
+    version="0.1",
+    packages=setuptools.find_packages(exclude=["submodules", "submodules.*"]),
+)

phantom/submodules/phantom-E2FGVI/.gitignore ADDED Viewed

	@@ -0,0 +1,136 @@

+# Customized
+*.pth
+*.pt
+keys.txt
+results/
+.vscode/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/

phantom/submodules/phantom-E2FGVI/E2FGVI/__init__.py ADDED Viewed

File without changes

phantom/submodules/phantom-E2FGVI/E2FGVI/configs/train_e2fgvi.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+    "seed": 2021,
+    "save_dir": "release_model/",
+    "train_data_loader": {
+        "name": "youtube-vos",
+        "data_root": "datasets",
+        "w": 432,
+        "h": 240,
+        "num_local_frames": 5,
+        "num_ref_frames": 3
+    },
+    "losses": {
+        "hole_weight": 1,
+        "valid_weight": 1,
+        "flow_weight": 1,
+        "adversarial_weight": 0.01,
+        "GAN_LOSS": "hinge"
+    },
+    "model": {
+        "net": "e2fgvi",
+        "no_dis": 0
+    },
+    "trainer": {
+        "type": "Adam",
+        "beta1": 0,
+        "beta2": 0.99,
+        "lr": 1e-4,
+        "batch_size": 8,
+        "num_workers": 2,
+        "log_freq": 100,
+        "save_freq": 5e3,
+        "iterations": 50e4,
+        "scheduler": {
+            "type": "MultiStepLR",
+            "milestones": [
+                40e4
+            ],
+            "gamma": 0.1
+        }
+    }
+}

phantom/submodules/phantom-E2FGVI/E2FGVI/configs/train_e2fgvi_hq.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+    "seed": 2021,
+    "save_dir": "release_model/",
+    "train_data_loader": {
+        "name": "youtube-vos",
+        "data_root": "datasets",
+        "w": 432,
+        "h": 240,
+        "num_local_frames": 5,
+        "num_ref_frames": 3
+    },
+    "losses": {
+        "hole_weight": 1,
+        "valid_weight": 1,
+        "flow_weight": 1,
+        "adversarial_weight": 0.01,
+        "GAN_LOSS": "hinge"
+    },
+    "model": {
+        "net": "e2fgvi_hq",
+        "no_dis": 0
+    },
+    "trainer": {
+        "type": "Adam",
+        "beta1": 0,
+        "beta2": 0.99,
+        "lr": 1e-4,
+        "batch_size": 8,
+        "num_workers": 2,
+        "log_freq": 100,
+        "save_freq": 5e3,
+        "iterations": 50e4,
+        "scheduler": {
+            "type": "MultiStepLR",
+            "milestones": [
+                40e4
+            ],
+            "gamma": 0.1
+        }
+    }
+}