Spaces:

HorizonRobotics
/

EmbodiedGen-Text-to-3D

Running on Zero

App Files Files Community

xinjie.wang commited on Aug 29

Commit

7ecea11

1 Parent(s): 808531b

update

Browse files

Files changed (25) hide show

common.py +4 -2
embodied_gen/data/convex_decomposer.py +161 -0
embodied_gen/data/mesh_operator.py +7 -1
embodied_gen/envs/pick_embodiedgen.py +389 -0
embodied_gen/models/gs_model.py +6 -8
embodied_gen/models/layout.py +509 -0
embodied_gen/scripts/compose_layout.py +73 -0
embodied_gen/scripts/gen_layout.py +156 -0
embodied_gen/scripts/imageto3d.py +13 -3
embodied_gen/scripts/parallel_sim.py +148 -0
embodied_gen/scripts/simulate_sapien.py +195 -0
embodied_gen/scripts/textto3d.py +3 -1
embodied_gen/scripts/textto3d.sh +1 -0
embodied_gen/trainer/gsplat_trainer.py +1 -1
embodied_gen/trainer/pono2mesh_trainer.py +1 -1
embodied_gen/utils/config.py +12 -0
embodied_gen/utils/enum.py +1 -0
embodied_gen/utils/gaussian.py +5 -6
embodied_gen/utils/geometry.py +458 -0
embodied_gen/utils/monkey_patches.py +66 -0
embodied_gen/utils/process_media.py +49 -6
embodied_gen/utils/simulation.py +633 -0
embodied_gen/utils/tags.py +1 -1
embodied_gen/validators/quality_checkers.py +6 -3
embodied_gen/validators/urdf_convertor.py +37 -18

common.py CHANGED Viewed

@@ -189,7 +189,7 @@ os.makedirs(TMP_DIR, exist_ok=True)
 lighting_css = """
 <style>
 #lighter_mesh canvas {
-    filter: brightness(1.8) !important;
 }
 </style>
 """
@@ -547,7 +547,9 @@ def extract_urdf(
     # Convert to URDF and recover attrs by GPT.
     filename = "sample"
-    urdf_convertor = URDFGenerator(GPT_CLIENT, render_view_num=4)
     asset_attrs = {
         "version": VERSION,
         "gs_model": f"{urdf_convertor.output_mesh_dir}/{filename}_gs.ply",

 lighting_css = """
 <style>
 #lighter_mesh canvas {
+    filter: brightness(1.9) !important;
 }
 </style>
 """
     # Convert to URDF and recover attrs by GPT.
     filename = "sample"
+    urdf_convertor = URDFGenerator(
+        GPT_CLIENT, render_view_num=4, decompose_convex=True
+    )
     asset_attrs = {
         "version": VERSION,
         "gs_model": f"{urdf_convertor.output_mesh_dir}/{filename}_gs.ply",

embodied_gen/data/convex_decomposer.py ADDED Viewed

	@@ -0,0 +1,161 @@

+# Project EmbodiedGen
+#
+# Copyright (c) 2025 Horizon Robotics. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+import logging
+import multiprocessing as mp
+import os
+import coacd
+import trimesh
+logger = logging.getLogger(__name__)
+__all__ = [
+    "decompose_convex_coacd",
+    "decompose_convex_mesh",
+    "decompose_convex_process",
+]
+def decompose_convex_coacd(
+    filename: str, outfile: str, params: dict, verbose: bool = False
+) -> None:
+    coacd.set_log_level("info" if verbose else "warn")
+    mesh = trimesh.load(filename, force="mesh")
+    mesh = coacd.Mesh(mesh.vertices, mesh.faces)
+    result = coacd.run_coacd(mesh, **params)
+    combined = sum([trimesh.Trimesh(*m) for m in result])
+    combined.export(outfile)
+def decompose_convex_mesh(
+    filename: str,
+    outfile: str,
+    threshold: float = 0.05,
+    max_convex_hull: int = -1,
+    preprocess_mode: str = "auto",
+    preprocess_resolution: int = 30,
+    resolution: int = 2000,
+    mcts_nodes: int = 20,
+    mcts_iterations: int = 150,
+    mcts_max_depth: int = 3,
+    pca: bool = False,
+    merge: bool = True,
+    seed: int = 0,
+    verbose: bool = False,
+) -> str:
+    """Decompose a mesh into convex parts using the CoACD algorithm."""
+    coacd.set_log_level("info" if verbose else "warn")
+    if os.path.exists(outfile):
+        logger.warning(f"Output file {outfile} already exists, removing it.")
+        os.remove(outfile)
+    params = dict(
+        threshold=threshold,
+        max_convex_hull=max_convex_hull,
+        preprocess_mode=preprocess_mode,
+        preprocess_resolution=preprocess_resolution,
+        resolution=resolution,
+        mcts_nodes=mcts_nodes,
+        mcts_iterations=mcts_iterations,
+        mcts_max_depth=mcts_max_depth,
+        pca=pca,
+        merge=merge,
+        seed=seed,
+    )
+    try:
+        decompose_convex_coacd(filename, outfile, params, verbose)
+        if os.path.exists(outfile):
+            return outfile
+    except Exception as e:
+        if verbose:
+            print(f"Decompose convex first attempt failed: {e}.")
+    if preprocess_mode != "on":
+        try:
+            params["preprocess_mode"] = "on"
+            decompose_convex_coacd(filename, outfile, params, verbose)
+            if os.path.exists(outfile):
+                return outfile
+        except Exception as e:
+            if verbose:
+                print(
+                    f"Decompose convex second attempt with preprocess_mode='on' failed: {e}"
+                )
+    raise RuntimeError(f"Convex decomposition failed on {filename}")
+def decompose_convex_mp(
+    filename: str,
+    outfile: str,
+    threshold: float = 0.05,
+    max_convex_hull: int = -1,
+    preprocess_mode: str = "auto",
+    preprocess_resolution: int = 30,
+    resolution: int = 2000,
+    mcts_nodes: int = 20,
+    mcts_iterations: int = 150,
+    mcts_max_depth: int = 3,
+    pca: bool = False,
+    merge: bool = True,
+    seed: int = 0,
+    verbose: bool = False,
+) -> str:
+    """Decompose a mesh into convex parts using the CoACD algorithm in a separate process.
+    See https://simulately.wiki/docs/toolkits/ConvexDecomp for details.
+    """
+    params = dict(
+        threshold=threshold,
+        max_convex_hull=max_convex_hull,
+        preprocess_mode=preprocess_mode,
+        preprocess_resolution=preprocess_resolution,
+        resolution=resolution,
+        mcts_nodes=mcts_nodes,
+        mcts_iterations=mcts_iterations,
+        mcts_max_depth=mcts_max_depth,
+        pca=pca,
+        merge=merge,
+        seed=seed,
+    )
+    ctx = mp.get_context("spawn")
+    p = ctx.Process(
+        target=decompose_convex_coacd,
+        args=(filename, outfile, params, verbose),
+    )
+    p.start()
+    p.join()
+    if p.exitcode == 0 and os.path.exists(outfile):
+        return outfile
+    if preprocess_mode != "on":
+        params["preprocess_mode"] = "on"
+        p = ctx.Process(
+            target=decompose_convex_coacd,
+            args=(filename, outfile, params, verbose),
+        )
+        p.start()
+        p.join()
+        if p.exitcode == 0 and os.path.exists(outfile):
+            return outfile
+    raise RuntimeError(f"Convex decomposition failed on {filename}")

embodied_gen/data/mesh_operator.py CHANGED Viewed

@@ -16,13 +16,17 @@
 import logging
 from typing import Tuple, Union
 import igraph
 import numpy as np
 import pyvista as pv
 import spaces
 import torch
 import utils3d
 from pymeshfix import _meshfix
 from tqdm import tqdm
@@ -33,7 +37,9 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)
-__all__ = ["MeshFixer"]
 def _radical_inverse(base, n):

 import logging
+import multiprocessing as mp
+import os
 from typing import Tuple, Union
+import coacd
 import igraph
 import numpy as np
 import pyvista as pv
 import spaces
 import torch
+import trimesh
 import utils3d
 from pymeshfix import _meshfix
 from tqdm import tqdm
 logger = logging.getLogger(__name__)
+__all__ = [
+    "MeshFixer",
+]
 def _radical_inverse(base, n):

embodied_gen/envs/pick_embodiedgen.py ADDED Viewed

	@@ -0,0 +1,389 @@

+# Project EmbodiedGen
+#
+# Copyright (c) 2025 Horizon Robotics. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+import json
+import os
+from copy import deepcopy
+import numpy as np
+import sapien
+import torch
+import torchvision.transforms as transforms
+from mani_skill.envs.sapien_env import BaseEnv
+from mani_skill.sensors.camera import CameraConfig
+from mani_skill.utils import sapien_utils
+from mani_skill.utils.building import actors
+from mani_skill.utils.registration import register_env
+from mani_skill.utils.structs.actor import Actor
+from mani_skill.utils.structs.pose import Pose
+from mani_skill.utils.structs.types import (
+    GPUMemoryConfig,
+    SceneConfig,
+    SimConfig,
+)
+from mani_skill.utils.visualization.misc import tile_images
+from tqdm import tqdm
+from embodied_gen.models.gs_model import GaussianOperator
+from embodied_gen.utils.enum import LayoutInfo, Scene3DItemEnum
+from embodied_gen.utils.geometry import bfs_placement, quaternion_multiply
+from embodied_gen.utils.log import logger
+from embodied_gen.utils.process_media import alpha_blend_rgba
+from embodied_gen.utils.simulation import (
+    SIM_COORD_ALIGN,
+    load_assets_from_layout_file,
+)
+__all__ = ["PickEmbodiedGen"]
+@register_env("PickEmbodiedGen-v1", max_episode_steps=100)
+class PickEmbodiedGen(BaseEnv):
+    SUPPORTED_ROBOTS = ["panda", "panda_wristcam", "fetch"]
+    goal_thresh = 0.0
+    def __init__(
+        self,
+        *args,
+        robot_uids: str | list[str] = "panda",
+        robot_init_qpos_noise: float = 0.02,
+        num_envs: int = 1,
+        reconfiguration_freq: int = None,
+        **kwargs,
+    ):
+        self.robot_init_qpos_noise = robot_init_qpos_noise
+        if reconfiguration_freq is None:
+            if num_envs == 1:
+                reconfiguration_freq = 1
+            else:
+                reconfiguration_freq = 0
+        # Init params from kwargs.
+        layout_file = kwargs.pop("layout_file", None)
+        replace_objs = kwargs.pop("replace_objs", True)
+        self.enable_grasp = kwargs.pop("enable_grasp", False)
+        self.init_quat = kwargs.pop("init_quat", [0.7071, 0, 0, 0.7071])
+        # Add small offset in z-axis to avoid collision.
+        self.objs_z_offset = kwargs.pop("objs_z_offset", 0.002)
+        self.robot_z_offset = kwargs.pop("robot_z_offset", 0.002)
+        self.layouts = self.init_env_layouts(
+            layout_file, num_envs, replace_objs
+        )
+        self.robot_pose = self.compute_robot_init_pose(
+            self.layouts, num_envs, self.robot_z_offset
+        )
+        self.env_actors = dict()
+        self.image_transform = transforms.PILToTensor()
+        super().__init__(
+            *args,
+            robot_uids=robot_uids,
+            reconfiguration_freq=reconfiguration_freq,
+            num_envs=num_envs,
+            **kwargs,
+        )
+        self.bg_images = dict()
+        if self.render_mode == "hybrid":
+            self.bg_images = self.render_gs3d_images(
+                self.layouts, num_envs, self.init_quat
+            )
+    @staticmethod
+    def init_env_layouts(
+        layout_file: str, num_envs: int, replace_objs: bool
+    ) -> list[LayoutInfo]:
+        layout = LayoutInfo.from_dict(json.load(open(layout_file, "r")))
+        layouts = []
+        for env_idx in range(num_envs):
+            if replace_objs and env_idx > 0:
+                layout = bfs_placement(deepcopy(layout))
+            layouts.append(layout)
+        return layouts
+    @staticmethod
+    def compute_robot_init_pose(
+        layouts: list[LayoutInfo], num_envs: int, z_offset: float = 0.0
+    ) -> list[list[float]]:
+        robot_pose = []
+        for env_idx in range(num_envs):
+            layout = layouts[env_idx]
+            robot_node = layout.relation[Scene3DItemEnum.ROBOT.value]
+            x, y, z, qx, qy, qz, qw = layout.position[robot_node]
+            robot_pose.append([x, y, z + z_offset, qw, qx, qy, qz])
+        return robot_pose
+    @property
+    def _default_sim_config(self):
+        return SimConfig(
+            scene_config=SceneConfig(
+                solver_position_iterations=30,
+                # contact_offset=0.04,
+                # rest_offset=0.001,
+            ),
+            # sim_freq=200,
+            control_freq=50,
+            gpu_memory_config=GPUMemoryConfig(
+                max_rigid_contact_count=2**20, max_rigid_patch_count=2**19
+            ),
+        )
+    @property
+    def _default_sensor_configs(self):
+        pose = sapien_utils.look_at(eye=[0.3, 0, 0.6], target=[-0.1, 0, 0.1])
+        return [
+            CameraConfig("base_camera", pose, 128, 128, np.pi / 2, 0.01, 100)
+        ]
+    @property
+    def _default_human_render_camera_configs(self):
+        pose = sapien_utils.look_at(
+            eye=[0.9, 0.0, 1.1], target=[0.0, 0.0, 0.9]
+        )
+        return CameraConfig(
+            "render_camera", pose, 256, 256, np.deg2rad(75), 0.01, 100
+        )
+    def _load_agent(self, options: dict):
+        super()._load_agent(options, sapien.Pose(p=[-10, 0, 10]))
+    def _load_scene(self, options: dict):
+        all_objects = []
+        logger.info(f"Loading assets and decomposition mesh collisions...")
+        for env_idx in range(self.num_envs):
+            env_actors = load_assets_from_layout_file(
+                self.scene,
+                self.layouts[env_idx],
+                z_offset=self.objs_z_offset,
+                init_quat=self.init_quat,
+                env_idx=env_idx,
+            )
+            self.env_actors[f"env{env_idx}"] = env_actors
+            all_objects.extend(env_actors.values())
+        self.obj = all_objects[-1]
+        for obj in all_objects:
+            self.remove_from_state_dict_registry(obj)
+        self.all_objects = Actor.merge(all_objects, name="all_objects")
+        self.add_to_state_dict_registry(self.all_objects)
+        self.goal_site = actors.build_sphere(
+            self.scene,
+            radius=self.goal_thresh,
+            color=[0, 1, 0, 0],
+            name="goal_site",
+            body_type="kinematic",
+            add_collision=False,
+            initial_pose=sapien.Pose(),
+        )
+        self._hidden_objects.append(self.goal_site)
+    def _initialize_episode(self, env_idx: torch.Tensor, options: dict):
+        with torch.device(self.device):
+            b = len(env_idx)
+            goal_xyz = torch.zeros((b, 3))
+            goal_xyz[:, :2] = torch.rand((b, 2)) * 0.2 - 0.1
+            self.goal_site.set_pose(Pose.create_from_pq(goal_xyz))
+            qpos = np.array(
+                [
+                    0.0,
+                    np.pi / 8,
+                    0,
+                    -np.pi * 3 / 8,
+                    0,
+                    np.pi * 3 / 4,
+                    np.pi / 4,
+                    0.04,
+                    0.04,
+                ]
+            )
+            qpos = (
+                np.random.normal(
+                    0, self.robot_init_qpos_noise, (self.num_envs, len(qpos))
+                )
+                + qpos
+            )
+            qpos[:, -2:] = 0.04
+            self.agent.robot.set_root_pose(np.array(self.robot_pose))
+            self.agent.reset(qpos)
+            self.agent.init_qpos = qpos
+            self.agent.controller.controllers["gripper"].reset()
+    def render_gs3d_images(
+        self, layouts: list[LayoutInfo], num_envs: int, init_quat: list[float]
+    ) -> dict[str, np.ndarray]:
+        sim_coord_align = (
+            torch.tensor(SIM_COORD_ALIGN).to(torch.float32).to(self.device)
+        )
+        cameras = self.scene.sensors.copy()
+        cameras.update(self.scene.human_render_cameras)
+        bg_node = layouts[0].relation[Scene3DItemEnum.BACKGROUND.value]
+        gs_path = os.path.join(layouts[0].assets[bg_node], "gs_model.ply")
+        raw_gs: GaussianOperator = GaussianOperator.load_from_ply(gs_path)
+        bg_images = dict()
+        for env_idx in tqdm(range(num_envs), desc="Pre-rendering Background"):
+            layout = layouts[env_idx]
+            x, y, z, qx, qy, qz, qw = layout.position[bg_node]
+            qx, qy, qz, qw = quaternion_multiply([qx, qy, qz, qw], init_quat)
+            init_pose = torch.tensor([x, y, z, qx, qy, qz, qw])
+            gs_model = raw_gs.get_gaussians(instance_pose=init_pose)
+            for key in cameras:
+                camera = cameras[key]
+                Ks = camera.camera.get_intrinsic_matrix()  # (n_env, 3, 3)
+                c2w = camera.camera.get_model_matrix()  # (n_env, 4, 4)
+                result = gs_model.render(
+                    c2w[env_idx] @ sim_coord_align,
+                    Ks[env_idx],
+                    image_width=camera.config.width,
+                    image_height=camera.config.height,
+                )
+                bg_images[f"{key}-env{env_idx}"] = result.rgb[..., ::-1]
+        return bg_images
+    def render(self):
+        if self.render_mode is None:
+            raise RuntimeError("render_mode is not set.")
+        if self.render_mode == "human":
+            return self.render_human()
+        elif self.render_mode == "rgb_array":
+            res = self.render_rgb_array()
+            return res
+        elif self.render_mode == "sensors":
+            res = self.render_sensors()
+            return res
+        elif self.render_mode == "all":
+            return self.render_all()
+        elif self.render_mode == "hybrid":
+            return self.hybrid_render()
+        else:
+            raise NotImplementedError(
+                f"Unsupported render mode {self.render_mode}."
+            )
+    def render_rgb_array(
+        self, camera_name: str = None, return_alpha: bool = False
+    ):
+        for obj in self._hidden_objects:
+            obj.show_visual()
+        self.scene.update_render(
+            update_sensors=False, update_human_render_cameras=True
+        )
+        images = []
+        render_images = self.scene.get_human_render_camera_images(
+            camera_name, return_alpha
+        )
+        for image in render_images.values():
+            images.append(image)
+        if len(images) == 0:
+            return None
+        if len(images) == 1:
+            return images[0]
+        for obj in self._hidden_objects:
+            obj.hide_visual()
+        return tile_images(images)
+    def render_sensors(self):
+        images = []
+        sensor_images = self.get_sensor_images()
+        for image in sensor_images.values():
+            for img in image.values():
+                images.append(img)
+        return tile_images(images)
+    def hybrid_render(self):
+        fg_images = self.render_rgb_array(
+            return_alpha=True
+        )  # (n_env, h, w, 3)
+        images = []
+        for key in self.bg_images:
+            if "render_camera" not in key:
+                continue
+            env_idx = int(key.split("-env")[-1])
+            rgba = alpha_blend_rgba(
+                fg_images[env_idx].cpu().numpy(), self.bg_images[key]
+            )
+            images.append(self.image_transform(rgba))
+        images = torch.stack(images, dim=0)
+        images = images.permute(0, 2, 3, 1)
+        return images[..., :3]
+    def evaluate(self):
+        obj_to_goal_pos = (
+            self.obj.pose.p
+        )  # self.goal_site.pose.p - self.obj.pose.p
+        is_obj_placed = (
+            torch.linalg.norm(obj_to_goal_pos, axis=1) <= self.goal_thresh
+        )
+        is_grasped = self.agent.is_grasping(self.obj)
+        is_robot_static = self.agent.is_static(0.2)
+        return dict(
+            is_grasped=is_grasped,
+            obj_to_goal_pos=obj_to_goal_pos,
+            is_obj_placed=is_obj_placed,
+            is_robot_static=is_robot_static,
+            is_grasping=self.agent.is_grasping(self.obj),
+            success=torch.logical_and(is_obj_placed, is_robot_static),
+        )
+    def _get_obs_extra(self, info: dict):
+        return dict()
+    def compute_dense_reward(self, obs: any, action: torch.Tensor, info: dict):
+        tcp_to_obj_dist = torch.linalg.norm(
+            self.obj.pose.p - self.agent.tcp.pose.p, axis=1
+        )
+        reaching_reward = 1 - torch.tanh(5 * tcp_to_obj_dist)
+        reward = reaching_reward
+        is_grasped = info["is_grasped"]
+        reward += is_grasped
+        # obj_to_goal_dist = torch.linalg.norm(
+        #     self.goal_site.pose.p - self.obj.pose.p, axis=1
+        # )
+        obj_to_goal_dist = torch.linalg.norm(
+            self.obj.pose.p - self.obj.pose.p, axis=1
+        )
+        place_reward = 1 - torch.tanh(5 * obj_to_goal_dist)
+        reward += place_reward * is_grasped
+        reward += info["is_obj_placed"] * is_grasped
+        static_reward = 1 - torch.tanh(
+            5
+            * torch.linalg.norm(self.agent.robot.get_qvel()[..., :-2], axis=1)
+        )
+        reward += static_reward * info["is_obj_placed"] * is_grasped
+        reward[info["success"]] = 6
+        return reward
+    def compute_normalized_dense_reward(
+        self, obs: any, action: torch.Tensor, info: dict
+    ):
+        return self.compute_dense_reward(obs=obs, action=action, info=info) / 6

embodied_gen/models/gs_model.py CHANGED Viewed

@@ -51,17 +51,15 @@ class RenderResult:
     def __post_init__(self):
         if isinstance(self.rgb, torch.Tensor):
-            rgb = self.rgb.detach().cpu().numpy()
-            rgb = (rgb * 255).astype(np.uint8)
-            self.rgb = cv2.cvtColor(rgb, cv2.COLOR_BGR2RGB)
         if isinstance(self.depth, torch.Tensor):
-            self.depth = self.depth.detach().cpu().numpy()
         if isinstance(self.opacity, torch.Tensor):
-            opacity = self.opacity.detach().cpu().numpy()
-            opacity = (opacity * 255).astype(np.uint8)
-            self.opacity = cv2.cvtColor(opacity, cv2.COLOR_GRAY2RGB)
             mask = np.where(self.opacity > self.mask_threshold, 255, 0)
-            self.mask = mask[..., 0:1].astype(np.uint8)
             self.rgba = np.concatenate([self.rgb, self.mask], axis=-1)

     def __post_init__(self):
         if isinstance(self.rgb, torch.Tensor):
+            rgb = (self.rgb * 255).to(torch.uint8)
+            self.rgb = rgb.cpu().numpy()[..., ::-1]
         if isinstance(self.depth, torch.Tensor):
+            self.depth = self.depth.cpu().numpy()
         if isinstance(self.opacity, torch.Tensor):
+            opacity = (self.opacity * 255).to(torch.uint8)
+            self.opacity = opacity.cpu().numpy()
             mask = np.where(self.opacity > self.mask_threshold, 255, 0)
+            self.mask = mask.astype(np.uint8)
             self.rgba = np.concatenate([self.rgb, self.mask], axis=-1)

embodied_gen/models/layout.py ADDED Viewed

	@@ -0,0 +1,509 @@

+# Project EmbodiedGen
+#
+# Copyright (c) 2025 Horizon Robotics. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+import argparse
+import json
+import logging
+import os
+import re
+import json_repair
+from embodied_gen.utils.enum import (
+    LayoutInfo,
+    RobotItemEnum,
+    Scene3DItemEnum,
+    SpatialRelationEnum,
+)
+from embodied_gen.utils.gpt_clients import GPT_CLIENT, GPTclient
+from embodied_gen.utils.process_media import SceneTreeVisualizer
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+__all__ = [
+    "LayoutDesigner",
+    "LAYOUT_DISASSEMBLER",
+    "LAYOUT_GRAPHER",
+    "LAYOUT_DESCRIBER",
+]
+DISTRACTOR_NUM = 3  # Maximum number of distractor objects allowed
+LAYOUT_DISASSEMBLE_PROMPT = f"""
+    You are an intelligent 3D scene planner. Given a natural language
+    description of a robotic task, output a structured description of
+    an interactive 3D scene.
+    The output must include the following fields:
+    - task: A high-level task type (e.g., "single-arm pick",
+        "dual-arm grasping", "pick and place", "object sorting").
+    - {Scene3DItemEnum.ROBOT}: The name or type of robot involved. If not mentioned,
+        use {RobotItemEnum.FRANKA} as default.
+    - {Scene3DItemEnum.BACKGROUND}: The room or indoor environment where the task happens
+        (e.g., Kitchen, Bedroom, Living Room, Workshop, Office).
+    - {Scene3DItemEnum.CONTEXT}: A indoor object involved in the manipulation
+        (e.g., Table, Shelf, Desk, Bed, Cabinet).
+    - {Scene3DItemEnum.MANIPULATED_OBJS}: The main object(s) that the robot directly interacts with.
+    - {Scene3DItemEnum.DISTRACTOR_OBJS}: Other objects that naturally belong to the scene but are not part of the main task.
+    Constraints:
+    - The {Scene3DItemEnum.BACKGROUND} must logically match the described task.
+    - The {Scene3DItemEnum.CONTEXT} must fit within the {Scene3DItemEnum.BACKGROUND}. (e.g., a bedroom may include a table or bed, but not a workbench.)
+    - The {Scene3DItemEnum.CONTEXT} must be a concrete indoor object, such as a "table",
+        "shelf", "desk", or "bed". It must not be an abstract concept (e.g., "area", "space", "zone")
+        or structural surface (e.g., "floor", "ground"). If the input describes an interaction near
+        the floor or vague space, you must infer a plausible object like a "table", "cabinet", or "storage box" instead.
+    - {Scene3DItemEnum.MANIPULATED_OBJS} and {Scene3DItemEnum.DISTRACTOR_OBJS} objects must be plausible,
+        and semantically compatible with the {Scene3DItemEnum.CONTEXT} and {Scene3DItemEnum.BACKGROUND}.
+    - {Scene3DItemEnum.DISTRACTOR_OBJS} must not confuse or overlap with the manipulated objects.
+    - {Scene3DItemEnum.DISTRACTOR_OBJS} number limit: {DISTRACTOR_NUM} distractors maximum.
+    - All {Scene3DItemEnum.BACKGROUND} are limited to indoor environments.
+    - {Scene3DItemEnum.MANIPULATED_OBJS} and {Scene3DItemEnum.DISTRACTOR_OBJS} are rigid bodies and not include flexible objects.
+    - {Scene3DItemEnum.MANIPULATED_OBJS} and {Scene3DItemEnum.DISTRACTOR_OBJS} must be common
+        household or office items or furniture, not abstract concepts, not too small like needle.
+    - If the input includes a plural or grouped object (e.g., "pens", "bottles", "plates", "fruit"),
+        you must decompose it into multiple individual instances (e.g., ["pen", "pen"], ["apple", "pear"]).
+    - Containers that hold objects (e.g., "bowl of apples", "box of tools") must
+        be separated into individual items (e.g., ["bowl", "apple", "apple"]).
+    - Do not include transparent objects such as "glass", "plastic", etc.
+    - The output must be in compact JSON format and use Markdown syntax, just like the output in the example below.
+    Examples:
+    Input:
+    "Pick up the marker from the table and put it in the bowl robot {RobotItemEnum.UR5}."
+    Output:
+    ```json
+    {{
+        "task_desc": "Pick up the marker from the table and put it in the bowl.",
+        "task": "pick and place",
+        "{Scene3DItemEnum.ROBOT}": "{RobotItemEnum.UR5}",
+        "{Scene3DItemEnum.BACKGROUND}": "kitchen",
+        "{Scene3DItemEnum.CONTEXT}": "table",
+        "{Scene3DItemEnum.MANIPULATED_OBJS}": ["marker"],
+        "{Scene3DItemEnum.DISTRACTOR_OBJS}": ["mug", "notebook", "bowl"]
+    }}
+    ```
+    Input:
+    "Put the rubik's cube on the top of the shelf."
+    Output:
+    ```json
+    {{
+        "task_desc": "Put the rubik's cube on the top of the shelf.",
+        "task": "pick and place",
+        "{Scene3DItemEnum.ROBOT}": "{RobotItemEnum.FRANKA}",
+        "{Scene3DItemEnum.BACKGROUND}": "bedroom",
+        "{Scene3DItemEnum.CONTEXT}": "shelf",
+        "{Scene3DItemEnum.MANIPULATED_OBJS}": ["rubik's cube"],
+        "{Scene3DItemEnum.DISTRACTOR_OBJS}": ["pen", "cup", "toy car"]
+    }}
+    ```
+    Input:
+    "Remove all the objects from the white basket and put them on the table."
+    Output:
+    ```json
+    {{
+        "task_desc": "Remove all the objects from the white basket and put them on the table, robot {RobotItemEnum.PIPER}.",
+        "task": "pick and place",
+        "{Scene3DItemEnum.ROBOT}": "{RobotItemEnum.PIPER}",
+        "{Scene3DItemEnum.BACKGROUND}": "office",
+        "{Scene3DItemEnum.CONTEXT}": "table",
+        "{Scene3DItemEnum.MANIPULATED_OBJS}": ["banana", "mobile phone"],
+        "{Scene3DItemEnum.DISTRACTOR_OBJS}": ["plate", "white basket"]
+    }}
+    ```
+    Input:
+    "Pick up the rope on the chair and put it in the box."
+    Output:
+    ```json
+    {{
+        "task_desc": "Pick up the rope on the chair and put it in the box, robot {RobotItemEnum.FRANKA}.",
+        "task": "pick and place",
+        "{Scene3DItemEnum.ROBOT}": "{RobotItemEnum.FRANKA}",
+        "{Scene3DItemEnum.BACKGROUND}": "living room",
+        "{Scene3DItemEnum.CONTEXT}": "chair",
+        "{Scene3DItemEnum.MANIPULATED_OBJS}": ["rope", "box"],
+        "{Scene3DItemEnum.DISTRACTOR_OBJS}": ["magazine"]
+    }}
+    ```
+    Input:
+    "Pick up the seal tape and plastic from the counter and put them in the open drawer and close it."
+    Output:
+    ```json
+    {{
+        "task_desc": "Pick up the seal tape and plastic from the counter and put them in the open drawer and close it.",
+        "task": "pick and place",
+        "robot": "franka",
+        "background": "kitchen",
+        "context": "counter",
+        "manipulated_objs": ["seal tape", "plastic", "opened drawer"],
+        "distractor_objs": ["scissors"]
+    }}
+    ```
+    Input:
+    "Put the pens in the grey bowl."
+    Output:
+    ```json
+    {{
+        "task_desc": "Put the pens in the grey bowl.",
+        "task": "pick and place",
+        "robot": "franka",
+        "background": "office",
+        "context": "table",
+        "manipulated_objs": ["pen", "pen", "grey bowl"],
+        "distractor_objs": ["notepad", "cup"]
+    }}
+    ```
+"""
+LAYOUT_HIERARCHY_PROMPT = f"""
+    You are a 3D scene layout reasoning expert.
+    Your task is to generate a spatial relationship dictionary in multiway tree
+    that describes how objects are arranged in a 3D environment
+    based on a given task description and object list.
+    Input in JSON format containing the task description, task type,
+    {Scene3DItemEnum.ROBOT}, {Scene3DItemEnum.BACKGROUND}, {Scene3DItemEnum.CONTEXT},
+    and a list of objects, including {Scene3DItemEnum.MANIPULATED_OBJS} and {Scene3DItemEnum.DISTRACTOR_OBJS}.
+    ### Supported Spatial Relations:
+    - "{SpatialRelationEnum.ON}": The child object bottom is directly on top of the parent object top.
+    - "{SpatialRelationEnum.INSIDE}": The child object is inside the context object.
+    - "{SpatialRelationEnum.IN}": The {Scene3DItemEnum.ROBOT} in the {Scene3DItemEnum.BACKGROUND}.
+    - "{SpatialRelationEnum.FLOOR}": The child object bottom is on the floor of the {Scene3DItemEnum.BACKGROUND}.
+    ### Rules:
+    - The {Scene3DItemEnum.CONTEXT} object must be "{SpatialRelationEnum.FLOOR}" the {Scene3DItemEnum.BACKGROUND}.
+    - {Scene3DItemEnum.MANIPULATED_OBJS} and {Scene3DItemEnum.DISTRACTOR_OBJS} must be either
+        "{SpatialRelationEnum.ON}" or "{SpatialRelationEnum.INSIDE}" the {Scene3DItemEnum.CONTEXT}
+    - Or "{SpatialRelationEnum.FLOOR}" {Scene3DItemEnum.BACKGROUND}.
+    - Use "{SpatialRelationEnum.INSIDE}" only if the parent is a container-like object (e.g., shelf, rack, cabinet).
+    - Do not define relationship edges between objects, only for the child and parent nodes.
+    - {Scene3DItemEnum.ROBOT} must "{SpatialRelationEnum.IN}" the {Scene3DItemEnum.BACKGROUND}.
+    - Ensure that each object appears only once in the layout tree, and its spatial relationship is defined with only one parent.
+    - Ensure a valid multiway tree structure with a maximum depth of 2 levels suitable for a 3D scene layout representation.
+    - Only output the final output in JSON format, using Markdown syntax as in examples.
+    ### Example
+    Input:
+    {{
+        "task_desc": "Pick up the marker from the table and put it in the bowl.",
+        "task": "pick and place",
+        "{Scene3DItemEnum.ROBOT}": "{RobotItemEnum.FRANKA}",
+        "{Scene3DItemEnum.BACKGROUND}": "kitchen",
+        "{Scene3DItemEnum.CONTEXT}": "table",
+        "{Scene3DItemEnum.MANIPULATED_OBJS}": ["marker", "bowl"],
+        "{Scene3DItemEnum.DISTRACTOR_OBJS}": ["mug", "chair"]
+    }}
+    Intermediate Think:
+        table {SpatialRelationEnum.FLOOR} kitchen
+        chair {SpatialRelationEnum.FLOOR} kitchen
+        {RobotItemEnum.FRANKA} {SpatialRelationEnum.IN} kitchen
+        marker {SpatialRelationEnum.ON} table
+        bowl {SpatialRelationEnum.ON} table
+        mug {SpatialRelationEnum.ON} table
+    Final Output:
+    ```json
+    {{
+        "kitchen": [
+            ["table", "{SpatialRelationEnum.FLOOR}"],
+            ["chair", "{SpatialRelationEnum.FLOOR}"],
+            ["{RobotItemEnum.FRANKA}", "{SpatialRelationEnum.IN}"]
+        ],
+        "table": [
+            ["marker", "{SpatialRelationEnum.ON}"],
+            ["bowl", "{SpatialRelationEnum.ON}"],
+            ["mug", "{SpatialRelationEnum.ON}"]
+        ]
+    }}
+    ```
+    Input:
+    {{
+        "task_desc": "Put the marker on top of the book.",
+        "task": "pick and place",
+        "{Scene3DItemEnum.ROBOT}": "{RobotItemEnum.UR5}",
+        "{Scene3DItemEnum.BACKGROUND}": "office",
+        "{Scene3DItemEnum.CONTEXT}": "desk",
+        "{Scene3DItemEnum.MANIPULATED_OBJS}": ["marker", "book"],
+        "{Scene3DItemEnum.DISTRACTOR_OBJS}": ["pen holder", "notepad"]
+    }}
+    Intermediate Think:
+        desk {SpatialRelationEnum.FLOOR} office
+        {RobotItemEnum.UR5} {SpatialRelationEnum.IN} office
+        marker {SpatialRelationEnum.ON} desk
+        book {SpatialRelationEnum.ON} desk
+        pen holder {SpatialRelationEnum.ON} desk
+        notepad {SpatialRelationEnum.ON} desk
+    Final Output:
+    ```json
+    {{
+        "office": [
+            ["desk", "{SpatialRelationEnum.FLOOR}"],
+            ["{RobotItemEnum.UR5}", "{SpatialRelationEnum.IN}"]
+        ],
+        "desk": [
+            ["marker", "{SpatialRelationEnum.ON}"],
+            ["book", "{SpatialRelationEnum.ON}"],
+            ["pen holder", "{SpatialRelationEnum.ON}"],
+            ["notepad", "{SpatialRelationEnum.ON}"]
+        ]
+    }}
+    ```
+    Input:
+    {{
+        "task_desc": "Put the rubik's cube on the top of the shelf.",
+        "task": "pick and place",
+        "{Scene3DItemEnum.ROBOT}": "{RobotItemEnum.UR5}",
+        "{Scene3DItemEnum.BACKGROUND}": "bedroom",
+        "{Scene3DItemEnum.CONTEXT}": "shelf",
+        "{Scene3DItemEnum.MANIPULATED_OBJS}": ["rubik's cube"],
+        "{Scene3DItemEnum.DISTRACTOR_OBJS}": ["toy car", "pen"]
+    }}
+    Intermediate Think:
+        shelf {SpatialRelationEnum.FLOOR} bedroom
+        {RobotItemEnum.UR5} {SpatialRelationEnum.IN} bedroom
+        rubik's cube {SpatialRelationEnum.INSIDE} shelf
+        toy car {SpatialRelationEnum.INSIDE} shelf
+        pen {SpatialRelationEnum.INSIDE} shelf
+    Final Output:
+    ```json
+    {{
+        "bedroom": [
+            ["shelf", "{SpatialRelationEnum.FLOOR}"],
+            ["{RobotItemEnum.UR5}", "{SpatialRelationEnum.IN}"]
+        ],
+        "shelf": [
+            ["rubik's cube", "{SpatialRelationEnum.INSIDE}"],
+            ["toy car", "{SpatialRelationEnum.INSIDE}"],
+            ["pen", "{SpatialRelationEnum.INSIDE}"]
+        ]
+    }}
+    ```
+    Input:
+    {{
+        "task_desc": "Put the marker in the cup on the counter.",
+        "task": "pick and place",
+        "robot": "franka",
+        "background": "kitchen",
+        "context": "counter",
+        "manipulated_objs": ["marker", "cup"],
+        "distractor_objs": ["plate", "spoon"]
+    }}
+    Intermediate Think:
+        counter {SpatialRelationEnum.FLOOR} kitchen
+        {RobotItemEnum.FRANKA} {SpatialRelationEnum.IN} kitchen
+        marker {SpatialRelationEnum.ON} counter
+        cup {SpatialRelationEnum.ON} counter
+        plate {SpatialRelationEnum.ON} counter
+        spoon {SpatialRelationEnum.ON} counter
+    Final Output:
+    ```json
+    {{
+        "kitchen": [
+            ["counter", "{SpatialRelationEnum.FLOOR}"],
+            ["{RobotItemEnum.FRANKA}", "{SpatialRelationEnum.IN}"]
+        ],
+        "counter": [
+            ["marker", "{SpatialRelationEnum.ON}"],
+            ["cup", "{SpatialRelationEnum.ON}"],
+            ["plate", "{SpatialRelationEnum.ON}"],
+            ["spoon", "{SpatialRelationEnum.ON}"]
+        ]
+    }}
+    ```
+"""
+LAYOUT_DESCRIBER_PROMPT = """
+    You are a 3D asset style descriptor.
+    Given a task description and a dictionary where the key is the object content and
+    the value is the object type, output a JSON dictionary with each object paired
+    with a concise, styled visual description suitable for 3D asset generation.
+    Generation Guidelines:
+    - For each object, brainstorm multiple style candidates before selecting the final
+        description. Vary phrasing, material, texture, color, and spatial details.
+    - Each description must be a maximum of 15 words, including color, style, materials.
+    - Descriptions should be visually grounded, specific, and reflect surface texture and structure.
+    - For objects marked as "context", explicitly mention the object is standalone, has an empty top.
+    - Use rich style descriptors: e.g., "scratched brown wooden desk" etc.
+    - Ensure all object styles align with the task's overall context and environment.
+    Format your output in JSON like the example below.
+    Example Input:
+    "Pick up the rope on the chair and put it in the box. {'living room': 'background', 'chair': 'context',
+        'rope': 'manipulated_objs', 'box': 'manipulated_objs', 'magazine': 'distractor_objs'}"
+    Example Output:
+    ```json
+    {
+        "living room": "modern cozy living room with soft sunlight and light grey carpet",
+        "chair": "standalone dark oak chair with no surroundings and clean empty seat",
+        "rope": "twisted hemp rope with rough fibers and dusty beige texture",
+        "box": "slightly crumpled cardboard box with open flaps and brown textured surface",
+        "magazine": "celebrity magazine with glossy red cover and large bold title"
+    }
+    ```
+"""
+class LayoutDesigner(object):
+    def __init__(
+        self,
+        gpt_client: GPTclient,
+        system_prompt: str,
+        verbose: bool = False,
+    ) -> None:
+        self.prompt = system_prompt.strip()
+        self.verbose = verbose
+        self.gpt_client = gpt_client
+    def query(self, prompt: str, params: dict = None) -> str:
+        full_prompt = self.prompt + f"\n\nInput:\n\"{prompt}\""
+        response = self.gpt_client.query(
+            text_prompt=full_prompt,
+            params=params,
+        )
+        if self.verbose:
+            logger.info(f"Response: {response}")
+        return response
+    def format_response(self, response: str) -> dict:
+        cleaned = re.sub(r"^```json\s*|\s*```$", "", response.strip())
+        try:
+            output = json.loads(cleaned)
+        except json.JSONDecodeError as e:
+            raise json.JSONDecodeError(
+                f"Error: {e}, failed to parse JSON response: {response}"
+            )
+        return output
+    def format_response_repair(self, response: str) -> dict:
+        return json_repair.loads(response)
+    def save_output(self, output: dict, save_path: str) -> None:
+        os.makedirs(os.path.dirname(save_path), exist_ok=True)
+        with open(save_path, 'w') as f:
+            json.dump(output, f, indent=4)
+    def __call__(
+        self, prompt: str, save_path: str = None, params: dict = None
+    ) -> dict | str:
+        response = self.query(prompt, params=params)
+        output = self.format_response_repair(response)
+        self.save_output(output, save_path) if save_path else None
+        return output
+LAYOUT_DISASSEMBLER = LayoutDesigner(
+    gpt_client=GPT_CLIENT, system_prompt=LAYOUT_DISASSEMBLE_PROMPT
+)
+LAYOUT_GRAPHER = LayoutDesigner(
+    gpt_client=GPT_CLIENT, system_prompt=LAYOUT_HIERARCHY_PROMPT
+)
+LAYOUT_DESCRIBER = LayoutDesigner(
+    gpt_client=GPT_CLIENT, system_prompt=LAYOUT_DESCRIBER_PROMPT
+)
+def build_scene_layout(
+    task_desc: str, output_path: str = None, gpt_params: dict = None
+) -> LayoutInfo:
+    layout_relation = LAYOUT_DISASSEMBLER(task_desc, params=gpt_params)
+    layout_tree = LAYOUT_GRAPHER(layout_relation, params=gpt_params)
+    object_mapping = Scene3DItemEnum.object_mapping(layout_relation)
+    obj_prompt = f'{layout_relation["task_desc"]} {object_mapping}'
+    objs_desc = LAYOUT_DESCRIBER(obj_prompt, params=gpt_params)
+    layout_info = LayoutInfo(
+        layout_tree, layout_relation, objs_desc, object_mapping
+    )
+    if output_path is not None:
+        visualizer = SceneTreeVisualizer(layout_info)
+        visualizer.render(save_path=output_path)
+        logger.info(f"Scene hierarchy tree saved to {output_path}")
+    return layout_info
+def parse_args():
+    parser = argparse.ArgumentParser(description="3D Scene Layout Designer")
+    parser.add_argument(
+        "--task_desc",
+        type=str,
+        default="Put the apples on the table on the plate",
+        help="Natural language description of the robotic task",
+    )
+    parser.add_argument(
+        "--save_root",
+        type=str,
+        default="outputs/layout_tree",
+        help="Path to save the layout output",
+    )
+    return parser.parse_args()
+if __name__ == "__main__":
+    from embodied_gen.utils.enum import LayoutInfo
+    from embodied_gen.utils.process_media import SceneTreeVisualizer
+    args = parse_args()
+    params = {
+        "temperature": 1.0,
+        "top_p": 0.95,
+        "frequency_penalty": 0.3,
+        "presence_penalty": 0.5,
+    }
+    layout_relation = LAYOUT_DISASSEMBLER(args.task_desc, params=params)
+    layout_tree = LAYOUT_GRAPHER(layout_relation, params=params)
+    object_mapping = Scene3DItemEnum.object_mapping(layout_relation)
+    obj_prompt = f'{layout_relation["task_desc"]} {object_mapping}'
+    objs_desc = LAYOUT_DESCRIBER(obj_prompt, params=params)
+    layout_info = LayoutInfo(layout_tree, layout_relation, objs_desc)
+    visualizer = SceneTreeVisualizer(layout_info)
+    os.makedirs(args.save_root, exist_ok=True)
+    scene_graph_path = f"{args.save_root}/scene_tree.jpg"
+    visualizer.render(save_path=scene_graph_path)
+    with open(f"{args.save_root}/layout.json", "w") as f:
+        json.dump(layout_info.to_dict(), f, indent=4)
+    print(f"Scene hierarchy tree saved to {scene_graph_path}")
+    print(f"Disassembled Layout: {layout_relation}")
+    print(f"Layout Graph: {layout_tree}")
+    print(f"Layout Descriptions: {objs_desc}")

embodied_gen/scripts/compose_layout.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# Project EmbodiedGen
+#
+# Copyright (c) 2025 Horizon Robotics. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+import json
+import os
+from dataclasses import dataclass
+import tyro
+from embodied_gen.scripts.simulate_sapien import entrypoint as sim_cli
+from embodied_gen.utils.enum import LayoutInfo
+from embodied_gen.utils.geometry import bfs_placement, compose_mesh_scene
+from embodied_gen.utils.log import logger
+@dataclass
+class LayoutPlacementConfig:
+    layout_path: str
+    output_dir: str | None = None
+    seed: int | None = None
+    max_attempts: int = 1000
+    output_iscene: bool = False
+    insert_robot: bool = False
+def entrypoint(**kwargs):
+    if kwargs is None or len(kwargs) == 0:
+        args = tyro.cli(LayoutPlacementConfig)
+    else:
+        args = LayoutPlacementConfig(**kwargs)
+    output_dir = (
+        args.output_dir
+        if args.output_dir is not None
+        else os.path.dirname(args.layout_path)
+    )
+    os.makedirs(output_dir, exist_ok=True)
+    out_scene_path = f"{output_dir}/Iscene.glb"
+    out_layout_path = f"{output_dir}/layout.json"
+    with open(args.layout_path, "r") as f:
+        layout_info = LayoutInfo.from_dict(json.load(f))
+    layout_info = bfs_placement(layout_info, seed=args.seed)
+    with open(out_layout_path, "w") as f:
+        json.dump(layout_info.to_dict(), f, indent=4)
+    if args.output_iscene:
+        compose_mesh_scene(layout_info, out_scene_path)
+    sim_cli(
+        layout_path=out_layout_path,
+        output_dir=output_dir,
+        robot_name="franka" if args.insert_robot else None,
+    )
+    logger.info(f"Layout placement completed in {output_dir}")
+if __name__ == "__main__":
+    entrypoint()

embodied_gen/scripts/gen_layout.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# Project EmbodiedGen
+#
+# Copyright (c) 2025 Horizon Robotics. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+import gc
+import json
+import os
+from dataclasses import dataclass, field
+from shutil import copytree
+from time import time
+from typing import Optional
+import torch
+import tyro
+from embodied_gen.models.layout import build_scene_layout
+from embodied_gen.scripts.simulate_sapien import entrypoint as sim_cli
+from embodied_gen.scripts.textto3d import text_to_3d
+from embodied_gen.utils.config import GptParamsConfig
+from embodied_gen.utils.enum import LayoutInfo, Scene3DItemEnum
+from embodied_gen.utils.geometry import bfs_placement, compose_mesh_scene
+from embodied_gen.utils.gpt_clients import GPT_CLIENT
+from embodied_gen.utils.log import logger
+from embodied_gen.utils.process_media import (
+    load_scene_dict,
+    parse_text_prompts,
+)
+from embodied_gen.validators.quality_checkers import SemanticMatcher
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+@dataclass
+class LayoutGenConfig:
+    task_descs: list[str]
+    output_root: str
+    bg_list: str = "outputs/bg_scenes/scene_list.txt"
+    n_img_sample: int = 3
+    text_guidance_scale: float = 7.0
+    img_denoise_step: int = 25
+    n_image_retry: int = 4
+    n_asset_retry: int = 3
+    n_pipe_retry: int = 2
+    seed_img: Optional[int] = None
+    seed_3d: Optional[int] = None
+    seed_layout: Optional[int] = None
+    keep_intermediate: bool = False
+    output_iscene: bool = False
+    insert_robot: bool = False
+    gpt_params: GptParamsConfig = field(
+        default_factory=lambda: GptParamsConfig(
+            temperature=1.0,
+            top_p=0.95,
+            frequency_penalty=0.3,
+            presence_penalty=0.5,
+        )
+    )
+def entrypoint() -> None:
+    args = tyro.cli(LayoutGenConfig)
+    SCENE_MATCHER = SemanticMatcher(GPT_CLIENT)
+    task_descs = parse_text_prompts(args.task_descs)
+    scene_dict = load_scene_dict(args.bg_list)
+    gpt_params = args.gpt_params.to_dict()
+    for idx, task_desc in enumerate(task_descs):
+        logger.info(f"Generate Layout and 3D scene for task: {task_desc}")
+        output_root = f"{args.output_root}/task_{idx:04d}"
+        scene_graph_path = f"{output_root}/scene_tree.jpg"
+        start_time = time()
+        layout_info: LayoutInfo = build_scene_layout(
+            task_desc, scene_graph_path, gpt_params
+        )
+        prompts_mapping = {v: k for k, v in layout_info.objs_desc.items()}
+        prompts = [
+            v
+            for k, v in layout_info.objs_desc.items()
+            if layout_info.objs_mapping[k] != Scene3DItemEnum.BACKGROUND.value
+        ]
+        for prompt in prompts:
+            node = prompts_mapping[prompt]
+            generation_log = text_to_3d(
+                prompts=[
+                    prompt,
+                ],
+                output_root=output_root,
+                asset_names=[
+                    node,
+                ],
+                n_img_sample=args.n_img_sample,
+                text_guidance_scale=args.text_guidance_scale,
+                img_denoise_step=args.img_denoise_step,
+                n_image_retry=args.n_image_retry,
+                n_asset_retry=args.n_asset_retry,
+                n_pipe_retry=args.n_pipe_retry,
+                seed_img=args.seed_img,
+                seed_3d=args.seed_3d,
+                keep_intermediate=args.keep_intermediate,
+            )
+            layout_info.assets.update(generation_log["assets"])
+            layout_info.quality.update(generation_log["quality"])
+        # Background GEN (for efficiency, temp use retrieval instead)
+        bg_node = layout_info.relation[Scene3DItemEnum.BACKGROUND.value]
+        text = layout_info.objs_desc[bg_node]
+        match_key = SCENE_MATCHER.query(text, str(scene_dict))
+        match_scene_path = f"{os.path.dirname(args.bg_list)}/{match_key}"
+        bg_save_dir = os.path.join(output_root, "background")
+        copytree(match_scene_path, bg_save_dir, dirs_exist_ok=True)
+        layout_info.assets[bg_node] = bg_save_dir
+        # BFS layout placement.
+        layout_info = bfs_placement(
+            layout_info,
+            limit_reach_range=True if args.insert_robot else False,
+            seed=args.seed_layout,
+        )
+        layout_path = f"{output_root}/layout.json"
+        with open(layout_path, "w") as f:
+            json.dump(layout_info.to_dict(), f, indent=4)
+        if args.output_iscene:
+            compose_mesh_scene(layout_info, f"{output_root}/Iscene.glb")
+        sim_cli(
+            layout_path=layout_path,
+            output_dir=output_root,
+            robot_name="franka" if args.insert_robot else None,
+        )
+        torch.cuda.empty_cache()
+        gc.collect()
+        elapsed_time = (time() - start_time) / 60
+        logger.info(
+            f"Layout generation done for {scene_graph_path}, layout result "
+            f"in {layout_path}, finished in {elapsed_time:.2f} mins."
+        )
+    logger.info(f"All tasks completed in {args.output_root}")
+if __name__ == "__main__":
+    entrypoint()

embodied_gen/scripts/imageto3d.py CHANGED Viewed

@@ -58,7 +58,7 @@ os.environ["GRADIO_ANALYTICS_ENABLED"] = "false"
 os.environ["SPCONV_ALGO"] = "native"
 random.seed(0)
-logger.info("Loading Models...")
 DELIGHT = DelightingModel()
 IMAGESR_MODEL = ImageRealESRGAN(outscale=4)
 RBG_REMOVER = RembgRemover()
@@ -107,6 +107,7 @@ def parse_args():
         type=int,
         default=2,
     )
     args, unknown = parser.parse_known_args()
     return args
@@ -151,6 +152,9 @@ def entrypoint(**kwargs):
             seg_image.save(seg_path)
             seed = args.seed
             for try_idx in range(args.n_retry):
                 logger.info(
                     f"Try: {try_idx + 1}/{args.n_retry}, Seed: {seed}, Prompt: {seg_path}"
@@ -207,7 +211,9 @@ def entrypoint(**kwargs):
                 color_path = os.path.join(output_root, "color.png")
                 render_gs_api(aligned_gs_path, color_path)
-                geo_flag, geo_result = GEO_CHECKER([color_path])
                 logger.warning(
                     f"{GEO_CHECKER.__class__.__name__}: {geo_result} for {seg_path}"
                 )
@@ -246,7 +252,11 @@ def entrypoint(**kwargs):
             mesh_glb_path = os.path.join(output_root, f"{filename}.glb")
             mesh.export(mesh_glb_path)
-            urdf_convertor = URDFGenerator(GPT_CLIENT, render_view_num=4)
             asset_attrs = {
                 "version": VERSION,
                 "gs_model": f"{urdf_convertor.output_mesh_dir}/{filename}_gs.ply",

 os.environ["SPCONV_ALGO"] = "native"
 random.seed(0)
+logger.info("Loading Image3D Models...")
 DELIGHT = DelightingModel()
 IMAGESR_MODEL = ImageRealESRGAN(outscale=4)
 RBG_REMOVER = RembgRemover()
         type=int,
         default=2,
     )
+    parser.add_argument("--disable_decompose_convex", action="store_true")
     args, unknown = parser.parse_known_args()
     return args
             seg_image.save(seg_path)
             seed = args.seed
+            asset_node = "unknown"
+            if isinstance(args.asset_type, list) and args.asset_type[idx]:
+                asset_node = args.asset_type[idx]
             for try_idx in range(args.n_retry):
                 logger.info(
                     f"Try: {try_idx + 1}/{args.n_retry}, Seed: {seed}, Prompt: {seg_path}"
                 color_path = os.path.join(output_root, "color.png")
                 render_gs_api(aligned_gs_path, color_path)
+                geo_flag, geo_result = GEO_CHECKER(
+                    [color_path], text=asset_node
+                )
                 logger.warning(
                     f"{GEO_CHECKER.__class__.__name__}: {geo_result} for {seg_path}"
                 )
             mesh_glb_path = os.path.join(output_root, f"{filename}.glb")
             mesh.export(mesh_glb_path)
+            urdf_convertor = URDFGenerator(
+                GPT_CLIENT,
+                render_view_num=4,
+                decompose_convex=not args.disable_decompose_convex,
+            )
             asset_attrs = {
                 "version": VERSION,
                 "gs_model": f"{urdf_convertor.output_mesh_dir}/{filename}_gs.ply",

embodied_gen/scripts/parallel_sim.py ADDED Viewed

	@@ -0,0 +1,148 @@

+# Project EmbodiedGen
+#
+# Copyright (c) 2025 Horizon Robotics. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+from embodied_gen.utils.monkey_patches import monkey_patch_maniskill
+monkey_patch_maniskill()
+import json
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Literal
+import gymnasium as gym
+import numpy as np
+import torch
+import tyro
+from mani_skill.utils.wrappers import RecordEpisode
+from tqdm import tqdm
+import embodied_gen.envs.pick_embodiedgen
+from embodied_gen.utils.enum import LayoutInfo, Scene3DItemEnum
+from embodied_gen.utils.log import logger
+from embodied_gen.utils.simulation import FrankaPandaGrasper
+@dataclass
+class ParallelSimConfig:
+    """CLI parameters for Parallel Sapien simulation."""
+    # Environment configuration
+    layout_file: str
+    """Path to the layout JSON file"""
+    output_dir: str
+    """Directory to save recorded videos"""
+    gym_env_name: str = "PickEmbodiedGen-v1"
+    """Name of the Gym environment to use"""
+    num_envs: int = 4
+    """Number of parallel environments"""
+    render_mode: Literal["rgb_array", "hybrid"] = "hybrid"
+    """Rendering mode: rgb_array or hybrid"""
+    enable_shadow: bool = True
+    """Whether to enable shadows in rendering"""
+    control_mode: str = "pd_joint_pos"
+    """Control mode for the agent"""
+    # Recording configuration
+    max_steps_per_video: int = 1000
+    """Maximum steps to record per video"""
+    save_trajectory: bool = False
+    """Whether to save trajectory data"""
+    # Simulation parameters
+    seed: int = 0
+    """Random seed for environment reset"""
+    warmup_steps: int = 50
+    """Number of warmup steps before action computation"""
+    reach_target_only: bool = True
+    """Whether to only reach target without full action"""
+def entrypoint(**kwargs):
+    if kwargs is None or len(kwargs) == 0:
+        cfg = tyro.cli(ParallelSimConfig)
+    else:
+        cfg = ParallelSimConfig(**kwargs)
+    env = gym.make(
+        cfg.gym_env_name,
+        num_envs=cfg.num_envs,
+        render_mode=cfg.render_mode,
+        enable_shadow=cfg.enable_shadow,
+        layout_file=cfg.layout_file,
+        control_mode=cfg.control_mode,
+    )
+    env = RecordEpisode(
+        env,
+        cfg.output_dir,
+        max_steps_per_video=cfg.max_steps_per_video,
+        save_trajectory=cfg.save_trajectory,
+    )
+    env.reset(seed=cfg.seed)
+    default_action = env.unwrapped.agent.init_qpos[:, :8]
+    for _ in tqdm(range(cfg.warmup_steps), desc="SIM Warmup"):
+        # action = env.action_space.sample() # Random action
+        obs, reward, terminated, truncated, info = env.step(default_action)
+    grasper = FrankaPandaGrasper(
+        env.unwrapped.agent,
+        env.unwrapped.sim_config.control_freq,
+    )
+    layout_data = LayoutInfo.from_dict(json.load(open(cfg.layout_file, "r")))
+    actions = defaultdict(list)
+    # Plan Grasp reach pose for each manipulated object in each env.
+    for env_idx in range(env.num_envs):
+        actors = env.unwrapped.env_actors[f"env{env_idx}"]
+        for node in layout_data.relation[
+            Scene3DItemEnum.MANIPULATED_OBJS.value
+        ]:
+            action = grasper.compute_grasp_action(
+                actor=actors[node]._objs[0],
+                reach_target_only=True,
+                env_idx=env_idx,
+            )
+            actions[node].append(action)
+    # Excute the planned actions for each manipulated object in each env.
+    for node in actions:
+        max_env_steps = 0
+        for env_idx in range(env.num_envs):
+            if actions[node][env_idx] is None:
+                continue
+            max_env_steps = max(max_env_steps, len(actions[node][env_idx]))
+        action_tensor = np.ones(
+            (max_env_steps, env.num_envs, env.action_space.shape[-1])
+        )
+        action_tensor *= default_action[None, ...]
+        for env_idx in range(env.num_envs):
+            action = actions[node][env_idx]
+            if action is None:
+                continue
+            action_tensor[: len(action), env_idx, :] = action
+        for step in tqdm(range(max_env_steps), desc=f"Grasping: {node}"):
+            action = torch.Tensor(action_tensor[step]).to(env.unwrapped.device)
+            env.unwrapped.agent.set_action(action)
+            obs, reward, terminated, truncated, info = env.step(action)
+    env.close()
+    logger.info(f"Results saved in {cfg.output_dir}")
+if __name__ == "__main__":
+    entrypoint()

embodied_gen/scripts/simulate_sapien.py ADDED Viewed

	@@ -0,0 +1,195 @@

+# Project EmbodiedGen
+#
+# Copyright (c) 2025 Horizon Robotics. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+import json
+import os
+from collections import defaultdict
+from dataclasses import dataclass, field
+from typing import Literal
+import imageio
+import numpy as np
+import torch
+import tyro
+from tqdm import tqdm
+from embodied_gen.models.gs_model import GaussianOperator
+from embodied_gen.utils.enum import LayoutInfo, Scene3DItemEnum
+from embodied_gen.utils.geometry import quaternion_multiply
+from embodied_gen.utils.log import logger
+from embodied_gen.utils.process_media import alpha_blend_rgba
+from embodied_gen.utils.simulation import (
+    SIM_COORD_ALIGN,
+    FrankaPandaGrasper,
+    SapienSceneManager,
+    load_assets_from_layout_file,
+    load_mani_skill_robot,
+    render_images,
+)
+@dataclass
+class SapienSimConfig:
+    # Simulation settings.
+    layout_path: str
+    output_dir: str
+    sim_freq: int = 200
+    sim_step: int = 400
+    z_offset: float = 0.004
+    init_quat: list[float] = field(
+        default_factory=lambda: [0.7071, 0, 0, 0.7071]
+    )  # xyzw
+    device: str = "cuda"
+    control_freq: int = 50
+    insert_robot: bool = False
+    # Camera settings.
+    render_interval: int = 10
+    num_cameras: int = 3
+    camera_radius: float = 0.9
+    camera_height: float = 1.1
+    image_hw: tuple[int, int] = (512, 512)
+    ray_tracing: bool = True
+    fovy_deg: float = 75.0
+    camera_target_pt: list[float] = field(
+        default_factory=lambda: [0.0, 0.0, 0.9]
+    )
+    render_keys: list[
+        Literal[
+            "Color", "Foreground", "Segmentation", "Normal", "Mask", "Depth"
+        ]
+    ] = field(default_factory=lambda: ["Foreground"])
+def entrypoint(**kwargs):
+    if kwargs is None or len(kwargs) == 0:
+        cfg = tyro.cli(SapienSimConfig)
+    else:
+        cfg = SapienSimConfig(**kwargs)
+    scene_manager = SapienSceneManager(
+        cfg.sim_freq, ray_tracing=cfg.ray_tracing
+    )
+    _ = scene_manager.initialize_circular_cameras(
+        num_cameras=cfg.num_cameras,
+        radius=cfg.camera_radius,
+        height=cfg.camera_height,
+        target_pt=cfg.camera_target_pt,
+        image_hw=cfg.image_hw,
+        fovy_deg=cfg.fovy_deg,
+    )
+    with open(cfg.layout_path, "r") as f:
+        layout_data = json.load(f)
+        layout_data: LayoutInfo = LayoutInfo.from_dict(layout_data)
+    actors = load_assets_from_layout_file(
+        scene_manager.scene,
+        layout_data,
+        cfg.z_offset,
+        cfg.init_quat,
+    )
+    agent = load_mani_skill_robot(
+        scene_manager.scene, layout_data, cfg.control_freq
+    )
+    frames = defaultdict(list)
+    image_cnt = 0
+    for step in tqdm(range(cfg.sim_step), desc="Simulation"):
+        scene_manager.scene.step()
+        agent.reset(agent.init_qpos)
+        if step % cfg.render_interval != 0:
+            continue
+        scene_manager.scene.update_render()
+        image_cnt += 1
+        for camera in scene_manager.cameras:
+            camera.take_picture()
+            images = render_images(camera, cfg.render_keys)
+            frames[camera.name].append(images)
+    actions = dict()
+    if cfg.insert_robot:
+        grasper = FrankaPandaGrasper(
+            agent,
+            cfg.control_freq,
+        )
+        for node in layout_data.relation[
+            Scene3DItemEnum.MANIPULATED_OBJS.value
+        ]:
+            actions[node] = grasper.compute_grasp_action(
+                actor=actors[node], reach_target_only=True
+            )
+    if "Foreground" not in cfg.render_keys:
+        return
+    bg_node = layout_data.relation[Scene3DItemEnum.BACKGROUND.value]
+    gs_path = f"{layout_data.assets[bg_node]}/gs_model.ply"
+    gs_model: GaussianOperator = GaussianOperator.load_from_ply(gs_path)
+    x, y, z, qx, qy, qz, qw = layout_data.position[bg_node]
+    qx, qy, qz, qw = quaternion_multiply([qx, qy, qz, qw], cfg.init_quat)
+    init_pose = torch.tensor([x, y, z, qx, qy, qz, qw])
+    gs_model = gs_model.get_gaussians(instance_pose=init_pose)
+    bg_images = dict()
+    for camera in scene_manager.cameras:
+        Ks = camera.get_intrinsic_matrix()
+        c2w = camera.get_model_matrix()
+        c2w = c2w @ SIM_COORD_ALIGN
+        result = gs_model.render(
+            torch.tensor(c2w, dtype=torch.float32).to(cfg.device),
+            torch.tensor(Ks, dtype=torch.float32).to(cfg.device),
+            image_width=cfg.image_hw[1],
+            image_height=cfg.image_hw[0],
+        )
+        bg_images[camera.name] = result.rgb[..., ::-1]
+    video_frames = []
+    for camera in scene_manager.cameras:
+        # Scene rendering
+        for step in range(image_cnt):
+            rgba = alpha_blend_rgba(
+                frames[camera.name][step]["Foreground"],
+                bg_images[camera.name],
+            )
+            video_frames.append(np.array(rgba))
+        # Grasp rendering
+        for node in actions:
+            if actions[node] is None:
+                continue
+            for action in tqdm(actions[node]):
+                grasp_frames = scene_manager.step_action(
+                    agent,
+                    torch.Tensor(action[None, ...]),
+                    scene_manager.cameras,
+                    cfg.render_keys,
+                    sim_steps_per_control=cfg.sim_freq // cfg.control_freq,
+                )
+                rgba = alpha_blend_rgba(
+                    grasp_frames[camera.name][0]["Foreground"],
+                    bg_images[camera.name],
+                )
+                video_frames.append(np.array(rgba))
+            agent.reset(agent.init_qpos)
+    os.makedirs(cfg.output_dir, exist_ok=True)
+    video_path = f"{cfg.output_dir}/Iscene.mp4"
+    imageio.mimsave(video_path, video_frames, fps=30)
+    logger.info(f"Interative 3D Scene Visualization saved in {video_path}")
+if __name__ == "__main__":
+    entrypoint()

embodied_gen/scripts/textto3d.py CHANGED Viewed

@@ -42,7 +42,7 @@ from embodied_gen.validators.quality_checkers import (
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 random.seed(0)
-logger.info("Loading Models...")
 SEMANTIC_CHECKER = SemanticConsistChecker(GPT_CLIENT)
 SEG_CHECKER = ImageSegChecker(GPT_CLIENT)
 TXTGEN_CHECKER = TextGenAlignChecker(GPT_CLIENT)
@@ -170,6 +170,7 @@ def text_to_3d(**kwargs) -> dict:
                 seed=random.randint(0, 100000) if seed_3d is None else seed_3d,
                 n_retry=args.n_asset_retry,
                 keep_intermediate=args.keep_intermediate,
             )
             mesh_path = f"{node_save_dir}/result/mesh/{save_node}.obj"
             image_path = render_asset3d(
@@ -270,6 +271,7 @@ def parse_args():
         help="Random seed for 3D generation",
     )
     parser.add_argument("--keep_intermediate", action="store_true")
     args, unknown = parser.parse_known_args()

 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 random.seed(0)
+logger.info("Loading TEXT2IMG_MODEL...")
 SEMANTIC_CHECKER = SemanticConsistChecker(GPT_CLIENT)
 SEG_CHECKER = ImageSegChecker(GPT_CLIENT)
 TXTGEN_CHECKER = TextGenAlignChecker(GPT_CLIENT)
                 seed=random.randint(0, 100000) if seed_3d is None else seed_3d,
                 n_retry=args.n_asset_retry,
                 keep_intermediate=args.keep_intermediate,
+                disable_decompose_convex=args.disable_decompose_convex,
             )
             mesh_path = f"{node_save_dir}/result/mesh/{save_node}.obj"
             image_path = render_asset3d(
         help="Random seed for 3D generation",
     )
     parser.add_argument("--keep_intermediate", action="store_true")
+    parser.add_argument("--disable_decompose_convex", action="store_true")
     args, unknown = parser.parse_known_args()

embodied_gen/scripts/textto3d.sh CHANGED Viewed

@@ -81,6 +81,7 @@ done
 # Step 1: Text-to-Image
 eval python3 embodied_gen/scripts/text2image.py \
     --prompts ${prompt_args} \
     --output_root "${output_root}/images" \

 # Step 1: Text-to-Image
+echo ${prompt_args}
 eval python3 embodied_gen/scripts/text2image.py \
     --prompts ${prompt_args} \
     --output_root "${output_root}/images" \

embodied_gen/trainer/gsplat_trainer.py CHANGED Viewed

@@ -617,7 +617,7 @@ class Runner:
         for rgb, depth in images_cache:
             depth_normalized = torch.clip(
                 (depth - depth_global_min)
-                / (depth_global_max - depth_global_min),
                 0,
                 1,
             )

         for rgb, depth in images_cache:
             depth_normalized = torch.clip(
                 (depth - depth_global_min)
+                / (depth_global_max - depth_global_min + 1e-8),
                 0,
                 1,
             )

embodied_gen/trainer/pono2mesh_trainer.py CHANGED Viewed

@@ -30,7 +30,7 @@ from kornia.morphology import dilation
 from PIL import Image
 from embodied_gen.models.sr_model import ImageRealESRGAN
 from embodied_gen.utils.config import Pano2MeshSRConfig
-from embodied_gen.utils.gaussian import compute_pinhole_intrinsics
 from embodied_gen.utils.log import logger
 from thirdparty.pano2room.modules.geo_predictors import PanoJointPredictor
 from thirdparty.pano2room.modules.geo_predictors.PanoFusionDistancePredictor import (

 from PIL import Image
 from embodied_gen.models.sr_model import ImageRealESRGAN
 from embodied_gen.utils.config import Pano2MeshSRConfig
+from embodied_gen.utils.geometry import compute_pinhole_intrinsics
 from embodied_gen.utils.log import logger
 from thirdparty.pano2room.modules.geo_predictors import PanoJointPredictor
 from thirdparty.pano2room.modules.geo_predictors.PanoFusionDistancePredictor import (

embodied_gen/utils/config.py CHANGED Viewed

@@ -17,15 +17,27 @@
 from dataclasses import dataclass, field
 from typing import List, Optional, Union
 from gsplat.strategy import DefaultStrategy, MCMCStrategy
 from typing_extensions import Literal, assert_never
 __all__ = [
     "Pano2MeshSRConfig",
     "GsplatTrainConfig",
 ]
 @dataclass
 class Pano2MeshSRConfig:
     mesh_file: str = "mesh_model.ply"

 from dataclasses import dataclass, field
 from typing import List, Optional, Union
+from dataclasses_json import DataClassJsonMixin
 from gsplat.strategy import DefaultStrategy, MCMCStrategy
 from typing_extensions import Literal, assert_never
 __all__ = [
+    "GptParamsConfig",
     "Pano2MeshSRConfig",
     "GsplatTrainConfig",
 ]
+@dataclass
+class GptParamsConfig(DataClassJsonMixin):
+    temperature: float = 0.1
+    top_p: float = 0.1
+    frequency_penalty: float = 0.0
+    presence_penalty: float = 0.0
+    stop: int | None = None
+    max_tokens: int = 500
 @dataclass
 class Pano2MeshSRConfig:
     mesh_file: str = "mesh_model.ply"

embodied_gen/utils/enum.py CHANGED Viewed

@@ -102,6 +102,7 @@ class LayoutInfo(DataClassJsonMixin):
     tree: dict[str, list]
     relation: dict[str, str | list[str]]
     objs_desc: dict[str, str] = field(default_factory=dict)
     assets: dict[str, str] = field(default_factory=dict)
     quality: dict[str, str] = field(default_factory=dict)
     position: dict[str, list[float]] = field(default_factory=dict)

     tree: dict[str, list]
     relation: dict[str, str | list[str]]
     objs_desc: dict[str, str] = field(default_factory=dict)
+    objs_mapping: dict[str, str] = field(default_factory=dict)
     assets: dict[str, str] = field(default_factory=dict)
     quality: dict[str, str] = field(default_factory=dict)
     position: dict[str, list[float]] = field(default_factory=dict)

embodied_gen/utils/gaussian.py CHANGED Viewed

@@ -35,7 +35,6 @@ __all__ = [
     "set_random_seed",
     "export_splats",
     "create_splats_with_optimizers",
-    "compute_pinhole_intrinsics",
     "resize_pinhole_intrinsics",
     "restore_scene_scale_and_position",
 ]
@@ -265,12 +264,12 @@ def create_splats_with_optimizers(
     return splats, optimizers
-def compute_pinhole_intrinsics(
-    image_w: int, image_h: int, fov_deg: float
 ) -> np.ndarray:
-    fov_rad = np.deg2rad(fov_deg)
-    fx = image_w / (2 * np.tan(fov_rad / 2))
-    fy = fx  # assuming square pixels
     cx = image_w / 2
     cy = image_h / 2
     K = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]])

     "set_random_seed",
     "export_splats",
     "create_splats_with_optimizers",
     "resize_pinhole_intrinsics",
     "restore_scene_scale_and_position",
 ]
     return splats, optimizers
+def compute_intrinsics_from_fovy(
+    image_w: int, image_h: int, fovy_deg: float
 ) -> np.ndarray:
+    fovy_rad = np.deg2rad(fovy_deg)
+    fy = image_h / (2 * np.tan(fovy_rad / 2))
+    fx = fy * (image_w / image_h)
     cx = image_w / 2
     cy = image_h / 2
     K = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]])

embodied_gen/utils/geometry.py ADDED Viewed

	@@ -0,0 +1,458 @@

+# Project EmbodiedGen
+#
+# Copyright (c) 2025 Horizon Robotics. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+import os
+import random
+from collections import defaultdict, deque
+from functools import wraps
+from typing import Literal
+import numpy as np
+import torch
+import trimesh
+from matplotlib.path import Path
+from pyquaternion import Quaternion
+from scipy.spatial import ConvexHull
+from scipy.spatial.transform import Rotation as R
+from shapely.geometry import Polygon
+from embodied_gen.utils.enum import LayoutInfo, Scene3DItemEnum
+from embodied_gen.utils.log import logger
+__all__ = [
+    "bfs_placement",
+    "with_seed",
+    "matrix_to_pose",
+    "pose_to_matrix",
+    "quaternion_multiply",
+    "check_reachable",
+    "bfs_placement",
+    "compose_mesh_scene",
+    "compute_pinhole_intrinsics",
+]
+def matrix_to_pose(matrix: np.ndarray) -> list[float]:
+    """Convert a 4x4 transformation matrix to a pose (x, y, z, qx, qy, qz, qw).
+    Args:
+        matrix (np.ndarray): 4x4 transformation matrix.
+    Returns:
+        List[float]: Pose as [x, y, z, qx, qy, qz, qw].
+    """
+    x, y, z = matrix[:3, 3]
+    rot_mat = matrix[:3, :3]
+    quat = R.from_matrix(rot_mat).as_quat()
+    qx, qy, qz, qw = quat
+    return [x, y, z, qx, qy, qz, qw]
+def pose_to_matrix(pose: list[float]) -> np.ndarray:
+    """Convert pose (x, y, z, qx, qy, qz, qw) to a 4x4 transformation matrix.
+    Args:
+        List[float]: Pose as [x, y, z, qx, qy, qz, qw].
+    Returns:
+        matrix (np.ndarray): 4x4 transformation matrix.
+    """
+    x, y, z, qx, qy, qz, qw = pose
+    r = R.from_quat([qx, qy, qz, qw])
+    matrix = np.eye(4)
+    matrix[:3, :3] = r.as_matrix()
+    matrix[:3, 3] = [x, y, z]
+    return matrix
+def compute_xy_bbox(
+    vertices: np.ndarray, col_x: int = 0, col_y: int = 2
+) -> list[float]:
+    x_vals = vertices[:, col_x]
+    y_vals = vertices[:, col_y]
+    return x_vals.min(), x_vals.max(), y_vals.min(), y_vals.max()
+def has_iou_conflict(
+    new_box: list[float],
+    placed_boxes: list[list[float]],
+    iou_threshold: float = 0.0,
+) -> bool:
+    new_min_x, new_max_x, new_min_y, new_max_y = new_box
+    for min_x, max_x, min_y, max_y in placed_boxes:
+        ix1 = max(new_min_x, min_x)
+        iy1 = max(new_min_y, min_y)
+        ix2 = min(new_max_x, max_x)
+        iy2 = min(new_max_y, max_y)
+        inter_area = max(0, ix2 - ix1) * max(0, iy2 - iy1)
+        if inter_area > iou_threshold:
+            return True
+    return False
+def with_seed(seed_attr_name: str = "seed"):
+    """A parameterized decorator that temporarily sets the random seed."""
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            seed = kwargs.get(seed_attr_name, None)
+            if seed is not None:
+                py_state = random.getstate()
+                np_state = np.random.get_state()
+                torch_state = torch.get_rng_state()
+                random.seed(seed)
+                np.random.seed(seed)
+                torch.manual_seed(seed)
+                try:
+                    result = func(*args, **kwargs)
+                finally:
+                    random.setstate(py_state)
+                    np.random.set_state(np_state)
+                    torch.set_rng_state(torch_state)
+                return result
+            else:
+                return func(*args, **kwargs)
+        return wrapper
+    return decorator
+def compute_convex_hull_path(
+    vertices: np.ndarray,
+    z_threshold: float = 0.05,
+    interp_per_edge: int = 3,
+    margin: float = -0.02,
+) -> Path:
+    top_vertices = vertices[
+        vertices[:, 1] > vertices[:, 1].max() - z_threshold
+    ]
+    top_xy = top_vertices[:, [0, 2]]
+    if len(top_xy) < 3:
+        raise ValueError("Not enough points to form a convex hull")
+    hull = ConvexHull(top_xy)
+    hull_points = top_xy[hull.vertices]
+    polygon = Polygon(hull_points)
+    polygon = polygon.buffer(margin)
+    hull_points = np.array(polygon.exterior.coords)
+    dense_points = []
+    for i in range(len(hull_points)):
+        p1 = hull_points[i]
+        p2 = hull_points[(i + 1) % len(hull_points)]
+        for t in np.linspace(0, 1, interp_per_edge, endpoint=False):
+            pt = (1 - t) * p1 + t * p2
+            dense_points.append(pt)
+    return Path(np.array(dense_points), closed=True)
+def find_parent_node(node: str, tree: dict) -> str | None:
+    for parent, children in tree.items():
+        if any(child[0] == node for child in children):
+            return parent
+    return None
+def all_corners_inside(hull: Path, box: list, threshold: int = 3) -> bool:
+    x1, x2, y1, y2 = box
+    corners = [[x1, y1], [x2, y1], [x1, y2], [x2, y2]]
+    num_inside = sum(hull.contains_point(c) for c in corners)
+    return num_inside >= threshold
+def compute_axis_rotation_quat(
+    axis: Literal["x", "y", "z"], angle_rad: float
+) -> list[float]:
+    if axis.lower() == 'x':
+        q = Quaternion(axis=[1, 0, 0], angle=angle_rad)
+    elif axis.lower() == 'y':
+        q = Quaternion(axis=[0, 1, 0], angle=angle_rad)
+    elif axis.lower() == 'z':
+        q = Quaternion(axis=[0, 0, 1], angle=angle_rad)
+    else:
+        raise ValueError(f"Unsupported axis '{axis}', must be one of x, y, z")
+    return [q.x, q.y, q.z, q.w]
+def quaternion_multiply(
+    init_quat: list[float], rotate_quat: list[float]
+) -> list[float]:
+    qx, qy, qz, qw = init_quat
+    q1 = Quaternion(w=qw, x=qx, y=qy, z=qz)
+    qx, qy, qz, qw = rotate_quat
+    q2 = Quaternion(w=qw, x=qx, y=qy, z=qz)
+    quat = q2 * q1
+    return [quat.x, quat.y, quat.z, quat.w]
+def check_reachable(
+    base_xyz: np.ndarray,
+    reach_xyz: np.ndarray,
+    min_reach: float = 0.25,
+    max_reach: float = 0.85,
+) -> bool:
+    """Check if the target point is within the reachable range."""
+    distance = np.linalg.norm(reach_xyz - base_xyz)
+    return min_reach < distance < max_reach
+@with_seed("seed")
+def bfs_placement(
+    layout_info: LayoutInfo,
+    floor_margin: float = 0,
+    beside_margin: float = 0.1,
+    max_attempts: int = 3000,
+    rotate_objs: bool = True,
+    rotate_bg: bool = True,
+    limit_reach_range: bool = True,
+    robot_dim: float = 0.12,
+    seed: int = None,
+) -> LayoutInfo:
+    object_mapping = layout_info.objs_mapping
+    position = {}  # node: [x, y, z, qx, qy, qz, qw]
+    parent_bbox_xy = {}
+    placed_boxes_map = defaultdict(list)
+    mesh_info = defaultdict(dict)
+    robot_node = layout_info.relation[Scene3DItemEnum.ROBOT.value]
+    for node in object_mapping:
+        if object_mapping[node] == Scene3DItemEnum.BACKGROUND.value:
+            bg_quat = (
+                compute_axis_rotation_quat(
+                    axis="y",
+                    angle_rad=np.random.uniform(0, 2 * np.pi),
+                )
+                if rotate_bg
+                else [0, 0, 0, 1]
+            )
+            bg_quat = [round(q, 4) for q in bg_quat]
+            continue
+        mesh_path = (
+            f"{layout_info.assets[node]}/mesh/{node.replace(' ', '_')}.obj"
+        )
+        mesh_info[node]["path"] = mesh_path
+        mesh = trimesh.load(mesh_path)
+        vertices = mesh.vertices
+        z1 = np.percentile(vertices[:, 1], 1)
+        z2 = np.percentile(vertices[:, 1], 99)
+        if object_mapping[node] == Scene3DItemEnum.CONTEXT.value:
+            object_quat = [0, 0, 0, 1]
+            mesh_info[node]["surface"] = compute_convex_hull_path(vertices)
+            # Put robot in the CONTEXT edge.
+            x, y = random.choice(mesh_info[node]["surface"].vertices)
+            theta = np.arctan2(y, x)
+            quat_initial = Quaternion(axis=[0, 0, 1], angle=theta)
+            quat_extra = Quaternion(axis=[0, 0, 1], angle=np.pi)
+            quat = quat_extra * quat_initial
+            _pose = [x, y, z2 - z1, quat.x, quat.y, quat.z, quat.w]
+            position[robot_node] = [round(v, 4) for v in _pose]
+            node_box = [
+                x - robot_dim / 2,
+                x + robot_dim / 2,
+                y - robot_dim / 2,
+                y + robot_dim / 2,
+            ]
+            placed_boxes_map[node].append(node_box)
+        elif rotate_objs:
+            # For manipulated and distractor objects, apply random rotation
+            angle_rad = np.random.uniform(0, 2 * np.pi)
+            object_quat = compute_axis_rotation_quat(
+                axis="y", angle_rad=angle_rad
+            )
+            object_quat_scipy = np.roll(object_quat, 1)  # [w, x, y, z]
+            rotation = R.from_quat(object_quat_scipy).as_matrix()
+            vertices = np.dot(mesh.vertices, rotation.T)
+            z1 = np.percentile(vertices[:, 1], 1)
+            z2 = np.percentile(vertices[:, 1], 99)
+        x1, x2, y1, y2 = compute_xy_bbox(vertices)
+        mesh_info[node]["pose"] = [x1, x2, y1, y2, z1, z2, *object_quat]
+        mesh_info[node]["area"] = max(1e-5, (x2 - x1) * (y2 - y1))
+    root = list(layout_info.tree.keys())[0]
+    queue = deque([((root, None), layout_info.tree.get(root, []))])
+    while queue:
+        (node, relation), children = queue.popleft()
+        if node not in object_mapping:
+            continue
+        if object_mapping[node] == Scene3DItemEnum.BACKGROUND.value:
+            position[node] = [0, 0, floor_margin, *bg_quat]
+        else:
+            x1, x2, y1, y2, z1, z2, qx, qy, qz, qw = mesh_info[node]["pose"]
+            if object_mapping[node] == Scene3DItemEnum.CONTEXT.value:
+                position[node] = [0, 0, -round(z1, 4), qx, qy, qz, qw]
+                parent_bbox_xy[node] = [x1, x2, y1, y2, z1, z2]
+            elif object_mapping[node] in [
+                Scene3DItemEnum.MANIPULATED_OBJS.value,
+                Scene3DItemEnum.DISTRACTOR_OBJS.value,
+            ]:
+                parent_node = find_parent_node(node, layout_info.tree)
+                parent_pos = position[parent_node]
+                (
+                    p_x1,
+                    p_x2,
+                    p_y1,
+                    p_y2,
+                    p_z1,
+                    p_z2,
+                ) = parent_bbox_xy[parent_node]
+                obj_dx = x2 - x1
+                obj_dy = y2 - y1
+                hull_path = mesh_info[parent_node].get("surface")
+                for _ in range(max_attempts):
+                    node_x1 = random.uniform(p_x1, p_x2 - obj_dx)
+                    node_y1 = random.uniform(p_y1, p_y2 - obj_dy)
+                    node_box = [
+                        node_x1,
+                        node_x1 + obj_dx,
+                        node_y1,
+                        node_y1 + obj_dy,
+                    ]
+                    if hull_path and not all_corners_inside(
+                        hull_path, node_box
+                    ):
+                        continue
+                    # Make sure the manipulated object is reachable by robot.
+                    if (
+                        limit_reach_range
+                        and object_mapping[node]
+                        == Scene3DItemEnum.MANIPULATED_OBJS.value
+                    ):
+                        cx = parent_pos[0] + node_box[0] + obj_dx / 2
+                        cy = parent_pos[1] + node_box[2] + obj_dy / 2
+                        cz = parent_pos[2] + p_z2 - z1
+                        robot_pose = position[robot_node][:3]
+                        if not check_reachable(
+                            base_xyz=np.array(robot_pose),
+                            reach_xyz=np.array([cx, cy, cz]),
+                        ):
+                            continue
+                    if not has_iou_conflict(
+                        node_box, placed_boxes_map[parent_node]
+                    ):
+                        z_offset = 0
+                        break
+                else:
+                    logger.warning(
+                        f"Cannot place {node} on {parent_node} without overlap"
+                        f" after {max_attempts} attempts, place beside {parent_node}."
+                    )
+                    for _ in range(max_attempts):
+                        node_x1 = random.choice(
+                            [
+                                random.uniform(
+                                    p_x1 - obj_dx - beside_margin,
+                                    p_x1 - obj_dx,
+                                ),
+                                random.uniform(p_x2, p_x2 + beside_margin),
+                            ]
+                        )
+                        node_y1 = random.choice(
+                            [
+                                random.uniform(
+                                    p_y1 - obj_dy - beside_margin,
+                                    p_y1 - obj_dy,
+                                ),
+                                random.uniform(p_y2, p_y2 + beside_margin),
+                            ]
+                        )
+                        node_box = [
+                            node_x1,
+                            node_x1 + obj_dx,
+                            node_y1,
+                            node_y1 + obj_dy,
+                        ]
+                        z_offset = -(parent_pos[2] + p_z2)
+                        if not has_iou_conflict(
+                            node_box, placed_boxes_map[parent_node]
+                        ):
+                            break
+                placed_boxes_map[parent_node].append(node_box)
+                abs_cx = parent_pos[0] + node_box[0] + obj_dx / 2
+                abs_cy = parent_pos[1] + node_box[2] + obj_dy / 2
+                abs_cz = parent_pos[2] + p_z2 - z1 + z_offset
+                position[node] = [
+                    round(v, 4)
+                    for v in [abs_cx, abs_cy, abs_cz, qx, qy, qz, qw]
+                ]
+                parent_bbox_xy[node] = [x1, x2, y1, y2, z1, z2]
+        sorted_children = sorted(
+            children, key=lambda x: -mesh_info[x[0]].get("area", 0)
+        )
+        for child, rel in sorted_children:
+            queue.append(((child, rel), layout_info.tree.get(child, [])))
+    layout_info.position = position
+    return layout_info
+def compose_mesh_scene(
+    layout_info: LayoutInfo, out_scene_path: str, with_bg: bool = False
+) -> None:
+    object_mapping = Scene3DItemEnum.object_mapping(layout_info.relation)
+    scene = trimesh.Scene()
+    for node in layout_info.assets:
+        if object_mapping[node] == Scene3DItemEnum.BACKGROUND.value:
+            mesh_path = f"{layout_info.assets[node]}/mesh_model.ply"
+            if not with_bg:
+                continue
+        else:
+            mesh_path = (
+                f"{layout_info.assets[node]}/mesh/{node.replace(' ', '_')}.obj"
+            )
+        mesh = trimesh.load(mesh_path)
+        offset = np.array(layout_info.position[node])[[0, 2, 1]]
+        mesh.vertices += offset
+        scene.add_geometry(mesh, node_name=node)
+    os.makedirs(os.path.dirname(out_scene_path), exist_ok=True)
+    scene.export(out_scene_path)
+    logger.info(f"Composed interactive 3D layout saved in {out_scene_path}")
+    return
+def compute_pinhole_intrinsics(
+    image_w: int, image_h: int, fov_deg: float
+) -> np.ndarray:
+    fov_rad = np.deg2rad(fov_deg)
+    fx = image_w / (2 * np.tan(fov_rad / 2))
+    fy = fx  # assuming square pixels
+    cx = image_w / 2
+    cy = image_h / 2
+    K = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]])
+    return K

embodied_gen/utils/monkey_patches.py CHANGED Viewed

@@ -18,6 +18,7 @@ import os
 import sys
 import zipfile
 import torch
 from huggingface_hub import hf_hub_download
 from omegaconf import OmegaConf
@@ -150,3 +151,68 @@ def monkey_patch_pano2room():
         self.inpaint_pipe = pipe
     SDFTInpainter.__init__ = patched_sd_inpaint_init

 import sys
 import zipfile
+import numpy as np
 import torch
 from huggingface_hub import hf_hub_download
 from omegaconf import OmegaConf
         self.inpaint_pipe = pipe
     SDFTInpainter.__init__ = patched_sd_inpaint_init
+def monkey_patch_maniskill():
+    from mani_skill.envs.scene import ManiSkillScene
+    def get_sensor_images(
+        self, obs: dict[str, any]
+    ) -> dict[str, dict[str, torch.Tensor]]:
+        sensor_data = dict()
+        for name, sensor in self.sensors.items():
+            sensor_data[name] = sensor.get_images(obs[name])
+        return sensor_data
+    def get_human_render_camera_images(
+        self, camera_name: str = None, return_alpha: bool = False
+    ) -> dict[str, torch.Tensor]:
+        def get_rgba_tensor(camera, return_alpha):
+            color = camera.get_obs(
+                rgb=True, depth=False, segmentation=False, position=False
+            )["rgb"]
+            if return_alpha:
+                seg_labels = camera.get_obs(
+                    rgb=False, depth=False, segmentation=True, position=False
+                )["segmentation"]
+                masks = np.where((seg_labels.cpu() > 0), 255, 0).astype(
+                    np.uint8
+                )
+                masks = torch.tensor(masks).to(color.device)
+                color = torch.concat([color, masks], dim=-1)
+            return color
+        image_data = dict()
+        if self.gpu_sim_enabled:
+            if self.parallel_in_single_scene:
+                for name, camera in self.human_render_cameras.items():
+                    camera.camera._render_cameras[0].take_picture()
+                    rgba = get_rgba_tensor(camera, return_alpha)
+                    image_data[name] = rgba
+            else:
+                for name, camera in self.human_render_cameras.items():
+                    if camera_name is not None and name != camera_name:
+                        continue
+                    assert camera.config.shader_config.shader_pack not in [
+                        "rt",
+                        "rt-fast",
+                        "rt-med",
+                    ], "ray tracing shaders do not work with parallel rendering"
+                    camera.capture()
+                    rgba = get_rgba_tensor(camera, return_alpha)
+                    image_data[name] = rgba
+        else:
+            for name, camera in self.human_render_cameras.items():
+                if camera_name is not None and name != camera_name:
+                    continue
+                camera.capture()
+                rgba = get_rgba_tensor(camera, return_alpha)
+                image_data[name] = rgba
+        return image_data
+    ManiSkillScene.get_sensor_images = get_sensor_images
+    ManiSkillScene.get_human_render_camera_images = (
+        get_human_render_camera_images
+    )

embodied_gen/utils/process_media.py CHANGED Viewed

@@ -166,7 +166,7 @@ def combine_images_to_grid(
     images: list[str | Image.Image],
     cat_row_col: tuple[int, int] = None,
     target_wh: tuple[int, int] = (512, 512),
-) -> list[str | Image.Image]:
     n_images = len(images)
     if n_images == 1:
         return images
@@ -377,6 +377,42 @@ def parse_text_prompts(prompts: list[str]) -> list[str]:
     return prompts
 def check_object_edge_truncated(
     mask: np.ndarray, edge_threshold: int = 5
 ) -> bool:
@@ -400,8 +436,15 @@ def check_object_edge_truncated(
 if __name__ == "__main__":
-    merge_video_video(
-        "outputs/imageto3d/room_bottle7/room_bottle_007/URDF_room_bottle_007/mesh_glo_normal.mp4",  # noqa
-        "outputs/imageto3d/room_bottle7/room_bottle_007/URDF_room_bottle_007/mesh.mp4",  # noqa
-        "merge.mp4",
-    )

     images: list[str | Image.Image],
     cat_row_col: tuple[int, int] = None,
     target_wh: tuple[int, int] = (512, 512),
+) -> list[Image.Image]:
     n_images = len(images)
     if n_images == 1:
         return images
     return prompts
+def alpha_blend_rgba(
+    fg_image: Union[str, Image.Image, np.ndarray],
+    bg_image: Union[str, Image.Image, np.ndarray],
+) -> Image.Image:
+    """Alpha blends a foreground RGBA image over a background RGBA image.
+    Args:
+        fg_image: Foreground image. Can be a file path (str), a PIL Image,
+            or a NumPy ndarray.
+        bg_image: Background image. Can be a file path (str), a PIL Image,
+            or a NumPy ndarray.
+    Returns:
+        A PIL Image representing the alpha-blended result in RGBA mode.
+    """
+    if isinstance(fg_image, str):
+        fg_image = Image.open(fg_image)
+    elif isinstance(fg_image, np.ndarray):
+        fg_image = Image.fromarray(fg_image)
+    if isinstance(bg_image, str):
+        bg_image = Image.open(bg_image)
+    elif isinstance(bg_image, np.ndarray):
+        bg_image = Image.fromarray(bg_image)
+    if fg_image.size != bg_image.size:
+        raise ValueError(
+            f"Image sizes not match {fg_image.size} v.s. {bg_image.size}."
+        )
+    fg = fg_image.convert("RGBA")
+    bg = bg_image.convert("RGBA")
+    return Image.alpha_composite(bg, fg)
 def check_object_edge_truncated(
     mask: np.ndarray, edge_threshold: int = 5
 ) -> bool:
 if __name__ == "__main__":
+    image_paths = [
+        "outputs/layouts_sim/task_0000/images/pen.png",
+        "outputs/layouts_sim/task_0000/images/notebook.png",
+        "outputs/layouts_sim/task_0000/images/mug.png",
+        "outputs/layouts_sim/task_0000/images/lamp.png",
+        "outputs/layouts_sim2/task_0014/images/cloth.png",  # TODO
+    ]
+    for image_path in image_paths:
+        image = cv2.imread(image_path, cv2.IMREAD_UNCHANGED)
+        mask = image[..., -1]
+        flag = check_object_edge_truncated(mask)
+        print(flag, image_path)

embodied_gen/utils/simulation.py ADDED Viewed

	@@ -0,0 +1,633 @@

+# Project EmbodiedGen
+#
+# Copyright (c) 2025 Horizon Robotics. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+import json
+import logging
+import os
+import xml.etree.ElementTree as ET
+from collections import defaultdict
+from typing import Literal
+import mplib
+import numpy as np
+import sapien.core as sapien
+import sapien.physx as physx
+import torch
+from mani_skill.agents.base_agent import BaseAgent
+from mani_skill.envs.scene import ManiSkillScene
+from mani_skill.examples.motionplanning.panda.utils import (
+    compute_grasp_info_by_obb,
+)
+from mani_skill.utils.geometry.trimesh_utils import get_component_mesh
+from PIL import Image, ImageColor
+from scipy.spatial.transform import Rotation as R
+from embodied_gen.data.utils import DiffrastRender
+from embodied_gen.utils.enum import LayoutInfo, Scene3DItemEnum
+from embodied_gen.utils.geometry import quaternion_multiply
+from embodied_gen.utils.log import logger
+COLORMAP = list(set(ImageColor.colormap.values()))
+COLOR_PALETTE = np.array(
+    [ImageColor.getrgb(c) for c in COLORMAP], dtype=np.uint8
+)
+SIM_COORD_ALIGN = np.array(
+    [
+        [1.0, 0.0, 0.0, 0.0],
+        [0.0, -1.0, 0.0, 0.0],
+        [0.0, 0.0, -1.0, 0.0],
+        [0.0, 0.0, 0.0, 1.0],
+    ]
+)  # Used to align SAPIEN, MuJoCo coordinate system with the world coordinate system
+__all__ = [
+    "SIM_COORD_ALIGN",
+    "FrankaPandaGrasper",
+    "load_assets_from_layout_file",
+    "load_mani_skill_robot",
+    "render_images",
+]
+def load_actor_from_urdf(
+    scene: ManiSkillScene | sapien.Scene,
+    file_path: str,
+    pose: sapien.Pose,
+    env_idx: int = None,
+    use_static: bool = False,
+    update_mass: bool = False,
+) -> sapien.pysapien.Entity:
+    tree = ET.parse(file_path)
+    root = tree.getroot()
+    node_name = root.get("name")
+    file_dir = os.path.dirname(file_path)
+    visual_file = root.find('.//visual/geometry/mesh').get("filename")
+    collision_file = root.find('.//collision/geometry/mesh').get("filename")
+    visual_file = os.path.join(file_dir, visual_file)
+    collision_file = os.path.join(file_dir, collision_file)
+    static_fric = root.find('.//collision/gazebo/mu1').text
+    dynamic_fric = root.find('.//collision/gazebo/mu2').text
+    material = physx.PhysxMaterial(
+        static_friction=np.clip(float(static_fric), 0.1, 0.7),
+        dynamic_friction=np.clip(float(dynamic_fric), 0.1, 0.6),
+        restitution=0.05,
+    )
+    builder = scene.create_actor_builder()
+    body_type = "static" if use_static else "dynamic"
+    builder.set_physx_body_type(body_type)
+    builder.add_multiple_convex_collisions_from_file(
+        collision_file if body_type == "dynamic" else visual_file,
+        material=material,
+        # decomposition="coacd",
+        # decomposition_params=dict(
+        #     threshold=0.05, max_convex_hull=64, verbose=False
+        # ),
+    )
+    builder.add_visual_from_file(visual_file)
+    builder.set_initial_pose(pose)
+    if isinstance(scene, ManiSkillScene) and env_idx is not None:
+        builder.set_scene_idxs([env_idx])
+    actor = builder.build(name=f"{node_name}-{env_idx}")
+    if update_mass and hasattr(actor.components[1], "mass"):
+        node_mass = float(root.find('.//inertial/mass').get("value"))
+        actor.components[1].set_mass(node_mass)
+    return actor
+def load_assets_from_layout_file(
+    scene: ManiSkillScene | sapien.Scene,
+    layout: LayoutInfo | str,
+    z_offset: float = 0.0,
+    init_quat: list[float] = [0, 0, 0, 1],
+    env_idx: int = None,
+) -> dict[str, sapien.pysapien.Entity]:
+    """Load assets from `EmbodiedGen` layout-gen output and create actors in the scene.
+    Args:
+        scene (sapien.Scene | ManiSkillScene): The SAPIEN or ManiSkill scene to load assets into.
+        layout (LayoutInfo): The layout information data.
+        z_offset (float): Offset to apply to the Z-coordinate of non-context objects.
+        init_quat (List[float]): Initial quaternion (x, y, z, w) for orientation adjustment.
+        env_idx (int): Environment index for multi-environment setup.
+    """
+    if isinstance(layout, str) and layout.endswith(".json"):
+        layout = LayoutInfo.from_dict(json.load(open(layout, "r")))
+    actors = dict()
+    for node in layout.assets:
+        file_dir = layout.assets[node]
+        file_name = f"{node.replace(' ', '_')}.urdf"
+        urdf_file = os.path.join(file_dir, file_name)
+        if layout.objs_mapping[node] == Scene3DItemEnum.BACKGROUND.value:
+            continue
+        position = layout.position[node].copy()
+        if layout.objs_mapping[node] != Scene3DItemEnum.CONTEXT.value:
+            position[2] += z_offset
+        use_static = (
+            layout.relation.get(Scene3DItemEnum.CONTEXT.value, None) == node
+        )
+        # Combine initial quaternion with object quaternion
+        x, y, z, qx, qy, qz, qw = position
+        qx, qy, qz, qw = quaternion_multiply([qx, qy, qz, qw], init_quat)
+        actor = load_actor_from_urdf(
+            scene,
+            urdf_file,
+            sapien.Pose(p=[x, y, z], q=[qw, qx, qy, qz]),
+            env_idx,
+            use_static=use_static,
+            update_mass=False,
+        )
+        actors[node] = actor
+    return actors
+def load_mani_skill_robot(
+    scene: sapien.Scene | ManiSkillScene,
+    layout: LayoutInfo | str,
+    control_freq: int = 20,
+    robot_init_qpos_noise: float = 0.0,
+    control_mode: str = "pd_joint_pos",
+    backend_str: tuple[str, str] = ("cpu", "gpu"),
+) -> BaseAgent:
+    from mani_skill.agents import REGISTERED_AGENTS
+    from mani_skill.envs.scene import ManiSkillScene
+    from mani_skill.envs.utils.system.backend import (
+        parse_sim_and_render_backend,
+    )
+    if isinstance(layout, str) and layout.endswith(".json"):
+        layout = LayoutInfo.from_dict(json.load(open(layout, "r")))
+    robot_name = layout.relation[Scene3DItemEnum.ROBOT.value]
+    x, y, z, qx, qy, qz, qw = layout.position[robot_name]
+    delta_z = 0.002  # Add small offset to avoid collision.
+    pose = sapien.Pose([x, y, z + delta_z], [qw, qx, qy, qz])
+    if robot_name not in REGISTERED_AGENTS:
+        logger.warning(
+            f"Robot `{robot_name}` not registered, chosen from {REGISTERED_AGENTS.keys()}, use `panda` instead."
+        )
+        robot_name = "panda"
+    ROBOT_CLS = REGISTERED_AGENTS[robot_name].agent_cls
+    backend = parse_sim_and_render_backend(*backend_str)
+    if isinstance(scene, sapien.Scene):
+        scene = ManiSkillScene([scene], device=backend_str[0], backend=backend)
+    robot = ROBOT_CLS(
+        scene=scene,
+        control_freq=control_freq,
+        control_mode=control_mode,
+        initial_pose=pose,
+    )
+    # Set robot init joint rad agree(joint0 to joint6 w 2 finger).
+    qpos = np.array(
+        [
+            0.0,
+            np.pi / 8,
+            0,
+            -np.pi * 3 / 8,
+            0,
+            np.pi * 3 / 4,
+            np.pi / 4,
+            0.04,
+            0.04,
+        ]
+    )
+    qpos = (
+        np.random.normal(
+            0, robot_init_qpos_noise, (len(scene.sub_scenes), len(qpos))
+        )
+        + qpos
+    )
+    qpos[:, -2:] = 0.04
+    robot.reset(qpos)
+    robot.init_qpos = robot.robot.qpos
+    robot.controller.controllers["gripper"].reset()
+    return robot
+def render_images(
+    camera: sapien.render.RenderCameraComponent,
+    render_keys: list[
+        Literal[
+            "Color",
+            "Segmentation",
+            "Normal",
+            "Mask",
+            "Depth",
+            "Foreground",
+        ]
+    ] = None,
+) -> dict[str, Image.Image]:
+    """Render images from a given sapien camera.
+    Args:
+        camera (sapien.render.RenderCameraComponent): The camera to render from.
+        render_keys (List[str]): Types of images to render (e.g., Color, Segmentation).
+    Returns:
+        Dict[str, Image.Image]: Dictionary of rendered images.
+    """
+    if render_keys is None:
+        render_keys = [
+            "Color",
+            "Segmentation",
+            "Normal",
+            "Mask",
+            "Depth",
+            "Foreground",
+        ]
+    results: dict[str, Image.Image] = {}
+    if "Color" in render_keys:
+        color = camera.get_picture("Color")
+        color_rgb = (np.clip(color[..., :3], 0, 1) * 255).astype(np.uint8)
+        results["Color"] = Image.fromarray(color_rgb)
+    if "Mask" in render_keys:
+        alpha = (np.clip(color[..., 3], 0, 1) * 255).astype(np.uint8)
+        results["Mask"] = Image.fromarray(alpha)
+    if "Segmentation" in render_keys:
+        seg_labels = camera.get_picture("Segmentation")
+        label0 = seg_labels[..., 0].astype(np.uint8)
+        seg_color = COLOR_PALETTE[label0]
+        results["Segmentation"] = Image.fromarray(seg_color)
+    if "Foreground" in render_keys:
+        seg_labels = camera.get_picture("Segmentation")
+        label0 = seg_labels[..., 0]
+        mask = np.where((label0 > 1), 255, 0).astype(np.uint8)
+        color = camera.get_picture("Color")
+        color_rgb = (np.clip(color[..., :3], 0, 1) * 255).astype(np.uint8)
+        foreground = np.concatenate([color_rgb, mask[..., None]], axis=-1)
+        results["Foreground"] = Image.fromarray(foreground)
+    if "Normal" in render_keys:
+        normal = camera.get_picture("Normal")[..., :3]
+        normal_img = (((normal + 1) / 2) * 255).astype(np.uint8)
+        results["Normal"] = Image.fromarray(normal_img)
+    if "Depth" in render_keys:
+        position_map = camera.get_picture("Position")
+        depth = -position_map[..., 2]
+        alpha = torch.tensor(color[..., 3], dtype=torch.float32)
+        norm_depth = DiffrastRender.normalize_map_by_mask(
+            torch.tensor(depth), alpha
+        )
+        depth_img = (norm_depth * 255).to(torch.uint8).numpy()
+        results["Depth"] = Image.fromarray(depth_img)
+    return results
+class SapienSceneManager:
+    """A class to manage SAPIEN simulator."""
+    def __init__(
+        self, sim_freq: int, ray_tracing: bool, device: str = "cuda"
+    ) -> None:
+        self.sim_freq = sim_freq
+        self.ray_tracing = ray_tracing
+        self.device = device
+        self.renderer = sapien.SapienRenderer()
+        self.scene = self._setup_scene()
+        self.cameras: list[sapien.render.RenderCameraComponent] = []
+        self.actors: dict[str, sapien.pysapien.Entity] = {}
+    def _setup_scene(self) -> sapien.Scene:
+        """Set up the SAPIEN scene with lighting and ground."""
+        # Ray tracing settings
+        if self.ray_tracing:
+            sapien.render.set_camera_shader_dir("rt")
+            sapien.render.set_ray_tracing_samples_per_pixel(64)
+            sapien.render.set_ray_tracing_path_depth(10)
+            sapien.render.set_ray_tracing_denoiser("oidn")
+        scene = sapien.Scene()
+        scene.set_timestep(1 / self.sim_freq)
+        # Add lighting
+        scene.set_ambient_light([0.2, 0.2, 0.2])
+        scene.add_directional_light(
+            direction=[0, 1, -1],
+            color=[1.5, 1.45, 1.4],
+            shadow=True,
+            shadow_map_size=2048,
+        )
+        scene.add_directional_light(
+            direction=[0, -0.5, 1], color=[0.8, 0.8, 0.85], shadow=False
+        )
+        scene.add_directional_light(
+            direction=[0, -1, 1], color=[1.0, 1.0, 1.0], shadow=False
+        )
+        ground_material = self.renderer.create_material()
+        ground_material.base_color = [0.5, 0.5, 0.5, 1]  # rgba, gray
+        ground_material.roughness = 0.7
+        ground_material.metallic = 0.0
+        scene.add_ground(0, render_material=ground_material)
+        return scene
+    def step_action(
+        self,
+        agent: BaseAgent,
+        action: torch.Tensor,
+        cameras: list[sapien.render.RenderCameraComponent],
+        render_keys: list[str],
+        sim_steps_per_control: int = 1,
+    ) -> dict:
+        agent.set_action(action)
+        frames = defaultdict(list)
+        for _ in range(sim_steps_per_control):
+            self.scene.step()
+        self.scene.update_render()
+        for camera in cameras:
+            camera.take_picture()
+            images = render_images(camera, render_keys=render_keys)
+            frames[camera.name].append(images)
+        return frames
+    def create_camera(
+        self,
+        cam_name: str,
+        pose: sapien.Pose,
+        image_hw: tuple[int, int],
+        fovy_deg: float,
+    ) -> sapien.render.RenderCameraComponent:
+        """Create a single camera in the scene.
+        Args:
+            cam_name (str): Name of the camera.
+            pose (sapien.Pose): Camera pose p=(x, y, z), q=(w, x, y, z)
+            image_hw (Tuple[int, int]): Image resolution (height, width) for cameras.
+            fovy_deg (float): Field of view in degrees for cameras.
+        Returns:
+            sapien.render.RenderCameraComponent: The created camera.
+        """
+        cam_actor = self.scene.create_actor_builder().build_kinematic()
+        cam_actor.set_pose(pose)
+        camera = self.scene.add_mounted_camera(
+            name=cam_name,
+            mount=cam_actor,
+            pose=sapien.Pose(p=[0, 0, 0], q=[1, 0, 0, 0]),
+            width=image_hw[1],
+            height=image_hw[0],
+            fovy=np.deg2rad(fovy_deg),
+            near=0.01,
+            far=100,
+        )
+        self.cameras.append(camera)
+        return camera
+    def initialize_circular_cameras(
+        self,
+        num_cameras: int,
+        radius: float,
+        height: float,
+        target_pt: list[float],
+        image_hw: tuple[int, int],
+        fovy_deg: float,
+    ) -> list[sapien.render.RenderCameraComponent]:
+        """Initialize multiple cameras arranged in a circle.
+        Args:
+            num_cameras (int): Number of cameras to create.
+            radius (float): Radius of the camera circle.
+            height (float): Fixed Z-coordinate of the cameras.
+            target_pt (list[float]): 3D point (x, y, z) that cameras look at.
+            image_hw (Tuple[int, int]): Image resolution (height, width) for cameras.
+            fovy_deg (float): Field of view in degrees for cameras.
+        Returns:
+            List[sapien.render.RenderCameraComponent]: List of created cameras.
+        """
+        angle_step = 2 * np.pi / num_cameras
+        world_up_vec = np.array([0.0, 0.0, 1.0])
+        target_pt = np.array(target_pt)
+        for i in range(num_cameras):
+            angle = i * angle_step
+            cam_x = radius * np.cos(angle)
+            cam_y = radius * np.sin(angle)
+            cam_z = height
+            eye_pos = [cam_x, cam_y, cam_z]
+            forward_vec = target_pt - eye_pos
+            forward_vec = forward_vec / np.linalg.norm(forward_vec)
+            temp_right_vec = np.cross(forward_vec, world_up_vec)
+            if np.linalg.norm(temp_right_vec) < 1e-6:
+                temp_right_vec = np.array([1.0, 0.0, 0.0])
+                if np.abs(np.dot(temp_right_vec, forward_vec)) > 0.99:
+                    temp_right_vec = np.array([0.0, 1.0, 0.0])
+            right_vec = temp_right_vec / np.linalg.norm(temp_right_vec)
+            up_vec = np.cross(right_vec, forward_vec)
+            rotation_matrix = np.array([forward_vec, -right_vec, up_vec]).T
+            rot = R.from_matrix(rotation_matrix)
+            scipy_quat = rot.as_quat()  # (x, y, z, w)
+            quat = [
+                scipy_quat[3],
+                scipy_quat[0],
+                scipy_quat[1],
+                scipy_quat[2],
+            ]  # (w, x, y, z)
+            self.create_camera(
+                f"camera_{i}",
+                sapien.Pose(p=eye_pos, q=quat),
+                image_hw,
+                fovy_deg,
+            )
+        return self.cameras
+class FrankaPandaGrasper(object):
+    def __init__(
+        self,
+        agent: BaseAgent,
+        control_freq: float,
+        joint_vel_limits: float = 2.0,
+        joint_acc_limits: float = 1.0,
+        finger_length: float = 0.025,
+    ) -> None:
+        self.agent = agent
+        self.robot = agent.robot
+        self.control_freq = control_freq
+        self.control_timestep = 1 / control_freq
+        self.joint_vel_limits = joint_vel_limits
+        self.joint_acc_limits = joint_acc_limits
+        self.finger_length = finger_length
+        self.planners = self._setup_planner()
+    def _setup_planner(self) -> mplib.Planner:
+        planners = []
+        for pose in self.robot.pose:
+            link_names = [link.get_name() for link in self.robot.get_links()]
+            joint_names = [
+                joint.get_name() for joint in self.robot.get_active_joints()
+            ]
+            planner = mplib.Planner(
+                urdf=self.agent.urdf_path,
+                srdf=self.agent.urdf_path.replace(".urdf", ".srdf"),
+                user_link_names=link_names,
+                user_joint_names=joint_names,
+                move_group="panda_hand_tcp",
+                joint_vel_limits=np.ones(7) * self.joint_vel_limits,
+                joint_acc_limits=np.ones(7) * self.joint_acc_limits,
+            )
+            planner.set_base_pose(pose.raw_pose[0].tolist())
+            planners.append(planner)
+        return planners
+    def control_gripper(
+        self,
+        gripper_state: Literal[-1, 1],
+        n_step: int = 10,
+    ) -> np.ndarray:
+        qpos = self.robot.get_qpos()[0, :-2].cpu().numpy()
+        actions = []
+        for _ in range(n_step):
+            action = np.hstack([qpos, gripper_state])[None, ...]
+            actions.append(action)
+        return np.concatenate(actions, axis=0)
+    def move_to_pose(
+        self,
+        pose: sapien.Pose,
+        control_timestep: float,
+        gripper_state: Literal[-1, 1],
+        use_point_cloud: bool = False,
+        n_max_step: int = 100,
+        action_key: str = "position",
+        env_idx: int = 0,
+    ) -> np.ndarray:
+        result = self.planners[env_idx].plan_qpos_to_pose(
+            np.concatenate([pose.p, pose.q]),
+            self.robot.get_qpos().cpu().numpy()[0],
+            time_step=control_timestep,
+            use_point_cloud=use_point_cloud,
+        )
+        if result["status"] != "Success":
+            result = self.planners[env_idx].plan_screw(
+                np.concatenate([pose.p, pose.q]),
+                self.robot.get_qpos().cpu().numpy()[0],
+                time_step=control_timestep,
+                use_point_cloud=use_point_cloud,
+            )
+        if result["status"] != "Success":
+            return
+        sample_ratio = (len(result[action_key]) // n_max_step) + 1
+        result[action_key] = result[action_key][::sample_ratio]
+        n_step = len(result[action_key])
+        actions = []
+        for i in range(n_step):
+            qpos = result[action_key][i]
+            action = np.hstack([qpos, gripper_state])[None, ...]
+            actions.append(action)
+        return np.concatenate(actions, axis=0)
+    def compute_grasp_action(
+        self,
+        actor: sapien.pysapien.Entity,
+        reach_target_only: bool = True,
+        offset: tuple[float, float, float] = [0, 0, -0.05],
+        env_idx: int = 0,
+    ) -> np.ndarray:
+        physx_rigid = actor.components[1]
+        mesh = get_component_mesh(physx_rigid, to_world_frame=True)
+        obb = mesh.bounding_box_oriented
+        approaching = np.array([0, 0, -1])
+        tcp_pose = self.agent.tcp.pose[env_idx]
+        target_closing = (
+            tcp_pose.to_transformation_matrix()[0, :3, 1].cpu().numpy()
+        )
+        grasp_info = compute_grasp_info_by_obb(
+            obb,
+            approaching=approaching,
+            target_closing=target_closing,
+            depth=self.finger_length,
+        )
+        closing, center = grasp_info["closing"], grasp_info["center"]
+        raw_tcp_pose = tcp_pose.sp
+        grasp_pose = self.agent.build_grasp_pose(approaching, closing, center)
+        reach_pose = grasp_pose * sapien.Pose(p=offset)
+        grasp_pose = grasp_pose * sapien.Pose(p=[0, 0, 0.01])
+        actions = []
+        reach_actions = self.move_to_pose(
+            reach_pose,
+            self.control_timestep,
+            gripper_state=1,
+            env_idx=env_idx,
+        )
+        actions.append(reach_actions)
+        if reach_actions is None:
+            logger.warning(
+                f"Failed to reach the grasp pose for node `{actor.name}`, skipping grasping."
+            )
+            return None
+        if not reach_target_only:
+            grasp_actions = self.move_to_pose(
+                grasp_pose,
+                self.control_timestep,
+                gripper_state=1,
+                env_idx=env_idx,
+            )
+            actions.append(grasp_actions)
+            close_actions = self.control_gripper(
+                gripper_state=-1,
+                env_idx=env_idx,
+            )
+            actions.append(close_actions)
+            back_actions = self.move_to_pose(
+                raw_tcp_pose,
+                self.control_timestep,
+                gripper_state=-1,
+                env_idx=env_idx,
+            )
+            actions.append(back_actions)
+        return np.concatenate(actions, axis=0)

embodied_gen/utils/tags.py CHANGED Viewed

	@@ -1 +1 @@
1	- VERSION = "v0.1.2"


1	+ VERSION = "v0.1.3"

embodied_gen/validators/quality_checkers.py CHANGED Viewed

@@ -109,7 +109,7 @@ class MeshGeoChecker(BaseChecker):
         if self.prompt is None:
             self.prompt = """
             You are an expert in evaluating the geometry quality of generated 3D asset.
-            You will be given rendered views of a generated 3D asset with black background.
             Your task is to evaluate the quality of the 3D asset generation,
             including geometry, structure, and appearance, based on the rendered views.
             Criteria:
@@ -130,10 +130,13 @@ class MeshGeoChecker(BaseChecker):
             Image shows a chair with simplified back legs and soft edges → YES
             """
-    def query(self, image_paths: list[str | Image.Image]) -> str:
         return self.gpt_client.query(
-            text_prompt=self.prompt,
             image_base64=image_paths,
         )

         if self.prompt is None:
             self.prompt = """
             You are an expert in evaluating the geometry quality of generated 3D asset.
+            You will be given rendered views of a generated 3D asset, type {}, with black background.
             Your task is to evaluate the quality of the 3D asset generation,
             including geometry, structure, and appearance, based on the rendered views.
             Criteria:
             Image shows a chair with simplified back legs and soft edges → YES
             """
+    def query(
+        self, image_paths: list[str | Image.Image], text: str = "unknown"
+    ) -> str:
+        input_prompt = self.prompt.format(text)
         return self.gpt_client.query(
+            text_prompt=input_prompt,
             image_base64=image_paths,
         )

embodied_gen/validators/urdf_convertor.py CHANGED Viewed

@@ -24,6 +24,7 @@ from xml.dom.minidom import parseString
 import numpy as np
 import trimesh
 from embodied_gen.utils.gpt_clients import GPT_CLIENT, GPTclient
 from embodied_gen.utils.process_media import render_asset3d
 from embodied_gen.utils.tags import VERSION
@@ -84,6 +85,7 @@ class URDFGenerator(object):
         attrs_name: list[str] = None,
         render_dir: str = "urdf_renders",
         render_view_num: int = 4,
     ) -> None:
         if mesh_file_list is None:
             mesh_file_list = []
@@ -107,36 +109,37 @@ class URDFGenerator(object):
             already provided, use it directly), accurately describe this 3D object asset (within 15 words),
             Determine the pose of the object in the first image and estimate the true vertical height
             (vertical projection) range of the object (in meters), i.e., how tall the object appears from top
-            to bottom in the front view (first) image. also weight range (unit: kilogram), the average
             static friction coefficient of the object relative to rubber and the average dynamic friction
-            coefficient of the object relative to rubber. Return response format as shown in Output Example.
             Output Example:
             Category: cup
             Description: shiny golden cup with floral design
-            Height: 0.1-0.15 m
             Weight: 0.3-0.6 kg
             Static friction coefficient: 0.6
             Dynamic friction coefficient: 0.5
-            IMPORTANT: Estimating Vertical Height from the First (Front View) Image.
             - The "vertical height" refers to the real-world vertical size of the object
             as projected in the first image, aligned with the image's vertical axis.
             - For flat objects like plates or disks or book, if their face is visible in the front view,
             use the diameter as the vertical height. If the edge is visible, use the thickness instead.
             - This is not necessarily the full length of the object, but how tall it appears
-            in the first image vertically, based on its pose and orientation.
-            - For objects(e.g., spoons, forks, writing instruments etc.) at an angle showing in
-            the first image, tilted at 45° will appear shorter vertically than when upright.
             Estimate the vertical projection of their real length based on its pose.
             For example:
-              - A pen standing upright in the first view (aligned with the image's vertical axis)
               full body visible in the first image: → vertical height ≈ 0.14-0.20 m
-              - A pen lying flat in the front view (showing thickness) → vertical height ≈ 0.018-0.025 m
               - Tilted pen in the first image (e.g., ~45° angle): vertical height ≈ 0.07-0.12 m
-            - Use the rest views(except the first image) to help determine the object's 3D pose and orientation.
             Assume the object is in real-world scale and estimate the approximate vertical height
-            (in meters) based on how large it appears vertically in the first image.
             """
             )
@@ -155,6 +158,7 @@ class URDFGenerator(object):
                 "gs_model",
             ]
         self.attrs_name = attrs_name
     def parse_response(self, response: str) -> dict[str, any]:
         lines = response.split("\n")
@@ -163,14 +167,14 @@ class URDFGenerator(object):
         description = lines[1].split(": ")[1]
         min_height, max_height = map(
             lambda x: float(x.strip().replace(",", "").split()[0]),
-            lines[2].split(": ")[1].split("-"),
         )
         min_mass, max_mass = map(
             lambda x: float(x.strip().replace(",", "").split()[0]),
-            lines[3].split(": ")[1].split("-"),
         )
-        mu1 = float(lines[4].split(": ")[1].replace(",", ""))
-        mu2 = float(lines[5].split(": ")[1].replace(",", ""))
         return {
             "category": category.lower(),
@@ -257,9 +261,24 @@ class URDFGenerator(object):
         # Update collision geometry
         collision = link.find("collision/geometry/mesh")
         if collision is not None:
-            collision.set(
-                "filename", os.path.join(self.output_mesh_dir, obj_name)
-            )
             collision.set("scale", "1.0 1.0 1.0")
         # Update friction coefficients

 import numpy as np
 import trimesh
+from embodied_gen.data.convex_decomposer import decompose_convex_mesh
 from embodied_gen.utils.gpt_clients import GPT_CLIENT, GPTclient
 from embodied_gen.utils.process_media import render_asset3d
 from embodied_gen.utils.tags import VERSION
         attrs_name: list[str] = None,
         render_dir: str = "urdf_renders",
         render_view_num: int = 4,
+        decompose_convex: bool = False,
     ) -> None:
         if mesh_file_list is None:
             mesh_file_list = []
             already provided, use it directly), accurately describe this 3D object asset (within 15 words),
             Determine the pose of the object in the first image and estimate the true vertical height
             (vertical projection) range of the object (in meters), i.e., how tall the object appears from top
+            to bottom in the first image. also weight range (unit: kilogram), the average
             static friction coefficient of the object relative to rubber and the average dynamic friction
+            coefficient of the object relative to rubber. Return response in format as shown in Output Example.
             Output Example:
             Category: cup
             Description: shiny golden cup with floral design
+            Pose: <short_description_within_10_words>
+            Height: 0.10-0.15 m
             Weight: 0.3-0.6 kg
             Static friction coefficient: 0.6
             Dynamic friction coefficient: 0.5
+            IMPORTANT: Estimating Vertical Height from the First (Front View) Image and pose estimation based on all views.
             - The "vertical height" refers to the real-world vertical size of the object
             as projected in the first image, aligned with the image's vertical axis.
             - For flat objects like plates or disks or book, if their face is visible in the front view,
             use the diameter as the vertical height. If the edge is visible, use the thickness instead.
             - This is not necessarily the full length of the object, but how tall it appears
+            in the first image vertically, based on its pose and orientation estimation on all views.
+            - For objects(e.g., spoons, forks, writing instruments etc.) at an angle showing in images,
+                e.g., tilted at 45° will appear shorter vertically than when upright.
             Estimate the vertical projection of their real length based on its pose.
             For example:
+              - A pen standing upright in the first image (aligned with the image's vertical axis)
               full body visible in the first image: → vertical height ≈ 0.14-0.20 m
+              - A pen lying flat in the first image (showing thickness or as a dot) → vertical height ≈ 0.018-0.025 m
               - Tilted pen in the first image (e.g., ~45° angle): vertical height ≈ 0.07-0.12 m
+            - Use the rest views to help determine the object's 3D pose and orientation.
             Assume the object is in real-world scale and estimate the approximate vertical height
+            based on the pose estimation and how large it appears vertically in the first image.
             """
             )
                 "gs_model",
             ]
         self.attrs_name = attrs_name
+        self.decompose_convex = decompose_convex
     def parse_response(self, response: str) -> dict[str, any]:
         lines = response.split("\n")
         description = lines[1].split(": ")[1]
         min_height, max_height = map(
             lambda x: float(x.strip().replace(",", "").split()[0]),
+            lines[3].split(": ")[1].split("-"),
         )
         min_mass, max_mass = map(
             lambda x: float(x.strip().replace(",", "").split()[0]),
+            lines[4].split(": ")[1].split("-"),
         )
+        mu1 = float(lines[5].split(": ")[1].replace(",", ""))
+        mu2 = float(lines[6].split(": ")[1].replace(",", ""))
         return {
             "category": category.lower(),
         # Update collision geometry
         collision = link.find("collision/geometry/mesh")
         if collision is not None:
+            collision_mesh = os.path.join(self.output_mesh_dir, obj_name)
+            if self.decompose_convex:
+                try:
+                    d_params = dict(
+                        threshold=0.05, max_convex_hull=64, verbose=False
+                    )
+                    filename = f"{os.path.splitext(obj_name)[0]}_collision.ply"
+                    output_path = os.path.join(mesh_folder, filename)
+                    decompose_convex_mesh(
+                        mesh_output_path, output_path, **d_params
+                    )
+                    collision_mesh = f"{self.output_mesh_dir}/{filename}"
+                except Exception as e:
+                    logger.warning(
+                        f"Convex decomposition failed for {output_path}, {e}."
+                        "Use original mesh for collision computation."
+                    )
+            collision.set("filename", collision_mesh)
             collision.set("scale", "1.0 1.0 1.0")
         # Update friction coefficients