Spaces:

heheyas
/

V3D

Running on Zero

File size: 29,825 Bytes

cfb7702

import numpy as np
from pathlib import Path
from PIL import Image
import json
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, default_collate
from torchvision.transforms import ToTensor, Normalize, Compose, Resize
from torchvision.transforms.functional import to_tensor
from pytorch_lightning import LightningDataModule
from einops import rearrange


def read_camera_matrix_single(json_file):
    # for gobjaverse
    with open(json_file, "r", encoding="utf8") as reader:
        json_content = json.load(reader)

    # negative sign for opencv to opengl
    camera_matrix = torch.zeros(3, 4)
    camera_matrix[:3, 0] = torch.tensor(json_content["x"])
    camera_matrix[:3, 1] = -torch.tensor(json_content["y"])
    camera_matrix[:3, 2] = -torch.tensor(json_content["z"])
    camera_matrix[:3, 3] = torch.tensor(json_content["origin"])
    """
    camera_matrix = np.eye(4)
    camera_matrix[:3, 0] = np.array(json_content['x'])
    camera_matrix[:3, 1] = np.array(json_content['y'])
    camera_matrix[:3, 2] = np.array(json_content['z'])
    camera_matrix[:3, 3] = np.array(json_content['origin'])
    # print(camera_matrix)
    """

    return camera_matrix


def read_camera_instrinsics_single(json_file, h: int, w: int, scale: float = 1.0):
    with open(json_file, "r", encoding="utf8") as reader:
        json_content = json.load(reader)

    h = int(h * scale)
    w = int(w * scale)

    y_fov = json_content["y_fov"]
    x_fov = json_content["x_fov"]

    fy = h / 2 / np.tan(y_fov / 2)
    fx = w / 2 / np.tan(x_fov / 2)

    cx = w // 2
    cy = h // 2

    intrinsics = torch.tensor(
        [
            [fx, fy],
            [cx, cy],
            [w, h],
        ],
        dtype=torch.float32,
    )
    return intrinsics


def compose_extrinsic_RT(RT: torch.Tensor):
    """
    Compose the standard form extrinsic matrix from RT.
    Batched I/O.
    """
    return torch.cat(
        [
            RT,
            torch.tensor([[[0, 0, 0, 1]]], dtype=torch.float32).repeat(
                RT.shape[0], 1, 1
            ),
        ],
        dim=1,
    )


def get_normalized_camera_intrinsics(intrinsics: torch.Tensor):
    """
    intrinsics: (N, 3, 2), [[fx, fy], [cx, cy], [width, height]]
    Return batched fx, fy, cx, cy
    """
    fx, fy = intrinsics[:, 0, 0], intrinsics[:, 0, 1]
    cx, cy = intrinsics[:, 1, 0], intrinsics[:, 1, 1]
    width, height = intrinsics[:, 2, 0], intrinsics[:, 2, 1]
    fx, fy = fx / width, fy / height
    cx, cy = cx / width, cy / height
    return fx, fy, cx, cy


def build_camera_standard(RT: torch.Tensor, intrinsics: torch.Tensor):
    """
    RT: (N, 3, 4)
    intrinsics: (N, 3, 2), [[fx, fy], [cx, cy], [width, height]]
    """
    E = compose_extrinsic_RT(RT)
    fx, fy, cx, cy = get_normalized_camera_intrinsics(intrinsics)
    I = torch.stack(
        [
            torch.stack([fx, torch.zeros_like(fx), cx], dim=-1),
            torch.stack([torch.zeros_like(fy), fy, cy], dim=-1),
            torch.tensor([[0, 0, 1]], dtype=torch.float32).repeat(RT.shape[0], 1),
        ],
        dim=1,
    )
    return torch.cat(
        [
            E.reshape(-1, 16),
            I.reshape(-1, 9),
        ],
        dim=-1,
    )


def calc_elevation(c2w):
    ## works for single or batched c2w
    ## assume world up is (0, 0, 1)
    pos = c2w[..., :3, 3]

    return np.arcsin(pos[..., 2] / np.linalg.norm(pos, axis=-1, keepdims=False))


def read_camera_matrix_single(json_file):
    with open(json_file, "r", encoding="utf8") as reader:
        json_content = json.load(reader)

    # negative sign for opencv to opengl
    # camera_matrix = np.zeros([3, 4])
    # camera_matrix[:3, 0] = np.array(json_content["x"])
    # camera_matrix[:3, 1] = -np.array(json_content["y"])
    # camera_matrix[:3, 2] = -np.array(json_content["z"])
    # camera_matrix[:3, 3] = np.array(json_content["origin"])
    camera_matrix = torch.zeros([3, 4])
    camera_matrix[:3, 0] = torch.tensor(json_content["x"])
    camera_matrix[:3, 1] = -torch.tensor(json_content["y"])
    camera_matrix[:3, 2] = -torch.tensor(json_content["z"])
    camera_matrix[:3, 3] = torch.tensor(json_content["origin"])
    """
    camera_matrix = np.eye(4)
    camera_matrix[:3, 0] = np.array(json_content['x'])
    camera_matrix[:3, 1] = np.array(json_content['y'])
    camera_matrix[:3, 2] = np.array(json_content['z'])
    camera_matrix[:3, 3] = np.array(json_content['origin'])
    # print(camera_matrix)
    """

    return camera_matrix


def blend_white_bg(image):
    new_image = Image.new("RGB", image.size, (255, 255, 255))
    new_image.paste(image, mask=image.split()[3])

    return new_image


def flatten_for_video(input):
    return input.flatten()


FLATTEN_FIELDS = ["fps_id", "motion_bucket_id", "cond_aug", "elevation"]


def video_collate_fn(batch: list[dict], *args, **kwargs):
    out = {}
    for key in batch[0].keys():
        if key in FLATTEN_FIELDS:
            out[key] = default_collate([item[key] for item in batch])
            out[key] = flatten_for_video(out[key])
        elif key == "num_video_frames":
            out[key] = batch[0][key]
        elif key in ["frames", "latents", "rgb"]:
            out[key] = default_collate([item[key] for item in batch])
            out[key] = rearrange(out[key], "b t c h w -> (b t) c h w")
        else:
            out[key] = default_collate([item[key] for item in batch])

    if "pixelnerf_input" in out:
        out["pixelnerf_input"]["rgb"] = rearrange(
            out["pixelnerf_input"]["rgb"], "b t c h w -> (b t) c h w"
        )

    return out


class GObjaverse(Dataset):
    def __init__(
        self,
        root_dir,
        split="train",
        transform=None,
        random_front=False,
        max_item=None,
        cond_aug_mean=-3.0,
        cond_aug_std=0.5,
        condition_on_elevation=False,
        fps_id=0.0,
        motion_bucket_id=300.0,
        use_latents=False,
        load_caps=False,
        front_view_selection="random",
        load_pixelnerf=False,
        debug_base_idx=None,
        scale_pose: bool = False,
        max_n_cond: int = 1,
        **unused_kwargs,
    ):
        self.root_dir = Path(root_dir)
        self.split = split
        self.random_front = random_front
        self.transform = transform
        self.use_latents = use_latents

        self.ids = json.load(open(self.root_dir / "valid_uids.json", "r"))
        self.n_views = 24

        self.load_caps = load_caps
        if self.load_caps:
            self.caps = json.load(open(self.root_dir / "text_captions_cap3d.json", "r"))

        self.cond_aug_mean = cond_aug_mean
        self.cond_aug_std = cond_aug_std
        self.condition_on_elevation = condition_on_elevation
        self.fps_id = fps_id
        self.motion_bucket_id = motion_bucket_id
        self.load_pixelnerf = load_pixelnerf
        self.scale_pose = scale_pose
        self.max_n_cond = max_n_cond

        if self.use_latents:
            self.latents_dir = self.root_dir / "latents256"
            self.clip_dir = self.root_dir / "clip_emb256"

        self.front_view_selection = front_view_selection
        if self.front_view_selection == "random":
            pass
        elif self.front_view_selection == "fixed":
            pass
        elif self.front_view_selection.startswith("clip_score"):
            self.clip_scores = torch.load(self.root_dir / "clip_score_per_view.pt")
            self.ids = list(self.clip_scores.keys())
        else:
            raise ValueError(
                f"Unknown front view selection method {self.front_view_selection}"
            )

        if max_item is not None:
            self.ids = self.ids[:max_item]
            ## debug
            self.ids = self.ids * 10000

        if debug_base_idx is not None:
            print(f"debug mode with base idx: {debug_base_idx}")
            self.debug_base_idx = debug_base_idx

    def __getitem__(self, idx: int):
        if hasattr(self, "debug_base_idx"):
            idx = (idx + self.debug_base_idx) % len(self.ids)
        data = {}
        idx_list = np.arange(self.n_views)
        # if self.random_front:
        #     roll_idx = np.random.randint(self.n_views)
        #     idx_list = np.roll(idx_list, roll_idx)
        if self.front_view_selection == "random":
            roll_idx = np.random.randint(self.n_views)
            idx_list = np.roll(idx_list, roll_idx)
        elif self.front_view_selection == "fixed":
            pass
        elif self.front_view_selection == "clip_score_softmax":
            this_clip_score = (
                F.softmax(self.clip_scores[self.ids[idx]], dim=-1).cpu().numpy()
            )
            roll_idx = np.random.choice(idx_list, p=this_clip_score)
            idx_list = np.roll(idx_list, roll_idx)
        elif self.front_view_selection == "clip_score_max":
            this_clip_score = (
                F.softmax(self.clip_scores[self.ids[idx]], dim=-1).cpu().numpy()
            )
            roll_idx = np.argmax(this_clip_score)
            idx_list = np.roll(idx_list, roll_idx)
        frames = []
        if not self.use_latents:
            try:
                for view_idx in idx_list:
                    frame = Image.open(
                        self.root_dir
                        / "gobjaverse"
                        / self.ids[idx]
                        / f"{view_idx:05d}/{view_idx:05d}.png"
                    )
                    frames.append(self.transform(frame))
            except:
                idx = 0
                frames = []
                for view_idx in idx_list:
                    frame = Image.open(
                        self.root_dir
                        / "gobjaverse"
                        / self.ids[idx]
                        / f"{view_idx:05d}/{view_idx:05d}.png"
                    )
                    frames.append(self.transform(frame))
                # a workaround for some bugs in gobjaverse
                # use idx=0 and the repeat will be resolved when gathering results, valid number of items can be checked by the len of results
            frames = torch.stack(frames, dim=0)
            cond = frames[0]

            cond_aug = np.exp(
                np.random.randn(1)[0] * self.cond_aug_std + self.cond_aug_mean
            )

            data.update(
                {
                    "frames": frames,
                    "cond_frames_without_noise": cond,
                    "cond_aug": torch.as_tensor([cond_aug] * self.n_views),
                    "cond_frames": cond + cond_aug * torch.randn_like(cond),
                    "fps_id": torch.as_tensor([self.fps_id] * self.n_views),
                    "motion_bucket_id": torch.as_tensor(
                        [self.motion_bucket_id] * self.n_views
                    ),
                    "num_video_frames": 24,
                    "image_only_indicator": torch.as_tensor([0.0] * self.n_views),
                }
            )
        else:
            latents = torch.load(self.latents_dir / f"{self.ids[idx]}.pt")[idx_list]
            clip_emb = torch.load(self.clip_dir / f"{self.ids[idx]}.pt")[idx_list][0]

            cond = latents[0]

            cond_aug = np.exp(
                np.random.randn(1)[0] * self.cond_aug_std + self.cond_aug_mean
            )

            data.update(
                {
                    "latents": latents,
                    "cond_frames_without_noise": clip_emb,
                    "cond_aug": torch.as_tensor([cond_aug] * self.n_views),
                    "cond_frames": cond + cond_aug * torch.randn_like(cond),
                    "fps_id": torch.as_tensor([self.fps_id] * self.n_views),
                    "motion_bucket_id": torch.as_tensor(
                        [self.motion_bucket_id] * self.n_views
                    ),
                    "num_video_frames": 24,
                    "image_only_indicator": torch.as_tensor([0.0] * self.n_views),
                }
            )

        if self.condition_on_elevation:
            sample_c2w = read_camera_matrix_single(
                self.root_dir / self.ids[idx] / f"00000/00000.json"
            )
            elevation = calc_elevation(sample_c2w)
            data["elevation"] = torch.as_tensor([elevation] * self.n_views)

        if self.load_pixelnerf:
            assert "frames" in data, f"pixelnerf cannot work with latents only mode"
            data["pixelnerf_input"] = {}
            RTs = []
            intrinsics = []
            for view_idx in idx_list:
                meta = (
                    self.root_dir
                    / "gobjaverse"
                    / self.ids[idx]
                    / f"{view_idx:05d}/{view_idx:05d}.json"
                )
                RTs.append(read_camera_matrix_single(meta)[:3])
                intrinsics.append(read_camera_instrinsics_single(meta, 256, 256))
            RTs = torch.stack(RTs, dim=0)
            intrinsics = torch.stack(intrinsics, dim=0)
            cameras = build_camera_standard(RTs, intrinsics)
            data["pixelnerf_input"]["cameras"] = cameras

            downsampled = []
            for view_idx in idx_list:
                frame = Image.open(
                    self.root_dir
                    / "gobjaverse"
                    / self.ids[idx]
                    / f"{view_idx:05d}/{view_idx:05d}.png"
                ).resize((32, 32))
                downsampled.append(to_tensor(blend_white_bg(frame)))
            data["pixelnerf_input"]["rgb"] = torch.stack(downsampled, dim=0)
            data["pixelnerf_input"]["frames"] = data["frames"]
            if self.scale_pose:
                c2ws = cameras[..., :16].reshape(-1, 4, 4)
                center = c2ws[:, :3, 3].mean(0)
                radius = (c2ws[:, :3, 3] - center).norm(dim=-1).max()
                scale = 1.5 / radius
                c2ws[..., :3, 3] = (c2ws[..., :3, 3] - center) * scale
                cameras[..., :16] = c2ws.reshape(-1, 16)

        if self.load_caps:
            data["caption"] = self.caps[self.ids[idx]]
            data["ids"] = self.ids[idx]

        return data

    def __len__(self):
        return len(self.ids)

    def collate_fn(self, batch):
        if self.max_n_cond > 1:
            n_cond = np.random.randint(1, self.max_n_cond + 1)
            if n_cond > 1:
                for b in batch:
                    source_index = [0] + np.random.choice(
                        np.arange(1, self.n_views),
                        self.max_n_cond - 1,
                        replace=False,
                    ).tolist()
                    b["pixelnerf_input"]["source_index"] = torch.as_tensor(source_index)
                    b["pixelnerf_input"]["n_cond"] = n_cond
                    b["pixelnerf_input"]["source_images"] = b["frames"][source_index]
                    b["pixelnerf_input"]["source_cameras"] = b["pixelnerf_input"][
                        "cameras"
                    ][source_index]

        return video_collate_fn(batch)


class ObjaverseSpiral(Dataset):
    def __init__(
        self,
        root_dir,
        split="train",
        transform=None,
        random_front=False,
        max_item=None,
        cond_aug_mean=-3.0,
        cond_aug_std=0.5,
        condition_on_elevation=False,
        **unused_kwargs,
    ):
        self.root_dir = Path(root_dir)
        self.split = split
        self.random_front = random_front
        self.transform = transform

        self.ids = json.load(open(self.root_dir / f"{split}_ids.json", "r"))
        self.n_views = 24
        valid_ids = []
        for idx in self.ids:
            if (self.root_dir / idx).exists():
                valid_ids.append(idx)
        self.ids = valid_ids

        self.cond_aug_mean = cond_aug_mean
        self.cond_aug_std = cond_aug_std
        self.condition_on_elevation = condition_on_elevation

        if max_item is not None:
            self.ids = self.ids[:max_item]

            ## debug
            self.ids = self.ids * 10000

    def __getitem__(self, idx: int):
        frames = []
        idx_list = np.arange(self.n_views)
        if self.random_front:
            roll_idx = np.random.randint(self.n_views)
            idx_list = np.roll(idx_list, roll_idx)
        for view_idx in idx_list:
            frame = Image.open(
                self.root_dir / self.ids[idx] / f"{view_idx:05d}/{view_idx:05d}.png"
            )
            frames.append(self.transform(frame))

        # data = {"jpg": torch.stack(frames, dim=0)}  # [T, C, H, W]
        frames = torch.stack(frames, dim=0)
        cond = frames[0]

        cond_aug = np.exp(
            np.random.randn(1)[0] * self.cond_aug_std + self.cond_aug_mean
        )

        data = {
            "frames": frames,
            "cond_frames_without_noise": cond,
            "cond_aug": torch.as_tensor([cond_aug] * self.n_views),
            "cond_frames": cond + cond_aug * torch.randn_like(cond),
            "fps_id": torch.as_tensor([1.0] * self.n_views),
            "motion_bucket_id": torch.as_tensor([300.0] * self.n_views),
            "num_video_frames": 24,
            "image_only_indicator": torch.as_tensor([0.0] * self.n_views),
        }

        if self.condition_on_elevation:
            sample_c2w = read_camera_matrix_single(
                self.root_dir / self.ids[idx] / f"00000/00000.json"
            )
            elevation = calc_elevation(sample_c2w)
            data["elevation"] = torch.as_tensor([elevation] * self.n_views)

        return data

    def __len__(self):
        return len(self.ids)


class ObjaverseLVISSpiral(Dataset):
    def __init__(
        self,
        root_dir,
        split="train",
        transform=None,
        random_front=False,
        max_item=None,
        cond_aug_mean=-3.0,
        cond_aug_std=0.5,
        condition_on_elevation=False,
        use_precomputed_latents=False,
        **unused_kwargs,
    ):
        print("Using LVIS subset")
        self.root_dir = Path(root_dir)
        self.latent_dir = Path("/mnt/vepfs/3Ddataset/render_results/latents512")
        self.split = split
        self.random_front = random_front
        self.transform = transform
        self.use_precomputed_latents = use_precomputed_latents

        self.ids = json.load(open("./assets/lvis_uids.json", "r"))
        self.n_views = 18
        valid_ids = []
        for idx in self.ids:
            if (self.root_dir / idx).exists():
                valid_ids.append(idx)
        self.ids = valid_ids
        print("=" * 30)
        print("Number of valid ids: ", len(self.ids))
        print("=" * 30)

        self.cond_aug_mean = cond_aug_mean
        self.cond_aug_std = cond_aug_std
        self.condition_on_elevation = condition_on_elevation

        if max_item is not None:
            self.ids = self.ids[:max_item]

            ## debug
            self.ids = self.ids * 10000

    def __getitem__(self, idx: int):
        frames = []
        idx_list = np.arange(self.n_views)
        if self.random_front:
            roll_idx = np.random.randint(self.n_views)
            idx_list = np.roll(idx_list, roll_idx)
        for view_idx in idx_list:
            frame = Image.open(
                self.root_dir
                / self.ids[idx]
                / "elevations_0"
                / f"colors_{view_idx * 2}.png"
            )
            frames.append(self.transform(frame))

        frames = torch.stack(frames, dim=0)
        cond = frames[0]

        cond_aug = np.exp(
            np.random.randn(1)[0] * self.cond_aug_std + self.cond_aug_mean
        )

        data = {
            "frames": frames,
            "cond_frames_without_noise": cond,
            "cond_aug": torch.as_tensor([cond_aug] * self.n_views),
            "cond_frames": cond + cond_aug * torch.randn_like(cond),
            "fps_id": torch.as_tensor([0.0] * self.n_views),
            "motion_bucket_id": torch.as_tensor([300.0] * self.n_views),
            "num_video_frames": self.n_views,
            "image_only_indicator": torch.as_tensor([0.0] * self.n_views),
        }

        if self.use_precomputed_latents:
            data["latents"] = torch.load(self.latent_dir / f"{self.ids[idx]}.pt")

        if self.condition_on_elevation:
            # sample_c2w = read_camera_matrix_single(
            #     self.root_dir / self.ids[idx] / f"00000/00000.json"
            # )
            # elevation = calc_elevation(sample_c2w)
            # data["elevation"] = torch.as_tensor([elevation] * self.n_views)
            assert False, "currently assumes elevation 0"

        return data

    def __len__(self):
        return len(self.ids)


class ObjaverseALLSpiral(ObjaverseLVISSpiral):
    def __init__(
        self,
        root_dir,
        split="train",
        transform=None,
        random_front=False,
        max_item=None,
        cond_aug_mean=-3.0,
        cond_aug_std=0.5,
        condition_on_elevation=False,
        use_precomputed_latents=False,
        **unused_kwargs,
    ):
        print("Using ALL objects in Objaverse")
        self.root_dir = Path(root_dir)
        self.split = split
        self.random_front = random_front
        self.transform = transform
        self.use_precomputed_latents = use_precomputed_latents
        self.latent_dir = Path("/mnt/vepfs/3Ddataset/render_results/latents512")

        self.ids = json.load(open("./assets/all_ids.json", "r"))
        self.n_views = 18
        valid_ids = []
        for idx in self.ids:
            if (self.root_dir / idx).exists() and (self.root_dir / idx).is_dir():
                valid_ids.append(idx)
        self.ids = valid_ids
        print("=" * 30)
        print("Number of valid ids: ", len(self.ids))
        print("=" * 30)

        self.cond_aug_mean = cond_aug_mean
        self.cond_aug_std = cond_aug_std
        self.condition_on_elevation = condition_on_elevation

        if max_item is not None:
            self.ids = self.ids[:max_item]

            ## debug
            self.ids = self.ids * 10000


class ObjaverseWithPose(Dataset):
    def __init__(
        self,
        root_dir,
        split="train",
        transform=None,
        random_front=False,
        max_item=None,
        cond_aug_mean=-3.0,
        cond_aug_std=0.5,
        condition_on_elevation=False,
        use_precomputed_latents=False,
        **unused_kwargs,
    ):
        print("Using Objaverse with poses")
        self.root_dir = Path(root_dir)
        self.split = split
        self.random_front = random_front
        self.transform = transform
        self.use_precomputed_latents = use_precomputed_latents
        self.latent_dir = Path("/mnt/vepfs/3Ddataset/render_results/latents512")

        self.ids = json.load(open("./assets/all_ids.json", "r"))
        self.n_views = 18
        valid_ids = []
        for idx in self.ids:
            if (self.root_dir / idx).exists() and (self.root_dir / idx).is_dir():
                valid_ids.append(idx)
        self.ids = valid_ids
        print("=" * 30)
        print("Number of valid ids: ", len(self.ids))
        print("=" * 30)

        self.cond_aug_mean = cond_aug_mean
        self.cond_aug_std = cond_aug_std
        self.condition_on_elevation = condition_on_elevation

    def __getitem__(self, idx: int):
        frames = []
        idx_list = np.arange(self.n_views)
        if self.random_front:
            roll_idx = np.random.randint(self.n_views)
            idx_list = np.roll(idx_list, roll_idx)
        for view_idx in idx_list:
            frame = Image.open(
                self.root_dir
                / self.ids[idx]
                / "elevations_0"
                / f"colors_{view_idx * 2}.png"
            )
            frames.append(self.transform(frame))

        frames = torch.stack(frames, dim=0)
        cond = frames[0]

        cond_aug = np.exp(
            np.random.randn(1)[0] * self.cond_aug_std + self.cond_aug_mean
        )

        data = {
            "frames": frames,
            "cond_frames_without_noise": cond,
            "cond_aug": torch.as_tensor([cond_aug] * self.n_views),
            "cond_frames": cond + cond_aug * torch.randn_like(cond),
            "fps_id": torch.as_tensor([0.0] * self.n_views),
            "motion_bucket_id": torch.as_tensor([300.0] * self.n_views),
            "num_video_frames": self.n_views,
            "image_only_indicator": torch.as_tensor([0.0] * self.n_views),
        }

        if self.use_precomputed_latents:
            data["latents"] = torch.load(self.latent_dir / f"{self.ids[idx]}.pt")

        if self.condition_on_elevation:
            assert False, "currently assumes elevation 0"

        return data


class LatentObjaverse(Dataset):
    def __init__(
        self,
        root_dir,
        split="train",
        random_front=False,
        subset="lvis",
        fps_id=1.0,
        motion_bucket_id=300.0,
        cond_aug_mean=-3.0,
        cond_aug_std=0.5,
        **unused_kwargs,
    ):
        self.root_dir = Path(root_dir)
        self.split = split
        self.random_front = random_front
        self.ids = json.load(open(Path("./assets") / f"{subset}_ids.json", "r"))
        self.clip_emb_dir = self.root_dir / ".." / "clip_emb512"
        self.n_views = 18
        self.fps_id = fps_id
        self.motion_bucket_id = motion_bucket_id
        self.cond_aug_mean = cond_aug_mean
        self.cond_aug_std = cond_aug_std
        if self.random_front:
            print("Using a random view as front view")

        valid_ids = []
        for idx in self.ids:
            if (self.root_dir / f"{idx}.pt").exists() and (
                self.clip_emb_dir / f"{idx}.pt"
            ).exists():
                valid_ids.append(idx)
        self.ids = valid_ids
        print("=" * 30)
        print("Number of valid ids: ", len(self.ids))
        print("=" * 30)

    def __getitem__(self, idx: int):
        uid = self.ids[idx]
        idx_list = torch.arange(self.n_views)
        latents = torch.load(self.root_dir / f"{uid}.pt")
        clip_emb = torch.load(self.clip_emb_dir / f"{uid}.pt")
        if self.random_front:
            idx_list = torch.roll(idx_list, np.random.randint(self.n_views))
        latents = latents[idx_list]
        clip_emb = clip_emb[idx_list][0]

        cond_aug = np.exp(
            np.random.randn(1)[0] * self.cond_aug_std + self.cond_aug_mean
        )
        cond = latents[0]

        data = {
            "latents": latents,
            "cond_frames_without_noise": clip_emb,
            "cond_frames": cond + cond_aug * torch.randn_like(cond),
            "fps_id": torch.as_tensor([self.fps_id] * self.n_views),
            "motion_bucket_id": torch.as_tensor([self.motion_bucket_id] * self.n_views),
            "cond_aug": torch.as_tensor([cond_aug] * self.n_views),
            "num_video_frames": self.n_views,
            "image_only_indicator": torch.as_tensor([0.0] * self.n_views),
        }

        return data

    def __len__(self):
        return len(self.ids)


class ObjaverseSpiralDataset(LightningDataModule):
    def __init__(
        self,
        root_dir,
        random_front=False,
        batch_size=2,
        num_workers=10,
        prefetch_factor=2,
        shuffle=True,
        max_item=None,
        dataset_cls="richdreamer",
        reso: int = 256,
        **kwargs,
    ) -> None:
        super().__init__()

        self.batch_size = batch_size
        self.num_workers = num_workers
        self.prefetch_factor = prefetch_factor
        self.shuffle = shuffle
        self.max_item = max_item

        self.transform = Compose(
            [
                blend_white_bg,
                Resize((reso, reso)),
                ToTensor(),
                Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
            ]
        )

        data_cls = {
            "richdreamer": ObjaverseSpiral,
            "lvis": ObjaverseLVISSpiral,
            "shengshu_all": ObjaverseALLSpiral,
            "latent": LatentObjaverse,
            "gobjaverse": GObjaverse,
        }[dataset_cls]

        self.train_dataset = data_cls(
            root_dir=root_dir,
            split="train",
            random_front=random_front,
            transform=self.transform,
            max_item=self.max_item,
            **kwargs,
        )
        self.test_dataset = data_cls(
            root_dir=root_dir,
            split="val",
            random_front=random_front,
            transform=self.transform,
            max_item=self.max_item,
            **kwargs,
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=self.shuffle,
            num_workers=self.num_workers,
            prefetch_factor=self.prefetch_factor,
            collate_fn=video_collate_fn
            if not hasattr(self.train_dataset, "collate_fn")
            else self.train_dataset.collate_fn,
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=self.shuffle,
            num_workers=self.num_workers,
            prefetch_factor=self.prefetch_factor,
            collate_fn=video_collate_fn
            if not hasattr(self.test_dataset, "collate_fn")
            else self.train_dataset.collate_fn,
        )

    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size,
            shuffle=self.shuffle,
            num_workers=self.num_workers,
            prefetch_factor=self.prefetch_factor,
            collate_fn=video_collate_fn
            if not hasattr(self.test_dataset, "collate_fn")
            else self.train_dataset.collate_fn,
        )