Step1X-3D

Paused

App Files Files Community

ReubenSun commited on 6 days ago

Commit

2ac1c2d

1 Parent(s): 13b826f

1

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.gitignore +11 -0
app.py +135 -0
examples/images/000.png +3 -0
examples/images/001.png +3 -0
examples/images/004.png +3 -0
examples/images/008.png +3 -0
examples/images/028.png +3 -0
examples/images/032.png +3 -0
examples/images/061.png +3 -0
examples/images/107.png +3 -0
requirements.txt +50 -0
step1x3d_geometry/__init__.py +52 -0
step1x3d_geometry/data/Objaverse.py +73 -0
step1x3d_geometry/data/__init__.py +1 -0
step1x3d_geometry/data/base.py +350 -0
step1x3d_geometry/models/__init__.py +1 -0
step1x3d_geometry/models/attention.py +776 -0
step1x3d_geometry/models/attention_processor.py +482 -0
step1x3d_geometry/models/autoencoders/__init__.py +3 -0
step1x3d_geometry/models/autoencoders/michelangelo_autoencoder.py +765 -0
step1x3d_geometry/models/autoencoders/surface_extractors.py +137 -0
step1x3d_geometry/models/autoencoders/transformers/attention.py +286 -0
step1x3d_geometry/models/autoencoders/transformers/perceiver_1d.py +50 -0
step1x3d_geometry/models/autoencoders/transformers/utils.py +21 -0
step1x3d_geometry/models/autoencoders/volume_decoders.py +327 -0
step1x3d_geometry/models/conditional_encoders/__init__.py +6 -0
step1x3d_geometry/models/conditional_encoders/base.py +202 -0
step1x3d_geometry/models/conditional_encoders/clip/modeling_clip.py +1597 -0
step1x3d_geometry/models/conditional_encoders/clip/modeling_conditional_clip.py +443 -0
step1x3d_geometry/models/conditional_encoders/dinov2/modeling_conditional_dinov2.py +248 -0
step1x3d_geometry/models/conditional_encoders/dinov2/modeling_dinov2.py +978 -0
step1x3d_geometry/models/conditional_encoders/dinov2_clip_encoder.py +514 -0
step1x3d_geometry/models/conditional_encoders/dinov2_encoder.py +296 -0
step1x3d_geometry/models/conditional_encoders/dinov2_with_registers/modeling_dinov2_with_registers.py +1088 -0
step1x3d_geometry/models/conditional_encoders/label_encoder.py +167 -0
step1x3d_geometry/models/conditional_encoders/t5_encoder.py +271 -0
step1x3d_geometry/models/pipelines/pipeline.py +513 -0
step1x3d_geometry/models/pipelines/pipeline_utils.py +404 -0
step1x3d_geometry/models/transformers/__init__.py +1 -0
step1x3d_geometry/models/transformers/flux_transformer_1d.py +600 -0
step1x3d_geometry/models/transformers/pixart_transformer_1d.py +574 -0
step1x3d_geometry/systems/__init__.py +1 -0
step1x3d_geometry/systems/base.py +210 -0
step1x3d_geometry/systems/shape_autoencoder.py +151 -0
step1x3d_geometry/systems/shape_diffusion.py +425 -0
step1x3d_geometry/systems/shape_rectified_flow.py +474 -0
step1x3d_geometry/systems/utils.py +391 -0
step1x3d_geometry/utils/__init__.py +1 -0
step1x3d_geometry/utils/base.py +215 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,11 @@

+output
+outputs
+**__pycache__
+.DS_Store
+cache
+step1x3d_texture/custom_rasterizer/build
+step1x3d_texture/custom_rasterizer/dist
+step1x3d_texture/custom_rasterizer/custom_rasterizer.egg-info
+step1x3d_texture/differentiable_renderer/build
+step1x3d_texture/differentiable_renderer/dist
+step1x3d_texture/differentiable_renderer/mesh_processor.egg-info

app.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import os
+import time
+import uuid
+import torch
+import trimesh
+import argparse
+import numpy as np
+import gradio as gr
+from step1x3d_geometry.models.pipelines.pipeline import Step1X3DGeometryPipeline
+from step1x3d_texture.pipelines.step1x_3d_texture_synthesis_pipeline import (
+    Step1X3DTexturePipeline,
+)
+from step1x3d_texture.utils.shape_post_process import (
+    FaceReducer,
+    DegenerateFaceRemover,
+)
+def generate_func(
+    input_image_path, guidance_scale, inference_steps, max_facenum, symmetry, edge_type
+):
+    if "Label" in args.geometry_model:
+        out = geometry_model(
+            input_image_path,
+            label={"symmetry": symmetry, "edge_type": edge_type},
+            guidance_scale=float(guidance_scale),
+            octree_resolution=384,
+            max_facenum=int(max_facenum),
+            num_inference_steps=int(inference_steps),
+        )
+    else:
+        out = geometry_model(
+            input_image_path,
+            guidance_scale=float(guidance_scale),
+            num_inference_steps=int(inference_steps),
+            max_facenum=int(max_facenum),
+        )
+    save_name = str(uuid.uuid4())
+    print(save_name)
+    geometry_save_path = f"{args.cache_dir}/{save_name}.glb"
+    geometry_mesh = out.mesh[0]
+    geometry_mesh.export(geometry_save_path)
+    geometry_mesh = DegenerateFaceRemover()(geometry_mesh)
+    geometry_mesh = FaceReducer()(geometry_mesh)
+    textured_mesh = texture_model(input_image_path, geometry_mesh)
+    textured_save_path = f"{args.cache_dir}/{save_name}-textured.glb"
+    textured_mesh.export(textured_save_path)
+    torch.cuda.empty_cache()
+    print("Generate finish")
+    return geometry_save_path, textured_save_path
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--geometry_model", type=str, default="Step1X-3D-Geometry-Label-1300m"
+    )
+    parser.add_argument(
+        "--texture_model", type=str, default="Step1X-3D-Texture"
+    )
+    parser.add_argument("--cache_dir", type=str, default="cache")
+    parser.add_argument("--port", type=int, default=7861)
+    parser.add_argument("--host", type=str, default="0.0.0.0")
+    args = parser.parse_args()
+    os.makedirs(args.cache_dir, exist_ok=True)
+    geometry_model = Step1X3DGeometryPipeline.from_pretrained(
+        "stepfun-ai/Step1X-3D", subfolder=args.geometry_model
+    ).to("cuda")
+    texture_model = Step1X3DTexturePipeline.from_pretrained("stepfun-ai/Step1X-3D", subfolder=args.texture_model)
+    with gr.Blocks(title="Step1X-3D demo") as demo:
+        gr.Markdown("# Step1X-3D")
+        with gr.Row():
+            with gr.Column(scale=2):
+                input_image = gr.Image(
+                    label="Image", type="filepath", image_mode="RGBA"
+                )
+                guidance_scale = gr.Number(label="Guidance Scale", value="7.5")
+                inference_steps = gr.Slider(
+                    label="Inferece Steps", minimum=1, maximum=100, value=50
+                )
+                max_facenum = gr.Number(label="Max Face Num", value="400000")
+                symmetry = gr.Radio(
+                    choices=["x", "asymmetry"],
+                    label="Symmetry Type",
+                    value="x",
+                    type="value",
+                )
+                edge_type = gr.Radio(
+                    choices=["sharp", "normal", "smooth"],
+                    label="Edge Type",
+                    value="sharp",
+                    type="value",
+                )
+                btn = gr.Button("Start")
+            with gr.Column(scale=4):
+                textured_preview = gr.Model3D(label="Textured", height=380)
+                geometry_preview = gr.Model3D(label="Geometry", height=380)
+            with gr.Column(scale=1):
+                gr.Examples(
+                    examples=[
+                        ["examples/images/000.png"],
+                        ["examples/images/001.png"],
+                        ["examples/images/004.png"],
+                        ["examples/images/008.png"],
+                        ["examples/images/028.png"],
+                        ["examples/images/032.png"],
+                        ["examples/images/061.png"],
+                        ["examples/images/107.png"],
+                    ],
+                    inputs=[input_image],
+                    cache_examples=False,
+                )
+        btn.click(
+            generate_func,
+            inputs=[
+                input_image,
+                guidance_scale,
+                inference_steps,
+                max_facenum,
+                symmetry,
+                edge_type,
+            ],
+            outputs=[geometry_preview, textured_preview],
+        )
+    demo.launch(server_name=args.host, server_port=args.port)
+    demo.queue(concurrency_count=3)

examples/images/000.png ADDED Viewed

Git LFS Details

SHA256: 62284b41c010dd81524c51d12da4369fc458abd955011f59ce395266a02efb5f
Pointer size: 132 Bytes
Size of remote file: 1.54 MB

examples/images/001.png ADDED Viewed

Git LFS Details

SHA256: e93cc2c9850b6ea7cf233ae2f8d96246d86de7fc1d9bf079f2455a47938e946a
Pointer size: 131 Bytes
Size of remote file: 608 kB

examples/images/004.png ADDED Viewed

Git LFS Details

SHA256: 19aa7e05ca0cb1eb4e7809eeded332cce8c21daf9e5458338b6ad3bfbba85679
Pointer size: 132 Bytes
Size of remote file: 1.3 MB

examples/images/008.png ADDED Viewed

Git LFS Details

SHA256: 67cf8e33b715641599c5489f06f6c5d1da312faf3c95196395d9d81a1aa112e1
Pointer size: 131 Bytes
Size of remote file: 367 kB

examples/images/028.png ADDED Viewed

Git LFS Details

SHA256: b12c3b18f615fb5c887bfbd946c69eff8934519182ee5ef13f3853ca64e0bc22
Pointer size: 132 Bytes
Size of remote file: 1.3 MB

examples/images/032.png ADDED Viewed

Git LFS Details

SHA256: 7f655fc199fed98a8d663e6e39baa94307af3e9494efa6389ac5b90c81b45b18
Pointer size: 132 Bytes
Size of remote file: 1.56 MB

examples/images/061.png ADDED Viewed

Git LFS Details

SHA256: e28ffd293ba94f8d92c7bef7db7125d6df5e05287f116d6f93617623aa5d7ecf
Pointer size: 131 Bytes
Size of remote file: 307 kB

examples/images/107.png ADDED Viewed

Git LFS Details

SHA256: 70c7d618bfd70125d0b61007e549f3369273b1de866b30c703a68045bceb8950
Pointer size: 132 Bytes
Size of remote file: 1.27 MB

requirements.txt ADDED Viewed

	@@ -0,0 +1,50 @@

+datasets==2.19.0
+diffusers==0.32.2
+einops==0.8.0
+huggingface-hub==0.26.2
+imageio==2.34.1
+jaxtyping==0.2.28
+joblib==1.4.0
+lightning-utilities==0.11.2
+matplotlib==3.8.4
+numpy==1.26.4
+omegaconf==2.3.0
+opencv-python-headless==4.10.0.84
+pandas==2.2.2
+pillow==10.3.0
+plyfile==1.0.3
+PyMCubes==0.1.4
+pyparsing==3.1.2
+pytorch-lightning==2.2.4
+PyYAML==6.0.1
+safetensors==0.4.3
+scikit-image==0.23.2
+scipy==1.13.0
+tensorboard==2.16.2
+tensorboardX==2.6.2.2
+timm==0.9.16
+tokenizers==0.21.0
+tqdm==4.66.2
+transformers==4.48.0
+trimesh==4.3.2
+spaces==0.28.3
+accelerate==1.5.2
+rembg==2.0.65
+gradio==5.5.0
+wandb==0.18.6
+deepspeed==0.16.4
+sageattention==1.0.6
+mosaicml-streaming==0.11.0
+easydict==1.13
+open3d==0.19.0
+prodigyopt==1.1.2
+peft==0.15.1
+sentencepiece==0.2.0
+pymeshlab==2023.12.post3
+onnxruntime==1.21.0
+bs4==0.0.2
+xatlas==0.0.10
+pybind11==2.13.6
+pygltflib==1.16.4
+kornia==0.8.0
+git+https://github.com/NVlabs/nvdiffrast.git

step1x3d_geometry/__init__.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import importlib
+__modules__ = {}
+def register(name):
+    def decorator(cls):
+        if name in __modules__:
+            raise ValueError(
+                f"Module {name} already exists! Names of extensions conflict!"
+            )
+        else:
+            __modules__[name] = cls
+        return cls
+    return decorator
+def find(name):
+    if name in __modules__:
+        return __modules__[name]
+    else:
+        try:
+            module_string = ".".join(name.split(".")[:-1])
+            cls_name = name.split(".")[-1]
+            module = importlib.import_module(module_string, package=None)
+            return getattr(module, cls_name)
+        except Exception as e:
+            raise ValueError(f"Module {name} not found!")
+###  grammar sugar for logging utilities  ###
+import logging
+logger = logging.getLogger("pytorch_lightning")
+from pytorch_lightning.utilities.rank_zero import (
+    rank_zero_debug,
+    rank_zero_info,
+    rank_zero_only,
+)
+debug = rank_zero_debug
+info = rank_zero_info
+@rank_zero_only
+def warn(*args, **kwargs):
+    logger.warn(*args, **kwargs)
+from . import data, models, systems

step1x3d_geometry/data/Objaverse.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import math
+import os
+import json
+import re
+import cv2
+from dataclasses import dataclass, field
+import pytorch_lightning as pl
+import torch
+import torch.nn.functional as F
+from torch.utils.data import DataLoader
+from step1x3d_geometry import register
+from step1x3d_geometry.utils.typing import *
+from step1x3d_geometry.utils.config import parse_structured
+from streaming import StreamingDataLoader
+from .base import BaseDataModuleConfig, BaseDataset
+@dataclass
+class ObjaverseDataModuleConfig(BaseDataModuleConfig):
+    pass
+class ObjaverseDataset(BaseDataset):
+    pass
+@register("Objaverse-datamodule")
+class ObjaverseDataModule(pl.LightningDataModule):
+    cfg: ObjaverseDataModuleConfig
+    def __init__(self, cfg: Optional[Union[dict, DictConfig]] = None) -> None:
+        super().__init__()
+        self.cfg = parse_structured(ObjaverseDataModuleConfig, cfg)
+    def setup(self, stage=None) -> None:
+        if stage in [None, "fit"]:
+            self.train_dataset = ObjaverseDataset(self.cfg, "train")
+        if stage in [None, "fit", "validate"]:
+            self.val_dataset = ObjaverseDataset(self.cfg, "val")
+        if stage in [None, "test", "predict"]:
+            self.test_dataset = ObjaverseDataset(self.cfg, "test")
+    def prepare_data(self):
+        pass
+    def general_loader(
+        self, dataset, batch_size, collate_fn=None, num_workers=0
+    ) -> DataLoader:
+        return DataLoader(
+            dataset,
+            batch_size=batch_size,
+            collate_fn=collate_fn,
+            num_workers=num_workers,
+        )
+    def train_dataloader(self) -> DataLoader:
+        return self.general_loader(
+            self.train_dataset,
+            batch_size=self.cfg.batch_size,
+            collate_fn=self.train_dataset.collate,
+            num_workers=self.cfg.num_workers,
+        )
+    def val_dataloader(self) -> DataLoader:
+        return self.general_loader(self.val_dataset, batch_size=1)
+    def test_dataloader(self) -> DataLoader:
+        return self.general_loader(self.test_dataset, batch_size=1)
+    def predict_dataloader(self) -> DataLoader:
+        return self.general_loader(self.test_dataset, batch_size=1)

step1x3d_geometry/data/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from . import Objaverse

step1x3d_geometry/data/base.py ADDED Viewed

	@@ -0,0 +1,350 @@

+import math
+import os
+import json
+import re
+import cv2
+from dataclasses import dataclass, field
+import random
+import imageio
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torchvision.transforms as transforms
+from torch.utils.data import DataLoader, Dataset
+from PIL import Image
+from step1x3d_geometry.utils.typing import *
+@dataclass
+class BaseDataModuleConfig:
+    root_dir: str = None
+    batch_size: int = 4
+    num_workers: int = 8
+    ################################# General argumentation #################################
+    random_flip: bool = (
+        False  # whether to randomly flip the input point cloud and the input images
+    )
+    ################################# Geometry part #################################
+    load_geometry: bool = True  # whether to load geometry data
+    with_sharp_data: bool = False
+    geo_data_type: str = "sdf"  # occupancy, sdf
+    # for occupancy or sdf supervision
+    n_samples: int = 4096  # number of points in input point cloud
+    upsample_ratio: int = 1  # upsample ratio for input point cloud
+    sampling_strategy: Optional[str] = (
+        "random"  # sampling strategy for input point cloud
+    )
+    scale: float = 1.0  # scale of the input point cloud and target supervision
+    noise_sigma: float = 0.0  # noise level of the input point cloud
+    rotate_points: bool = (
+        False  # whether to rotate the input point cloud and the supervision, for VAE aug.
+    )
+    load_geometry_supervision: bool = False  # whether to load supervision
+    supervision_type: str = "sdf"  # occupancy, sdf, tsdf, tsdf_w_surface
+    n_supervision: int = 10000  # number of points in supervision
+    tsdf_threshold: float = (
+        0.01  # threshold for truncating sdf values, used when input is sdf
+    )
+    ################################# Image part #################################
+    load_image: bool = False  # whether to load images
+    image_type: str = "rgb"  # rgb, normal, rgb_or_normal
+    image_file_type: str = "png"  # png, jpeg
+    image_type_ratio: float = (
+        1.0  # ratio of rgb for each dataset when image_type is "rgb_or_normal"
+    )
+    crop_image: bool = True  # whether to crop the input image
+    random_color_jitter: bool = (
+        False  # whether to randomly color jitter the input images
+    )
+    random_rotate: bool = (
+        False  # whether to randomly rotate the input images, default [-10 deg, 10 deg]
+    )
+    random_mask: bool = False  # whether to add random mask to the input image
+    background_color: Tuple[int, int, int] = field(
+        default_factory=lambda: (255, 255, 255)
+    )
+    idx: Optional[List[int]] = None  # index of the image to load
+    n_views: int = 1  # number of views
+    foreground_ratio: Optional[float] = 0.90
+    ################################# Caption part #################################
+    load_caption: bool = False  # whether to load captions
+    load_label: bool = False  # whether to load labels
+class BaseDataset(Dataset):
+    def __init__(self, cfg: Any, split: str) -> None:
+        super().__init__()
+        self.cfg: BaseDataModuleConfig = cfg
+        self.split = split
+        self.uids = json.load(open(f"{cfg.root_dir}/{split}.json"))
+        print(f"Loaded {len(self.uids)} {split} uids")
+        # add ColorJitter transforms for input images
+        if self.cfg.random_color_jitter:
+            self.color_jitter = transforms.ColorJitter(
+                brightness=0.4, contrast=0.4, saturation=0.4, hue=0.2
+            )
+        # add RandomRotation transforms for input images
+        if self.cfg.random_rotate:
+            self.rotate = transforms.RandomRotation(
+                degrees=10, fill=(*self.cfg.background_color, 0.0)
+            )  # by default 10 deg
+    def __len__(self):
+        return len(self.uids)
+    def _load_shape_from_occupancy_or_sdf(self, index: int) -> Dict[str, Any]:
+        if self.cfg.geo_data_type == "sdf":
+            data = np.load(f"{self.cfg.root_dir}/surfaces/{self.uids[index]}.npz")
+            # for input point cloud
+            surface = data["surface"]
+            if self.cfg.with_sharp_data:
+                sharp_surface = data["sharp_surface"]
+        else:
+            raise NotImplementedError(
+                f"Data type {self.cfg.geo_data_type} not implemented"
+            )
+        # random sampling
+        if self.cfg.sampling_strategy == "random":
+            rng = np.random.default_rng()
+            ind = rng.choice(
+                surface.shape[0],
+                self.cfg.upsample_ratio * self.cfg.n_samples,
+                replace=True,
+            )
+            surface = surface[ind]
+            if self.cfg.with_sharp_data:
+                sharp_surface = sharp_surface[ind]
+        elif self.cfg.sampling_strategy == "fps":
+            import fpsample
+            kdline_fps_samples_idx = fpsample.bucket_fps_kdline_sampling(
+                surface[:, :3], self.cfg.n_samples, h=5
+            )
+            surface = surface[kdline_fps_samples_idx]
+            if self.cfg.with_sharp_data:
+                kdline_fps_samples_idx = fpsample.bucket_fps_kdline_sampling(
+                    sharp_surface[:, :3], self.cfg.n_samples, h=5
+                )
+                sharp_surface = sharp_surface[kdline_fps_samples_idx]
+        else:
+            raise NotImplementedError(
+                f"sampling strategy {self.cfg.sampling_strategy} not implemented"
+            )
+        # rescale data
+        surface[:, :3] = surface[:, :3] * self.cfg.scale  # target scale
+        if self.cfg.with_sharp_data:
+            sharp_surface[:, :3] = sharp_surface[:, :3] * self.cfg.scale  # target scale
+            ret = {
+                "uid": self.uids[index].split("/")[-1],
+                "surface": surface.astype(np.float32),
+                "sharp_surface": sharp_surface.astype(np.float32),
+            }
+        else:
+            ret = {
+                "uid": self.uids[index].split("/")[-1],
+                "surface": surface.astype(np.float32),
+            }
+        return ret
+    def _load_shape_supervision_occupancy_or_sdf(self, index: int) -> Dict[str, Any]:
+        # for supervision
+        ret = {}
+        if self.cfg.geo_data_type == "sdf":
+            data = np.load(f"{self.cfg.root_dir}/surfaces/{self.uids[index]}.npz")
+            data = np.concatenate(
+                [data["volume_rand_points"], data["near_surface_points"]], axis=0
+            )
+            rand_points, sdfs = data[:, :3], data[:, 3:]
+        else:
+            raise NotImplementedError(
+                f"Data type {self.cfg.geo_data_type} not implemented"
+            )
+        # random sampling
+        rng = np.random.default_rng()
+        ind = rng.choice(rand_points.shape[0], self.cfg.n_supervision, replace=False)
+        rand_points = rand_points[ind]
+        rand_points = rand_points * self.cfg.scale
+        ret["rand_points"] = rand_points.astype(np.float32)
+        if self.cfg.geo_data_type == "sdf":
+            if self.cfg.supervision_type == "sdf":
+                ret["sdf"] = sdfs[ind].flatten().astype(np.float32)
+            elif self.cfg.supervision_type == "occupancy":
+                ret["occupancies"] = np.where(sdfs[ind].flatten() < 1e-3, 0, 1).astype(
+                    np.float32
+                )
+            elif self.cfg.supervision_type == "tsdf":
+                ret["sdf"] = (
+                    sdfs[ind]
+                    .flatten()
+                    .astype(np.float32)
+                    .clip(-self.cfg.tsdf_threshold, self.cfg.tsdf_threshold)
+                    / self.cfg.tsdf_threshold
+                )
+            else:
+                raise NotImplementedError(
+                    f"Supervision type {self.cfg.supervision_type} not implemented"
+                )
+        return ret
+    def _load_image(self, index: int) -> Dict[str, Any]:
+        def _process_img(image, background_color=(255, 255, 255), foreground_ratio=0.9):
+            alpha = image.getchannel("A")
+            background = Image.new("RGBA", image.size, (*background_color, 255))
+            image = Image.alpha_composite(background, image)
+            image = image.crop(alpha.getbbox())
+            new_size = tuple(int(dim * foreground_ratio) for dim in image.size)
+            resized_image = image.resize(new_size)
+            padded_image = Image.new("RGBA", image.size, (*background_color, 255))
+            paste_position = (
+                (image.width - resized_image.width) // 2,
+                (image.height - resized_image.height) // 2,
+            )
+            padded_image.paste(resized_image, paste_position)
+            # Expand image to 1:1
+            max_dim = max(padded_image.size)
+            image = Image.new("RGBA", (max_dim, max_dim), (*background_color, 255))
+            paste_position = (
+                (max_dim - padded_image.width) // 2,
+                (max_dim - padded_image.height) // 2,
+            )
+            image.paste(padded_image, paste_position)
+            image = image.resize((512, 512))
+            return image.convert("RGB"), alpha
+        ret = {}
+        if self.cfg.image_type == "rgb" or self.cfg.image_type == "normal":
+            assert (
+                self.cfg.n_views == 1
+            ), "Only single view is supported for single image"
+            sel_idx = random.choice(self.cfg.idx)
+            ret["sel_image_idx"] = sel_idx
+            if self.cfg.image_type == "rgb":
+                img_path = (
+                    f"{self.cfg.root_dir}/images/"
+                    + "/".join(self.uids[index].split("/")[-2:])
+                    + f"/{'{:04d}'.format(sel_idx)}_rgb.{self.cfg.image_file_type}"
+                )
+            elif self.cfg.image_type == "normal":
+                img_path = (
+                    f"{self.cfg.root_dir}/images/"
+                    + "/".join(self.uids[index].split("/")[-2:])
+                    + f"/{'{:04d}'.format(sel_idx)}_normal.{self.cfg.image_file_type}"
+                )
+            image = Image.open(img_path).copy()
+            # add random color jitter
+            if self.cfg.random_color_jitter:
+                rgb = self.color_jitter(image.convert("RGB"))
+                image = Image.merge("RGBA", (*rgb.split(), image.getchannel("A")))
+            # add random rotation
+            if self.cfg.random_rotate:
+                image = self.rotate(image)
+            # add crop
+            if self.cfg.crop_image:
+                background_color = (
+                    torch.randint(0, 256, (3,))
+                    if self.cfg.background_color is None
+                    else torch.as_tensor(self.cfg.background_color)
+                )
+                image, alpha = _process_img(
+                    image, background_color, self.cfg.foreground_ratio
+                )
+            else:
+                alpha = image.getchannel("A")
+                background = Image.new("RGBA", image.size, background_color)
+                image = Image.alpha_composite(background, image).convert("RGB")
+            ret["image"] = torch.from_numpy(np.array(image) / 255.0)
+            ret["mask"] = torch.from_numpy(np.array(alpha) / 255.0).unsqueeze(0)
+        else:
+            raise NotImplementedError(
+                f"Image type {self.cfg.image_type} not implemented"
+            )
+        return ret
+    def _get_data(self, index):
+        ret = {"uid": self.uids[index]}
+        # random flip
+        flip = np.random.rand() < 0.5 if self.cfg.random_flip else False
+        # load geometry
+        if self.cfg.load_geometry:
+            if self.cfg.geo_data_type == "occupancy" or self.cfg.geo_data_type == "sdf":
+                # load shape
+                ret = self._load_shape_from_occupancy_or_sdf(index)
+                # load supervision for shape
+                if self.cfg.load_geometry_supervision:
+                    ret.update(self._load_shape_supervision_occupancy_or_sdf(index))
+            else:
+                raise NotImplementedError(
+                    f"Geo data type {self.cfg.geo_data_type} not implemented"
+                )
+            if flip:  # random flip the input point cloud and the supervision
+                for key in ret.keys():
+                    if key in ["surface", "sharp_surface"]:  # N x (xyz + normal)
+                        ret[key][:, 0] = -ret[key][:, 0]
+                        ret[key][:, 3] = -ret[key][:, 3]
+                    elif key in ["rand_points"]:
+                        ret[key][:, 0] = -ret[key][:, 0]
+        # load image
+        if self.cfg.load_image:
+            ret.update(self._load_image(index))
+            if flip:  # random flip the input image
+                for key in ret.keys():
+                    if key in ["image"]:  # random flip the input image
+                        ret[key] = torch.flip(ret[key], [2])
+                    if key in ["mask"]:  # random flip the input image
+                        ret[key] = torch.flip(ret[key], [2])
+        # load caption
+        meta = None
+        if self.cfg.load_caption:
+            with open(f"{self.cfg.root_dir}/metas/{self.uids[index]}.json", "r") as f:
+                meta = json.load(f)
+            ret.update({"caption": meta["caption"]})
+        # load label
+        if self.cfg.load_label:
+            if meta is None:
+                with open(
+                    f"{self.cfg.root_dir}/metas/{self.uids[index]}.json", "r"
+                ) as f:
+                    meta = json.load(f)
+            ret.update({"label": [meta["label"]]})
+        return ret
+    def __getitem__(self, index):
+        try:
+            return self._get_data(index)
+        except Exception as e:
+            print(f"Error in {self.uids[index]}: {e}")
+            return self.__getitem__(np.random.randint(len(self)))
+    def collate(self, batch):
+        from torch.utils.data._utils.collate import default_collate_fn_map
+        return torch.utils.data.default_collate(batch)

step1x3d_geometry/models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from . import autoencoders, conditional_encoders, transformers

step1x3d_geometry/models/attention.py ADDED Viewed

	@@ -0,0 +1,776 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional, Tuple, Union
+import collections.abc
+from itertools import repeat
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torch.distributed as dist
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.utils.import_utils import is_torch_npu_available, is_xformers_available
+from diffusers.models.attention import FeedForward
+from diffusers.models.attention_processor import Attention, AttentionProcessor
+from diffusers.models.normalization import (
+    AdaLayerNormContinuous,
+    AdaLayerNormZero,
+    AdaLayerNormZeroSingle,
+    FP32LayerNorm,
+    LayerNorm,
+)
+from .attention_processor import FluxAttnProcessor2_0, AttnProcessor2_0
+@maybe_allow_in_graph
+class MultiCondBasicTransformerBlock(nn.Module):
+    r"""
+    A basic Transformer block.
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm (:
+            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+        attention_bias (:
+            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used.
+        double_self_attention (`bool`, *optional*):
+            Whether to use two self-attention layers. In this case no cross attention layers are used.
+        upcast_attention (`bool`, *optional*):
+            Whether to upcast the attention computation to float32. This is useful for mixed precision training.
+        norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
+            Whether to use learnable elementwise affine parameters for normalization.
+        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
+            The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
+        final_dropout (`bool` *optional*, defaults to False):
+            Whether to apply a final dropout after the last feed-forward layer.
+        attention_type (`str`, *optional*, defaults to `"default"`):
+            The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
+        positional_embeddings (`str`, *optional*, defaults to `None`):
+            The type of positional embeddings to apply to.
+        num_positional_embeddings (`int`, *optional*, defaults to `None`):
+            The maximum number of positional embeddings to apply.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        use_self_attention: bool = True,
+        use_cross_attention: bool = False,
+        self_attention_norm_type: Optional[
+            str
+        ] = "layer_norm",  # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single', 'ada_norm_continuous', 'layer_norm_i2vgen'
+        cross_attention_dim: Optional[int] = None,
+        cross_attention_norm_type: Optional[str] = None,
+        # parallel second cross attention
+        use_cross_attention_2: bool = False,
+        cross_attention_2_dim: Optional[int] = None,
+        cross_attention_2_norm_type: Optional[str] = None,
+        # parallel third cross attention
+        use_cross_attention_3: bool = False,
+        cross_attention_3_dim: Optional[int] = None,
+        cross_attention_3_norm_type: Optional[str] = None,
+        dropout=0.0,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_eps: float = 1e-5,
+        final_dropout: bool = False,
+        attention_type: str = "default",
+        positional_embeddings: Optional[str] = None,
+        num_positional_embeddings: Optional[int] = None,
+        ada_norm_continous_conditioning_embedding_dim: Optional[int] = None,
+        ada_norm_bias: Optional[int] = None,
+        ff_inner_dim: Optional[int] = None,
+        ff_bias: bool = True,
+        attention_out_bias: bool = True,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_attention_heads = num_attention_heads
+        self.use_self_attention = use_self_attention
+        self.use_cross_attention = use_cross_attention
+        self.self_attention_norm_type = self_attention_norm_type
+        self.cross_attention_dim = cross_attention_dim
+        self.cross_attention_norm_type = cross_attention_norm_type
+        self.use_cross_attention_2 = use_cross_attention_2
+        self.cross_attention_2_dim = cross_attention_2_dim
+        self.cross_attention_2_norm_type = cross_attention_2_norm_type
+        self.use_cross_attention_3 = use_cross_attention_3
+        self.cross_attention_3_dim = cross_attention_3_dim
+        self.cross_attention_3_norm_type = cross_attention_3_norm_type
+        self.dropout = dropout
+        self.cross_attention_dim = cross_attention_dim
+        self.activation_fn = activation_fn
+        self.attention_bias = attention_bias
+        self.double_self_attention = double_self_attention
+        self.norm_elementwise_affine = norm_elementwise_affine
+        self.positional_embeddings = positional_embeddings
+        self.num_positional_embeddings = num_positional_embeddings
+        self.only_cross_attention = only_cross_attention
+        # We keep these boolean flags for backward-compatibility.
+        self.use_ada_layer_norm_zero = (
+            num_embeds_ada_norm is not None
+        ) and self_attention_norm_type == "ada_norm_zero"
+        self.use_ada_layer_norm = (
+            num_embeds_ada_norm is not None
+        ) and self_attention_norm_type == "ada_norm"
+        self.use_ada_layer_norm_single = self_attention_norm_type == "ada_norm_single"
+        self.use_layer_norm = self_attention_norm_type == "layer_norm"
+        self.use_ada_layer_norm_continuous = (
+            self_attention_norm_type == "ada_norm_continuous"
+        )
+        if (
+            self_attention_norm_type in ("ada_norm", "ada_norm_zero")
+            and num_embeds_ada_norm is None
+        ):
+            raise ValueError(
+                f"`self_attention_norm_type` is set to {self_attention_norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+                f" define `num_embeds_ada_norm` if setting `self_attention_norm_type` to {self_attention_norm_type}."
+            )
+        self.self_attention_norm_type = self_attention_norm_type
+        self.num_embeds_ada_norm = num_embeds_ada_norm
+        if positional_embeddings and (num_positional_embeddings is None):
+            raise ValueError(
+                "If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
+            )
+        if positional_embeddings == "sinusoidal":
+            self.pos_embed = SinusoidalPositionalEmbedding(
+                dim, max_seq_length=num_positional_embeddings
+            )
+        else:
+            self.pos_embed = None
+        # Define 3 blocks. Each block has its own normalization layer.
+        if use_self_attention:
+            # 1. Self-Attn
+            if self_attention_norm_type == "ada_norm":
+                self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+            elif self_attention_norm_type == "ada_norm_zero":
+                self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
+            elif self_attention_norm_type == "ada_norm_continuous":
+                self.norm1 = AdaLayerNormContinuous(
+                    dim,
+                    ada_norm_continous_conditioning_embedding_dim,
+                    norm_elementwise_affine,
+                    norm_eps,
+                    ada_norm_bias,
+                    "rms_norm",
+                )
+            elif (
+                self_attention_norm_type == "fp32_layer_norm"
+                or self_attention_norm_type is None
+            ):
+                self.norm1 = FP32LayerNorm(dim, norm_eps, norm_elementwise_affine)
+            else:
+                self.norm1 = nn.RMSNorm(
+                    dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps
+                )
+            self.attn1 = Attention(
+                query_dim=dim,
+                heads=num_attention_heads,
+                dim_head=dim // num_attention_heads,
+                dropout=dropout,
+                bias=attention_bias,
+                cross_attention_dim=(
+                    cross_attention_dim if only_cross_attention else None
+                ),
+                upcast_attention=upcast_attention,
+                out_bias=attention_out_bias,
+                processor=AttnProcessor2_0(),
+            )
+        # 2. Cross-Attn
+        if use_cross_attention or double_self_attention:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            if cross_attention_norm_type == "ada_norm":
+                self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm)
+            elif cross_attention_norm_type == "ada_norm_continuous":
+                self.norm2 = AdaLayerNormContinuous(
+                    dim,
+                    ada_norm_continous_conditioning_embedding_dim,
+                    norm_elementwise_affine,
+                    norm_eps,
+                    ada_norm_bias,
+                    "rms_norm",
+                )
+            elif (
+                cross_attention_norm_type == "fp32_layer_norm"
+                or cross_attention_norm_type is None
+            ):
+                self.norm2 = FP32LayerNorm(dim, norm_eps, norm_elementwise_affine)
+            else:
+                self.norm2 = nn.RMSNorm(
+                    dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps
+                )
+            self.attn2 = Attention(
+                query_dim=dim,
+                cross_attention_dim=(
+                    cross_attention_dim if not double_self_attention else None
+                ),
+                heads=num_attention_heads,
+                dim_head=dim // num_attention_heads,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+                out_bias=attention_out_bias,
+                processor=AttnProcessor2_0(),
+            )  # is self-attn if encoder_hidden_states is none
+        else:
+            self.norm2 = None
+            self.attn2 = None
+        # 2'. Parallel Second Cross-Attn
+        if use_cross_attention_2:
+            assert cross_attention_2_dim is not None
+            if cross_attention_2_norm_type == "ada_norm":
+                self.norm2_2 = AdaLayerNorm(dim, num_embeds_ada_norm)
+            elif cross_attention_2_norm_type == "ada_norm_continuous":
+                self.norm2_2 = AdaLayerNormContinuous(
+                    dim,
+                    ada_norm_continous_conditioning_embedding_dim,
+                    norm_elementwise_affine,
+                    norm_eps,
+                    ada_norm_bias,
+                    "rms_norm",
+                )
+            elif (
+                cross_attention_2_norm_type == "fp32_layer_norm"
+                or cross_attention_2_norm_type is None
+            ):
+                self.norm2_2 = FP32LayerNorm(dim, norm_eps, norm_elementwise_affine)
+            else:
+                self.norm2_2 = nn.RMSNorm(
+                    dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps
+                )
+            self.attn2_2 = Attention(
+                query_dim=dim,
+                cross_attention_dim=cross_attention_2_dim,
+                heads=num_attention_heads,
+                dim_head=dim // num_attention_heads,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+                out_bias=attention_out_bias,
+                processor=AttnProcessor2_0(),
+            )
+            # self.attn2_2 = Attention(
+            #     query_dim=dim,
+            #     cross_attention_dim=cross_attention_2_dim,
+            #     dim_head=dim // num_attention_heads,
+            #     heads=num_attention_heads,
+            #     qk_norm="rms_norm" if qk_norm else None,
+            #     cross_attention_norm=cross_attention_2_norm_type,
+            #     eps=1e-6,
+            #     bias=qkv_bias,
+            #     processor=AttnProcessor2_0(),
+            # )
+        else:
+            self.norm2_2 = None
+            self.attn2_2 = None
+        # 2'. Parallel Third Cross-Attn
+        if use_cross_attention_3:
+            assert cross_attention_3_dim is not None
+            if cross_attention_3_norm_type == "ada_norm":
+                self.norm2_3 = AdaLayerNorm(dim, num_embeds_ada_norm)
+            elif cross_attention_3_norm_type == "ada_norm_continuous":
+                self.norm2_3 = AdaLayerNormContinuous(
+                    dim,
+                    ada_norm_continous_conditioning_embedding_dim,
+                    norm_elementwise_affine,
+                    norm_eps,
+                    ada_norm_bias,
+                    "rms_norm",
+                )
+            elif (
+                cross_attention_3_norm_type == "fp32_layer_norm"
+                or cross_attention_3_norm_type is None
+            ):
+                self.norm2_3 = FP32LayerNorm(dim, norm_eps, norm_elementwise_affine)
+            else:
+                self.norm2_3 = nn.RMSNorm(
+                    dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps
+                )
+            self.attn2_3 = Attention(
+                query_dim=dim,
+                cross_attention_dim=cross_attention_3_dim,
+                heads=num_attention_heads,
+                dim_head=dim // num_attention_heads,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+                out_bias=attention_out_bias,
+                processor=AttnProcessor2_0(),
+            )
+        else:
+            self.norm2_3 = None
+            self.attn2_3 = None
+        # 3. Feed-forward
+        if self_attention_norm_type == "ada_norm_continuous":
+            self.norm3 = AdaLayerNormContinuous(
+                dim,
+                ada_norm_continous_conditioning_embedding_dim,
+                norm_elementwise_affine,
+                norm_eps,
+                ada_norm_bias,
+                "layer_norm",
+            )
+        elif self_attention_norm_type in ["ada_norm_zero", "ada_norm", "layer_norm"]:
+            self.norm3 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+        elif self_attention_norm_type == "layer_norm_i2vgen":
+            self.norm3 = None
+        self.ff = FeedForward(
+            dim,
+            dropout=dropout,
+            activation_fn=activation_fn,
+            final_dropout=final_dropout,
+            inner_dim=ff_inner_dim,
+            bias=ff_bias,
+        )
+        # 4. Fuser
+        if attention_type == "gated" or attention_type == "gated-text-image":
+            self.fuser = GatedSelfAttentionDense(
+                dim, cross_attention_dim, num_attention_heads, attention_head_dim
+            )
+        # 5. Scale-shift for PixArt-Alpha.
+        if self_attention_norm_type == "ada_norm_single":
+            self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5)
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        self._chunk_dim = dim
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_hidden_states_2: Optional[torch.Tensor] = None,
+        encoder_hidden_states_3: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask_2: Optional[torch.Tensor] = None,
+        encoder_attention_mask_3: Optional[torch.Tensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored."
+                )
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 0. Self-Attention
+        batch_size = hidden_states.shape[0]
+        if self.self_attention_norm_type == "ada_norm":
+            norm_hidden_states = self.norm1(hidden_states, timestep)
+        elif self.self_attention_norm_type == "ada_norm_zero":
+            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
+        elif self.self_attention_norm_type in ["layer_norm", "layer_norm_i2vgen"]:
+            norm_hidden_states = self.norm1(hidden_states)
+        elif self.self_attention_norm_type == "ada_norm_continuous":
+            norm_hidden_states = self.norm1(
+                hidden_states, added_cond_kwargs["pooled_text_emb"]
+            )
+        elif self.self_attention_norm_type == "ada_norm_single":
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+                self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
+            ).chunk(6, dim=1)
+            norm_hidden_states = self.norm1(hidden_states)
+            norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
+        else:
+            raise ValueError("Incorrect norm used")
+        if self.pos_embed is not None:
+            norm_hidden_states = self.pos_embed(norm_hidden_states)
+        # 1. Prepare GLIGEN inputs
+        cross_attention_kwargs = (
+            cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+        )
+        gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
+        attn_output = self.attn1(
+            norm_hidden_states,
+            encoder_hidden_states=(
+                encoder_hidden_states if self.only_cross_attention else None
+            ),
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+        if self.self_attention_norm_type == "ada_norm_zero":
+            attn_output = gate_msa.unsqueeze(1) * attn_output
+        elif self.self_attention_norm_type == "ada_norm_single":
+            attn_output = gate_msa * attn_output
+        hidden_states = attn_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+        # 1.2 GLIGEN Control
+        if gligen_kwargs is not None:
+            hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
+        # 3. Cross-Attention
+        if self.attn2 is not None:
+            if self.cross_attention_norm_type == "ada_norm":
+                norm_hidden_states = self.norm2(hidden_states, timestep)
+            elif self.cross_attention_norm_type in [
+                "ada_norm_zero",
+                "layer_norm",
+                "layer_norm_i2vgen",
+            ]:
+                norm_hidden_states = self.norm2(hidden_states)
+            elif self.cross_attention_norm_type == "ada_norm_single":
+                # For PixArt norm2 isn't applied here:
+                # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
+                norm_hidden_states = hidden_states
+            elif self.cross_attention_norm_type == "ada_norm_continuous":
+                norm_hidden_states = self.norm2(
+                    hidden_states, added_cond_kwargs["pooled_text_emb"]
+                )
+            else:
+                raise ValueError("Incorrect norm")
+            if (
+                self.pos_embed is not None
+                and self.cross_attention_norm_type != "ada_norm_single"
+            ):
+                norm_hidden_states = self.pos_embed(norm_hidden_states)
+            attn_output = self.attn2(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                **cross_attention_kwargs,
+            )
+            hidden_states = attn_output + hidden_states
+        # 3.1 Parallel Second Cross-Attention
+        if self.attn2_2 is not None:
+            if self.cross_attention_2_norm_type == "ada_norm":
+                norm_hidden_states = self.norm2_2(hidden_states, timestep)
+            elif self.cross_attention_2_norm_type in [
+                "ada_norm_zero",
+                "layer_norm",
+                "layer_norm_i2vgen",
+            ]:
+                norm_hidden_states = self.norm2_2(hidden_states)
+            elif self.cross_attention_2_norm_type == "ada_norm_single":
+                # For PixArt norm2_2 isn't applied here:
+                # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
+                norm_hidden_states = hidden_states
+            elif self.cross_attention_2_norm_type == "ada_norm_continuous":
+                norm_hidden_states = self.norm2_2(
+                    hidden_states, added_cond_kwargs["pooled_text_emb"]
+                )
+            else:
+                raise ValueError("Incorrect norm")
+            if (
+                self.pos_embed is not None
+                and self.cross_attention_2_norm_type != "ada_norm_single"
+            ):
+                norm_hidden_states = self.pos_embed(norm_hidden_states)
+            attn_output_2 = self.attn2_2(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states_2,
+                attention_mask=encoder_attention_mask_2,
+                **cross_attention_kwargs,
+            )
+            hidden_states = attn_output_2 + hidden_states
+        # 3.2 Parallel Third Cross-Attention
+        if self.attn2_3 is not None:
+            if self.cross_attention_3_norm_type == "ada_norm":
+                norm_hidden_states = self.norm2_3(hidden_states, timestep)
+            elif self.cross_attention_3_norm_type in [
+                "ada_norm_zero",
+                "layer_norm",
+                "layer_norm_i2vgen",
+            ]:
+                norm_hidden_states = self.norm2_3(hidden_states)
+            elif self.cross_attention_3_norm_type == "ada_norm_single":
+                # For PixArt norm2_3 isn't applied here:
+                # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
+                norm_hidden_states = hidden_states
+            elif self.cross_attention_3_norm_type == "ada_norm_continuous":
+                norm_hidden_states = self.norm2_3(
+                    hidden_states, added_cond_kwargs["pooled_text_emb"]
+                )
+            else:
+                raise ValueError("Incorrect norm")
+            if (
+                self.pos_embed is not None
+                and self.cross_attention_3_norm_type != "ada_norm_single"
+            ):
+                norm_hidden_states = self.pos_embed(norm_hidden_states)
+            attn_output_3 = self.attn2_3(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states_3,
+                attention_mask=encoder_attention_mask_3,
+                **cross_attention_kwargs,
+            )
+            hidden_states = attn_output_3 + hidden_states
+        # 4. Feed-forward
+        # i2vgen doesn't have this norm 🤷‍♂️
+        if self.self_attention_norm_type == "ada_norm_continuous":
+            norm_hidden_states = self.norm3(
+                hidden_states, added_cond_kwargs["pooled_text_emb"]
+            )
+        elif not self.self_attention_norm_type == "ada_norm_single":
+            norm_hidden_states = self.norm3(hidden_states)
+        if self.self_attention_norm_type == "ada_norm_zero":
+            norm_hidden_states = (
+                norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+            )
+        if self.self_attention_norm_type == "ada_norm_single":
+            norm_hidden_states = self.norm2(hidden_states)
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+        if self._chunk_size is not None:
+            # "feed_forward_chunk_size" can be used to save memory
+            ff_output = _chunked_feed_forward(
+                self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size
+            )
+        else:
+            ff_output = self.ff(norm_hidden_states)
+        if self.self_attention_norm_type == "ada_norm_zero":
+            ff_output = gate_mlp.unsqueeze(1) * ff_output
+        elif self.self_attention_norm_type == "ada_norm_single":
+            ff_output = gate_mlp * ff_output
+        hidden_states = ff_output + hidden_states
+        return hidden_states
+@maybe_allow_in_graph
+class FluxSingleTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        mlp_ratio: float = 4.0,
+    ):
+        super().__init__()
+        self.mlp_hidden_dim = int(dim * mlp_ratio)
+        self.norm = AdaLayerNormZeroSingle(dim)
+        self.proj_mlp = nn.Linear(dim, self.mlp_hidden_dim)
+        self.act_mlp = nn.GELU(approximate="tanh")
+        self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)
+        if is_torch_npu_available():
+            deprecation_message = (
+                "Defaulting to FluxAttnProcessor2_0_NPU for NPU devices will be removed. Attention processors "
+                "should be set explicitly using the `set_attn_processor` method."
+            )
+            deprecate("npu_processor", "0.34.0", deprecation_message)
+            processor = FluxAttnProcessor2_0_NPU()
+        else:
+            processor = FluxAttnProcessor2_0()
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            bias=True,
+            processor=processor,
+            qk_norm="rms_norm",
+            eps=1e-6,
+            pre_only=True,
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
+        mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
+        joint_attention_kwargs = joint_attention_kwargs or {}
+        attn_output = self.attn(
+            hidden_states=norm_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+            **joint_attention_kwargs,
+        )
+        hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
+        gate = gate.unsqueeze(1)
+        hidden_states = gate * self.proj_out(hidden_states)
+        hidden_states = residual + hidden_states
+        if hidden_states.dtype == torch.float16:
+            hidden_states = hidden_states.clip(-65504, 65504)
+        return hidden_states
+@maybe_allow_in_graph
+class FluxTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        qk_norm: str = "rms_norm",
+        eps: float = 1e-6,
+    ):
+        super().__init__()
+        self.norm1 = AdaLayerNormZero(dim)
+        self.norm1_context = AdaLayerNormZero(dim)
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            added_kv_proj_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            context_pre_only=False,
+            bias=True,
+            processor=FluxAttnProcessor2_0(),
+            qk_norm=qk_norm,
+            eps=eps,
+        )
+        mlp_ratio = 4.0
+        self.mlp_hidden_dim = int(dim * mlp_ratio)
+        self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+        self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff_context = FeedForward(
+            dim=dim, dim_out=dim, activation_fn="gelu-approximate"
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+            hidden_states, emb=temb
+        )
+        norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = (
+            self.norm1_context(encoder_hidden_states, emb=temb)
+        )
+        joint_attention_kwargs = joint_attention_kwargs or {}
+        # Attention.
+        attention_outputs = self.attn(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+            **joint_attention_kwargs,
+        )
+        if len(attention_outputs) == 2:
+            attn_output, context_attn_output = attention_outputs
+        elif len(attention_outputs) == 3:
+            attn_output, context_attn_output, ip_attn_output = attention_outputs
+        # Process attention outputs for the `hidden_states`.
+        attn_output = gate_msa.unsqueeze(1) * attn_output
+        hidden_states = hidden_states + attn_output
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_hidden_states = (
+            norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        )
+        ff_output = self.ff(norm_hidden_states)
+        ff_output = gate_mlp.unsqueeze(1) * ff_output
+        hidden_states = hidden_states + ff_output
+        if len(attention_outputs) == 3:
+            hidden_states = hidden_states + ip_attn_output
+        # Process attention outputs for the `encoder_hidden_states`.
+        context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
+        encoder_hidden_states = encoder_hidden_states + context_attn_output
+        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+        norm_encoder_hidden_states = (
+            norm_encoder_hidden_states * (1 + c_scale_mlp[:, None])
+            + c_shift_mlp[:, None]
+        )
+        context_ff_output = self.ff_context(norm_encoder_hidden_states)
+        encoder_hidden_states = (
+            encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
+        )
+        if encoder_hidden_states.dtype == torch.float16:
+            encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
+        return encoder_hidden_states, hidden_states

step1x3d_geometry/models/attention_processor.py ADDED Viewed

	@@ -0,0 +1,482 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, List, Optional, Tuple, Union
+import os
+import torch
+import torch.nn.functional as F
+from diffusers.models.attention_processor import Attention
+from diffusers.utils import logging
+from diffusers.utils.import_utils import is_torch_npu_available, is_xformers_available
+from diffusers.utils.torch_utils import is_torch_version, maybe_allow_in_graph
+from einops import rearrange
+from torch import nn
+# add sageattention support
+scaled_dot_product_attention = F.scaled_dot_product_attention
+if os.environ.get("USE_SAGEATTN", "0") == "1":
+    try:
+        from sageattention import sageattn
+    except ImportError:
+        raise ImportError(
+            'Please install the package "sageattention" to use this USE_SAGEATTN.'
+        )
+    scaled_dot_product_attention = sageattn
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class AttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(
+                attention_mask, sequence_length, batch_size
+            )
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(
+                batch_size, attn.heads, -1, attention_mask.shape[-1]
+            )
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
+                1, 2
+            )
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(
+                encoder_hidden_states
+            )
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(
+            batch_size, -1, attn.heads * head_dim
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class FusedAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). It uses
+    fused projection layers. For self-attention modules, all projection matrices (i.e., query, key, value) are fused.
+    For cross-attention modules, key and value projection matrices are fused.
+    <Tip warning={true}>
+    This API is currently 🧪 experimental in nature and can change in future.
+    </Tip>
+    """
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "FusedAttnProcessor2_0 requires at least PyTorch 2.0, to use it. Please upgrade PyTorch to > 2.0."
+            )
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(
+                attention_mask, sequence_length, batch_size
+            )
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(
+                batch_size, attn.heads, -1, attention_mask.shape[-1]
+            )
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
+                1, 2
+            )
+        if encoder_hidden_states is None:
+            qkv = attn.to_qkv(hidden_states)
+            split_size = qkv.shape[-1] // 3
+            query, key, value = torch.split(qkv, split_size, dim=-1)
+        else:
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(
+                    encoder_hidden_states
+                )
+            query = attn.to_q(hidden_states)
+            kv = attn.to_kv(encoder_hidden_states)
+            split_size = kv.shape[-1] // 2
+            key, value = torch.split(kv, split_size, dim=-1)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(
+            batch_size, -1, attn.heads * head_dim
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class FluxAttnProcessor2_0:
+    """Attention processor used typically in processing the SD3-like self-attention projections."""
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "FluxAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        batch_size, _, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        # `sample` projections.
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # the attention in FluxSingleTransformerBlock does not use `encoder_hidden_states`
+        if encoder_hidden_states is not None:
+            # `context` projections.
+            encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+            encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+            encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+            encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            if attn.norm_added_q is not None:
+                encoder_hidden_states_query_proj = attn.norm_added_q(
+                    encoder_hidden_states_query_proj
+                )
+            if attn.norm_added_k is not None:
+                encoder_hidden_states_key_proj = attn.norm_added_k(
+                    encoder_hidden_states_key_proj
+                )
+            # attention
+            query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+        if image_rotary_emb is not None:
+            from .embeddings import apply_rotary_emb
+            query = apply_rotary_emb(query, image_rotary_emb)
+            key = apply_rotary_emb(key, image_rotary_emb)
+        hidden_states = scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(
+            batch_size, -1, attn.heads * head_dim
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        if encoder_hidden_states is not None:
+            encoder_hidden_states, hidden_states = (
+                hidden_states[:, : encoder_hidden_states.shape[1]],
+                hidden_states[:, encoder_hidden_states.shape[1] :],
+            )
+            # linear proj
+            hidden_states = attn.to_out[0](hidden_states)
+            # dropout
+            hidden_states = attn.to_out[1](hidden_states)
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+            return hidden_states, encoder_hidden_states
+        else:
+            return hidden_states
+class FusedFluxAttnProcessor2_0:
+    """Attention processor used typically in processing the SD3-like self-attention projections."""
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "FusedFluxAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        batch_size, _, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        # `sample` projections.
+        qkv = attn.to_qkv(hidden_states)
+        split_size = qkv.shape[-1] // 3
+        query, key, value = torch.split(qkv, split_size, dim=-1)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # the attention in FluxSingleTransformerBlock does not use `encoder_hidden_states`
+        # `context` projections.
+        if encoder_hidden_states is not None:
+            encoder_qkv = attn.to_added_qkv(encoder_hidden_states)
+            split_size = encoder_qkv.shape[-1] // 3
+            (
+                encoder_hidden_states_query_proj,
+                encoder_hidden_states_key_proj,
+                encoder_hidden_states_value_proj,
+            ) = torch.split(encoder_qkv, split_size, dim=-1)
+            encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+                batch_size, -1, attn.heads, head_dim
+            ).transpose(1, 2)
+            if attn.norm_added_q is not None:
+                encoder_hidden_states_query_proj = attn.norm_added_q(
+                    encoder_hidden_states_query_proj
+                )
+            if attn.norm_added_k is not None:
+                encoder_hidden_states_key_proj = attn.norm_added_k(
+                    encoder_hidden_states_key_proj
+                )
+            # attention
+            query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+        if image_rotary_emb is not None:
+            from .embeddings import apply_rotary_emb
+            query = apply_rotary_emb(query, image_rotary_emb)
+            key = apply_rotary_emb(key, image_rotary_emb)
+        hidden_states = scaled_dot_product_attention(
+            query, key, value, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(
+            batch_size, -1, attn.heads * head_dim
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        if encoder_hidden_states is not None:
+            encoder_hidden_states, hidden_states = (
+                hidden_states[:, : encoder_hidden_states.shape[1]],
+                hidden_states[:, encoder_hidden_states.shape[1] :],
+            )
+            # linear proj
+            hidden_states = attn.to_out[0](hidden_states)
+            # dropout
+            hidden_states = attn.to_out[1](hidden_states)
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+            return hidden_states, encoder_hidden_states
+        else:
+            return hidden_states

step1x3d_geometry/models/autoencoders/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from . import (
+    michelangelo_autoencoder,
+)

step1x3d_geometry/models/autoencoders/michelangelo_autoencoder.py ADDED Viewed

	@@ -0,0 +1,765 @@

+from dataclasses import dataclass
+import math
+import torch
+import numpy as np
+import random
+import time
+import trimesh
+import torch.nn as nn
+from einops import repeat, rearrange
+from tqdm import trange
+from itertools import product
+from diffusers.models.modeling_utils import ModelMixin
+import step1x3d_geometry
+from step1x3d_geometry.utils.checkpoint import checkpoint
+from step1x3d_geometry.utils.base import BaseModule
+from step1x3d_geometry.utils.typing import *
+from step1x3d_geometry.utils.misc import get_world_size, get_device
+from .transformers.perceiver_1d import Perceiver
+from .transformers.attention import ResidualCrossAttentionBlock
+from .volume_decoders import HierarchicalVolumeDecoder, VanillaVolumeDecoder
+from .surface_extractors import MCSurfaceExtractor, DMCSurfaceExtractor
+from ..pipelines.pipeline_utils import smart_load_model
+from safetensors.torch import load_file
+VALID_EMBED_TYPES = ["identity", "fourier", "learned_fourier", "siren"]
+class FourierEmbedder(nn.Module):
+    def __init__(
+        self,
+        num_freqs: int = 6,
+        logspace: bool = True,
+        input_dim: int = 3,
+        include_input: bool = True,
+        include_pi: bool = True,
+    ) -> None:
+        super().__init__()
+        if logspace:
+            frequencies = 2.0 ** torch.arange(num_freqs, dtype=torch.float32)
+        else:
+            frequencies = torch.linspace(
+                1.0, 2.0 ** (num_freqs - 1), num_freqs, dtype=torch.float32
+            )
+        if include_pi:
+            frequencies *= torch.pi
+        self.register_buffer("frequencies", frequencies, persistent=False)
+        self.include_input = include_input
+        self.num_freqs = num_freqs
+        self.out_dim = self.get_dims(input_dim)
+    def get_dims(self, input_dim):
+        temp = 1 if self.include_input or self.num_freqs == 0 else 0
+        out_dim = input_dim * (self.num_freqs * 2 + temp)
+        return out_dim
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.num_freqs > 0:
+            embed = (x[..., None].contiguous() * self.frequencies).view(
+                *x.shape[:-1], -1
+            )
+            if self.include_input:
+                return torch.cat((x, embed.sin(), embed.cos()), dim=-1)
+            else:
+                return torch.cat((embed.sin(), embed.cos()), dim=-1)
+        else:
+            return x
+class LearnedFourierEmbedder(nn.Module):
+    def __init__(self, input_dim, dim):
+        super().__init__()
+        assert (dim % 2) == 0
+        half_dim = dim // 2
+        per_channel_dim = half_dim // input_dim
+        self.weights = nn.Parameter(torch.randn(per_channel_dim))
+        self.out_dim = self.get_dims(input_dim)
+    def forward(self, x):
+        # [b, t, c, 1] * [1, d] = [b, t, c, d] -> [b, t, c * d]
+        freqs = (x[..., None] * self.weights[None] * 2 * np.pi).view(*x.shape[:-1], -1)
+        fouriered = torch.cat((x, freqs.sin(), freqs.cos()), dim=-1)
+        return fouriered
+    def get_dims(self, input_dim):
+        return input_dim * (self.weights.shape[0] * 2 + 1)
+class Sine(nn.Module):
+    def __init__(self, w0=1.0):
+        super().__init__()
+        self.w0 = w0
+    def forward(self, x):
+        return torch.sin(self.w0 * x)
+class Siren(nn.Module):
+    def __init__(
+        self,
+        in_dim,
+        out_dim,
+        w0=1.0,
+        c=6.0,
+        is_first=False,
+        use_bias=True,
+        activation=None,
+        dropout=0.0,
+    ):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.is_first = is_first
+        weight = torch.zeros(out_dim, in_dim)
+        bias = torch.zeros(out_dim) if use_bias else None
+        self.init_(weight, bias, c=c, w0=w0)
+        self.weight = nn.Parameter(weight)
+        self.bias = nn.Parameter(bias) if use_bias else None
+        self.activation = Sine(w0) if activation is None else activation
+        self.dropout = nn.Dropout(dropout)
+    def init_(self, weight, bias, c, w0):
+        dim = self.in_dim
+        w_std = (1 / dim) if self.is_first else (math.sqrt(c / dim) / w0)
+        weight.uniform_(-w_std, w_std)
+        if bias is not None:
+            bias.uniform_(-w_std, w_std)
+    def forward(self, x):
+        out = F.linear(x, self.weight, self.bias)
+        out = self.activation(out)
+        out = self.dropout(out)
+        return out
+def get_embedder(embed_type="fourier", num_freqs=-1, input_dim=3, include_pi=True):
+    if embed_type == "identity" or (embed_type == "fourier" and num_freqs == -1):
+        return nn.Identity(), input_dim
+    elif embed_type == "fourier":
+        embedder_obj = FourierEmbedder(num_freqs=num_freqs, include_pi=include_pi)
+    elif embed_type == "learned_fourier":
+        embedder_obj = LearnedFourierEmbedder(in_channels=input_dim, dim=num_freqs)
+    elif embed_type == "siren":
+        embedder_obj = Siren(
+            in_dim=input_dim, out_dim=num_freqs * input_dim * 2 + input_dim
+        )
+    else:
+        raise ValueError(
+            f"{embed_type} is not valid. Currently only supprts {VALID_EMBED_TYPES}"
+        )
+    return embedder_obj
+###################### AutoEncoder
+class DiagonalGaussianDistribution(ModelMixin, object):
+    def __init__(
+        self,
+        parameters: Union[torch.Tensor, List[torch.Tensor]],
+        deterministic=False,
+        feat_dim=1,
+    ):
+        self.feat_dim = feat_dim
+        self.parameters = parameters
+        if isinstance(parameters, list):
+            self.mean = parameters[0]
+            self.logvar = parameters[1]
+        else:
+            self.mean, self.logvar = torch.chunk(parameters, 2, dim=feat_dim)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(self.mean)
+    def sample(self):
+        x = self.mean + self.std * torch.randn_like(self.mean)
+        return x
+    def kl(self, other=None, dims=(1, 2)):
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        else:
+            if other is None:
+                return 0.5 * torch.mean(
+                    torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar, dim=dims
+                )
+            else:
+                return 0.5 * torch.mean(
+                    torch.pow(self.mean - other.mean, 2) / other.var
+                    + self.var / other.var
+                    - 1.0
+                    - self.logvar
+                    + other.logvar,
+                    dim=dims,
+                )
+    def nll(self, sample, dims=(1, 2)):
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.sum(
+            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims,
+        )
+    def mode(self):
+        return self.mean
+class PerceiverCrossAttentionEncoder(ModelMixin, nn.Module):
+    def __init__(
+        self,
+        use_downsample: bool,
+        num_latents: int,
+        embedder: FourierEmbedder,
+        point_feats: int,
+        embed_point_feats: bool,
+        width: int,
+        heads: int,
+        layers: int,
+        init_scale: float = 0.25,
+        qkv_bias: bool = True,
+        qk_norm: bool = True,
+        use_ln_post: bool = False,
+        use_flash: bool = False,
+        use_checkpoint: bool = False,
+        use_multi_reso: bool = False,
+        resolutions: list = [],
+        sampling_prob: list = [],
+        with_sharp_data: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.num_latents = num_latents
+        self.use_downsample = use_downsample
+        self.embed_point_feats = embed_point_feats
+        self.use_multi_reso = use_multi_reso
+        self.resolutions = resolutions
+        self.sampling_prob = sampling_prob
+        if not self.use_downsample:
+            self.query = nn.Parameter(torch.randn((num_latents, width)) * 0.02)
+        self.embedder = embedder
+        if self.embed_point_feats:
+            self.input_proj = nn.Linear(self.embedder.out_dim * 2, width)
+        else:
+            self.input_proj = nn.Linear(self.embedder.out_dim + point_feats, width)
+        self.cross_attn = ResidualCrossAttentionBlock(
+            width=width,
+            heads=heads,
+            init_scale=init_scale,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm,
+            use_flash=use_flash,
+        )
+        self.with_sharp_data = with_sharp_data
+        if with_sharp_data:
+            self.downsmaple_num_latents = num_latents // 2
+            self.input_proj_sharp = nn.Linear(
+                self.embedder.out_dim + point_feats, width
+            )
+            self.cross_attn_sharp = ResidualCrossAttentionBlock(
+                width=width,
+                heads=heads,
+                init_scale=init_scale,
+                qkv_bias=qkv_bias,
+                qk_norm=qk_norm,
+                use_flash=use_flash,
+            )
+        else:
+            self.downsmaple_num_latents = num_latents
+        self.self_attn = Perceiver(
+            n_ctx=num_latents,
+            width=width,
+            layers=layers,
+            heads=heads,
+            init_scale=init_scale,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm,
+            use_flash=use_flash,
+            use_checkpoint=use_checkpoint,
+        )
+        if use_ln_post:
+            self.ln_post = nn.LayerNorm(width)
+        else:
+            self.ln_post = None
+    def _forward(self, pc, feats, sharp_pc=None, sharp_feat=None):
+        """
+        Args:
+            pc (torch.FloatTensor): [B, N, 3]
+            feats (torch.FloatTensor or None): [B, N, C]
+        Returns:
+        """
+        bs, N, D = pc.shape
+        data = self.embedder(pc)
+        if feats is not None:
+            if self.embed_point_feats:
+                feats = self.embedder(feats)
+            data = torch.cat([data, feats], dim=-1)
+        data = self.input_proj(data)
+        if self.with_sharp_data:
+            sharp_data = self.embedder(sharp_pc)
+            if sharp_feat is not None:
+                if self.embed_point_feats:
+                    sharp_feat = self.embedder(sharp_feat)
+                sharp_data = torch.cat([sharp_data, sharp_feat], dim=-1)
+            sharp_data = self.input_proj_sharp(sharp_data)
+        if self.use_multi_reso:
+            resolution = random.choice(self.resolutions, size=1, p=self.sampling_prob)[
+                0
+            ]
+            if resolution != N:
+                flattened = pc.view(bs * N, D)  # bs*N, 64.      103,4096,3 -> 421888,3
+                batch = torch.arange(bs).to(pc.device)  # 103
+                batch = torch.repeat_interleave(batch, N)  # bs*N. 421888
+                pos = flattened.to(torch.float16)
+                ratio = 1.0 * resolution / N  # 0.0625
+                idx = fps(pos, batch, ratio=ratio)  # 26368
+                pc = pc.view(bs * N, -1)[idx].view(bs, -1, D)
+                bs, N, D = feats.shape
+                flattened1 = feats.view(bs * N, D)
+                feats = flattened1.view(bs * N, -1)[idx].view(bs, -1, D)
+                bs, N, D = pc.shape
+        if self.use_downsample:
+            ###### fps
+            from torch_cluster import fps
+            flattened = pc.view(bs * N, D)  # bs*N, 64
+            batch = torch.arange(bs).to(pc.device)
+            batch = torch.repeat_interleave(batch, N)  # bs*N
+            pos = flattened.to(torch.float16)
+            ratio = 1.0 * self.downsmaple_num_latents / N
+            idx = fps(pos, batch, ratio=ratio).detach()
+            query = data.view(bs * N, -1)[idx].view(bs, -1, data.shape[-1])
+            if self.with_sharp_data:
+                bs, N, D = sharp_pc.shape
+                flattened = sharp_pc.view(bs * N, D)  # bs*N, 64
+                pos = flattened.to(torch.float16)
+                ratio = 1.0 * self.downsmaple_num_latents / N
+                idx = fps(pos, batch, ratio=ratio).detach()
+                sharp_query = sharp_data.view(bs * N, -1)[idx].view(
+                    bs, -1, sharp_data.shape[-1]
+                )
+                query = torch.cat([query, sharp_query], dim=1)
+        else:
+            query = self.query
+            query = repeat(query, "m c -> b m c", b=bs)
+        latents = self.cross_attn(query, data)
+        if self.with_sharp_data:
+            latents = latents + self.cross_attn_sharp(query, sharp_data)
+        latents = self.self_attn(latents)
+        if self.ln_post is not None:
+            latents = self.ln_post(latents)
+        return latents
+    def forward(
+        self,
+        pc: torch.FloatTensor,
+        feats: Optional[torch.FloatTensor] = None,
+        sharp_pc: Optional[torch.FloatTensor] = None,
+        sharp_feats: Optional[torch.FloatTensor] = None,
+    ):
+        """
+        Args:
+            pc (torch.FloatTensor): [B, N, 3]
+            feats (torch.FloatTensor or None): [B, N, C]
+        Returns:
+            dict
+        """
+        return checkpoint(
+            self._forward,
+            (pc, feats, sharp_pc, sharp_feats),
+            self.parameters(),
+            self.use_checkpoint,
+        )
+class PerceiverCrossAttentionDecoder(ModelMixin, nn.Module):
+    def __init__(
+        self,
+        num_latents: int,
+        out_dim: int,
+        embedder: FourierEmbedder,
+        width: int,
+        heads: int,
+        init_scale: float = 0.25,
+        qkv_bias: bool = True,
+        qk_norm: bool = True,
+        use_flash: bool = False,
+        use_checkpoint: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.embedder = embedder
+        self.query_proj = nn.Linear(self.embedder.out_dim, width)
+        self.cross_attn_decoder = ResidualCrossAttentionBlock(
+            n_data=num_latents,
+            width=width,
+            heads=heads,
+            init_scale=init_scale,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm,
+            use_flash=use_flash,
+        )
+        self.ln_post = nn.LayerNorm(width)
+        self.output_proj = nn.Linear(width, out_dim)
+    def _forward(self, queries: torch.FloatTensor, latents: torch.FloatTensor):
+        queries = self.query_proj(self.embedder(queries))
+        x = self.cross_attn_decoder(queries, latents)
+        x = self.ln_post(x)
+        x = self.output_proj(x)
+        return x
+    def forward(self, queries: torch.FloatTensor, latents: torch.FloatTensor):
+        return checkpoint(
+            self._forward, (queries, latents), self.parameters(), self.use_checkpoint
+        )
+@step1x3d_geometry.register("michelangelo-autoencoder")
+class MichelangeloAutoencoder(BaseModule):
+    r"""
+    A VAE model for encoding shapes into latents and decoding latent representations into shapes.
+    """
+    @dataclass
+    class Config(BaseModule.Config):
+        pretrained_model_name_or_path: str = ""
+        subfolder: str = ""
+        n_samples: int = 4096
+        use_downsample: bool = False
+        downsample_ratio: float = 0.0625
+        num_latents: int = 256
+        point_feats: int = 0
+        embed_point_feats: bool = False
+        out_dim: int = 1
+        embed_dim: int = 64
+        embed_type: str = "fourier"
+        num_freqs: int = 8
+        include_pi: bool = True
+        width: int = 768
+        heads: int = 12
+        num_encoder_layers: int = 8
+        num_decoder_layers: int = 16
+        init_scale: float = 0.25
+        qkv_bias: bool = True
+        qk_norm: bool = False
+        use_ln_post: bool = False
+        use_flash: bool = False
+        use_checkpoint: bool = True
+        use_multi_reso: Optional[bool] = False
+        resolutions: Optional[List[int]] = None
+        sampling_prob: Optional[List[float]] = None
+        with_sharp_data: Optional[bool] = True
+        volume_decoder_type: str = "hierarchical"
+        surface_extractor_type: str = "mc"
+        z_scale_factor: float = 1.0
+    cfg: Config
+    def configure(self) -> None:
+        super().configure()
+        self.embedder = get_embedder(
+            embed_type=self.cfg.embed_type,
+            num_freqs=self.cfg.num_freqs,
+            include_pi=self.cfg.include_pi,
+        )
+        # encoder
+        self.cfg.init_scale = self.cfg.init_scale * math.sqrt(1.0 / self.cfg.width)
+        self.encoder = PerceiverCrossAttentionEncoder(
+            use_downsample=self.cfg.use_downsample,
+            embedder=self.embedder,
+            num_latents=self.cfg.num_latents,
+            point_feats=self.cfg.point_feats,
+            embed_point_feats=self.cfg.embed_point_feats,
+            width=self.cfg.width,
+            heads=self.cfg.heads,
+            layers=self.cfg.num_encoder_layers,
+            init_scale=self.cfg.init_scale,
+            qkv_bias=self.cfg.qkv_bias,
+            qk_norm=self.cfg.qk_norm,
+            use_ln_post=self.cfg.use_ln_post,
+            use_flash=self.cfg.use_flash,
+            use_checkpoint=self.cfg.use_checkpoint,
+            use_multi_reso=self.cfg.use_multi_reso,
+            resolutions=self.cfg.resolutions,
+            sampling_prob=self.cfg.sampling_prob,
+            with_sharp_data=self.cfg.with_sharp_data,
+        )
+        if self.cfg.embed_dim > 0:
+            # VAE embed
+            self.pre_kl = nn.Linear(self.cfg.width, self.cfg.embed_dim * 2)
+            self.post_kl = nn.Linear(self.cfg.embed_dim, self.cfg.width)
+            self.latent_shape = (self.cfg.num_latents, self.cfg.embed_dim)
+        else:
+            self.latent_shape = (self.cfg.num_latents, self.cfg.width)
+        self.transformer = Perceiver(
+            n_ctx=self.cfg.num_latents,
+            width=self.cfg.width,
+            layers=self.cfg.num_decoder_layers,
+            heads=self.cfg.heads,
+            init_scale=self.cfg.init_scale,
+            qkv_bias=self.cfg.qkv_bias,
+            qk_norm=self.cfg.qk_norm,
+            use_flash=self.cfg.use_flash,
+            use_checkpoint=self.cfg.use_checkpoint,
+        )
+        # decoder
+        self.decoder = PerceiverCrossAttentionDecoder(
+            embedder=self.embedder,
+            out_dim=self.cfg.out_dim,
+            num_latents=self.cfg.num_latents,
+            width=self.cfg.width,
+            heads=self.cfg.heads,
+            init_scale=self.cfg.init_scale,
+            qkv_bias=self.cfg.qkv_bias,
+            qk_norm=self.cfg.qk_norm,
+            use_flash=self.cfg.use_flash,
+            use_checkpoint=self.cfg.use_checkpoint,
+        )
+        # volume decoder
+        if self.cfg.volume_decoder_type == "hierarchical":
+            self.volume_decoder = HierarchicalVolumeDecoder()
+        else:
+            self.volume_decoder = VanillaVolumeDecoder()
+        if self.cfg.pretrained_model_name_or_path != "":
+            local_model_path = f"{smart_load_model(self.cfg.pretrained_model_name_or_path, self.cfg.subfolder)}/vae/diffusion_pytorch_model.safetensors"
+            pretrain_safetensors = load_file(local_model_path)
+            print(f"Loading pretrained VAE model from {local_model_path}")
+            if "state_dict" in pretrain_safetensors:
+                _pretrained_safetensors = {}
+                for k, v in pretrain_safetensors["state_dict"].items():
+                    if k.startswith("shape_model."):
+                        if "proj1" in k:
+                            _pretrained_safetensors[
+                                k.replace("shape_model.", "").replace(
+                                    "proj1", "proj_sharp"
+                                )
+                            ] = v
+                        elif "attn1" in k:
+                            _pretrained_safetensors[
+                                k.replace("shape_model.", "").replace(
+                                    "attn1", "attn_sharp"
+                                )
+                            ] = v
+                        else:
+                            _pretrained_safetensors[k.replace("shape_model.", "")] = v
+                pretrain_safetensors = _pretrained_safetensors
+                self.load_state_dict(pretrain_safetensors, strict=True)
+            else:
+                _pretrained_safetensors = {}
+                for k, v in pretrain_safetensors.items():
+                    if k.startswith("shape_model"):
+                        final_module = self
+                        for key in k.replace("shape_model.", "").split("."):
+                            final_module = getattr(final_module, key)
+                        data = final_module.data
+                        data_zero = torch.zeros_like(data).to(v)
+                        if data.shape != v.shape:
+                            if data.ndim == 1:
+                                data_zero[: v.shape[0]] = v
+                            elif data.ndim == 2:
+                                data_zero[: v.shape[0], : v.shape[1]] = v
+                            v = data_zero
+                        _pretrained_safetensors[k.replace("shape_model.", "")] = v
+                    else:
+                        _pretrained_safetensors[k] = v
+                pretrain_safetensors = _pretrained_safetensors
+                self.load_state_dict(pretrain_safetensors, strict=True)
+                print("Successed load pretrained VAE model")
+    def encode(
+        self,
+        surface: torch.FloatTensor,
+        sample_posterior: bool = True,
+        sharp_surface: torch.FloatTensor = None,
+    ):
+        """
+        Args:
+            surface (torch.FloatTensor): [B, N, 3+C]
+            sample_posterior (bool):
+        Returns:
+            shape_latents (torch.FloatTensor): [B, num_latents, width]
+            kl_embed (torch.FloatTensor): [B, num_latents, embed_dim]
+            posterior (DiagonalGaussianDistribution or None):
+        """
+        assert (
+            surface.shape[-1] == 3 + self.cfg.point_feats
+        ), f"\
+            Expected {3 + self.cfg.point_feats} channels, got {surface.shape[-1]}"
+        pc, feats = surface[..., :3], surface[..., 3:]  # B, n_samples, 3
+        if sharp_surface is not None:
+            sharp_pc, sharp_feats = (
+                sharp_surface[..., :3],
+                sharp_surface[..., 3:],
+            )  # B, n_samples, 3
+        else:
+            sharp_pc, sharp_feats = None, None
+        shape_embeds = self.encoder(
+            pc, feats, sharp_pc, sharp_feats
+        )  # B, num_latents, width
+        kl_embed, posterior = self.encode_kl_embed(
+            shape_embeds, sample_posterior
+        )  # B, num_latents, embed_dim
+        kl_embed = kl_embed * self.cfg.z_scale_factor  # encode with scale
+        return shape_embeds, kl_embed, posterior
+    def decode(self, latents: torch.FloatTensor):
+        """
+        Args:
+            latents (torch.FloatTensor): [B, embed_dim]
+        Returns:
+            latents (torch.FloatTensor): [B, embed_dim]
+        """
+        latents = self.post_kl(
+            latents / self.cfg.z_scale_factor
+        )  # [B, num_latents, embed_dim] -> [B, num_latents, width]
+        return self.transformer(latents)
+    def query(self, queries: torch.FloatTensor, latents: torch.FloatTensor):
+        """
+        Args:
+            queries (torch.FloatTensor): [B, N, 3]
+            latents (torch.FloatTensor): [B, embed_dim]
+        Returns:
+            features (torch.FloatTensor): [B, N, C], output features
+        """
+        features = self.decoder(queries, latents)
+        return features
+    def encode_kl_embed(
+        self, latents: torch.FloatTensor, sample_posterior: bool = True
+    ):
+        posterior = None
+        if self.cfg.embed_dim > 0:
+            moments = self.pre_kl(latents)
+            posterior = DiagonalGaussianDistribution(moments, feat_dim=-1)
+            if sample_posterior:
+                kl_embed = posterior.sample()
+            else:
+                kl_embed = posterior.mode()
+        else:
+            kl_embed = latents
+        return kl_embed, posterior
+    def forward(
+        self,
+        surface: torch.FloatTensor,
+        sharp_surface: torch.FloatTensor = None,
+        rand_points: torch.FloatTensor = None,
+        sample_posterior: bool = True,
+        **kwargs,
+    ):
+        shape_latents, kl_embed, posterior = self.encode(
+            surface, sample_posterior=sample_posterior, sharp_surface=sharp_surface
+        )
+        latents = self.decode(kl_embed)  # [B, num_latents, width]
+        meshes = self.extract_geometry(latents, **kwargs)
+        return shape_latents, latents, posterior, meshes
+    def extract_geometry(self, latents: torch.FloatTensor, **kwargs):
+        grid_logits_list = []
+        for i in range(latents.shape[0]):
+            grid_logits = self.volume_decoder(
+                latents[i].unsqueeze(0), self.query, **kwargs
+            )
+            grid_logits_list.append(grid_logits)
+        grid_logits = torch.cat(grid_logits_list, dim=0)
+        # extract mesh
+        surface_extractor_type = (
+            kwargs["surface_extractor_type"]
+            if "surface_extractor_type" in kwargs.keys()
+            and kwargs["surface_extractor_type"] is not None
+            else self.cfg.surface_extractor_type
+        )
+        if surface_extractor_type == "mc":
+            surface_extractor = MCSurfaceExtractor()
+            meshes = surface_extractor(grid_logits, **kwargs)
+        elif surface_extractor_type == "dmc":
+            surface_extractor = DMCSurfaceExtractor()
+            meshes = surface_extractor(grid_logits, **kwargs)
+        else:
+            raise NotImplementedError
+        return meshes

step1x3d_geometry/models/autoencoders/surface_extractors.py ADDED Viewed

	@@ -0,0 +1,137 @@

+from typing import Union, Tuple, List
+import numpy as np
+import torch
+from skimage import measure
+class MeshExtractResult:
+    def __init__(self, verts, faces, vertex_attrs=None, res=64):
+        self.verts = verts
+        self.faces = faces.long()
+        self.vertex_attrs = vertex_attrs
+        self.face_normal = self.comput_face_normals()
+        self.vert_normal = self.comput_v_normals()
+        self.res = res
+        self.success = verts.shape[0] != 0 and faces.shape[0] != 0
+        # training only
+        self.tsdf_v = None
+        self.tsdf_s = None
+        self.reg_loss = None
+    def comput_face_normals(self):
+        i0 = self.faces[..., 0].long()
+        i1 = self.faces[..., 1].long()
+        i2 = self.faces[..., 2].long()
+        v0 = self.verts[i0, :]
+        v1 = self.verts[i1, :]
+        v2 = self.verts[i2, :]
+        face_normals = torch.cross(v1 - v0, v2 - v0, dim=-1)
+        face_normals = torch.nn.functional.normalize(face_normals, dim=1)
+        return face_normals[:, None, :].repeat(1, 3, 1)
+    def comput_v_normals(self):
+        i0 = self.faces[..., 0].long()
+        i1 = self.faces[..., 1].long()
+        i2 = self.faces[..., 2].long()
+        v0 = self.verts[i0, :]
+        v1 = self.verts[i1, :]
+        v2 = self.verts[i2, :]
+        face_normals = torch.cross(v1 - v0, v2 - v0, dim=-1)
+        v_normals = torch.zeros_like(self.verts)
+        v_normals.scatter_add_(0, i0[..., None].repeat(1, 3), face_normals)
+        v_normals.scatter_add_(0, i1[..., None].repeat(1, 3), face_normals)
+        v_normals.scatter_add_(0, i2[..., None].repeat(1, 3), face_normals)
+        v_normals = torch.nn.functional.normalize(v_normals, dim=1)
+        return v_normals
+def center_vertices(vertices):
+    """Translate the vertices so that bounding box is centered at zero."""
+    vert_min = vertices.min(dim=0)[0]
+    vert_max = vertices.max(dim=0)[0]
+    vert_center = 0.5 * (vert_min + vert_max)
+    return vertices - vert_center
+class SurfaceExtractor:
+    def _compute_box_stat(
+        self, bounds: Union[Tuple[float], List[float], float], octree_resolution: int
+    ):
+        if isinstance(bounds, float):
+            bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds]
+        bbox_min, bbox_max = np.array(bounds[0:3]), np.array(bounds[3:6])
+        bbox_size = bbox_max - bbox_min
+        grid_size = [
+            int(octree_resolution) + 1,
+            int(octree_resolution) + 1,
+            int(octree_resolution) + 1,
+        ]
+        return grid_size, bbox_min, bbox_size
+    def run(self, *args, **kwargs):
+        return NotImplementedError
+    def __call__(self, grid_logits, **kwargs):
+        outputs = []
+        for i in range(grid_logits.shape[0]):
+            try:
+                verts, faces = self.run(grid_logits[i], **kwargs)
+                outputs.append(
+                    MeshExtractResult(
+                        verts=verts.float(),
+                        faces=faces,
+                        res=kwargs["octree_resolution"],
+                    )
+                )
+            except Exception:
+                import traceback
+                traceback.print_exc()
+                outputs.append(None)
+        return outputs
+class MCSurfaceExtractor(SurfaceExtractor):
+    def run(self, grid_logit, *, mc_level, bounds, octree_resolution, **kwargs):
+        verts, faces, normals, _ = measure.marching_cubes(
+            grid_logit.float().cpu().numpy(), mc_level, method="lewiner"
+        )
+        grid_size, bbox_min, bbox_size = self._compute_box_stat(
+            bounds, octree_resolution
+        )
+        verts = verts / grid_size * bbox_size + bbox_min
+        verts = torch.tensor(verts, device=grid_logit.device, dtype=torch.float32)
+        faces = torch.tensor(
+            np.ascontiguousarray(faces), device=grid_logit.device, dtype=torch.long
+        )
+        faces = faces[:, [2, 1, 0]]
+        return verts, faces
+class DMCSurfaceExtractor(SurfaceExtractor):
+    def run(self, grid_logit, *, octree_resolution, **kwargs):
+        device = grid_logit.device
+        if not hasattr(self, "dmc"):
+            try:
+                from diso import DiffDMC
+            except:
+                raise ImportError(
+                    "Please install diso via `pip install diso`, or set mc_algo to 'mc'"
+                )
+            self.dmc = DiffDMC(dtype=torch.float32).to(device)
+        sdf = -grid_logit / octree_resolution
+        sdf = sdf.to(torch.float32).contiguous()
+        verts, faces = self.dmc(sdf, deform=None, return_quads=False, normalize=True)
+        grid_size, bbox_min, bbox_size = self._compute_box_stat(
+            kwargs["bounds"], octree_resolution
+        )
+        verts = verts * kwargs["bounds"] * 2 - kwargs["bounds"]
+        return verts, faces

step1x3d_geometry/models/autoencoders/transformers/attention.py ADDED Viewed

	@@ -0,0 +1,286 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from step1x3d_geometry.utils.typing import *
+from step1x3d_geometry.utils.checkpoint import checkpoint
+from .utils import init_linear, MLP
+from timm.models.vision_transformer import Attention
+class MultiheadAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        n_ctx: int,
+        width: int,
+        heads: int,
+        init_scale: float,
+        qkv_bias: bool,
+        qk_norm: bool,
+        norm_layer=nn.LayerNorm,
+        use_flash: bool = False,
+    ):
+        super().__init__()
+        self.n_ctx = n_ctx
+        self.width = width
+        self.heads = heads
+        self.c_qkv = nn.Linear(width, width * 3, bias=qkv_bias)
+        self.c_proj = nn.Linear(width, width)
+        self.attention = QKVMultiheadAttention(
+            heads=heads,
+            n_ctx=n_ctx,
+            width=width,
+            norm_layer=norm_layer,
+            qk_norm=qk_norm,
+            use_flash=use_flash,
+        )
+        init_linear(self.c_qkv, init_scale)
+        init_linear(self.c_proj, init_scale)
+    def forward(self, x):
+        x = self.c_qkv(x)
+        x = checkpoint(self.attention, (x,), (), True)
+        x = self.c_proj(x)
+        return x
+class QKVMultiheadAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        heads: int,
+        n_ctx: int,
+        width=None,
+        qk_norm: bool = False,
+        norm_layer=nn.LayerNorm,
+        use_flash: bool = False,
+    ):
+        super().__init__()
+        self.heads = heads
+        self.n_ctx = n_ctx
+        self.use_flash = use_flash
+        self.q_norm = (
+            norm_layer(width // heads, elementwise_affine=True, eps=1e-6)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.k_norm = (
+            norm_layer(width // heads, elementwise_affine=True, eps=1e-6)
+            if qk_norm
+            else nn.Identity()
+        )
+    def forward(self, qkv):
+        bs, n_ctx, width = qkv.shape
+        attn_ch = width // self.heads // 3
+        scale = 1 / math.sqrt(math.sqrt(attn_ch))
+        qkv = qkv.view(bs, n_ctx, self.heads, -1)
+        q, k, v = torch.split(qkv, attn_ch, dim=-1)
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        if self.use_flash:
+            q = q.permute(0, 2, 1, 3)
+            k = k.permute(0, 2, 1, 3)
+            v = v.permute(0, 2, 1, 3)
+            out = (
+                F.scaled_dot_product_attention(q, k, v)
+                .permute(0, 2, 1, 3)
+                .reshape(bs, n_ctx, -1)
+            )
+        else:
+            weight = torch.einsum(
+                "bthc,bshc->bhts", q * scale, k * scale
+            )  # More stable with f16 than dividing afterwards
+            wdtype = weight.dtype
+            weight = torch.softmax(weight.float(), dim=-1).type(wdtype)
+            out = torch.einsum("bhts,bshc->bthc", weight, v).reshape(bs, n_ctx, -1)
+        return out
+class ResidualAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        n_ctx: int,
+        width: int,
+        heads: int,
+        init_scale: float = 1.0,
+        qkv_bias: bool = True,
+        norm_layer=nn.LayerNorm,
+        qk_norm: bool = True,
+        use_flash: bool = False,
+        use_checkpoint: bool = False,
+    ):
+        super().__init__()
+        self.use_checkpoint = use_checkpoint
+        self.attn = MultiheadAttention(
+            n_ctx=n_ctx,
+            width=width,
+            heads=heads,
+            init_scale=init_scale,
+            qkv_bias=qkv_bias,
+            norm_layer=norm_layer,
+            qk_norm=qk_norm,
+            use_flash=use_flash,
+        )
+        self.ln_1 = nn.LayerNorm(width)
+        self.mlp = MLP(width=width, init_scale=init_scale)
+        self.ln_2 = nn.LayerNorm(width)
+    def _forward(self, x: torch.Tensor):
+        x = x + self.attn(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+    def forward(self, x: torch.Tensor):
+        return checkpoint(self._forward, (x,), self.parameters(), self.use_checkpoint)
+class MultiheadCrossAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        width: int,
+        heads: int,
+        init_scale: float,
+        qkv_bias: bool = True,
+        norm_layer=nn.LayerNorm,
+        qk_norm: bool = True,
+        use_flash: bool = False,
+        n_data: Optional[int] = None,
+        data_width: Optional[int] = None,
+    ):
+        super().__init__()
+        self.n_data = n_data
+        self.width = width
+        self.heads = heads
+        self.data_width = width if data_width is None else data_width
+        self.c_q = nn.Linear(width, width, bias=qkv_bias)
+        self.c_kv = nn.Linear(self.data_width, width * 2, bias=qkv_bias)
+        self.c_proj = nn.Linear(width, width)
+        self.attention = QKVMultiheadCrossAttention(
+            heads=heads,
+            n_data=n_data,
+            width=width,
+            norm_layer=norm_layer,
+            qk_norm=qk_norm,
+            use_flash=use_flash,
+        )
+        init_linear(self.c_q, init_scale)
+        init_linear(self.c_kv, init_scale)
+        init_linear(self.c_proj, init_scale)
+    def forward(self, x, data):
+        x = self.c_q(x)
+        data = self.c_kv(data)
+        x = checkpoint(self.attention, (x, data), (), True)
+        x = self.c_proj(x)
+        return x
+class QKVMultiheadCrossAttention(nn.Module):
+    def __init__(
+        self,
+        *,
+        heads: int,
+        n_data: Optional[int] = None,
+        width=None,
+        norm_layer=nn.LayerNorm,
+        qk_norm: bool = False,
+        use_flash: bool = False,
+    ):
+        super().__init__()
+        self.heads = heads
+        self.n_data = n_data
+        self.use_flash = use_flash
+        self.q_norm = (
+            norm_layer(width // heads, elementwise_affine=True, eps=1e-6)
+            if qk_norm
+            else nn.Identity()
+        )
+        self.k_norm = (
+            norm_layer(width // heads, elementwise_affine=True, eps=1e-6)
+            if qk_norm
+            else nn.Identity()
+        )
+    def forward(self, q, kv):
+        _, n_ctx, _ = q.shape
+        bs, n_data, width = kv.shape
+        attn_ch = width // self.heads // 2
+        scale = 1 / math.sqrt(math.sqrt(attn_ch))
+        q = q.view(bs, n_ctx, self.heads, -1)
+        kv = kv.view(bs, n_data, self.heads, -1)
+        k, v = torch.split(kv, attn_ch, dim=-1)
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        if self.use_flash:
+            q = q.permute(0, 2, 1, 3)
+            k = k.permute(0, 2, 1, 3)
+            v = v.permute(0, 2, 1, 3)
+            out = (
+                F.scaled_dot_product_attention(q, k, v)
+                .permute(0, 2, 1, 3)
+                .reshape(bs, n_ctx, -1)
+            )
+        else:
+            weight = torch.einsum(
+                "bthc,bshc->bhts", q * scale, k * scale
+            )  # More stable with f16 than dividing afterwards
+            wdtype = weight.dtype
+            weight = torch.softmax(weight.float(), dim=-1).type(wdtype)
+            out = torch.einsum("bhts,bshc->bthc", weight, v).reshape(bs, n_ctx, -1)
+        return out
+class ResidualCrossAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        n_data: Optional[int] = None,
+        width: int,
+        heads: int,
+        data_width: Optional[int] = None,
+        init_scale: float = 0.25,
+        qkv_bias: bool = True,
+        qk_norm: bool = True,
+        use_flash: bool = False,
+    ):
+        super().__init__()
+        if data_width is None:
+            data_width = width
+        self.attn = MultiheadCrossAttention(
+            n_data=n_data,
+            width=width,
+            heads=heads,
+            data_width=data_width,
+            init_scale=init_scale,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm,
+            use_flash=use_flash,
+        )
+        self.ln_1 = nn.LayerNorm(width)
+        self.ln_2 = nn.LayerNorm(data_width)
+        self.mlp = MLP(width=width, init_scale=init_scale)
+        self.ln_3 = nn.LayerNorm(width)
+    def forward(self, x: torch.Tensor, data: torch.Tensor):
+        x = x + self.attn(self.ln_1(x), self.ln_2(data))
+        x = x + self.mlp(self.ln_3(x))
+        return x

step1x3d_geometry/models/autoencoders/transformers/perceiver_1d.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from step1x3d_geometry.utils.typing import *
+from step1x3d_geometry.utils.checkpoint import checkpoint
+from .utils import init_linear
+from .attention import ResidualAttentionBlock
+class Perceiver(nn.Module):
+    def __init__(
+        self,
+        *,
+        n_ctx: int,
+        width: int,
+        layers: int,
+        heads: int,
+        init_scale: float = 0.25,
+        qkv_bias: bool = True,
+        qk_norm: bool = True,
+        use_flash: bool = False,
+        use_checkpoint: bool = False
+    ):
+        super().__init__()
+        self.n_ctx = n_ctx
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.ModuleList(
+            [
+                ResidualAttentionBlock(
+                    n_ctx=n_ctx,
+                    width=width,
+                    heads=heads,
+                    init_scale=init_scale,
+                    qkv_bias=qkv_bias,
+                    qk_norm=qk_norm,
+                    use_flash=use_flash,
+                    use_checkpoint=use_checkpoint,
+                )
+                for _ in range(layers)
+            ]
+        )
+    def forward(self, x: torch.Tensor):
+        for block in self.resblocks:
+            x = block(x)
+        return x

step1x3d_geometry/models/autoencoders/transformers/utils.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import torch.nn as nn
+def init_linear(l, stddev):
+    nn.init.normal_(l.weight, std=stddev)
+    if l.bias is not None:
+        nn.init.constant_(l.bias, 0.0)
+class MLP(nn.Module):
+    def __init__(self, *, width: int, init_scale: float):
+        super().__init__()
+        self.width = width
+        self.c_fc = nn.Linear(width, width * 4)
+        self.c_proj = nn.Linear(width * 4, width)
+        self.gelu = nn.GELU()
+        init_linear(self.c_fc, init_scale)
+        init_linear(self.c_proj, init_scale)
+    def forward(self, x):
+        return self.c_proj(self.gelu(self.c_fc(x)))

step1x3d_geometry/models/autoencoders/volume_decoders.py ADDED Viewed

	@@ -0,0 +1,327 @@

+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+from typing import Union, Tuple, List, Callable
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import repeat
+from tqdm import tqdm
+cube_corners = torch.tensor(
+    [
+        [0, 0, 0],
+        [1, 0, 0],
+        [0, 1, 0],
+        [1, 1, 0],
+        [0, 0, 1],
+        [1, 0, 1],
+        [0, 1, 1],
+        [1, 1, 1],
+    ],
+    dtype=torch.int,
+)
+def extract_near_surface_volume_fn(input_tensor: torch.Tensor, alpha: float):
+    device = input_tensor.device
+    D = input_tensor.shape[0]
+    signed_val = 0.0
+    # 添加偏移并处理无效值
+    val = input_tensor + alpha
+    valid_mask = val > -9000  # 假设-9000是无效值
+    # 改进的邻居获取函数（保持维度一致）
+    def get_neighbor(t, shift, axis):
+        """根据指定轴进行位移并保持维度一致"""
+        if shift == 0:
+            return t.clone()
+        # 确定填充轴（输入为[D, D, D]对应z,y,x轴）
+        pad_dims = [0, 0, 0, 0, 0, 0]  # 格式：[x前，x后，y前，y后，z前，z后]
+        # 根据轴类型设置填充
+        if axis == 0:  # x轴（最后一个维度）
+            pad_idx = 0 if shift > 0 else 1
+            pad_dims[pad_idx] = abs(shift)
+        elif axis == 1:  # y轴（中间维度）
+            pad_idx = 2 if shift > 0 else 3
+            pad_dims[pad_idx] = abs(shift)
+        elif axis == 2:  # z轴（第一个维度）
+            pad_idx = 4 if shift > 0 else 5
+            pad_dims[pad_idx] = abs(shift)
+        # 执行填充（添加batch和channel维度适配F.pad）
+        padded = F.pad(
+            t.unsqueeze(0).unsqueeze(0), pad_dims[::-1], mode="replicate"
+        )  # 反转顺序适配F.pad
+        # 构建动态切片索引
+        slice_dims = [slice(None)] * 3  # 初始化为全切片
+        if axis == 0:  # x轴（dim=2）
+            if shift > 0:
+                slice_dims[0] = slice(shift, None)
+            else:
+                slice_dims[0] = slice(None, shift)
+        elif axis == 1:  # y轴（dim=1）
+            if shift > 0:
+                slice_dims[1] = slice(shift, None)
+            else:
+                slice_dims[1] = slice(None, shift)
+        elif axis == 2:  # z轴（dim=0）
+            if shift > 0:
+                slice_dims[2] = slice(shift, None)
+            else:
+                slice_dims[2] = slice(None, shift)
+        # 应用切片并恢复维度
+        padded = padded.squeeze(0).squeeze(0)
+        sliced = padded[slice_dims]
+        return sliced
+    # 获取各方向邻居（确保维度一致）
+    left = get_neighbor(val, 1, axis=0)  # x方向
+    right = get_neighbor(val, -1, axis=0)
+    back = get_neighbor(val, 1, axis=1)  # y方向
+    front = get_neighbor(val, -1, axis=1)
+    down = get_neighbor(val, 1, axis=2)  # z方向
+    up = get_neighbor(val, -1, axis=2)
+    # 处理边界无效值（使用where保持维度一致）
+    def safe_where(neighbor):
+        return torch.where(neighbor > -9000, neighbor, val)
+    left = safe_where(left)
+    right = safe_where(right)
+    back = safe_where(back)
+    front = safe_where(front)
+    down = safe_where(down)
+    up = safe_where(up)
+    # 计算符号一致性（转换为float32确保精度）
+    sign = torch.sign(val.to(torch.float32))
+    neighbors_sign = torch.stack(
+        [
+            torch.sign(left.to(torch.float32)),
+            torch.sign(right.to(torch.float32)),
+            torch.sign(back.to(torch.float32)),
+            torch.sign(front.to(torch.float32)),
+            torch.sign(down.to(torch.float32)),
+            torch.sign(up.to(torch.float32)),
+        ],
+        dim=0,
+    )
+    # 检查所有符号是否一致
+    same_sign = torch.all(neighbors_sign == sign, dim=0)
+    # 生成最终掩码
+    mask = (~same_sign).to(torch.int32)
+    return mask * valid_mask.to(torch.int32)
+def generate_dense_grid_points(
+    bbox_min: np.ndarray,
+    bbox_max: np.ndarray,
+    octree_resolution: int,
+    indexing: str = "ij",
+):
+    length = bbox_max - bbox_min
+    num_cells = octree_resolution
+    x = np.linspace(bbox_min[0], bbox_max[0], int(num_cells) + 1, dtype=np.float32)
+    y = np.linspace(bbox_min[1], bbox_max[1], int(num_cells) + 1, dtype=np.float32)
+    z = np.linspace(bbox_min[2], bbox_max[2], int(num_cells) + 1, dtype=np.float32)
+    [xs, ys, zs] = np.meshgrid(x, y, z, indexing=indexing)
+    xyz = np.stack((xs, ys, zs), axis=-1)
+    grid_size = [int(num_cells) + 1, int(num_cells) + 1, int(num_cells) + 1]
+    return xyz, grid_size, length
+class VanillaVolumeDecoder:
+    @torch.no_grad()
+    def __call__(
+        self,
+        latents: torch.FloatTensor,
+        geo_decoder: Callable,
+        bounds: Union[Tuple[float], List[float], float] = 1.01,
+        num_chunks: int = 10000,
+        octree_resolution: int = 384,
+        enable_pbar: bool = True,
+        **kwargs,
+    ):
+        device = latents.device
+        dtype = latents.dtype
+        batch_size = latents.shape[0]
+        # 1. generate query points
+        if isinstance(bounds, float):
+            bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds]
+        bbox_min, bbox_max = np.array(bounds[0:3]), np.array(bounds[3:6])
+        xyz_samples, grid_size, length = generate_dense_grid_points(
+            bbox_min=bbox_min,
+            bbox_max=bbox_max,
+            octree_resolution=octree_resolution,
+            indexing="ij",
+        )
+        xyz_samples = (
+            torch.from_numpy(xyz_samples)
+            .to(device, dtype=dtype)
+            .contiguous()
+            .reshape(-1, 3)
+        )
+        # 2. latents to 3d volume
+        batch_features = []
+        for start in tqdm(
+            range(0, xyz_samples.shape[0], num_chunks),
+            desc=f"Volume Decoding",
+            disable=not enable_pbar,
+        ):
+            chunk_queries = xyz_samples[start : start + num_chunks, :]
+            chunk_queries = repeat(chunk_queries, "p c -> b p c", b=batch_size)
+            features = geo_decoder(queries=chunk_queries, latents=latents)
+            batch_features.append(features)
+        grid_features = torch.cat(batch_features, dim=1)
+        grid_logits, grid_features = grid_features[..., 0:1], grid_features[..., 1:]
+        grid_logits = grid_logits.view((batch_size, *grid_size)).float()
+        return grid_logits, xyz_samples, grid_features, None
+class HierarchicalVolumeDecoder:
+    @torch.no_grad()
+    def __call__(
+        self,
+        latents: torch.FloatTensor,
+        geo_decoder: Callable,
+        bounds: Union[Tuple[float], List[float], float] = 1.01,
+        num_chunks: int = 65536,
+        mc_level: float = 0.0,
+        octree_resolution: int = 384,
+        min_resolution: int = 63,
+        enable_pbar: bool = True,
+        empty_value: float = float("nan"),
+        **kwargs,
+    ):
+        device = latents.device
+        dtype = latents.dtype
+        resolutions = []
+        if octree_resolution < min_resolution:
+            resolutions.append(octree_resolution)
+        while octree_resolution >= min_resolution:
+            resolutions.append(octree_resolution)
+            octree_resolution = octree_resolution // 2
+        resolutions.reverse()
+        # 1. generate query points
+        if isinstance(bounds, float):
+            bounds = [-bounds, -bounds, -bounds, bounds, bounds, bounds]
+        bbox_min = np.array(bounds[0:3])
+        bbox_max = np.array(bounds[3:6])
+        bbox_size = bbox_max - bbox_min
+        xyz_samples, grid_size, length = generate_dense_grid_points(
+            bbox_min=bbox_min,
+            bbox_max=bbox_max,
+            octree_resolution=resolutions[0],
+            indexing="ij",
+        )
+        dilate = nn.Conv3d(1, 1, 3, padding=1, bias=False, device=device, dtype=dtype)
+        dilate.weight = torch.nn.Parameter(
+            torch.ones(dilate.weight.shape, dtype=dtype, device=device)
+        )
+        grid_size = np.array(grid_size)
+        xyz_samples = (
+            torch.from_numpy(xyz_samples)
+            .to(device, dtype=dtype)
+            .contiguous()
+            .reshape(-1, 3)
+        )
+        # 2. latents to 3d volume
+        batch_features = []
+        batch_size = latents.shape[0]
+        for start in tqdm(
+            range(0, xyz_samples.shape[0], num_chunks),
+            desc=f"Hierarchical Volume Decoding [r{resolutions[0] + 1}]",
+            disable=not enable_pbar,
+        ):
+            queries = xyz_samples[start : start + num_chunks, :]
+            batch_queries = repeat(queries, "p c -> b p c", b=batch_size)
+            features = geo_decoder(queries=batch_queries, latents=latents)
+            batch_features.append(features)
+        grid_features = torch.cat(batch_features, dim=1).view(
+            (batch_size, grid_size[0], grid_size[1], grid_size[2], -1)
+        )
+        grid_logits = grid_features[..., 0]  # assume the first element is the logits
+        for octree_depth_now in resolutions[1:]:
+            grid_size = np.array([octree_depth_now + 1] * 3)
+            resolution = bbox_size / octree_depth_now
+            next_index = torch.zeros(tuple(grid_size), dtype=dtype, device=device)
+            next_logits = torch.full(
+                next_index.shape, -10000.0, dtype=dtype, device=device
+            )
+            curr_points = extract_near_surface_volume_fn(
+                grid_logits.squeeze(0), mc_level
+            )
+            curr_points += grid_logits.squeeze(0).abs() < 0.95
+            if octree_depth_now == resolutions[-1]:
+                expand_num = 0
+            else:
+                expand_num = 1
+            for i in range(expand_num):
+                curr_points = dilate(curr_points.unsqueeze(0).to(dtype)).squeeze(0)
+            (cidx_x, cidx_y, cidx_z) = torch.where(curr_points > 0)
+            next_index[cidx_x * 2, cidx_y * 2, cidx_z * 2] = 1
+            for i in range(2 - expand_num):
+                next_index = dilate(next_index.unsqueeze(0)).squeeze(0)
+            nidx = torch.where(next_index > 0)
+            next_points = torch.stack(nidx, dim=1)
+            next_points = next_points * torch.tensor(
+                resolution, dtype=latents.dtype, device=device
+            ) + torch.tensor(bbox_min, dtype=latents.dtype, device=device)
+            batch_features = []
+            for start in tqdm(
+                range(0, next_points.shape[0], num_chunks),
+                desc=f"Hierarchical Volume Decoding [r{octree_depth_now + 1}]",
+                disable=not enable_pbar,
+            ):
+                queries = next_points[start : start + num_chunks, :]
+                batch_queries = repeat(queries, "p c -> b p c", b=batch_size)
+                features = geo_decoder(
+                    queries=batch_queries.to(latents.dtype), latents=latents
+                )
+                batch_features.append(features)
+            grid_features = torch.cat(batch_features, dim=1)
+            grid_logits = grid_features[..., 0:1]
+            next_logits[nidx] = grid_logits[0, ..., 0]
+            grid_logits = next_logits.unsqueeze(0)
+        grid_logits[grid_logits == -10000.0] = empty_value
+        return grid_logits

step1x3d_geometry/models/conditional_encoders/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from . import (
+    dinov2_encoder,
+    dinov2_clip_encoder,
+    t5_encoder,
+    label_encoder,
+)

step1x3d_geometry/models/conditional_encoders/base.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import random
+import torch
+import torch.nn as nn
+import numpy as np
+from PIL import Image
+from dataclasses import dataclass
+from torchvision.transforms import Normalize
+from torchvision.transforms import InterpolationMode
+from torchvision.transforms.transforms import _interpolation_modes_from_int
+from transformers import CLIPModel, CLIPTokenizer, CLIPImageProcessor
+from transformers.utils import ModelOutput
+from typing import Iterable, Optional, Union, List
+import step1x3d_geometry
+from step1x3d_geometry.utils.base import BaseModule
+from step1x3d_geometry.utils.typing import *
+ImageType = Union[np.ndarray, torch.Tensor, Image.Image]
+class BaseVisualEncoder(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        pretrained_model_name_or_path: Optional[str] = (
+            None  # the pretrained model name or path
+        )
+        encode_camera: bool = False  # whether to encode camera
+        camera_embeds_type: str = "sincos"  # the type of camera embeds
+        camera_embeds_dim: Optional[int] = None  # the dimension of camera embeds
+        n_views: int = 1  # the number of views
+        empty_embeds_ratio: float = 0.1  # the ratio of empty embeds
+        normalize_embeds: bool = False  # whether to normalize the embeds
+        zero_uncond_embeds: bool = True
+    cfg: Config
+    def configure(self) -> None:
+        super().configure()
+        if self.cfg.encode_camera:
+            self.distance = 1.0
+            self.register_buffer(
+                "cameras",
+                torch.as_tensor(
+                    [
+                        [
+                            [1, 0, 0, 0],
+                            [0, 0, -1, -self.distance],
+                            [0, 1, 0, 0],
+                            [0, 0, 0, 1],
+                        ],  # front to back
+                        [
+                            [0, 0, 1, self.distance],
+                            [1, 0, 0, 0],
+                            [0, 1, 0, 0],
+                            [0, 0, 0, 1],
+                        ],  # right to left
+                        [
+                            [-1, 0, 0, 0],
+                            [0, 0, 1, self.distance],
+                            [0, 1, 0, 0],
+                            [0, 0, 0, 1],
+                        ],  # back to front
+                        [
+                            [0, 0, -1, -self.distance],
+                            [-1, 0, 0, 0],
+                            [0, 1, 0, 0],
+                            [0, 0, 0, 1],
+                        ],  # left to right
+                    ],
+                    dtype=torch.float32,
+                ),
+            )
+    def encode_image(
+        self,
+        images: Iterable[Optional[ImageType]],
+        camera_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        raise NotImplementedError
+    def encode_camera(self, c2ws: torch.Tensor):
+        if self.cfg.camera_embeds_type == "sincos":
+            assert (
+                c2ws.shape[-1] == 4 and c2ws.shape[-2] == 4
+            ), f"Invalid c2ws shape: {c2ws.shape}"
+            c2ws = c2ws.view(-1, 16)
+            return torch.cat([torch.sin(c2ws), torch.cos(c2ws)], dim=-1)
+        else:
+            raise NotImplementedError(
+                f"Unknown camera_embeds_type: {self.cfg.camera_embeds_type}"
+            )
+    def forward(self, batch):
+        assert (
+            "image" in batch or "mvimages" in batch
+        ), "image or mvimages is required for visual embeds"
+        if batch["image"].dim() == 5:
+            bs = batch["image"].shape[0] * batch["image"].shape[1]
+        else:
+            bs = batch["image"].shape[0]
+        if random.random() < self.cfg.empty_embeds_ratio:
+            if "image" in batch or "image_embeds" in batch:
+                visual_embeds = self.empty_image_embeds.repeat(bs, 1, 1)
+            elif "mvimages" in batch or "mvimage_embeds" in batch:
+                visual_embeds = self.empty_image_embeds.unsqueeze(1).repeat(bs, 1, 1, 1)
+        else:
+            # for visual inputs
+            if "image" in batch:
+                if self.cfg.encode_camera:
+                    visual_embeds = self.encode_image(
+                        batch["image"], cameras=batch["c2w"]
+                    )
+                else:
+                    visual_embeds = self.encode_image(batch["image"])
+            elif "mvimages" in batch:
+                n_views = batch["mvimages"].shape[1]
+                if self.cfg.encode_camera:
+                    visual_embeds = self.encode_image(
+                        batch["mvimages"].view(-1, *batch["mvimages"].shape[-3:]),
+                        cameras=batch["c2ws"],
+                    ).view(bs, n_views, *self.empty_image_embeds.shape[-2:])
+                else:
+                    visual_embeds = self.encode_image(
+                        batch["mvimages"].view(-1, *batch["mvimages"].shape[-3:])
+                    ).view(bs, n_views, *self.empty_image_embeds.shape[-2:])
+        if self.cfg.normalize_embeds:  # post-process the visual embeds
+            visual_embeds = visual_embeds / visual_embeds.norm(dim=-1, keepdim=True)
+        return visual_embeds
+class BaseCaptionEncoder(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        pretrained_model_name_or_path: Optional[str] = (
+            None  # the pretrained model name or path
+        )
+        text_max_length: int = 77
+        empty_embeds_ratio: float = 0.1  # the ratio of empty embeds
+        normalize_embeds: bool = False  # whether to normalize the embeds
+        zero_uncond_embeds: bool = True
+    cfg: Config
+    def configure(self) -> None:
+        super().configure()
+    def forward(self, batch, force_drop_ids=None):
+        assert "caption" in batch, "caption is required for caption embeds"
+        bs = len(batch["label"])
+        if random.random() < self.cfg.empty_embeds_ratio:
+            caption_embeds = self.empty_text_embeds.repeat(bs, 1, 1)
+        else:
+            caption_embeds = self.encode_text(batch["caption"])
+        if self.cfg.normalize_embeds:  # post-process the label embeds
+            caption_embeds = caption_embeds / caption_embeds.norm(dim=-1, keepdim=True)
+        return caption_embeds
+class BaseLabelEncoder(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        pretrained_model_name_or_path: Optional[str] = (
+            None  # the pretrained model name or path
+        )
+        hidden_size: int = 1024
+        empty_embeds_ratio: float = 0.1  # the ratio of empty embeds
+        normalize_embeds: bool = False  # whether to normalize the embeds
+        zero_uncond_embeds: bool = True
+    cfg: Config
+    def configure(self) -> None:
+        super().configure()
+    def forward(self, batch, force_drop_ids=None):
+        assert "label" in batch, "label is required for label embeds"
+        bs = len(batch["label"])
+        if random.random() < self.cfg.empty_embeds_ratio:
+            label_embeds = self.empty_label_embeds.repeat(bs, 1, 1)
+        else:
+            label_embeds = self.encode_label(batch["label"])
+        if self.cfg.normalize_embeds:  # post-process the label embeds
+            label_embeds = label_embeds / label_embeds.norm(dim=-1, keepdim=True)
+        return label_embeds

step1x3d_geometry/models/conditional_encoders/clip/modeling_clip.py ADDED Viewed

	@@ -0,0 +1,1597 @@

+# coding=utf-8
+# Copyright 2021 The OpenAI Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch CLIP model."""
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.modeling_attn_mask_utils import (
+    _create_4d_causal_attention_mask,
+    _prepare_4d_attention_mask,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+    ImageClassifierOutput,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.models.clip.configuration_clip import (
+    CLIPConfig,
+    CLIPTextConfig,
+    CLIPVisionConfig,
+)
+logger = logging.get_logger(__name__)
+# General docstring
+_CONFIG_FOR_DOC = "CLIPConfig"
+_CHECKPOINT_FOR_DOC = "openai/clip-vit-base-patch32"
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "openai/clip-vit-base-patch32"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "LABEL_0"
+# contrastive loss function, adapted from
+# https://sachinruk.github.io/blog/2021-03-07-clip.html
+def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
+    return nn.functional.cross_entropy(
+        logits, torch.arange(len(logits), device=logits.device)
+    )
+def clip_loss(similarity: torch.Tensor) -> torch.Tensor:
+    caption_loss = contrastive_loss(similarity)
+    image_loss = contrastive_loss(similarity.t())
+    return (caption_loss + image_loss) / 2.0
+@dataclass
+class CLIPVisionModelOutput(ModelOutput):
+    """
+    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
+    Args:
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    image_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+@dataclass
+class CLIPTextModelOutput(ModelOutput):
+    """
+    Base class for text model's outputs that also contains a pooling of the last hidden states.
+    Args:
+        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The text embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    text_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+@dataclass
+class CLIPOutput(ModelOutput):
+    """
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+            Contrastive loss for image-text similarity.
+        logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
+            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
+            similarity scores.
+        logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
+            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
+            similarity scores.
+        text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPTextModel`].
+        image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPVisionModel`].
+        text_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`CLIPTextModel`].
+        vision_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`CLIPVisionModel`].
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits_per_image: torch.FloatTensor = None
+    logits_per_text: torch.FloatTensor = None
+    text_embeds: torch.FloatTensor = None
+    image_embeds: torch.FloatTensor = None
+    text_model_output: BaseModelOutputWithPooling = None
+    vision_model_output: BaseModelOutputWithPooling = None
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            (
+                self[k]
+                if k not in ["text_model_output", "vision_model_output"]
+                else getattr(self, k).to_tuple()
+            )
+            for k in self.keys()
+        )
+class CLIPVisionEmbeddings(nn.Module):
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=False,
+        )
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer(
+            "position_ids",
+            torch.arange(self.num_positions).expand((1, -1)),
+            persistent=False,
+        )
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(
+            pixel_values.to(dtype=target_dtype)
+        )  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding(self.position_ids)
+        return embeddings
+class CLIPTextEmbeddings(nn.Module):
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__()
+        embed_dim = config.hidden_size
+        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
+        self.position_embedding = nn.Embedding(
+            config.max_position_embeddings, embed_dim
+        )
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids",
+            torch.arange(config.max_position_embeddings).expand((1, -1)),
+            persistent=False,
+        )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        seq_length = (
+            input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
+        )
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+        if inputs_embeds is None:
+            inputs_embeds = self.token_embedding(input_ids)
+        position_embeddings = self.position_embedding(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+        return embeddings
+class CLIPAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return (
+            tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
+            .transpose(1, 2)
+            .contiguous()
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        bsz, tgt_len, embed_dim = hidden_states.size()
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scale
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+        # apply the causal_attention_mask first
+        if causal_attention_mask is not None:
+            if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
+                    f" {causal_attention_mask.size()}"
+                )
+            attn_weights = (
+                attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+                + causal_attention_mask
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = (
+                attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+                + attention_mask
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        if output_attentions:
+            # this operation is a bit akward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(
+                bsz, self.num_heads, tgt_len, src_len
+            )
+            attn_weights = attn_weights_reshaped.view(
+                bsz * self.num_heads, tgt_len, src_len
+            )
+        else:
+            attn_weights_reshaped = None
+        attn_probs = nn.functional.dropout(
+            attn_weights, p=self.dropout, training=self.training
+        )
+        attn_output = torch.bmm(attn_probs, value_states)
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights_reshaped
+class CLIPMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+class CLIPEncoderLayer(nn.Module):
+    def __init__(self, config: CLIPConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = CLIPAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = CLIPMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs
+class CLIPPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = CLIPConfig
+    base_model_prefix = "clip"
+    supports_gradient_checkpointing = True
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor
+        if isinstance(module, CLIPTextEmbeddings):
+            module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+            module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
+        elif isinstance(module, CLIPVisionEmbeddings):
+            factor = self.config.initializer_factor
+            nn.init.normal_(
+                module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor
+            )
+            nn.init.normal_(
+                module.patch_embedding.weight,
+                std=module.config.initializer_range * factor,
+            )
+            nn.init.normal_(
+                module.position_embedding.weight,
+                std=module.config.initializer_range * factor,
+            )
+        elif isinstance(module, CLIPAttention):
+            factor = self.config.initializer_factor
+            in_proj_std = (
+                (module.embed_dim**-0.5)
+                * ((2 * module.config.num_hidden_layers) ** -0.5)
+                * factor
+            )
+            out_proj_std = (module.embed_dim**-0.5) * factor
+            nn.init.normal_(module.q_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.k_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.v_proj.weight, std=in_proj_std)
+            nn.init.normal_(module.out_proj.weight, std=out_proj_std)
+        elif isinstance(module, CLIPMLP):
+            factor = self.config.initializer_factor
+            in_proj_std = (
+                (module.config.hidden_size**-0.5)
+                * ((2 * module.config.num_hidden_layers) ** -0.5)
+                * factor
+            )
+            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
+            nn.init.normal_(module.fc1.weight, std=fc_std)
+            nn.init.normal_(module.fc2.weight, std=in_proj_std)
+        elif isinstance(module, CLIPModel):
+            nn.init.normal_(
+                module.text_projection.weight,
+                std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+            nn.init.normal_(
+                module.visual_projection.weight,
+                std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+        elif isinstance(module, CLIPVisionModelWithProjection):
+            nn.init.normal_(
+                module.visual_projection.weight,
+                std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+            )
+        elif isinstance(module, CLIPTextModelWithProjection):
+            nn.init.normal_(
+                module.text_projection.weight,
+                std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
+            )
+        elif isinstance(module, CLIPForImageClassification):
+            nn.init.normal_(
+                module.classifier.weight,
+                std=self.config.vision_config.hidden_size**-0.5
+                * self.config.initializer_factor,
+            )
+        if isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+CLIP_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`CLIPConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+CLIP_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+CLIP_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+CLIP_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+class CLIPEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`CLIPEncoderLayer`].
+    Args:
+        config: CLIPConfig
+    """
+    def __init__(self, config: CLIPConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList(
+            [CLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    encoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                    output_attentions=output_attentions,
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, encoder_states, all_attentions]
+                if v is not None
+            )
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
+        )
+class CLIPTextTransformer(nn.Module):
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = CLIPTextEmbeddings(config)
+        self.encoder = CLIPEncoder(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        # For `pooled_output` computation
+        self.eos_token_id = config.eos_token_id
+    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if input_ids is None:
+            raise ValueError("You have to specify input_ids")
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+        # CLIP's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+        causal_attention_mask = _create_4d_causal_attention_mask(
+            input_shape, hidden_states.dtype, device=hidden_states.device
+        )
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _prepare_4d_attention_mask(
+                attention_mask, hidden_states.dtype
+            )
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+        if self.eos_token_id == 2:
+            # The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here.
+            # A CLIP model with such `eos_token_id` in the config can't work correctly with extra new tokens added
+            # ------------------------------------------------------------
+            # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+            # take features from the eot embedding (eot_token is the highest number in each sequence)
+            # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+            pooled_output = last_hidden_state[
+                torch.arange(
+                    last_hidden_state.shape[0], device=last_hidden_state.device
+                ),
+                input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(
+                    dim=-1
+                ),
+            ]
+        else:
+            # The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible)
+            pooled_output = last_hidden_state[
+                torch.arange(
+                    last_hidden_state.shape[0], device=last_hidden_state.device
+                ),
+                # We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`)
+                # Note: we assume each sequence (along batch dim.) contains an  `eos_token_id` (e.g. prepared by the tokenizer)
+                (
+                    input_ids.to(dtype=torch.int, device=last_hidden_state.device)
+                    == self.eos_token_id
+                )
+                .int()
+                .argmax(dim=-1),
+            ]
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+@add_start_docstrings(
+    """The text model from CLIP without any head or projection on top.""",
+    CLIP_START_DOCSTRING,
+)
+class CLIPTextModel(CLIPPreTrainedModel):
+    config_class = CLIPTextConfig
+    _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"]
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__(config)
+        self.text_model = CLIPTextTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.token_embedding
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.token_embedding = value
+    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=BaseModelOutputWithPooling, config_class=CLIPTextConfig
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> from transformers import AutoTokenizer, CLIPTextModel
+        >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
+        ```"""
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        return self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+class CLIPVisionTransformer(nn.Module):
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = CLIPVisionEmbeddings(config)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.encoder = CLIPEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.pre_layrnorm(hidden_states)
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+@add_start_docstrings(
+    """The vision model from CLIP without any head or projection on top.""",
+    CLIP_START_DOCSTRING,
+)
+class CLIPVisionModel(CLIPPreTrainedModel):
+    config_class = CLIPVisionConfig
+    main_input_name = "pixel_values"
+    _no_split_modules = ["CLIPEncoderLayer"]
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__(config)
+        self.vision_model = CLIPVisionTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=BaseModelOutputWithPooling, config_class=CLIPVisionConfig
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPVisionModel
+        >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled CLS states
+        ```"""
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+@add_start_docstrings(CLIP_START_DOCSTRING)
+class CLIPModel(CLIPPreTrainedModel):
+    config_class = CLIPConfig
+    _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"]
+    def __init__(self, config: CLIPConfig):
+        super().__init__(config)
+        if not isinstance(config.text_config, CLIPTextConfig):
+            raise ValueError(
+                "config.text_config is expected to be of type CLIPTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+        if not isinstance(config.vision_config, CLIPVisionConfig):
+            raise ValueError(
+                "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+        text_config = config.text_config
+        vision_config = config.vision_config
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+        self.text_model = CLIPTextTransformer(text_config)
+        self.vision_model = CLIPVisionTransformer(vision_config)
+        self.visual_projection = nn.Linear(
+            self.vision_embed_dim, self.projection_dim, bias=False
+        )
+        self.text_projection = nn.Linear(
+            self.text_embed_dim, self.projection_dim, bias=False
+        )
+        self.logit_scale = nn.Parameter(
+            torch.tensor(self.config.logit_scale_init_value)
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`CLIPTextModel`].
+        Examples:
+        ```python
+        >>> from transformers import AutoTokenizer, CLIPModel
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = text_outputs[1]
+        text_features = self.text_projection(pooled_output)
+        return text_features
+    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`CLIPVisionModel`].
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPModel
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> image_features = model.get_image_features(**inputs)
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = vision_outputs[1]  # pooled_output
+        image_features = self.visual_projection(pooled_output)
+        return image_features
+    @add_start_docstrings_to_model_forward(CLIP_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CLIPOutput, config_class=CLIPConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CLIPOutput]:
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPModel
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
+        ... )
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
+        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_embeds = vision_outputs[1]
+        image_embeds = self.visual_projection(image_embeds)
+        text_embeds = text_outputs[1]
+        text_embeds = self.text_projection(text_embeds)
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
+        logits_per_image = logits_per_text.t()
+        loss = None
+        if return_loss:
+            loss = clip_loss(logits_per_text)
+        if not return_dict:
+            output = (
+                logits_per_image,
+                logits_per_text,
+                text_embeds,
+                image_embeds,
+                text_outputs,
+                vision_outputs,
+            )
+            return ((loss,) + output) if loss is not None else output
+        return CLIPOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
+@add_start_docstrings(
+    """
+    CLIP Text Model with a projection layer on top (a linear layer on top of the pooled output).
+    """,
+    CLIP_START_DOCSTRING,
+)
+class CLIPTextModelWithProjection(CLIPPreTrainedModel):
+    config_class = CLIPTextConfig
+    _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"]
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__(config)
+        self.text_model = CLIPTextTransformer(config)
+        self.text_projection = nn.Linear(
+            config.hidden_size, config.projection_dim, bias=False
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.token_embedding
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.token_embedding = value
+    @add_start_docstrings_to_model_forward(CLIP_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=CLIPTextModelOutput, config_class=CLIPTextConfig
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CLIPTextModelOutput]:
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> from transformers import AutoTokenizer, CLIPTextModelWithProjection
+        >>> model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> text_embeds = outputs.text_embeds
+        ```"""
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = text_outputs[1]
+        text_embeds = self.text_projection(pooled_output)
+        if not return_dict:
+            outputs = (text_embeds, text_outputs[0]) + text_outputs[2:]
+            return tuple(output for output in outputs if output is not None)
+        return CLIPTextModelOutput(
+            text_embeds=text_embeds,
+            last_hidden_state=text_outputs.last_hidden_state,
+            hidden_states=text_outputs.hidden_states,
+            attentions=text_outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output).
+    """,
+    CLIP_START_DOCSTRING,
+)
+class CLIPVisionModelWithProjection(CLIPPreTrainedModel):
+    config_class = CLIPVisionConfig
+    main_input_name = "pixel_values"
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__(config)
+        self.vision_model = CLIPVisionTransformer(config)
+        self.visual_projection = nn.Linear(
+            config.hidden_size, config.projection_dim, bias=False
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+    @add_start_docstrings_to_model_forward(CLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=CLIPVisionModelOutput, config_class=CLIPVisionConfig
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CLIPVisionModelOutput]:
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, CLIPVisionModelWithProjection
+        >>> model = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> image_embeds = outputs.image_embeds
+        ```"""
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = vision_outputs[1]  # pooled_output
+        image_embeds = self.visual_projection(pooled_output)
+        if not return_dict:
+            outputs = (image_embeds, vision_outputs[0]) + vision_outputs[2:]
+            return tuple(output for output in outputs if output is not None)
+        return CLIPVisionModelOutput(
+            image_embeds=image_embeds,
+            last_hidden_state=vision_outputs.last_hidden_state,
+            hidden_states=vision_outputs.hidden_states,
+            attentions=vision_outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    CLIP vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
+    the patch tokens) e.g. for ImageNet.
+    """,
+    CLIP_START_DOCSTRING,
+)
+class CLIPForImageClassification(CLIPPreTrainedModel):
+    main_input_name = "pixel_values"
+    def __init__(self, config: CLIPConfig) -> None:
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.vision_model = CLIPVisionTransformer(config.vision_config)
+        # Classifier head
+        self.classifier = (
+            nn.Linear(config.vision_config.hidden_size, config.num_labels)
+            if config.num_labels > 0
+            else nn.Identity()
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(CLIP_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        outputs = self.vision_model(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        # average pool the patch tokens
+        sequence_output = torch.mean(sequence_output[:, 1:, :], dim=1)
+        # apply classifier
+        logits = self.classifier(sequence_output)
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (
+                    labels.dtype == torch.long or labels.dtype == torch.int
+                ):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

step1x3d_geometry/models/conditional_encoders/clip/modeling_conditional_clip.py ADDED Viewed

	@@ -0,0 +1,443 @@

+# coding=utf-8
+# Copyright 2023 Meta AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Reference:
+# * transformers/models/dinov2/modeling_dinov2.py
+# * https://github.com/facebookresearch/DiT/blob/main/models.py#L101
+# * https://github.com/3DTopia/OpenLRM/tree/main/openlrm/models/encoders/dinov2
+"""PyTorch CLIP model."""
+from typing import Dict, List, Optional, Set, Tuple, Union
+import torch
+import torch.nn as nn
+from .modeling_clip import (
+    CLIPConfig,
+    CLIPTextConfig,
+    CLIPVisionConfig,
+    CLIPEncoderLayer,
+    CLIPTextTransformer,
+    CLIPVisionTransformer,
+    CLIPModel,
+    CLIPVisionEmbeddings,
+    CLIPVisionModel,
+    CLIPOutput,
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+)
+class ModLN(nn.Module):
+    def __init__(self, inner_dim: int, mod_dim: int = 32):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(mod_dim, inner_dim * 2),
+        )
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.zeros_(m.weight)
+                nn.init.zeros_(m.bias)
+    def forward(self, x: torch.Tensor, condition: torch.Tensor):
+        """
+        x: [N, M, C_in], M: num of tokens
+        condition: [N, C_mod]
+        """
+        shift, scale = self.mlp(condition).unsqueeze(1).chunk(2, dim=-1)
+        return x * (1 + scale) + shift
+class ConditionalCLIPVisionConfig(CLIPVisionConfig):
+    def __init__(self, modulation_dim: int = 32, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.modulation_dim = modulation_dim
+class ConditionalCLIPEncoderLayer(CLIPEncoderLayer):
+    """This corresponds to the Block class in the original implementation."""
+    def __init__(self, config: ConditionalCLIPVisionConfig) -> None:
+        super().__init__(config)
+        self.mod_norm1 = ModLN(config.hidden_size, config.modulation_dim)
+        self.mod_norm2 = ModLN(config.hidden_size, config.modulation_dim)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+        condition: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        residual = hidden_states
+        hidden_states = self.mod_norm1(self.layer_norm1(hidden_states), condition)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.mod_norm2(self.layer_norm2(hidden_states), condition)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs
+class ConditionalCLIPEncoder(nn.Module):
+    def __init__(self, config: CLIPConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList(
+            [
+                ConditionalCLIPEncoderLayer(config)
+                for _ in range(config.num_hidden_layers)
+            ]
+        )
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        condition: Optional[torch.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    encoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                    condition=condition,
+                    output_attentions=output_attentions,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                    condition=condition,
+                    output_attentions=output_attentions,
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, encoder_states, all_attentions]
+                if v is not None
+            )
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
+        )
+class ConditionalCLIPVisionTransformer(CLIPVisionTransformer):
+    def __init__(self, config: ConditionalCLIPVisionConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = CLIPVisionEmbeddings(config)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.encoder = ConditionalCLIPEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        condition: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.pre_layrnorm(hidden_states)
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            condition=condition,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+class ConditionalCLIPVisionModel(CLIPVisionModel):
+    config_class = ConditionalCLIPVisionConfig
+    def __init__(self, config: ConditionalCLIPVisionConfig):
+        super().__init__(config)
+        self.vision_model = ConditionalCLIPVisionTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        condition: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        return self.vision_model(
+            pixel_values=pixel_values,
+            condition=condition,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+class ConditionalCLIPModel(CLIPModel):
+    config_class = CLIPConfig
+    def __init__(self, config: CLIPConfig):
+        super().__init__(config)
+        if not isinstance(config.text_config, CLIPTextConfig):
+            raise ValueError(
+                "config.text_config is expected to be of type CLIPTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+        if not isinstance(config.vision_config, CLIPVisionConfig):
+            raise ValueError(
+                "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+        text_config = config.text_config
+        vision_config = config.vision_config
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+        self.text_model = CLIPTextTransformer(text_config)
+        self.vision_model = ConditionalCLIPVisionTransformer(vision_config)
+        self.visual_projection = nn.Linear(
+            self.vision_embed_dim, self.projection_dim, bias=False
+        )
+        self.text_projection = nn.Linear(
+            self.text_embed_dim, self.projection_dim, bias=False
+        )
+        self.logit_scale = nn.Parameter(
+            torch.tensor(self.config.logit_scale_init_value)
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        condition: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            condition=condition,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = vision_outputs[1]  # pooled_output
+        image_features = self.visual_projection(pooled_output)
+        return image_features
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        condition: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CLIPOutput]:
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            condition=condition,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_embeds = vision_outputs[1]
+        image_embeds = self.visual_projection(image_embeds)
+        text_embeds = text_outputs[1]
+        text_embeds = self.text_projection(text_embeds)
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
+        logits_per_image = logits_per_text.t()
+        loss = None
+        if return_loss:
+            loss = clip_loss(logits_per_text)
+        if not return_dict:
+            output = (
+                logits_per_image,
+                logits_per_text,
+                text_embeds,
+                image_embeds,
+                text_outputs,
+                vision_outputs,
+            )
+            return ((loss,) + output) if loss is not None else output
+        return CLIPOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )

step1x3d_geometry/models/conditional_encoders/dinov2/modeling_conditional_dinov2.py ADDED Viewed

	@@ -0,0 +1,248 @@

+# coding=utf-8
+# Copyright 2023 Meta AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Reference:
+# * transformers/models/dinov2/modeling_dinov2.py
+# * https://github.com/facebookresearch/DiT/blob/main/models.py#L101
+# * https://github.com/3DTopia/OpenLRM/tree/main/openlrm/models/encoders/dinov2
+"""PyTorch DINOv2 model."""
+from typing import Dict, List, Optional, Set, Tuple, Union
+import torch
+import torch.nn as nn
+from .modeling_dinov2 import (
+    Dinov2Config,
+    Dinov2Layer,
+    Dinov2Model,
+    Dinov2Embeddings,
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+)
+class ModLN(nn.Module):
+    def __init__(self, inner_dim: int, mod_dim: int = 1024):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(mod_dim, inner_dim * 2),
+        )
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.zeros_(m.weight)
+                nn.init.zeros_(m.bias)
+    def forward(self, x: torch.Tensor, condition: torch.Tensor):
+        """
+        x: [N, M, C_in], M: num of tokens
+        condition: [N, C_mod]
+        """
+        shift, scale = self.mlp(condition).unsqueeze(1).chunk(2, dim=-1)
+        return x * (1 + scale) + shift
+class ConditionalDinov2Config(Dinov2Config):
+    def __init__(self, modulation_dim: int = 1024, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.modulation_dim = modulation_dim
+class ConditionalDinov2Layer(Dinov2Layer):
+    """This corresponds to the Block class in the original implementation."""
+    def __init__(self, config: ConditionalDinov2Config) -> None:
+        super().__init__(config)
+        self.mod_norm1 = ModLN(config.hidden_size, config.modulation_dim)
+        self.mod_norm2 = ModLN(config.hidden_size, config.modulation_dim)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        condition: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_attention_outputs = self.attention(
+            self.mod_norm1(
+                self.norm1(hidden_states), condition
+            ),  # in Dinov2, layernorm is applied before self-attention
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        attention_output = self.layer_scale1(attention_output)
+        outputs = self_attention_outputs[
+            1:
+        ]  # add self attentions if we output attention weights
+        # first residual connection
+        hidden_states = self.drop_path(attention_output) + hidden_states
+        # in Dinov2, layernorm is also applied after self-attention
+        layer_output = self.mod_norm2(self.norm2(hidden_states), condition)
+        layer_output = self.mlp(layer_output)
+        layer_output = self.layer_scale2(layer_output)
+        # second residual connection
+        layer_output = self.drop_path(layer_output) + hidden_states
+        outputs = (layer_output,) + outputs
+        return outputs
+# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->Dinov2
+class ConditionalDinov2Encoder(nn.Module):
+    def __init__(self, config: ConditionalDinov2Config) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [ConditionalDinov2Layer(config) for _ in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        condition: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    layer_head_mask,
+                    condition,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    layer_head_mask,
+                    condition,
+                    output_attentions,
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, all_hidden_states, all_self_attentions]
+                if v is not None
+            )
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+class ConditionalDinov2Model(Dinov2Model):
+    config_class = ConditionalDinov2Config
+    def __init__(self, config: ConditionalDinov2Config):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = Dinov2Embeddings(config)
+        self.encoder = ConditionalDinov2Encoder(config)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        condition: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        embedding_output = self.embeddings(
+            pixel_values, bool_masked_pos=bool_masked_pos
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            condition=condition,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = sequence_output[:, 0, :]
+        if not return_dict:
+            head_outputs = (sequence_output, pooled_output)
+            return head_outputs + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )

step1x3d_geometry/models/conditional_encoders/dinov2/modeling_dinov2.py ADDED Viewed

	@@ -0,0 +1,978 @@

+# coding=utf-8
+# Copyright 2023 Meta AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch DINOv2 model."""
+import collections.abc
+import math
+from typing import Dict, List, Optional, Set, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+    BackboneOutput,
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+    ImageClassifierOutput,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import (
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from transformers.utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.utils.backbone_utils import BackboneMixin
+from transformers.models.dinov2.configuration_dinov2 import Dinov2Config
+logger = logging.get_logger(__name__)
+# General docstring
+_CONFIG_FOR_DOC = "Dinov2Config"
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/dinov2-base"
+_EXPECTED_OUTPUT_SHAPE = [1, 257, 768]
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "facebook/dinov2-small-imagenet1k-1-layer"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+DINOV2_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/dinov2-base",
+    # See all DINOv2 models at https://huggingface.co/models?filter=dinov2
+]
+class Dinov2Embeddings(nn.Module):
+    """
+    Construct the CLS token, mask token, position and patch embeddings.
+    """
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+        self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size))
+        self.patch_embeddings = Dinov2PatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(
+            torch.randn(1, num_patches + 1, config.hidden_size)
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.config = config
+    def interpolate_pos_encoding(
+        self, embeddings: torch.Tensor, height: int, width: int
+    ) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
+        resolution images.
+        Source:
+        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+        """
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+        if num_patches == num_positions and height == width:
+            return self.position_embeddings
+        class_pos_embed = self.position_embeddings[:, 0]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+        dim = embeddings.shape[-1]
+        height = height // self.config.patch_size
+        width = width // self.config.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        height, width = height + 0.1, width + 0.1
+        patch_pos_embed = patch_pos_embed.reshape(
+            1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim
+        )
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+        target_dtype = patch_pos_embed.dtype
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.to(dtype=torch.float32),
+            scale_factor=(
+                float(height / math.sqrt(num_positions)),
+                float(width / math.sqrt(num_positions)),
+            ),
+            mode="bicubic",
+            align_corners=False,
+        ).to(dtype=target_dtype)
+        if (
+            int(height) != patch_pos_embed.shape[-2]
+            or int(width) != patch_pos_embed.shape[-1]
+        ):
+            raise ValueError(
+                "Width or height does not match with the interpolated position embeddings"
+            )
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+    def forward(
+        self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        target_dtype = self.patch_embeddings.projection.weight.dtype
+        embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
+        if bool_masked_pos is not None:
+            embeddings = torch.where(
+                bool_masked_pos.unsqueeze(-1),
+                self.mask_token.to(embeddings.dtype).unsqueeze(0),
+                embeddings,
+            )
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+        # add positional encoding to each token
+        embeddings = embeddings + self.interpolate_pos_encoding(
+            embeddings, height, width
+        )
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class Dinov2PatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+        image_size = (
+            image_size
+            if isinstance(image_size, collections.abc.Iterable)
+            else (image_size, image_size)
+        )
+        patch_size = (
+            patch_size
+            if isinstance(patch_size, collections.abc.Iterable)
+            else (patch_size, patch_size)
+        )
+        num_patches = (image_size[1] // patch_size[1]) * (
+            image_size[0] // patch_size[0]
+        )
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+        self.projection = nn.Conv2d(
+            num_channels, hidden_size, kernel_size=patch_size, stride=patch_size
+        )
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        num_channels = pixel_values.shape[1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+                f" Expected {self.num_channels} but got {num_channels}."
+            )
+        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return embeddings
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->Dinov2
+class Dinov2SelfAttention(nn.Module):
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+            config, "embedding_size"
+        ):
+            raise ValueError(
+                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(
+            config.hidden_size, self.all_head_size, bias=config.qkv_bias
+        )
+        self.key = nn.Linear(
+            config.hidden_size, self.all_head_size, bias=config.qkv_bias
+        )
+        self.value = nn.Linear(
+            config.hidden_size, self.all_head_size, bias=config.qkv_bias
+        )
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (
+            self.num_attention_heads,
+            self.attention_head_size,
+        )
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(
+        self,
+        hidden_states,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        mixed_query_layer = self.query(hidden_states)
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+        outputs = (
+            (context_layer, attention_probs) if output_attentions else (context_layer,)
+        )
+        return outputs
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->Dinov2
+class Dinov2SelfOutput(nn.Module):
+    """
+    The residual connection is defined in Dinov2Layer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(
+        self, hidden_states: torch.Tensor, input_tensor: torch.Tensor
+    ) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->Dinov2
+class Dinov2Attention(nn.Module):
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+        self.attention = Dinov2SelfAttention(config)
+        self.output = Dinov2SelfOutput(config)
+        self.pruned_heads = set()
+    def prune_heads(self, heads: Set[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads,
+            self.attention.num_attention_heads,
+            self.attention.attention_head_size,
+            self.pruned_heads,
+        )
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(
+            heads
+        )
+        self.attention.all_head_size = (
+            self.attention.attention_head_size * self.attention.num_attention_heads
+        )
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[
+            1:
+        ]  # add attentions if we output them
+        return outputs
+class Dinov2LayerScale(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.lambda1 = nn.Parameter(
+            config.layerscale_value * torch.ones(config.hidden_size)
+        )
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        return hidden_state * self.lambda1
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(
+    input: torch.Tensor, drop_prob: float = 0.0, training: bool = False
+) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (
+        input.ndim - 1
+    )  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(
+        shape, dtype=input.dtype, device=input.device
+    )
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath
+class Dinov2DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+class Dinov2MLP(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=True)
+        if isinstance(config.hidden_act, str):
+            self.activation = ACT2FN[config.hidden_act]
+        else:
+            self.activation = config.hidden_act
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=True)
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.fc1(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.fc2(hidden_state)
+        return hidden_state
+class Dinov2SwiGLUFFN(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True)
+        self.weights_out = nn.Linear(hidden_features, out_features, bias=True)
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.weights_in(hidden_state)
+        x1, x2 = hidden_state.chunk(2, dim=-1)
+        hidden = nn.functional.silu(x1) * x2
+        return self.weights_out(hidden)
+class Dinov2Layer(nn.Module):
+    """This corresponds to the Block class in the original implementation."""
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+        self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attention = Dinov2Attention(config)
+        self.layer_scale1 = Dinov2LayerScale(config)
+        self.drop_path = (
+            Dinov2DropPath(config.drop_path_rate)
+            if config.drop_path_rate > 0.0
+            else nn.Identity()
+        )
+        self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        if config.use_swiglu_ffn:
+            self.mlp = Dinov2SwiGLUFFN(config)
+        else:
+            self.mlp = Dinov2MLP(config)
+        self.layer_scale2 = Dinov2LayerScale(config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_attention_outputs = self.attention(
+            self.norm1(
+                hidden_states
+            ),  # in Dinov2, layernorm is applied before self-attention
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        attention_output = self.layer_scale1(attention_output)
+        outputs = self_attention_outputs[
+            1:
+        ]  # add self attentions if we output attention weights
+        # first residual connection
+        hidden_states = self.drop_path(attention_output) + hidden_states
+        # in Dinov2, layernorm is also applied after self-attention
+        layer_output = self.norm2(hidden_states)
+        layer_output = self.mlp(layer_output)
+        layer_output = self.layer_scale2(layer_output)
+        # second residual connection
+        layer_output = self.drop_path(layer_output) + hidden_states
+        outputs = (layer_output,) + outputs
+        return outputs
+# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->Dinov2
+class Dinov2Encoder(nn.Module):
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [Dinov2Layer(config) for _ in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    layer_head_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states, layer_head_mask, output_attentions
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, all_hidden_states, all_self_attentions]
+                if v is not None
+            )
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+class Dinov2PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = Dinov2Config
+    base_model_prefix = "dinov2"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, Dinov2Embeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.position_embeddings.dtype)
+            module.cls_token.data = nn.init.trunc_normal_(
+                module.cls_token.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.cls_token.dtype)
+DINOV2_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+    Parameters:
+        config ([`Dinov2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+DINOV2_BASE_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`BitImageProcessor.preprocess`] for details.
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for
+            pre-training.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+DINOV2_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`BitImageProcessor.preprocess`] for details.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    "The bare DINOv2 Model transformer outputting raw hidden-states without any specific head on top.",
+    DINOV2_START_DOCSTRING,
+)
+class Dinov2Model(Dinov2PreTrainedModel):
+    def __init__(self, config: Dinov2Config):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = Dinov2Embeddings(config)
+        self.encoder = Dinov2Encoder(config)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> Dinov2PatchEmbeddings:
+        return self.embeddings.patch_embeddings
+    def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    @add_start_docstrings_to_model_forward(DINOV2_BASE_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPooling,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        embedding_output = self.embeddings(
+            pixel_values, bool_masked_pos=bool_masked_pos
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = sequence_output[:, 0, :]
+        if not return_dict:
+            head_outputs = (sequence_output, pooled_output)
+            return head_outputs + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state
+    of the [CLS] token) e.g. for ImageNet.
+    """,
+    DINOV2_START_DOCSTRING,
+)
+class Dinov2ForImageClassification(Dinov2PreTrainedModel):
+    def __init__(self, config: Dinov2Config) -> None:
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.dinov2 = Dinov2Model(config)
+        # Classifier head
+        self.classifier = (
+            nn.Linear(config.hidden_size * 2, config.num_labels)
+            if config.num_labels > 0
+            else nn.Identity()
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(DINOV2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        outputs = self.dinov2(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]  # batch_size, sequence_length, hidden_size
+        cls_token = sequence_output[:, 0]
+        patch_tokens = sequence_output[:, 1:]
+        linear_input = torch.cat([cls_token, patch_tokens.mean(dim=1)], dim=1)
+        logits = self.classifier(linear_input)
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (
+                    labels.dtype == torch.long or labels.dtype == torch.int
+                ):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Dinov2 backbone, to be used with frameworks like DETR and MaskFormer.
+    """,
+    DINOV2_START_DOCSTRING,
+)
+class Dinov2Backbone(Dinov2PreTrainedModel, BackboneMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        super()._init_backbone(config)
+        self.num_features = [
+            config.hidden_size for _ in range(config.num_hidden_layers + 1)
+        ]
+        self.embeddings = Dinov2Embeddings(config)
+        self.encoder = Dinov2Encoder(config)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> Dinov2PatchEmbeddings:
+        return self.embeddings.patch_embeddings
+    @add_start_docstrings_to_model_forward(DINOV2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> BackboneOutput:
+        """
+        Returns:
+        Examples:
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
+        >>> model = AutoBackbone.from_pretrained(
+        ...     "facebook/dinov2-base", out_features=["stage2", "stage5", "stage8", "stage11"]
+        ... )
+        >>> inputs = processor(image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> feature_maps = outputs.feature_maps
+        >>> list(feature_maps[-1].shape)
+        [1, 768, 16, 16]
+        ```"""
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        embedding_output = self.embeddings(pixel_values)
+        outputs = self.encoder(
+            embedding_output,
+            output_hidden_states=True,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs.hidden_states if return_dict else outputs[1]
+        feature_maps = ()
+        for stage, hidden_state in zip(self.stage_names, hidden_states):
+            if stage in self.out_features:
+                if self.config.apply_layernorm:
+                    hidden_state = self.layernorm(hidden_state)
+                if self.config.reshape_hidden_states:
+                    hidden_state = hidden_state[:, 1:]
+                    # this was actually a bug in the original implementation that we copied here,
+                    # cause normally the order is height, width
+                    batch_size, _, height, width = pixel_values.shape
+                    patch_size = self.config.patch_size
+                    hidden_state = hidden_state.reshape(
+                        batch_size, height // patch_size, width // patch_size, -1
+                    )
+                    hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
+                feature_maps += (hidden_state,)
+        if not return_dict:
+            if output_hidden_states:
+                output = (feature_maps,) + outputs[1:]
+            else:
+                output = (feature_maps,) + outputs[2:]
+            return output
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions if output_attentions else None,
+        )

step1x3d_geometry/models/conditional_encoders/dinov2_clip_encoder.py ADDED Viewed

	@@ -0,0 +1,514 @@

+import random
+import torch
+from torch import nn
+import numpy as np
+import re
+from einops import rearrange
+from dataclasses import dataclass
+from torchvision import transforms
+from diffusers.models.modeling_utils import ModelMixin
+from transformers import CLIPTokenizer, CLIPImageProcessor
+from transformers import AutoImageProcessor, AutoModel
+from transformers import T5EncoderModel, T5Tokenizer, AutoTokenizer
+from transformers.utils import ModelOutput
+from typing import Iterable, Optional, Union, List
+import step1x3d_geometry
+from step1x3d_geometry.utils.typing import *
+from .clip.modeling_clip import CLIPModel
+from .clip.modeling_conditional_clip import ConditionalCLIPModel
+from .base import BaseVisualEncoder, ImageType
+from .dinov2.modeling_dinov2 import Dinov2Model
+from .dinov2.modeling_conditional_dinov2 import ConditionalDinov2Model
+from .dinov2_with_registers.modeling_dinov2_with_registers import (
+    Dinov2WithRegistersModel,
+)
+CLIP_IMAGE_SIZE = 224
+@dataclass
+class CLIPEmbedOutput(ModelOutput):
+    last_hidden_state: torch.FloatTensor = None
+    pooler_output: torch.FloatTensor = None
+    embeds: torch.FloatTensor = None
+class DINOEmbedOutput(ModelOutput):
+    last_hidden_state: torch.FloatTensor = None
+    pooler_output: torch.FloatTensor = None
+@step1x3d_geometry.register("dinov2-clip-encoder")
+class Dinov2CLIPEncoder(BaseVisualEncoder, ModelMixin):
+    @dataclass
+    class Config(BaseVisualEncoder.Config):
+        pretrained_model_name_or_path: Optional[str] = (
+            None  # the pretrained model name or path for condition model
+        )
+        pretrained_clip_name_or_path: Optional[str] = (
+            None  # the pretrained model name or path for clip
+        )
+        pretrained_dino_name_or_path: Optional[str] = (
+            None  # the pretrained model name or path for dino
+        )
+        pretrained_linear_proj: Optional[str] = None
+        freeze_modulation_clip: bool = False
+        freeze_modulation_dino: bool = False
+        enable_gradient_checkpointing: bool = False
+        image_size: int = CLIP_IMAGE_SIZE
+        fuse_type: str = "concat"
+        dino_type: Optional[str] = None
+        clip_type: Optional[str] = None
+        kwargs: Optional[dict] = None
+    cfg: Config
+    def configure(self) -> None:
+        super().configure()
+        # Load the CLIP model and processor
+        if not self.cfg.encode_camera:
+            if self.cfg.pretrained_clip_name_or_path is not None:
+                self.cfg.clip_type = f"openai/{self.cfg.pretrained_clip_name_or_path.split('openai--')[-1].split('/')[0]}"
+                self.clip_model: CLIPModel = CLIPModel.from_pretrained(
+                    self.cfg.pretrained_clip_name_or_path
+                )
+            else:
+                print("Loading CLIP model from openai/clip-vit-large-patch14")
+                self.dino_type = "openai/clip-vit-large-patch14"
+                self.clip_model: CLIPModel = CLIPModel(
+                    config=ConditionalCLIPModel.config_class.from_pretrained(
+                        "openai/clip-vit-large-patch14",
+                    )
+                )
+            if self.cfg.pretrained_dino_name_or_path is not None:
+                self.cfg.dino_type = f"facebook/{self.cfg.pretrained_dino_name_or_path.split('facebook--')[-1].split('/')[0]}"
+                self.dino_model: Dinov2Model = AutoModel.from_pretrained(
+                    self.cfg.pretrained_dino_name_or_path
+                )
+            else:
+                if (
+                    self.cfg.pretrained_model_name_or_path is None
+                ):  # default to load Dinov2-base model
+                    assert (
+                        self.cfg.dino_type is not None
+                    ), "The dino_type should be provided"
+                    print(f"Loading Dinov2 model from {self.cfg.dino_type}")
+                    if "reg" in self.cfg.dino_type:
+                        self.dino_model: Dinov2WithRegistersModel = (
+                            Dinov2WithRegistersModel(
+                                config=Dinov2WithRegistersModel.config_class.from_pretrained(
+                                    self.cfg.dino_type,
+                                )
+                            )
+                        )
+                    else:
+                        self.dino_model: Dinov2Model = Dinov2Model(
+                            config=Dinov2Model.config_class.from_pretrained(
+                                self.dino_type,
+                            )
+                        )
+                elif "dinov2base" in self.cfg.pretrained_model_name_or_path:
+                    print("Loading Dinov2 model from facebook/dinov2-base")
+                    self.cfg.dino_type = "facebook/dinov2-base"
+                    self.dino_model: Dinov2Model = Dinov2Model(
+                        config=Dinov2Model.config_class.from_pretrained(
+                            "facebook/dinov2-base",
+                        )
+                    )
+                elif "dinov2regbase" in self.cfg.pretrained_model_name_or_path:
+                    print(
+                        "Loading Dinov2 model from facebook/dinov2-with-registers-base"
+                    )
+                    self.cfg.dino_type = "facebook/dinov2-with-registers-base"
+                    self.dino_model: Dinov2WithRegistersModel = (
+                        Dinov2WithRegistersModel(
+                            config=Dinov2WithRegistersModel.config_class.from_pretrained(
+                                "facebook/dinov2-with-registers-base",
+                            )
+                        )
+                    )
+                elif "dinov2reglarge" in self.cfg.pretrained_model_name_or_path:
+                    print(
+                        "Loading Dinov2 model from facebook/dinov2-with-registers-large"
+                    )
+                    self.cfg.dino_type = "facebook/dinov2-with-registers-large"
+                    self.dino_model: Dinov2WithRegistersModel = (
+                        Dinov2WithRegistersModel(
+                            config=Dinov2WithRegistersModel.config_class.from_pretrained(
+                                "facebook/dinov2-with-registers-large",
+                            )
+                        )
+                    )
+                else:
+                    raise ValueError(
+                        f"Unknown Dinov2 model: {self.cfg.pretrained_model_name_or_path}"
+                    )
+        else:
+            # clip
+            conditional_clip_config = ConditionalCLIPModel.config_class.from_pretrained(
+                self.cfg.pretrained_clip_name_or_path,
+            )
+            conditional_clip_config.vision_config.modulation_dim = (
+                self.cfg.camera_embeds_dim
+            )
+            self.clip_model: CLIPModel = ConditionalCLIPModel.from_pretrained(
+                self.cfg.pretrained_clip_name_or_path,
+                vision_config=conditional_clip_config.vision_config,
+            )
+            # dino
+            conditional_vit_config = (
+                ConditionalDinov2Model.config_class.from_pretrained(
+                    self.cfg.pretrained_dino_name_or_path,
+                )
+            )
+            conditional_vit_config.modulation_dim = self.cfg.camera_embeds_dim
+            self.dino_model: ConditionalDinov2Model = (
+                ConditionalDinov2Model.from_pretrained(
+                    self.cfg.pretrained_dino_name_or_path, config=conditional_vit_config
+                )
+            )
+        self.image_preprocess_clip = CLIPImageProcessor()
+        self.image_preprocess_dino = AutoImageProcessor.from_pretrained(
+            self.cfg.dino_type
+            if self.cfg.pretrained_dino_name_or_path is None
+            else self.cfg.pretrained_dino_name_or_path
+        )
+        self.transform_clip = transforms.Compose(
+            [
+                transforms.Resize(
+                    CLIP_IMAGE_SIZE,
+                    transforms.InterpolationMode.BICUBIC,
+                    antialias=True,
+                ),  # clip is CLIP_IMAGE_SIZE
+                transforms.CenterCrop(CLIP_IMAGE_SIZE),  # crop a square.
+                transforms.Normalize(
+                    mean=[0.48145466, 0.4578275, 0.40821073],
+                    std=[0.26862954, 0.26130258, 0.27577711],
+                ),
+            ]
+        )
+        self.transform_dino = transforms.Compose(
+            [
+                transforms.Resize(
+                    self.cfg.image_size,
+                    transforms.InterpolationMode.BICUBIC,
+                    antialias=True,
+                ),
+                transforms.CenterCrop(self.cfg.image_size),  # crop a square
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406],
+                    std=[0.229, 0.224, 0.225],
+                ),
+            ]
+        )
+        if self.cfg.enable_gradient_checkpointing:
+            self.dino_model.encoder.gradient_checkpointing = True
+        if self.cfg.zero_uncond_embeds:
+            image_size = max(self.cfg.image_size, self.cfg.image_size)
+            self.empty_image_embeds_dino = torch.zeros(
+                (self.cfg.n_views, (image_size // 14) ** 2 + 1, 1024)
+            ).detach()
+            self.empty_image_embeds_clip = torch.zeros(
+                (self.cfg.n_views, (CLIP_IMAGE_SIZE // 14) ** 2 + 1, 1024)
+            ).detach()
+            if self.cfg.fuse_type == "concat":
+                self.empty_image_embeds = torch.cat(
+                    [self.empty_image_embeds_dino, self.empty_image_embeds_clip], dim=1
+                )
+            else:
+                raise ValueError
+        else:
+            if self.cfg.encode_camera:
+                self.empty_image_embeds_dino = self.encode_image_dino(
+                    torch.zeros(
+                        self.cfg.n_views, self.cfg.image_size, self.cfg.image_size, 3
+                    ),
+                    self.cameras[: self.cfg.n_views],
+                ).detach()
+                self.empty_image_embeds_clip = self.encode_image_clip(
+                    torch.zeros(
+                        self.cfg.n_views, self.cfg.image_size, self.cfg.image_size, 3
+                    ),
+                    self.cameras[: self.cfg.n_views],
+                ).detach()
+            else:
+                self.empty_image_embeds_dino = self.encode_image_dino(
+                    torch.zeros(
+                        self.cfg.n_views, self.cfg.image_size, self.cfg.image_size, 3
+                    )
+                ).detach()
+                self.empty_image_embeds_clip = self.encode_image_clip(
+                    torch.zeros(
+                        self.cfg.n_views, self.cfg.image_size, self.cfg.image_size, 3
+                    )
+                ).detach()
+            self.empty_image_embeds_clip, self.empty_image_embeds_dino = (
+                self.align_clip_dino(
+                    self.empty_image_embeds_clip, self.empty_image_embeds_dino
+                )
+            )
+            self.empty_image_embeds = torch.cat(
+                [self.empty_image_embeds_dino, self.empty_image_embeds_clip], dim=1
+            )
+        # Freeze the clip model parameters
+        self.clip_model.eval()
+        for k, p in self.clip_model.named_parameters():
+            ks = k.split(".")
+            if (
+                "mod_norm1" in ks
+                or "mod_norm2" in ks
+                and not self.cfg.freeze_modulation_clip
+            ):
+                p.requires_grad_(not self.cfg.freeze_modulation_clip)
+            else:
+                p.requires_grad_(False)
+        # freeze the dino model parameters
+        self.dino_model.eval()
+        for k, p in self.dino_model.named_parameters():
+            ks = k.split(".")
+            if (
+                "mod_norm1" in ks
+                or "mod_norm2" in ks
+                and not self.cfg.freeze_modulation_dino
+            ):
+                p.requires_grad_(not self.cfg.freeze_modulation_dino)
+            else:
+                p.requires_grad_(False)
+        # add a linear projection layer to project the dino embeddings to the same dimension as clip embeddings
+        if (
+            self.clip_model.config.vision_config.hidden_size
+            != self.dino_model.config.hidden_size
+        ):
+            self.linear_proj = nn.Linear(
+                self.clip_model.config.vision_config.hidden_size,
+                self.dino_model.config.vision_config.hidden_size,
+                bias=False,
+            )
+        else:
+            self.linear_proj = nn.Identity()
+        if self.cfg.pretrained_model_name_or_path is not None:
+            print(f"Loading ckpt from {self.cfg.pretrained_model_name_or_path}")
+            ckpt = torch.load(
+                self.cfg.pretrained_model_name_or_path, map_location="cpu"
+            )["state_dict"]
+            pretrained_model_ckpt = {}
+            for k, v in ckpt.items():
+                if k.startswith("condition."):
+                    pretrained_model_ckpt[k.replace("condition.", "")] = v
+            self.load_state_dict(pretrained_model_ckpt, strict=True)
+    def encode_image_clip(
+        self,
+        images: Iterable[Optional[ImageType]],
+        cameras: Optional[torch.Tensor] = None,
+        force_none_camera_embeds: bool = False,
+        return_dict: bool = False,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        camera_embeds = None
+        if isinstance(images, (np.ndarray, torch.Tensor)):  # for training process
+            assert (
+                images.min() >= 0.0 and images.max() <= 1.0
+            ), "The pixel values should be in the range of [0, 1]"
+            if self.cfg.encode_camera:
+                assert cameras is not None, "The cameras should be provided"
+                camera_embeds = self.encode_camera(cameras)
+            pixel_values = self.transform_clip(images.permute(0, 3, 1, 2))
+        else:  # for inference process
+            if self.cfg.encode_camera:
+                if cameras is None:
+                    bs = len(images) // self.cfg.n_views
+                    cameras = (
+                        self.cameras[: self.cfg.n_views]
+                        .repeat(bs, 1, 1)
+                        .to(self.clip_model.device)
+                    )
+                camera_embeds = self.encode_camera(cameras)
+            pixel_values = self.image_preprocess_clip.preprocess(
+                images,
+                return_tensors="pt",
+                do_rescale=True,
+                do_resize=True,
+                size=CLIP_IMAGE_SIZE,
+                crop_size=CLIP_IMAGE_SIZE,
+            ).pixel_values
+        if force_none_camera_embeds:
+            camera_embeds = None
+        if pixel_values.ndim == 4:
+            pixel_values = pixel_values.unsqueeze(1)
+            if camera_embeds is not None:
+                camera_embeds = camera_embeds.unsqueeze(1)
+        if self.cfg.encode_camera and camera_embeds is not None:
+            vision_outputs = self.clip_model.vision_model(
+                pixel_values=rearrange(
+                    pixel_values.to(self.clip_model.device), "B N C H W -> (B N) C H W"
+                ),
+                condition=rearrange(camera_embeds, "B N C -> (B N) C"),
+            )
+        else:
+            vision_outputs = self.clip_model.vision_model(
+                pixel_values=rearrange(
+                    pixel_values.to(self.clip_model.device), "B N C H W -> (B N) C H W"
+                ),
+            )
+        if return_dict:
+            # clip
+            pooler_output = vision_outputs[1]  # pooled_output
+            image_features = self.clip_model.visual_projection(pooler_output)
+            clip_embeds = vision_outputs.last_hidden_state
+            clip_embeds_dict = CLIPEmbedOutput(
+                last_hidden_state=clip_embeds,
+                pooler_output=pooler_output,
+                embeds=image_features,
+            )
+            return clip_embeds_dict
+        else:
+            return vision_outputs.last_hidden_state
+    def encode_image_dino(
+        self,
+        images: Iterable[Optional[ImageType]],
+        cameras: Optional[torch.Tensor] = None,
+        force_none_camera_embeds: bool = False,
+        return_dict: bool = False,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        camera_embeds = None
+        if isinstance(images, (np.ndarray, torch.Tensor)):  # for training process
+            assert (
+                images.min() >= 0.0 and images.max() <= 1.0
+            ), "The pixel values should be in the range of [0, 1]"
+            if self.cfg.encode_camera:
+                assert cameras is not None, "The cameras should be provided"
+                camera_embeds = self.encode_camera(cameras)
+            pixel_values = self.transform_dino(images.permute(0, 3, 1, 2))
+        else:  # for inference process
+            if self.cfg.encode_camera:
+                if cameras is None:
+                    bs = len(images) // self.cfg.n_views
+                    cameras = (
+                        self.cameras[: self.cfg.n_views]
+                        .repeat(bs, 1, 1)
+                        .to(self.dino_model.device)
+                    )
+                camera_embeds = self.encode_camera(cameras)
+            pixel_values = self.image_preprocess_dino.preprocess(
+                images,
+                return_tensors="pt",
+                do_rescale=True,
+                do_resize=True,
+                size=self.cfg.image_size,
+                crop_size=self.cfg.image_size,
+            ).pixel_values
+        if force_none_camera_embeds:
+            camera_embeds = None
+        if pixel_values.ndim == 4:
+            pixel_values = pixel_values.unsqueeze(1)
+            if camera_embeds is not None:
+                camera_embeds = camera_embeds.unsqueeze(1)
+        if self.cfg.encode_camera and camera_embeds is not None:
+            vision_outputs = self.dino_model(
+                rearrange(
+                    pixel_values.to(self.dino_model.device), "B N C H W -> (B N) C H W"
+                ),
+                condition=rearrange(camera_embeds, "B N C -> (B N) C"),
+            )
+        else:
+            vision_outputs = self.dino_model(
+                rearrange(
+                    pixel_values.to(self.dino_model.device), "B N C H W -> (B N) C H W"
+                ),
+            )
+        if return_dict:
+            # dino
+            dino_embeds_dict = DINOEmbedOutput(
+                last_hidden_state=vision_outputs.last_hidden_state,
+                pooler_output=vision_outputs.pooler_output,
+            )
+            return dino_embeds_dict
+        else:
+            return vision_outputs.last_hidden_state
+    def align_clip_dino(self, clip_embeds, dino_embeds):
+        if (
+            clip_embeds.shape[-2] != dino_embeds.shape[-2]
+        ):  # different shape, interpolate the clip embeddings to the same shape as dino embeddings
+            assert (
+                clip_embeds.shape[-2] == (self.cfg.image_size // 14) ** 2 + 1
+            ), "The clip embeddings should have the shape of (n_views, (image_size // 14) ** 2 + 1, 1024)"
+            clip_embeds_patch_tokens = clip_embeds[:, 1:].view(
+                clip_embeds.shape[0],
+                self.cfg.image_size // 14,
+                self.cfg.image_size // 14,
+                1024,
+            )
+            clip_embeds_patch_tokens = (
+                torch.nn.functional.interpolate(
+                    clip_embeds_patch_tokens.permute(0, 3, 1, 2),
+                    size=(self.cfg.image_size // 14, self.cfg.image_size // 14),
+                    mode="bilinear",
+                    align_corners=False,
+                )
+                .permute(0, 2, 3, 1)
+                .view(clip_embeds.shape[0], -1, 1024)
+            )
+            clip_embeds = torch.cat(
+                [clip_embeds[:, :1], clip_embeds_patch_tokens], dim=1
+            )
+        return clip_embeds, dino_embeds
+    def encode_image(
+        self,
+        images: Iterable[Optional[ImageType]],
+        cameras: Optional[torch.Tensor] = None,
+        force_none_camera_embeds: bool = False,
+        return_dict: bool = False,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        clip_embeds = self.encode_image_clip(images, cameras)
+        dino_embeds = self.encode_image_dino(images, cameras)
+        if (
+            self.dino_model.__class__.__name__ == "Dinov2WithRegistersModel"
+        ):  # x_norm_clstoken, x_norm_regtokens, x_norm_patchtokens
+            dino_embeds = torch.cat(
+                [
+                    dino_embeds[:, :1],
+                    dino_embeds[:, self.dino_model.config.num_register_tokens + 1 :],
+                ],
+                dim=1,
+            )
+        clip_embeds = self.linear_proj(clip_embeds)  # bs, 257, 1024
+        if self.cfg.fuse_type == "concat":
+            visual_embeds = torch.cat([dino_embeds, clip_embeds], dim=1)
+        # elif self.cfg.fuse_type == 'add':
+        #     clip_embeds, dino_embeds = self.align_clip_dino(clip_embeds, dino_embeds)
+        else:
+            raise ValueError
+        return visual_embeds

step1x3d_geometry/models/conditional_encoders/dinov2_encoder.py ADDED Viewed

	@@ -0,0 +1,296 @@

+import random
+import torch
+from torch import nn
+import numpy as np
+import re
+from einops import rearrange
+from dataclasses import dataclass
+from torchvision import transforms
+from diffusers.models.modeling_utils import ModelMixin
+from transformers import AutoImageProcessor, AutoModel
+from transformers.utils import ModelOutput
+from typing import Iterable, Optional, Union, List
+import step1x3d_geometry
+from step1x3d_geometry.utils.typing import *
+from .base import BaseVisualEncoder, ImageType
+from .dinov2.modeling_dinov2 import Dinov2Model
+from .dinov2.modeling_conditional_dinov2 import ConditionalDinov2Model
+from .dinov2_with_registers.modeling_dinov2_with_registers import (
+    Dinov2WithRegistersModel,
+)
+class DINOEmbedOutput(ModelOutput):
+    last_hidden_state: torch.FloatTensor = None
+    pooler_output: torch.FloatTensor = None
+@step1x3d_geometry.register("dinov2-encoder")
+class Dinov2Encoder(BaseVisualEncoder, ModelMixin):
+    @dataclass
+    class Config(BaseVisualEncoder.Config):
+        pretrained_model_name_or_path: Optional[str] = (
+            None  # the pretrained model name or path for condition model
+        )
+        pretrained_dino_name_or_path: Optional[str] = (
+            None  # the pretrained model name or path for dino
+        )
+        freeze_modulation_dino: bool = False
+        enable_gradient_checkpointing: bool = False
+        image_size: int = 224
+        dino_type: Optional[str] = None
+        kwargs: Optional[dict] = None
+    cfg: Config
+    def configure(self) -> None:
+        super().configure()
+        # Load the DINOV2 model and processor
+        if not self.cfg.encode_camera:
+            if self.cfg.pretrained_dino_name_or_path is not None:
+                self.cfg.dino_type = f"facebook/{self.cfg.pretrained_dino_name_or_path.split('facebook--')[-1].split('/')[0]}"
+                if self.cfg.kwargs is not None:
+                    self.dino_model: Dinov2Model = AutoModel.from_pretrained(
+                        self.cfg.pretrained_dino_name_or_path, **self.cfg.kwargs
+                    )
+                else:
+                    self.dino_model: Dinov2Model = AutoModel.from_pretrained(
+                        self.cfg.pretrained_dino_name_or_path
+                    )
+            else:
+                if (
+                    self.cfg.pretrained_model_name_or_path is None
+                ):  # default to load Dinov2-base model
+                    assert (
+                        self.cfg.dino_type is not None
+                    ), "The dino_type should be provided"
+                    print(f"Loading Dinov2 model from {self.cfg.dino_type}")
+                    if "reg" in self.cfg.dino_type:
+                        self.dino_model: Dinov2WithRegistersModel = (
+                            Dinov2WithRegistersModel(
+                                config=Dinov2WithRegistersModel.config_class.from_pretrained(
+                                    self.cfg.dino_type,
+                                )
+                            )
+                        )
+                    else:
+                        self.dino_model: Dinov2Model = Dinov2Model(
+                            config=Dinov2Model.config_class.from_pretrained(
+                                self.dino_type,
+                            )
+                        )
+                elif "dinov2base" in self.cfg.pretrained_model_name_or_path:
+                    print("Loading Dinov2 model from facebook/dinov2-base")
+                    self.cfg.dino_type = "facebook/dinov2-base"
+                    self.dino_model: Dinov2Model = Dinov2Model(
+                        config=Dinov2Model.config_class.from_pretrained(
+                            "facebook/dinov2-base",
+                        )
+                    )
+                elif "dinov2regbase" in self.cfg.pretrained_model_name_or_path:
+                    print(
+                        "Loading Dinov2 model from facebook/dinov2-with-registers-base"
+                    )
+                    self.cfg.dino_type = "facebook/dinov2-with-registers-base"
+                    self.dino_model: Dinov2WithRegistersModel = (
+                        Dinov2WithRegistersModel(
+                            config=Dinov2WithRegistersModel.config_class.from_pretrained(
+                                "facebook/dinov2-with-registers-base",
+                            )
+                        )
+                    )
+                elif "dinov2reglarge" in self.cfg.pretrained_model_name_or_path:
+                    print(
+                        "Loading Dinov2 model from facebook/dinov2-with-registers-large"
+                    )
+                    self.cfg.dino_type = "facebook/dinov2-with-registers-large"
+                    self.dino_model: Dinov2WithRegistersModel = (
+                        Dinov2WithRegistersModel(
+                            config=Dinov2WithRegistersModel.config_class.from_pretrained(
+                                "facebook/dinov2-with-registers-large",
+                            )
+                        )
+                    )
+                else:
+                    raise ValueError(
+                        f"Unknown Dinov2 model: {self.cfg.pretrained_model_name_or_path}"
+                    )
+        else:
+            # dino
+            conditional_vit_config = (
+                ConditionalDinov2Model.config_class.from_pretrained(
+                    self.cfg.pretrained_dino_name_or_path,
+                )
+            )
+            conditional_vit_config.modulation_dim = self.cfg.camera_embeds_dim
+            self.dino_model: ConditionalDinov2Model = (
+                ConditionalDinov2Model.from_pretrained(
+                    self.cfg.pretrained_dino_name_or_path, config=conditional_vit_config
+                )
+            )
+        self.image_preprocess_dino = AutoImageProcessor.from_pretrained(
+            self.cfg.dino_type
+            if self.cfg.pretrained_dino_name_or_path is None
+            else self.cfg.pretrained_dino_name_or_path
+        )
+        self.transform_dino = transforms.Compose(
+            [
+                transforms.Resize(
+                    self.cfg.image_size,
+                    transforms.InterpolationMode.BICUBIC,
+                    antialias=True,
+                ),
+                transforms.CenterCrop(
+                    self.cfg.image_size
+                ),  # crop a (image_size, image_size) square
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406],
+                    std=[0.229, 0.224, 0.225],
+                ),
+            ]
+        )
+        if self.cfg.enable_gradient_checkpointing:
+            self.dino_model.encoder.gradient_checkpointing = True
+        if self.cfg.zero_uncond_embeds:
+            self.empty_image_embeds = torch.zeros(
+                (
+                    self.cfg.n_views,
+                    (self.cfg.image_size // 14) ** 2 + 1,
+                    self.dino_model.config.hidden_size,
+                )
+            ).detach()
+        else:
+            if self.cfg.encode_camera:
+                self.empty_image_embeds = self.encode_image_dino(
+                    torch.zeros(
+                        self.cfg.n_views, self.cfg.image_size, self.cfg.image_size, 3
+                    ),
+                    self.cameras[: self.cfg.n_views],
+                ).detach()
+            else:
+                self.empty_image_embeds = self.encode_image_dino(
+                    torch.zeros(
+                        self.cfg.n_views, self.cfg.image_size, self.cfg.image_size, 3
+                    )
+                ).detach()
+        # freeze the dino model parameters
+        self.dino_model.eval()
+        for k, p in self.dino_model.named_parameters():
+            ks = k.split(".")
+            if (
+                "mod_norm1" in ks
+                or "mod_norm2" in ks
+                and not self.cfg.freeze_modulation_dino
+            ):
+                p.requires_grad_(not self.cfg.freeze_modulation_dino)
+            else:
+                p.requires_grad_(False)
+        # load pretrained_model_name_or_path
+        if self.cfg.pretrained_model_name_or_path is not None:
+            print(f"Loading ckpt from {self.cfg.pretrained_model_name_or_path}")
+            ckpt = torch.load(
+                self.cfg.pretrained_model_name_or_path, map_location="cpu"
+            )["state_dict"]
+            pretrained_model_ckpt = {}
+            for k, v in ckpt.items():
+                if k.startswith("visual_condition."):
+                    pretrained_model_ckpt[k.replace("visual_condition.", "")] = v
+            self.load_state_dict(pretrained_model_ckpt, strict=True)
+    def encode_image_dino(
+        self,
+        images: Iterable[Optional[ImageType]],
+        cameras: Optional[torch.Tensor] = None,
+        force_none_camera_embeds: bool = False,
+        return_dict: bool = False,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        camera_embeds = None
+        if isinstance(images, (np.ndarray, torch.Tensor)):  # for training process
+            assert (
+                images.min() >= 0.0 and images.max() <= 1.0
+            ), "The pixel values should be in the range of [0, 1]"
+            if self.cfg.encode_camera:
+                assert cameras is not None, "The cameras should be provided"
+                camera_embeds = self.encode_camera(cameras)
+            pixel_values = self.transform_dino(images.permute(0, 3, 1, 2))
+        else:  # for inference process
+            if self.cfg.encode_camera:
+                if cameras is None:
+                    bs = len(images) // self.cfg.n_views
+                    cameras = (
+                        self.cameras[: self.cfg.n_views]
+                        .repeat(bs, 1, 1)
+                        .to(self.dino_model.device)
+                    )
+                camera_embeds = self.encode_camera(cameras)
+            pixel_values = self.image_preprocess_dino.preprocess(
+                images,
+                return_tensors="pt",
+                do_rescale=True,
+                do_resize=True,
+                size=self.cfg.image_size,
+                crop_size=self.cfg.image_size,
+            ).pixel_values
+        if force_none_camera_embeds:
+            camera_embeds = None
+        if pixel_values.ndim == 4:
+            pixel_values = pixel_values.unsqueeze(1)
+            if camera_embeds is not None:
+                camera_embeds = camera_embeds.unsqueeze(1)
+        if self.cfg.encode_camera and camera_embeds is not None:
+            vision_outputs = self.dino_model(
+                rearrange(
+                    pixel_values.to(self.dino_model.device), "B N C H W -> (B N) C H W"
+                ),
+                condition=rearrange(camera_embeds, "B N C -> (B N) C"),
+            )
+        else:
+            vision_outputs = self.dino_model(
+                rearrange(
+                    pixel_values.to(self.dino_model.device), "B N C H W -> (B N) C H W"
+                ),
+            )
+        if return_dict:
+            # dino
+            dino_embeds_dict = DINOEmbedOutput(
+                last_hidden_state=vision_outputs.last_hidden_state,
+                pooler_output=vision_outputs.pooler_output,
+            )
+            return dino_embeds_dict
+        else:
+            return vision_outputs.last_hidden_state
+    def encode_image(
+        self,
+        images: Iterable[Optional[ImageType]],
+        cameras: Optional[torch.Tensor] = None,
+        force_none_camera_embeds: bool = False,
+        return_dict: bool = False,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        dino_embeds = self.encode_image_dino(images, cameras)
+        if (
+            self.dino_model.__class__.__name__ == "Dinov2WithRegistersModel"
+        ):  # x_norm_clstoken, x_norm_regtokens, x_norm_patchtokens
+            dino_embeds = torch.cat(
+                [
+                    dino_embeds[:, :1],
+                    dino_embeds[:, self.dino_model.config.num_register_tokens + 1 :],
+                ],
+                dim=1,
+            )
+        return dino_embeds

step1x3d_geometry/models/conditional_encoders/dinov2_with_registers/modeling_dinov2_with_registers.py ADDED Viewed

	@@ -0,0 +1,1088 @@

+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_dinov2_with_registers.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 Meta Inc. and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections.abc
+import math
+from typing import Dict, List, Optional, Set, Tuple, Union
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+    BackboneOutput,
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+    ImageClassifierOutput,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.pytorch_utils import (
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from transformers.utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+    torch_int,
+)
+from transformers.utils.backbone_utils import BackboneMixin
+from transformers.models.dinov2_with_registers.configuration_dinov2_with_registers import (
+    Dinov2WithRegistersConfig,
+)
+logger = logging.get_logger(__name__)
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/dinov2_with_registers-base"
+# General docstring
+_CONFIG_FOR_DOC = "Dinov2WithRegistersConfig"
+class Dinov2WithRegistersPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+        image_size = (
+            image_size
+            if isinstance(image_size, collections.abc.Iterable)
+            else (image_size, image_size)
+        )
+        patch_size = (
+            patch_size
+            if isinstance(patch_size, collections.abc.Iterable)
+            else (patch_size, patch_size)
+        )
+        num_patches = (image_size[1] // patch_size[1]) * (
+            image_size[0] // patch_size[0]
+        )
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+        self.projection = nn.Conv2d(
+            num_channels, hidden_size, kernel_size=patch_size, stride=patch_size
+        )
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        num_channels = pixel_values.shape[1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+                f" Expected {self.num_channels} but got {num_channels}."
+            )
+        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return embeddings
+class Dinov2WithRegistersEmbeddings(nn.Module):
+    """
+    Construct the CLS token, mask token, register tokens, position and patch embeddings.
+    """
+    def __init__(self, config: Dinov2WithRegistersConfig) -> None:
+        super().__init__()
+        self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size))
+        self.register_tokens = nn.Parameter(
+            torch.zeros(1, config.num_register_tokens, config.hidden_size)
+        )
+        self.patch_embeddings = Dinov2WithRegistersPatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(
+            torch.randn(1, num_patches + 1, config.hidden_size)
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.patch_size = config.patch_size
+        self.config = config
+    def interpolate_pos_encoding(
+        self, embeddings: torch.Tensor, height: int, width: int
+    ) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
+        resolution images. This implementation supports torch.jit tracing while maintaining backwards compatibility
+        with the original implementation.
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+        - https://github.com/facebookresearch/dinov2/blob/main/dinov2/models/vision_transformer.py
+        """
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+        # Skip interpolation for matching dimensions (unless tracing)
+        if (
+            not torch.jit.is_tracing()
+            and num_patches == num_positions
+            and height == width
+        ):
+            return self.position_embeddings
+        # Handle class token and patch embeddings separately
+        class_pos_embed = self.position_embeddings[:, 0]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+        dim = embeddings.shape[-1]
+        # Calculate new dimensions
+        height = height // self.config.patch_size
+        width = width // self.config.patch_size
+        # Reshape for interpolation
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(
+            1, sqrt_num_positions, sqrt_num_positions, dim
+        )
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+        # Store original dtype for restoration after interpolation
+        target_dtype = patch_pos_embed.dtype
+        # Interpolate at float32 precision
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.to(dtype=torch.float32),
+            size=(
+                torch_int(height),
+                torch_int(width),
+            ),  # Explicit size instead of scale_factor
+            mode="bicubic",
+            align_corners=False,
+            antialias=True,
+        ).to(dtype=target_dtype)
+        # Validate output dimensions if not tracing
+        if not torch.jit.is_tracing():
+            if (
+                int(height) != patch_pos_embed.shape[-2]
+                or int(width) != patch_pos_embed.shape[-1]
+            ):
+                raise ValueError(
+                    "Width or height does not match with the interpolated position embeddings"
+                )
+        # Reshape back to original format
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        # Combine class and patch embeddings
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+    def forward(
+        self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        target_dtype = self.patch_embeddings.projection.weight.dtype
+        embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
+        if bool_masked_pos is not None:
+            embeddings = torch.where(
+                bool_masked_pos.unsqueeze(-1),
+                self.mask_token.to(embeddings.dtype).unsqueeze(0),
+                embeddings,
+            )
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+        # add positional encoding to each token
+        embeddings = embeddings + self.interpolate_pos_encoding(
+            embeddings, height, width
+        )
+        # add register tokens
+        embeddings = torch.cat(
+            (
+                embeddings[:, :1],
+                self.register_tokens.expand(embeddings.shape[0], -1, -1),
+                embeddings[:, 1:],
+            ),
+            dim=1,
+        )
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class Dinov2WithRegistersSelfAttention(nn.Module):
+    def __init__(self, config: Dinov2WithRegistersConfig) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+            config, "embedding_size"
+        ):
+            raise ValueError(
+                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(
+            config.hidden_size, self.all_head_size, bias=config.qkv_bias
+        )
+        self.key = nn.Linear(
+            config.hidden_size, self.all_head_size, bias=config.qkv_bias
+        )
+        self.value = nn.Linear(
+            config.hidden_size, self.all_head_size, bias=config.qkv_bias
+        )
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (
+            self.num_attention_heads,
+            self.attention_head_size,
+        )
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(
+        self,
+        hidden_states,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        mixed_query_layer = self.query(hidden_states)
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+        outputs = (
+            (context_layer, attention_probs) if output_attentions else (context_layer,)
+        )
+        return outputs
+class Dinov2WithRegistersSdpaSelfAttention(Dinov2WithRegistersSelfAttention):
+    def __init__(self, config: Dinov2WithRegistersConfig) -> None:
+        super().__init__(config)
+        self.attention_probs_dropout_prob = config.attention_probs_dropout_prob
+    def forward(
+        self,
+        hidden_states,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "Dinov2WithRegistersModel is using Dinov2WithRegistersSdpaSelfAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+            )
+        mixed_query_layer = self.query(hidden_states)
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        context_layer = torch.nn.functional.scaled_dot_product_attention(
+            query_layer,
+            key_layer,
+            value_layer,
+            head_mask,
+            self.attention_probs_dropout_prob if self.training else 0.0,
+            is_causal=False,
+            scale=None,
+        )
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+        return context_layer, None
+class Dinov2WithRegistersSelfOutput(nn.Module):
+    """
+    The residual connection is defined in Dinov2WithRegistersLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+    def __init__(self, config: Dinov2WithRegistersConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(
+        self, hidden_states: torch.Tensor, input_tensor: torch.Tensor
+    ) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+class Dinov2WithRegistersAttention(nn.Module):
+    def __init__(self, config: Dinov2WithRegistersConfig) -> None:
+        super().__init__()
+        self.attention = Dinov2WithRegistersSelfAttention(config)
+        self.output = Dinov2WithRegistersSelfOutput(config)
+        self.pruned_heads = set()
+    def prune_heads(self, heads: Set[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads,
+            self.attention.num_attention_heads,
+            self.attention.attention_head_size,
+            self.pruned_heads,
+        )
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(
+            heads
+        )
+        self.attention.all_head_size = (
+            self.attention.attention_head_size * self.attention.num_attention_heads
+        )
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[
+            1:
+        ]  # add attentions if we output them
+        return outputs
+class Dinov2WithRegistersSdpaAttention(Dinov2WithRegistersAttention):
+    def __init__(self, config: Dinov2WithRegistersConfig) -> None:
+        super().__init__(config)
+        self.attention = Dinov2WithRegistersSdpaSelfAttention(config)
+class Dinov2WithRegistersLayerScale(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.lambda1 = nn.Parameter(
+            config.layerscale_value * torch.ones(config.hidden_size)
+        )
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        return hidden_state * self.lambda1
+def drop_path(
+    input: torch.Tensor, drop_prob: float = 0.0, training: bool = False
+) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (
+        input.ndim - 1
+    )  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(
+        shape, dtype=input.dtype, device=input.device
+    )
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+class Dinov2WithRegistersDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+class Dinov2WithRegistersMLP(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=True)
+        if isinstance(config.hidden_act, str):
+            self.activation = ACT2FN[config.hidden_act]
+        else:
+            self.activation = config.hidden_act
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=True)
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.fc1(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.fc2(hidden_state)
+        return hidden_state
+class Dinov2WithRegistersSwiGLUFFN(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True)
+        self.weights_out = nn.Linear(hidden_features, out_features, bias=True)
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.weights_in(hidden_state)
+        x1, x2 = hidden_state.chunk(2, dim=-1)
+        hidden = nn.functional.silu(x1) * x2
+        return self.weights_out(hidden)
+DINOV2_WITH_REGISTERS_ATTENTION_CLASSES = {
+    "eager": Dinov2WithRegistersAttention,
+    "sdpa": Dinov2WithRegistersSdpaAttention,
+}
+class Dinov2WithRegistersLayer(nn.Module):
+    """This corresponds to the Block class in the original implementation."""
+    def __init__(self, config: Dinov2WithRegistersConfig) -> None:
+        super().__init__()
+        self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attention = DINOV2_WITH_REGISTERS_ATTENTION_CLASSES[
+            config._attn_implementation
+        ](config)
+        self.layer_scale1 = Dinov2WithRegistersLayerScale(config)
+        self.drop_path = (
+            Dinov2WithRegistersDropPath(config.drop_path_rate)
+            if config.drop_path_rate > 0.0
+            else nn.Identity()
+        )
+        self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        if config.use_swiglu_ffn:
+            self.mlp = Dinov2WithRegistersSwiGLUFFN(config)
+        else:
+            self.mlp = Dinov2WithRegistersMLP(config)
+        self.layer_scale2 = Dinov2WithRegistersLayerScale(config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_attention_outputs = self.attention(
+            self.norm1(
+                hidden_states
+            ),  # in Dinov2WithRegisters, layernorm is applied before self-attention
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        attention_output = self.layer_scale1(attention_output)
+        outputs = self_attention_outputs[
+            1:
+        ]  # add self attentions if we output attention weights
+        # first residual connection
+        hidden_states = self.drop_path(attention_output) + hidden_states
+        # in Dinov2WithRegisters, layernorm is also applied after self-attention
+        layer_output = self.norm2(hidden_states)
+        layer_output = self.mlp(layer_output)
+        layer_output = self.layer_scale2(layer_output)
+        # second residual connection
+        layer_output = self.drop_path(layer_output) + hidden_states
+        outputs = (layer_output,) + outputs
+        return outputs
+class Dinov2WithRegistersEncoder(nn.Module):
+    def __init__(self, config: Dinov2WithRegistersConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [Dinov2WithRegistersLayer(config) for _ in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    layer_head_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states, layer_head_mask, output_attentions
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, all_hidden_states, all_self_attentions]
+                if v is not None
+            )
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+class Dinov2WithRegistersPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = Dinov2WithRegistersConfig
+    base_model_prefix = "dinov2_with_registers"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Dinov2WithRegistersSwiGLUFFN"]
+    _supports_sdpa = True
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, Dinov2WithRegistersEmbeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.position_embeddings.dtype)
+            module.cls_token.data = nn.init.trunc_normal_(
+                module.cls_token.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.cls_token.dtype)
+_EXPECTED_OUTPUT_SHAPE = [1, 257, 768]
+DINOV2_WITH_REGISTERS_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+    Parameters:
+        config ([`Dinov2WithRegistersConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+DINOV2_WITH_REGISTERS_BASE_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`BitImageProcessor.preprocess`] for details.
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for
+            pre-training.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    "The bare Dinov2WithRegisters Model transformer outputting raw hidden-states without any specific head on top.",
+    DINOV2_WITH_REGISTERS_START_DOCSTRING,
+)
+class Dinov2WithRegistersModel(Dinov2WithRegistersPreTrainedModel):
+    def __init__(self, config: Dinov2WithRegistersConfig):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = Dinov2WithRegistersEmbeddings(config)
+        self.encoder = Dinov2WithRegistersEncoder(config)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> Dinov2WithRegistersPatchEmbeddings:
+        return self.embeddings.patch_embeddings
+    def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    @add_start_docstrings_to_model_forward(DINOV2_WITH_REGISTERS_BASE_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPooling,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        embedding_output = self.embeddings(
+            pixel_values, bool_masked_pos=bool_masked_pos
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = sequence_output[:, 0, :]
+        if not return_dict:
+            head_outputs = (sequence_output, pooled_output)
+            return head_outputs + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "facebook/dinov2_with_registers-small-imagenet1k-1-layer"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+DINOV2_WITH_REGISTERS_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`BitImageProcessor.preprocess`] for details.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    """
+    Dinov2WithRegisters Model transformer with an image classification head on top (a linear layer on top of the final hidden state
+    of the [CLS] token) e.g. for ImageNet.
+    """,
+    DINOV2_WITH_REGISTERS_START_DOCSTRING,
+)
+class Dinov2WithRegistersForImageClassification(Dinov2WithRegistersPreTrainedModel):
+    def __init__(self, config: Dinov2WithRegistersConfig) -> None:
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.dinov2_with_registers = Dinov2WithRegistersModel(config)
+        # Classifier head
+        self.classifier = (
+            nn.Linear(config.hidden_size * 2, config.num_labels)
+            if config.num_labels > 0
+            else nn.Identity()
+        )
+        # Initialize weights and apply final processing
+        self.post_init()
+    @add_start_docstrings_to_model_forward(DINOV2_WITH_REGISTERS_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        outputs = self.dinov2_with_registers(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]  # batch_size, sequence_length, hidden_size
+        cls_token = sequence_output[:, 0]
+        patch_tokens = sequence_output[:, 1:]
+        linear_input = torch.cat([cls_token, patch_tokens.mean(dim=1)], dim=1)
+        logits = self.classifier(linear_input)
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (
+                    labels.dtype == torch.long or labels.dtype == torch.int
+                ):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Dinov2WithRegisters backbone, to be used with frameworks like DETR and MaskFormer.
+    """,
+    DINOV2_WITH_REGISTERS_START_DOCSTRING,
+)
+class Dinov2WithRegistersBackbone(Dinov2WithRegistersPreTrainedModel, BackboneMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        super()._init_backbone(config)
+        self.num_features = [
+            config.hidden_size for _ in range(config.num_hidden_layers + 1)
+        ]
+        self.embeddings = Dinov2WithRegistersEmbeddings(config)
+        self.encoder = Dinov2WithRegistersEncoder(config)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.num_register_tokens = config.num_register_tokens
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> Dinov2WithRegistersPatchEmbeddings:
+        return self.embeddings.patch_embeddings
+    @add_start_docstrings_to_model_forward(DINOV2_WITH_REGISTERS_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> BackboneOutput:
+        """
+        Returns:
+        Examples:
+        Returns:
+        Examples:
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-with-registers-base")
+        >>> model = AutoBackbone.from_pretrained(
+        ...     "facebook/dinov2-with-registers-base", out_features=["stage2", "stage5", "stage8", "stage11"]
+        ... )
+        >>> inputs = processor(image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> feature_maps = outputs.feature_maps
+        >>> list(feature_maps[-1].shape)
+        [1, 768, 16, 16]
+        ```"""
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        embedding_output = self.embeddings(pixel_values)
+        outputs = self.encoder(
+            embedding_output,
+            output_hidden_states=True,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs.hidden_states if return_dict else outputs[1]
+        feature_maps = ()
+        for stage, hidden_state in zip(self.stage_names, hidden_states):
+            if stage in self.out_features:
+                if self.config.apply_layernorm:
+                    hidden_state = self.layernorm(hidden_state)
+                if self.config.reshape_hidden_states:
+                    hidden_state = hidden_state[:, self.num_register_tokens + 1 :]
+                    # this was actually a bug in the original implementation that we copied here,
+                    # cause normally the order is height, width
+                    batch_size, _, height, width = pixel_values.shape
+                    patch_size = self.config.patch_size
+                    hidden_state = hidden_state.reshape(
+                        batch_size, height // patch_size, width // patch_size, -1
+                    )
+                    hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
+                feature_maps += (hidden_state,)
+        if not return_dict:
+            if output_hidden_states:
+                output = (feature_maps,) + outputs[1:]
+            else:
+                output = (feature_maps,) + outputs[2:]
+            return output
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions if output_attentions else None,
+        )
+__all__ = [
+    "Dinov2WithRegistersPreTrainedModel",
+    "Dinov2WithRegistersModel",
+    "Dinov2WithRegistersForImageClassification",
+    "Dinov2WithRegistersBackbone",
+]

step1x3d_geometry/models/conditional_encoders/label_encoder.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import random
+import torch
+from torch import nn
+import numpy as np
+import re
+from einops import rearrange
+from dataclasses import dataclass
+from torchvision import transforms
+from diffusers.models.modeling_utils import ModelMixin
+from transformers.utils import ModelOutput
+from typing import Iterable, Optional, Union, List
+import step1x3d_geometry
+from step1x3d_geometry.utils.typing import *
+from step1x3d_geometry.utils.misc import get_device
+from .base import BaseLabelEncoder
+DEFAULT_POSE = 0  # "unknown", "t-pose", "a-pose", uncond
+NUM_POSE_CLASSES = 3
+POSE_MAPPING = {"unknown": 0, "t-pose": 1, "a-pose": 2, "uncond": 3}
+DEFAULT_SYMMETRY_TYPE = 0  # "asymmetry", "x", uncond
+NUM_SYMMETRY_TYPE_CLASSES = 2
+SYMMETRY_TYPE_MAPPING = {"asymmetry": 0, "x": 1, "y": 0, "z": 0, "uncond": 2}
+DEFAULT_GEOMETRY_QUALITY = 0  # "normal", "smooth", "sharp", uncond,
+NUM_GEOMETRY_QUALITY_CLASSES = 3
+GEOMETRY_QUALITY_MAPPING = {"normal": 0, "smooth": 1, "sharp": 2, "uncod": 3}
+@step1x3d_geometry.register("label-encoder")
+class LabelEncoder(BaseLabelEncoder, ModelMixin):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+    Args:
+        num_classes (`int`): The number of classes.
+        hidden_size (`int`): The size of the vector embeddings.
+    """
+    def configure(self) -> None:
+        super().configure()
+        if self.cfg.zero_uncond_embeds:
+            self.embedding_table_tpose = nn.Embedding(
+                NUM_POSE_CLASSES, self.cfg.hidden_size
+            )
+            self.embedding_table_symmetry_type = nn.Embedding(
+                NUM_SYMMETRY_TYPE_CLASSES, self.cfg.hidden_size
+            )
+            self.embedding_table_geometry_quality = nn.Embedding(
+                NUM_GEOMETRY_QUALITY_CLASSES, self.cfg.hidden_size
+            )
+        else:
+            self.embedding_table_tpose = nn.Embedding(
+                NUM_POSE_CLASSES + 1, self.cfg.hidden_size
+            )
+            self.embedding_table_symmetry_type = nn.Embedding(
+                NUM_SYMMETRY_TYPE_CLASSES + 1, self.cfg.hidden_size
+            )
+            self.embedding_table_geometry_quality = nn.Embedding(
+                NUM_GEOMETRY_QUALITY_CLASSES + 1, self.cfg.hidden_size
+            )
+        if self.cfg.zero_uncond_embeds:
+            self.empty_label_embeds = torch.zeros((1, 3, self.cfg.hidden_size)).detach()
+        else:
+            self.empty_label_embeds = (
+                self.encode_label(  # the last class label is for the uncond
+                    [{"pose": "", "symetry": "", "geometry_type": ""}]
+                ).detach()
+            )
+        # load pretrained_model_name_or_path
+        if self.cfg.pretrained_model_name_or_path is not None:
+            print(f"Loading ckpt from {self.cfg.pretrained_model_name_or_path}")
+            ckpt = torch.load(
+                self.cfg.pretrained_model_name_or_path, map_location="cpu"
+            )["state_dict"]
+            pretrained_model_ckpt = {}
+            for k, v in ckpt.items():
+                if k.startswith("label_condition."):
+                    pretrained_model_ckpt[k.replace("label_condition.", "")] = v
+            self.load_state_dict(pretrained_model_ckpt, strict=True)
+    def encode_label(self, labels: List[dict]) -> torch.FloatTensor:
+        tpose_label_embeds = []
+        symmetry_type_label_embeds = []
+        geometry_quality_label_embeds = []
+        for label in labels:
+            if "pose" in label.keys():
+                if label["pose"] is None or label["pose"] == "":
+                    tpose_label_embeds.append(
+                        torch.zeros(self.cfg.hidden_size).detach().to(get_device())
+                    )
+                else:
+                    tpose_label_embeds.append(
+                        self.embedding_table_symmetry_type(
+                            torch.tensor(POSE_MAPPING[label["pose"][0]]).to(
+                                get_device()
+                            )
+                        )
+                    )
+            else:
+                tpose_label_embeds.append(
+                    self.embedding_table_tpose(
+                        torch.tensor(DEFAULT_POSE).to(get_device())
+                    )
+                )
+            if "symmetry" in label.keys():
+                if label["symmetry"] is None or label["symmetry"] == "":
+                    symmetry_type_label_embeds.append(
+                        torch.zeros(self.cfg.hidden_size).detach().to(get_device())
+                    )
+                else:
+                    symmetry_type_label_embeds.append(
+                        self.embedding_table_symmetry_type(
+                            torch.tensor(
+                                SYMMETRY_TYPE_MAPPING[label["symmetry"][0]]
+                            ).to(get_device())
+                        )
+                    )
+            else:
+                symmetry_type_label_embeds.append(
+                    self.embedding_table_symmetry_type(
+                        torch.tensor(DEFAULT_SYMMETRY_TYPE).to(get_device())
+                    )
+                )
+            if "geometry_type" in label.keys():
+                if label["geometry_type"] is None or label["geometry_type"] == "":
+                    geometry_quality_label_embeds.append(
+                        torch.zeros(self.cfg.hidden_size).detach().to(get_device())
+                    )
+                else:
+                    geometry_quality_label_embeds.append(
+                        self.embedding_table_geometry_quality(
+                            torch.tensor(
+                                GEOMETRY_QUALITY_MAPPING[label["geometry_type"][0]]
+                            ).to(get_device())
+                        )
+                    )
+            else:
+                geometry_quality_label_embeds.append(
+                    self.embedding_table_geometry_quality(
+                        torch.tensor(DEFAULT_GEOMETRY_QUALITY).to(get_device())
+                    )
+                )
+        tpose_label_embeds = torch.stack(tpose_label_embeds)
+        symmetry_type_label_embeds = torch.stack(symmetry_type_label_embeds)
+        geometry_quality_label_embeds = torch.stack(geometry_quality_label_embeds)
+        label_embeds = torch.stack(
+            [
+                tpose_label_embeds,
+                symmetry_type_label_embeds,
+                geometry_quality_label_embeds,
+            ],
+            dim=1,
+        ).to(self.dtype)
+        return label_embeds

step1x3d_geometry/models/conditional_encoders/t5_encoder.py ADDED Viewed

	@@ -0,0 +1,271 @@

+import random
+import torch
+from torch import nn
+import numpy as np
+import re
+import urllib.parse as ul
+from bs4 import BeautifulSoup
+from einops import rearrange
+from dataclasses import dataclass
+from torchvision import transforms
+from diffusers.models.modeling_utils import ModelMixin
+from transformers import AutoImageProcessor, AutoModel
+from transformers import T5EncoderModel, T5Tokenizer, AutoTokenizer
+from transformers.utils import ModelOutput
+from typing import Iterable, Optional, Union, List
+import step1x3d_geometry
+from step1x3d_geometry.utils.typing import *
+from .base import BaseCaptionEncoder
+bad_punct_regex = re.compile(
+    r"["
+    + "#®•©™&@·º½¾¿¡§~"
+    + "\)"
+    + "\("
+    + "\]"
+    + "\["
+    + "\}"
+    + "\{"
+    + "\|"
+    + "\\"
+    + "\/"
+    + "\*"
+    + r"]{1,}"
+)  # noqa
+@step1x3d_geometry.register("t5-encoder")
+class T5Encoder(BaseCaptionEncoder, ModelMixin):
+    @dataclass
+    class Config(BaseCaptionEncoder.Config):
+        pretrained_model_name_or_path: Optional[str] = (
+            None  # the pretrained model name or path for condition model
+        )
+        pretrained_t5_name_or_path: Optional[str] = (
+            None  # the pretrained model name or path for T5
+        )
+        preprocessing_text: bool = False
+        text_max_length: int = 77
+        t5_type: Optional[str] = None
+    cfg: Config
+    def configure(self) -> None:
+        super().configure()
+        # Load the T5 model and tokenizer
+        if self.cfg.pretrained_t5_name_or_path is not None:
+            self.cfg.t5_type = f"google-t5/{self.cfg.pretrained_t5_name_or_path.split('google-t5--')[-1].split('/')[0]}"
+            self.tokenizer = T5Tokenizer.from_pretrained(
+                self.cfg.pretrained_t5_name_or_path
+            )
+            self.text_model = T5EncoderModel.from_pretrained(
+                self.cfg.pretrained_t5_name_or_path, torch_dtype=torch.bfloat16
+            )
+        else:
+            if (
+                self.cfg.pretrained_model_name_or_path is None
+            ):  # default to load t5-base model
+                assert self.cfg.t5_type is not None, "The t5_type should be provided"
+                print(f"Loading T5 model from {self.cfg.t5_type}")
+                self.text_model = T5EncoderModel(
+                    config=T5EncoderModel.config_class.from_pretrained(
+                        self.cfg.t5_type,
+                    )
+                ).to(torch.bfloat16)
+            elif "t5small" in self.cfg.pretrained_model_name_or_path:
+                print("Loading Dinov2 model from google-t5/t5-small")
+                self.cfg.t5_type = "google-t5/t5-small"
+                self.text_model = T5EncoderModel.from_pretrained(
+                    self.cfg.t5_type, torch_dtype=torch.bfloat16
+                )
+            elif "t5base" in self.cfg.pretrained_model_name_or_path:
+                print("Loading Dinov2 model from google-t5/t5-base")
+                self.cfg.t5_type = "google-t5/t5-base"
+                self.text_model = T5EncoderModel.from_pretrained(
+                    self.cfg.t5_type, torch_dtype=torch.bfloat16
+                )
+            else:
+                raise ValueError(
+                    f"Unknown T5 model: {self.cfg.pretrained_model_name_or_path}"
+                )
+            self.tokenizer = T5Tokenizer.from_pretrained(self.cfg.t5_type)
+        # Set the empty image/text embeds
+        if self.cfg.zero_uncond_embeds:
+            self.empty_text_embeds = torch.zeros(
+                (1, self.cfg.text_max_length, self.text_model.config.hidden_size)
+            ).detach()
+        else:
+            self.empty_text_embeds = self.encode_text([""]).detach()
+        # load pretrained_model_name_or_path
+        if self.cfg.pretrained_model_name_or_path is not None:
+            print(f"Loading ckpt from {self.cfg.pretrained_model_name_or_path}")
+            ckpt = torch.load(
+                self.cfg.pretrained_model_name_or_path, map_location="cpu"
+            )["state_dict"]
+            pretrained_model_ckpt = {}
+            for k, v in ckpt.items():
+                if k.startswith("caption_condition."):
+                    pretrained_model_ckpt[k.replace("caption_condition.", "")] = v
+            self.load_state_dict(pretrained_model_ckpt, strict=True)
+    def clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub("<person>", "person", caption)
+        # urls:
+        caption = re.sub(
+            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        caption = re.sub(
+            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        # html:
+        caption = BeautifulSoup(caption, features="html.parser").text
+        # @<nickname>
+        caption = re.sub(r"@[\w\d]+\b", "", caption)
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
+        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
+        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
+        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
+        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
+        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
+        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
+        #######################################################
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
+            "-",
+            caption,
+        )
+        # кавычки к одному стандарту
+        caption = re.sub(r"[`´«»“”¨]", '"', caption)
+        caption = re.sub(r"[‘’]", "'", caption)
+        # &quot;
+        caption = re.sub(r"&quot;?", "", caption)
+        # &amp
+        caption = re.sub(r"&amp", "", caption)
+        # ip adresses:
+        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
+        # article ids:
+        caption = re.sub(r"\d:\d\d\s+$", "", caption)
+        # \n
+        caption = re.sub(r"\\n", " ", caption)
+        # "#123"
+        caption = re.sub(r"#\d{1,3}\b", "", caption)
+        # "#12345.."
+        caption = re.sub(r"#\d{5,}\b", "", caption)
+        # "123456.."
+        caption = re.sub(r"\b\d{6,}\b", "", caption)
+        # filenames:
+        caption = re.sub(
+            r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption
+        )
+        #
+        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
+        caption = re.sub(
+            bad_punct_regex, r" ", caption
+        )  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r"(?:\-|\_)")
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, " ", caption)
+        caption = self.basic_clean(caption)
+        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
+        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
+        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
+        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
+        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
+        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
+        caption = re.sub(
+            r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption
+        )
+        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
+        caption = re.sub(
+            r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption
+        )  # j2d1a2a...
+        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
+        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
+        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
+        caption = re.sub(r"\s+", " ", caption)
+        caption.strip()
+        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
+        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
+        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
+        caption = re.sub(r"^\.\S+$", "", caption)
+        return caption.strip()
+    def text_preprocessing(self, text):
+        if self.cfg.preprocessing_text:
+            # The exact text cleaning as was in the training stage:
+            text = self.clean_caption(text)
+            return text
+        else:
+            return text.lower().strip()
+    def encode_text(self, texts: List[str]) -> torch.FloatTensor:
+        texts = [self.text_preprocessing(text) for text in texts]
+        text_tokens_and_mask = self.tokenizer(
+            texts,
+            max_length=self.cfg.text_max_length,
+            padding="max_length",
+            truncation=True,
+            return_attention_mask=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_tokens_and_mask["input_ids"] = text_tokens_and_mask["input_ids"]  # N x 77
+        text_tokens_and_mask["attention_mask"] = text_tokens_and_mask["attention_mask"]
+        with torch.no_grad():
+            label_embeds = self.text_model(
+                input_ids=text_tokens_and_mask["input_ids"].to(self.text_model.device),
+                attention_mask=text_tokens_and_mask["attention_mask"].to(
+                    self.text_model.device
+                ),
+            )["last_hidden_state"].detach()
+        return label_embeds

step1x3d_geometry/models/pipelines/pipeline.py ADDED Viewed

	@@ -0,0 +1,513 @@

+# Some parts of this file are refer to Hugging Face Diffusers library.
+import os
+import json
+import warnings
+from typing import Callable, List, Optional, Union, Dict, Any
+import PIL.Image
+import trimesh
+import rembg
+import torch
+import numpy as np
+from huggingface_hub import hf_hub_download
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import BaseOutput
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.loaders import (
+    FluxIPAdapterMixin,
+    FluxLoraLoaderMixin,
+    FromSingleFileMixin,
+    TextualInversionLoaderMixin,
+)
+from .pipeline_utils import (
+    TransformerDiffusionMixin,
+    preprocess_image,
+    retrieve_timesteps,
+    remove_floater,
+    remove_degenerate_face,
+    reduce_face,
+    smart_load_model,
+)
+from transformers import (
+    BitImageProcessor,
+)
+import step1x3d_geometry
+from step1x3d_geometry.models.autoencoders.surface_extractors import MeshExtractResult
+from step1x3d_geometry.utils.config import ExperimentConfig, load_config
+from ..autoencoders.michelangelo_autoencoder import MichelangeloAutoencoder
+from ..conditional_encoders.dinov2_encoder import Dinov2Encoder
+from ..conditional_encoders.t5_encoder import T5Encoder
+from ..conditional_encoders.label_encoder import LabelEncoder
+from ..transformers.flux_transformer_1d import FluxDenoiser
+class Step1X3DGeometryPipelineOutput(BaseOutput):
+    """
+    Output class for image pipelines.
+    Args:
+        images (`List[PIL.Image.Image]` or `torch.Tensor`):
+            List of PIL images or a tensor representing the input images.
+        meshes (`List[trimesh.Trimesh]` or `np.ndarray`)
+            List of denoised trimesh meshes of length `batch_size` or a tuple of NumPy array with shape `((vertices, 3), (faces, 3)) of length `batch_size``.
+    """
+    image: PIL.Image.Image
+    mesh: Union[trimesh.Trimesh, MeshExtractResult, np.ndarray]
+class Step1X3DGeometryPipeline(
+    DiffusionPipeline, FromSingleFileMixin, TransformerDiffusionMixin
+):
+    """
+    Step1X-3D Geometry Pipeline, generate high-quality meshes conditioned on image/caption/label inputs
+    Args:
+        scheduler (FlowMatchEulerDiscreteScheduler):
+            The diffusion scheduler controlling the denoising process
+        vae (MichelangeloAutoencoder):
+            Variational Autoencoder for latent space compression/reconstruction
+        transformer (FluxDenoiser):
+            Transformer-based denoising model
+        visual_encoder (Dinov2Encoder):
+            Pretrained visual encoder for image feature extraction
+        caption_encoder (T5Encoder):
+            Text encoder for processing natural language captions
+        label_encoder (LabelEncoder):
+            Auxiliary text encoder for label conditioning
+        visual_eature_extractor (BitImageProcessor):
+            Preprocessor for input images
+    Note:
+        - CPU offloading sequence: visual_encoder → caption_encoder → label_encoder → transformer → vae
+        - Optional components: visual_encoder, visual_eature_extractor, caption_encoder, label_encoder
+    """
+    model_cpu_offload_seq = (
+        "visual_encoder->caption_encoder->label_encoder->transformer->vae"
+    )
+    _optional_components = [
+        "visual_encoder",
+        "visual_eature_extractor",
+        "caption_encoder",
+        "label_encoder",
+    ]
+    @classmethod
+    def from_pretrained(cls, model_path, subfolder='.', **kwargs):
+        local_model_path = smart_load_model(model_path, subfolder)
+        return super().from_pretrained(local_model_path, **kwargs)
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: MichelangeloAutoencoder,
+        transformer: FluxDenoiser,
+        visual_encoder: Dinov2Encoder,
+        caption_encoder: T5Encoder,
+        label_encoder: LabelEncoder,
+        visual_eature_extractor: BitImageProcessor,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            transformer=transformer,
+            scheduler=scheduler,
+            visual_encoder=visual_encoder,
+            caption_encoder=caption_encoder,
+            label_encoder=label_encoder,
+            visual_eature_extractor=visual_eature_extractor,
+        )
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    def check_inputs(
+        self,
+        image,
+    ):
+        r"""
+        Check if the inputs are valid. Raise an error if not.
+        """
+        if isinstance(image, str):
+            assert os.path.isfile(image) or image.startswith(
+                "http"
+            ), "Input image must be a valid URL or a file path."
+        elif isinstance(image, (torch.Tensor, PIL.Image.Image)):
+            raise ValueError(
+                "Input image must be a `torch.Tensor` or `PIL.Image.Image`."
+            )
+    def encode_image(self, image, device, num_meshes_per_prompt):
+        dtype = next(self.visual_encoder.parameters()).dtype
+        image_embeds = self.visual_encoder.encode_image(image)
+        image_embeds = image_embeds.repeat_interleave(num_meshes_per_prompt, dim=0)
+        uncond_image_embeds = self.visual_encoder.empty_image_embeds.repeat(
+            image_embeds.shape[0], 1, 1
+        ).to(image_embeds)
+        return image_embeds, uncond_image_embeds
+    def encode_caption(self, caption, device, num_meshes_per_prompt):
+        dtype = next(self.label_encoder.parameters()).dtype
+        caption_embeds = self.caption_encoder.encode_text([caption])
+        caption_embeds = caption_embeds.repeat_interleave(num_meshes_per_prompt, dim=0)
+        uncond_caption_embeds = self.caption_encoder.empty_text_embeds.repeat(
+            caption_embeds.shape[0], 1, 1
+        ).to(caption_embeds)
+        return caption_embeds, uncond_caption_embeds
+    def encode_label(self, label, device, num_meshes_per_prompt):
+        dtype = next(self.label_encoder.parameters()).dtype
+        label_embeds = self.label_encoder.encode_label([label])
+        label_embeds = label_embeds.repeat_interleave(num_meshes_per_prompt, dim=0)
+        uncond_label_embeds = self.label_encoder.empty_label_embeds.repeat(
+            label_embeds.shape[0], 1, 1
+        ).to(label_embeds)
+        return label_embeds, uncond_label_embeds
+    def prepare_latents(
+        self,
+        batch_size,
+        num_tokens,
+        num_channels_latents,
+        dtype,
+        device,
+        generator,
+        latents: Optional[torch.Tensor] = None,
+    ):
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+        shape = (batch_size, num_tokens, num_channels_latents)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        return latents
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Union[torch.FloatTensor, PIL.Image.Image, str],
+        label: Optional[str] = None,
+        caption: Optional[str] = None,
+        num_inference_steps: int = 30,
+        timesteps: List[int] = None,
+        num_meshes_per_prompt: int = 1,
+        guidance_scale: float = 7.5,
+        generator: Optional[int] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        force_remove_background: bool = False,
+        background_color: List[int] = [255, 255, 255],
+        foreground_ratio: float = 0.95,
+        surface_extractor_type: Optional[str] = None,
+        bounds: float = 1.05,
+        mc_level: float = 0.0,
+        octree_resolution: int = 384,
+        output_type: str = "trimesh",
+        do_remove_floater: bool = True,
+        do_remove_degenerate_face: bool = False,
+        do_reduce_face: bool = True,
+        do_shade_smooth: bool = True,
+        max_facenum: int = 200000,
+        return_dict: bool = True,
+        use_zero_init: Optional[bool] = True,
+        zero_steps: Optional[int] = 0,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            image (`torch.FloatTensor` or `PIL.Image.Image` or `str`):
+                `Image`, or tensor representing an image batch, or path to an image file. The image will be encoded to
+                its CLIP/DINO-v2 embedding which the DiT will be conditioned on.
+            label (`str`):
+                The label of the generated mesh, like {"symmetry": "asymmetry", "edge_type": "smooth"}
+            num_inference_steps (`int`, *optional*, defaults to 30):
+                The number of denoising steps. More denoising steps usually lead to a higher quality mesh at the expense
+                of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not provided, will use equally spaced timesteps.
+            num_meshes_per_prompt (`int`, *optional*, defaults to 1):
+                The number of meshes to generate per input image.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                Higher guidance scale encourages generation that closely matches the input image.
+            generator (`int`, *optional*):
+                A seed to make the generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents to use as inputs for mesh generation.
+            force_remove_background (`bool`, *optional*, defaults to `False`):
+                Whether to force remove the background from the input image before processing.
+            background_color (`List[int]`, *optional*, defaults to `[255, 255, 255]`):
+                RGB color values for the background if it needs to be removed or modified.
+            foreground_ratio (`float`, *optional*, defaults to 0.95):
+                Ratio of the image to consider as foreground when processing.
+            surface_extractor_type (`str`, *optional*, defaults to "mc"):
+                Type of surface extraction method to use ("mc" for Marching Cubes or other available methods).
+            bounds (`float`, *optional*, defaults to 1.05):
+                Bounding box size for the generated mesh.
+            mc_level (`float`, *optional*, defaults to 0.0):
+                Iso-surface level value for Marching Cubes extraction.
+            octree_resolution (`int`, *optional*, defaults to 256):
+                Resolution of the octree used for mesh generation.
+            output_type (`str`, *optional*, defaults to "trimesh"):
+                Type of output mesh format ("trimesh" or other supported formats).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a `MeshPipelineOutput` instead of a plain tuple.
+        Returns:
+            [`MeshPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`MeshPipelineOutput`] is returned, otherwise a `tuple` is returned where the
+                first element is a list of generated meshes and the second element is a list of corresponding metadata.
+        """
+        # 0. Check inputs. Raise error if not correct
+        self.check_inputs(
+            image=image,
+        )
+        device = self._execution_device
+        self._guidance_scale = guidance_scale
+        # 1. Define call parameters
+        if isinstance(image, torch.Tensor):
+            batch_size = image.shape[0]
+        elif isinstance(image, PIL.Image.Image) or isinstance(image, str):
+            batch_size = 1
+        # 2. Preprocess input image
+        if isinstance(image, torch.Tensor):
+            assert image.ndim == 3  # H, W, 3
+            image_pil = TF.to_pil_image(image)
+        elif isinstance(image, PIL.Image.Image):
+            image_pil = image
+        elif isinstance(image, str):
+            if image.startswith("http"):
+                import requests
+                image_pil = PIL.Image.open(requests.get(image, stream=True).raw)
+            else:
+                image_pil = PIL.Image.open(image)
+        image_pil = preprocess_image(image_pil, force=force_remove_background, background_color=background_color, foreground_ratio=foreground_ratio)  # remove the background images
+        # 3. Encode condition
+        image_embeds, negative_image_embeds = self.encode_image(
+            image_pil, device, num_meshes_per_prompt
+        )
+        if self.do_classifier_free_guidance and image_embeds is not None:
+            image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0)
+        # 3.1 Encode label condition
+        label_embeds = None
+        if self.transformer.cfg.use_label_condition:
+            if label is not None:
+                label_embeds, negative_label_embeds = self.encode_label(
+                    label, device, num_meshes_per_prompt
+                )
+                if self.do_classifier_free_guidance:
+                    label_embeds = torch.cat(
+                        [negative_label_embeds, label_embeds], dim=0
+                    )
+            else:
+                uncond_label_embeds = self.label_encoder.empty_label_embeds.repeat(
+                    num_meshes_per_prompt, 1, 1
+                ).to(image_embeds)
+                if self.do_classifier_free_guidance:
+                    label_embeds = torch.cat(
+                        [uncond_label_embeds, uncond_label_embeds], dim=0
+                    )
+        # 3.3 Encode caption condition
+        caption_embeds = None
+        if self.transformer.cfg.use_caption_condition:
+            if caption is not None:
+                caption_embeds, negative_caption_embeds = self.encode_caption(
+                    caption, device, num_meshes_per_prompt
+                )
+                if self.do_classifier_free_guidance:
+                    caption_embeds = torch.cat(
+                        [negative_caption_embeds, caption_embeds], dim=0
+                    )
+            else:
+                uncond_caption_embeds = self.caption_encoder.empty_text_embeds.repeat(
+                    num_meshes_per_prompt, 1, 1
+                ).to(image_embeds)
+                if self.do_classifier_free_guidance:
+                    caption_embeds = torch.cat(
+                        [uncond_caption_embeds, uncond_caption_embeds], dim=0
+                    )
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps
+        )
+        num_warmup_steps = max(
+            len(timesteps) - num_inference_steps * self.scheduler.order, 0
+        )
+        self._num_timesteps = len(timesteps)
+        # 5. Prepare latent variables
+        num_latents = self.vae.cfg.num_latents
+        num_channels_latents = self.transformer.cfg.input_channels
+        latents = self.prepare_latents(
+            batch_size * num_meshes_per_prompt,
+            num_latents,
+            num_channels_latents,
+            image_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = (
+                    torch.cat([latents] * 2)
+                    if self.do_classifier_free_guidance
+                    else latents
+                )
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0])
+                noise_pred = self.transformer(
+                    latent_model_input,
+                    timestep,
+                    visual_condition=image_embeds,
+                    label_condition=label_embeds,
+                    caption_condition=caption_embeds,
+                    return_dict=False,
+                )[0]
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_image = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (
+                        noise_pred_image - noise_pred_uncond
+                    )
+                if (i <= zero_steps) and use_zero_init:
+                    noise_pred = noise_pred * 0.0
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, return_dict=False
+                )[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    progress_bar.update()
+        # 4. Post-processing
+        if not output_type == "latent":
+            if latents.dtype == torch.bfloat16:
+                self.vae.to(torch.float16)
+                latents = latents.to(torch.float16)
+            mesh = self.vae.extract_geometry(
+                self.vae.decode(latents),
+                surface_extractor_type=surface_extractor_type,
+                bounds=bounds,
+                mc_level=mc_level,
+                octree_resolution=octree_resolution,
+                enable_pbar=False,
+            )
+            if output_type != "raw":
+                mesh_list = []
+                for i, cur_mesh in enumerate(mesh):
+                    print(f"Generating mesh {i+1}/{num_meshes_per_prompt}")
+                    if output_type == "trimesh":
+                        import trimesh
+                        cur_mesh = trimesh.Trimesh(
+                            vertices=cur_mesh.verts.cpu().numpy(),
+                            faces=cur_mesh.faces.cpu().numpy(),
+                        )
+                        cur_mesh.fix_normals()
+                        cur_mesh.face_normals
+                        cur_mesh.vertex_normals
+                        cur_mesh.visual = trimesh.visual.TextureVisuals(
+                            material=trimesh.visual.material.PBRMaterial(
+                                baseColorFactor=(255, 255, 255),
+                                main_color=(255, 255, 255),
+                                metallicFactor=0.05,
+                                roughnessFactor=1.0,
+                            )
+                        )
+                        if do_remove_floater:
+                            cur_mesh = remove_floater(cur_mesh)
+                        if do_remove_degenerate_face:
+                            cur_mesh = remove_degenerate_face(cur_mesh)
+                        if do_reduce_face and max_facenum > 0:
+                            cur_mesh = reduce_face(cur_mesh, max_facenum)
+                        if do_shade_smooth:
+                            cur_mesh = cur_mesh.smooth_shaded
+                        mesh_list.append(cur_mesh)
+                    elif output_type == "np":
+                        if do_remove_floater:
+                            print(
+                                'remove floater is NOT used when output_type is "np". '
+                            )
+                        if do_remove_degenerate_face:
+                            print(
+                                'remove degenerate face is NOT used when output_type is "np". '
+                            )
+                        if do_reduce_face:
+                            print(
+                                'reduce floater is NOT used when output_type is "np". '
+                            )
+                        if do_shade_smooth:
+                            print('shade smooth is NOT used when output_type is "np". ')
+                        mesh_list.append(
+                            [
+                                cur_mesh[0].verts.cpu().numpy(),
+                                cur_mesh[0].faces.cpu().numpy(),
+                            ]
+                        )
+                mesh = mesh_list
+            else:
+                if do_remove_floater:
+                    print('remove floater is NOT used when output_type is "raw". ')
+                if do_remove_degenerate_face:
+                    print(
+                        'remove degenerate face is NOT used when output_type is "raw". '
+                    )
+                if do_reduce_face:
+                    print('reduce floater is NOT used when output_type is "raw". ')
+        else:
+            mesh = latents
+        if not return_dict:
+            return tuple(image_pil), tuple(mesh)
+        return Step1X3DGeometryPipelineOutput(image=image_pil, mesh=mesh)

step1x3d_geometry/models/pipelines/pipeline_utils.py ADDED Viewed

	@@ -0,0 +1,404 @@

+from typing import Callable, List, Optional, Union, Dict, Any
+import os
+from diffusers.utils import logging
+import PIL.Image
+import torch
+import trimesh
+import pymeshlab
+import tempfile
+from step1x3d_geometry.models.autoencoders.surface_extractors import MeshExtractResult
+logger = logging.get_logger(__name__)
+def preprocess_image(
+    images_pil: Union[List[PIL.Image.Image], PIL.Image.Image],
+    force: bool = False,
+    background_color: List[int] = [255, 255, 255],
+    foreground_ratio: float = 0.9,
+    rembg_backend: str = "bria",
+    **rembg_kwargs,
+):
+    r"""
+    Crop and remote the background of the input image
+    Args:
+        image_pil (`List[PIL.Image.Image]`):
+            List of `PIL.Image.Image` objects representing the input image.
+        force (`bool`, *optional*, defaults to `False`):
+            Whether to force remove the background even if the image has an alpha channel.
+    Returns:
+        `List[PIL.Image.Image]`: List of `PIL.Image.Image` objects representing the preprocessed image.
+    """
+    is_single_image = False
+    if isinstance(images_pil, PIL.Image.Image):
+        images_pil = [images_pil]
+        is_single_image = True
+    preprocessed_images = []
+    for i in range(len(images_pil)):
+        image = images_pil[i]
+        width, height, size = image.width, image.height, image.size
+        do_remove = True
+        if image.mode == "RGBA" and image.getextrema()[3][0] < 255:
+            # explain why current do not rm bg
+            print(
+                "alhpa channl not empty, skip remove background, using alpha channel as mask"
+            )
+            do_remove = False
+        do_remove = do_remove or force
+        if do_remove:
+            import rembg  # lazy import
+            if rembg_backend == "default":
+                image = rembg.remove(image, **rembg_kwargs)
+            else:
+                image = rembg.remove(
+                    image,
+                    session=rembg.new_session(
+                        model_name="bria",
+                        providers=[
+                            (
+                                "CUDAExecutionProvider",
+                                {
+                                    "device_id": 0,
+                                    "arena_extend_strategy": "kSameAsRequested",
+                                    "gpu_mem_limit": 6 * 1024 * 1024 * 1024,
+                                    "cudnn_conv_algo_search": "HEURISTIC",
+                                },
+                            ),
+                            "CPUExecutionProvider",
+                        ],
+                    ),
+                    **rembg_kwargs,
+                )
+        # calculate the min bbox of the image
+        alpha = image.split()[-1]
+        bboxs = alpha.getbbox()
+        x1, y1, x2, y2 = bboxs
+        dy, dx = y2 - y1, x2 - x1
+        s = min(height * foreground_ratio / dy, width * foreground_ratio / dx)
+        Ht, Wt = int(dy * s), int(dx * s)
+        background = PIL.Image.new("RGBA", image.size, (*background_color, 255))
+        image = PIL.Image.alpha_composite(background, image)
+        image = image.crop(alpha.getbbox())
+        alpha = alpha.crop(alpha.getbbox())
+        # Calculate the new size after rescaling
+        new_size = tuple(int(dim * foreground_ratio) for dim in size)
+        # Resize the image while maintaining the aspect ratio
+        resized_image = image.resize((Wt, Ht))
+        resized_alpha = alpha.resize((Wt, Ht))
+        # Create a new image with the original size and white background
+        padded_image = PIL.Image.new("RGB", size, tuple(background_color))
+        padded_alpha = PIL.Image.new("L", size, (0))
+        paste_position = (
+            (width - resized_image.width) // 2,
+            (height - resized_image.height) // 2,
+        )
+        padded_image.paste(resized_image, paste_position)
+        padded_alpha.paste(resized_alpha, paste_position)
+        # expand image to 1:1
+        width, height = padded_image.size
+        if width == height:
+            padded_image.putalpha(padded_alpha)
+            preprocessed_images.append(padded_image)
+            continue
+        new_size = (max(width, height), max(width, height))
+        new_image = PIL.Image.new("RGB", new_size, tuple(background_color))
+        new_alpha = PIL.Image.new("L", new_size, (0))
+        paste_position = ((new_size[0] - width) // 2, (new_size[1] - height) // 2)
+        new_image.paste(padded_image, paste_position)
+        new_alpha.paste(padded_alpha, paste_position)
+        new_image.putalpha(new_alpha)
+        preprocessed_images.append(new_image)
+    if is_single_image:
+        return preprocessed_images[0]
+    return preprocessed_images
+def load_mesh(path):
+    if path.endswith(".glb"):
+        mesh = trimesh.load(path)
+    else:
+        mesh = pymeshlab.MeshSet()
+        mesh.load_new_mesh(path)
+    return mesh
+def trimesh2pymeshlab(mesh: trimesh.Trimesh):
+    with tempfile.NamedTemporaryFile(suffix=".ply", delete=False) as temp_file:
+        if isinstance(mesh, trimesh.scene.Scene):
+            for idx, obj in enumerate(mesh.geometry.values()):
+                if idx == 0:
+                    temp_mesh = obj
+                else:
+                    temp_mesh = temp_mesh + obj
+            mesh = temp_mesh
+        mesh.export(temp_file.name)
+        mesh = pymeshlab.MeshSet()
+        mesh.load_new_mesh(temp_file.name)
+    return mesh
+def pymeshlab2trimesh(mesh: pymeshlab.MeshSet):
+    with tempfile.NamedTemporaryFile(suffix=".ply", delete=False) as temp_file:
+        mesh.save_current_mesh(temp_file.name)
+        mesh = trimesh.load(temp_file.name)
+    if isinstance(mesh, trimesh.Scene):
+        combined_mesh = trimesh.Trimesh()
+        for geom in mesh.geometry.values():
+            combined_mesh = trimesh.util.concatenate([combined_mesh, geom])
+        mesh = combined_mesh
+    return mesh
+def import_mesh(mesh):
+    mesh_type = type(mesh)
+    if isinstance(mesh, str):
+        mesh = load_mesh(mesh)
+    elif isinstance(mesh, MeshExtractResult):
+        mesh = pymeshlab.MeshSet()
+        mesh_pymeshlab = pymeshlab.Mesh(
+            vertex_matrix=mesh.verts.cpu().numpy(), face_matrix=mesh.faces.cpu().numpy()
+        )
+        mesh.add_mesh(mesh_pymeshlab, "converted_mesh")
+    if isinstance(mesh, (trimesh.Trimesh, trimesh.scene.Scene)):
+        mesh = trimesh2pymeshlab(mesh)
+    return mesh, mesh_type
+def remove_floater(mesh):
+    mesh, mesh_type = import_mesh(mesh)
+    mesh.apply_filter(
+        "compute_selection_by_small_disconnected_components_per_face", nbfaceratio=0.001
+    )
+    mesh.apply_filter("compute_selection_transfer_face_to_vertex", inclusive=False)
+    mesh.apply_filter("meshing_remove_selected_vertices_and_faces")
+    return pymeshlab2trimesh(mesh)
+def remove_degenerate_face(mesh):
+    mesh, mesh_type = import_mesh(mesh)
+    with tempfile.NamedTemporaryFile(suffix=".ply", delete=False) as temp_file:
+        mesh.save_current_mesh(temp_file.name)
+        mesh = pymeshlab.MeshSet()
+        mesh.load_new_mesh(temp_file.name)
+    return pymeshlab2trimesh(mesh)
+def reduce_face(mesh, max_facenum=50000):
+    mesh, mesh_type = import_mesh(mesh)
+    if max_facenum > mesh.current_mesh().face_number():
+        return pymeshlab2trimesh(mesh)
+    mesh.apply_filter(
+        "meshing_decimation_quadric_edge_collapse",
+        targetfacenum=max_facenum,
+        qualitythr=1.0,
+        preserveboundary=True,
+        boundaryweight=3,
+        preservenormal=True,
+        preservetopology=True,
+        autoclean=True,
+    )
+    return pymeshlab2trimesh(mesh)
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError(
+            "Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values"
+        )
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()
+        )
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()
+        )
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+class TransformerDiffusionMixin:
+    r"""
+    Helper for DiffusionPipeline with vae and transformer.(mainly for DIT)
+    """
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+    def fuse_qkv_projections(self, transformer: bool = True, vae: bool = True):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        Args:
+            transformer (`bool`, defaults to `True`): To apply fusion on the Transformer.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+        """
+        self.fusing_transformer = False
+        self.fusing_vae = False
+        if transformer:
+            self.fusing_transformer = True
+            self.transformer.fuse_qkv_projections()
+        if vae:
+            self.fusing_vae = True
+            self.vae.fuse_qkv_projections()
+    def unfuse_qkv_projections(self, transformer: bool = True, vae: bool = True):
+        """Disable QKV projection fusion if enabled.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        Args:
+            transformer (`bool`, defaults to `True`): To apply fusion on the Transformer.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+        """
+        if transformer:
+            if not self.fusing_transformer:
+                logger.warning(
+                    "The UNet was not initially fused for QKV projections. Doing nothing."
+                )
+            else:
+                self.transformer.unfuse_qkv_projections()
+                self.fusing_transformer = False
+        if vae:
+            if not self.fusing_vae:
+                logger.warning(
+                    "The VAE was not initially fused for QKV projections. Doing nothing."
+                )
+            else:
+                self.vae.unfuse_qkv_projections()
+                self.fusing_vae = False
+def try_download(model_id, subfolder):
+    try:
+        from huggingface_hub import snapshot_download
+        path = snapshot_download(
+            repo_id=model_id,
+            allow_patterns=[f"{subfolder}/*"],
+        )
+        print(path)
+        model_path = os.path.join(path, subfolder)
+        return model_path
+    except Exception as e:
+        raise e
+def smart_load_model(model_path, subfolder = ""):
+    if subfolder == "":
+        if os.path.exists(model_path):
+            return model_path
+        else:
+            return try_download(model_path, '.')
+    else:
+        if os.path.exists(os.path.join(model_path, subfolder)):
+            return os.path.join(model_path, subfolder)
+        else:
+            return try_download(model_path, subfolder)

step1x3d_geometry/models/transformers/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from . import flux_transformer_1d, pixart_transformer_1d

step1x3d_geometry/models/transformers/flux_transformer_1d.py ADDED Viewed

	@@ -0,0 +1,600 @@

+# Some parts of this file are adapted from Hugging Face Diffusers library.
+from typing import Any, Dict, Optional, Union, Tuple
+from dataclasses import dataclass
+import re
+import torch
+from torch import nn
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import PeftAdapterMixin
+from diffusers.models.attention_processor import (
+    Attention,
+    AttentionProcessor,
+    AttnProcessor,
+)
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.embeddings import (
+    GaussianFourierProjection,
+    TimestepEmbedding,
+    Timesteps,
+)
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    is_torch_version,
+    logging,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.models.normalization import (
+    AdaLayerNormSingle,
+    AdaLayerNormContinuous,
+    FP32LayerNorm,
+    LayerNorm,
+)
+from ..attention_processor import FusedFluxAttnProcessor2_0, FluxAttnProcessor2_0
+from ..attention import FluxTransformerBlock, FluxSingleTransformerBlock
+import step1x3d_geometry
+from step1x3d_geometry.utils.base import BaseModule
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class Transformer1DModelOutput:
+    sample: torch.FloatTensor
+class FluxTransformer1DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
+    r"""
+    The Transformer model introduced in Flux.
+    Reference: https://blackforestlabs.ai/announcing-black-forest-la
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            The number of heads to use for multi-head attention.
+        width (`int`, *optional*, defaults to 2048):
+            Maximum sequence length in latent space (equivalent to max_seq_length in Transformers).
+            Determines the first dimension size of positional embedding matrices[1](@ref).
+        in_channels (`int`, *optional*, defaults to 64):
+            The number of channels in the input and output (specify if the input is **continuous**).
+        num_layers (`int`, *optional*, defaults to 1):
+            The number of layers of Transformer blocks to use.
+        cross_attention_dim (`int`, *optional*):
+            Dimensionality of conditional embeddings for cross-attention mechanisms
+    """
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["FluxTransformerBlock", "FluxSingleTransformerBlock"]
+    _skip_layerwise_casting_patterns = ["pos_embed", "norm"]
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        width: int = 2048,
+        in_channels: int = 4,
+        num_layers: int = 19,
+        num_single_layers: int = 38,
+        cross_attention_dim: int = 768,
+    ):
+        super().__init__()
+        # Set some common variables used across the board.
+        self.out_channels = in_channels
+        self.num_heads = num_attention_heads
+        self.inner_dim = width
+        # self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        # self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=self.inner_dim)
+        time_embed_dim, timestep_input_dim = self._set_time_proj(
+            "positional",
+            inner_dim=self.inner_dim,
+            flip_sin_to_cos=False,
+            freq_shift=0,
+            time_embedding_dim=None,
+        )
+        self.time_proj = TimestepEmbedding(
+            timestep_input_dim, time_embed_dim, act_fn="gelu", out_dim=self.inner_dim
+        )
+        self.proj_in = nn.Linear(self.config.in_channels, self.inner_dim, bias=True)
+        self.proj_cross_attention = nn.Linear(
+            self.config.cross_attention_dim, self.inner_dim, bias=True
+        )
+        # 2. Initialize the transformer blocks.
+        self.transformer_blocks = nn.ModuleList(
+            [
+                FluxTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=width // num_attention_heads,
+                )
+                for _ in range(self.config.num_layers)
+            ]
+        )
+        self.single_transformer_blocks = nn.ModuleList(
+            [
+                FluxSingleTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=width // num_attention_heads,
+                )
+                for _ in range(self.config.num_single_layers)
+            ]
+        )
+        # 3. Output blocks.
+        self.norm_out = AdaLayerNormContinuous(
+            self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6
+        )
+        self.proj_out = nn.Linear(self.inner_dim, self.out_channels, bias=True)
+        self.gradient_checkpointing = False
+    def _set_time_proj(
+        self,
+        time_embedding_type: str,
+        inner_dim: int,
+        flip_sin_to_cos: bool,
+        freq_shift: float,
+        time_embedding_dim: int,
+    ) -> Tuple[int, int]:
+        if time_embedding_type == "fourier":
+            time_embed_dim = time_embedding_dim or inner_dim * 2
+            if time_embed_dim % 2 != 0:
+                raise ValueError(
+                    f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}."
+                )
+            self.time_embed = GaussianFourierProjection(
+                time_embed_dim // 2,
+                set_W_to_weight=False,
+                log=False,
+                flip_sin_to_cos=flip_sin_to_cos,
+            )
+            timestep_input_dim = time_embed_dim
+        elif time_embedding_type == "positional":
+            time_embed_dim = time_embedding_dim or inner_dim * 4
+            self.time_embed = Timesteps(inner_dim, flip_sin_to_cos, freq_shift)
+            timestep_input_dim = inner_dim
+        else:
+            raise ValueError(
+                f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
+            )
+        return time_embed_dim, timestep_input_dim
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        self.original_attn_processors = None
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError(
+                    "`fuse_qkv_projections()` is not supported for models having added KV projections."
+                )
+        self.original_attn_processors = self.attn_processors
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+        self.set_attn_processor(FusedFluxAttnProcessor2_0())
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(
+            name: str,
+            module: torch.nn.Module,
+            processors: Dict[str, AttentionProcessor],
+        ):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(
+        self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]
+    ):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        self.set_attn_processor(FluxAttnProcessor2_0())
+    # Copied from diffusers.models.unets.unet_3d_condition.UNet3DConditionModel.enable_forward_chunking
+    def enable_forward_chunking(
+        self, chunk_size: Optional[int] = None, dim: int = 0
+    ) -> None:
+        """
+        Sets the attention processor to use [feed forward
+        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).
+        Parameters:
+            chunk_size (`int`, *optional*):
+                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
+                over each tensor of dim=`dim`.
+            dim (`int`, *optional*, defaults to `0`):
+                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
+                or dim=1 (sequence length).
+        """
+        if dim not in [0, 1]:
+            raise ValueError(f"Make sure to set `dim` to either 0 or 1, not {dim}")
+        # By default chunk size is 1
+        chunk_size = chunk_size or 1
+        def fn_recursive_feed_forward(
+            module: torch.nn.Module, chunk_size: int, dim: int
+        ):
+            if hasattr(module, "set_chunk_feed_forward"):
+                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
+            for child in module.children():
+                fn_recursive_feed_forward(child, chunk_size, dim)
+        for module in self.children():
+            fn_recursive_feed_forward(module, chunk_size, dim)
+    # Copied from diffusers.models.unets.unet_3d_condition.UNet3DConditionModel.disable_forward_chunking
+    def disable_forward_chunking(self):
+        def fn_recursive_feed_forward(
+            module: torch.nn.Module, chunk_size: int, dim: int
+        ):
+            if hasattr(module, "set_chunk_feed_forward"):
+                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
+            for child in module.children():
+                fn_recursive_feed_forward(child, chunk_size, dim)
+        for module in self.children():
+            fn_recursive_feed_forward(module, None, 0)
+    def forward(
+        self,
+        hidden_states: Optional[torch.Tensor],
+        timestep: Union[int, float, torch.LongTensor],
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ):
+        """
+        The [`HunyuanDiT2DModel`] forward method.
+        Args:
+        hidden_states (`torch.Tensor` of shape `(batch size, dim, latents_size)`):
+            The input tensor.
+        timestep ( `torch.LongTensor`, *optional*):
+            Used to indicate denoising step.
+        encoder_hidden_states ( `torch.Tensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+            Conditional embeddings for cross attention layer.
+        encoder_hidden_states_2 ( `torch.Tensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+            Conditional embeddings for cross attention layer.
+        return_dict: bool
+            Whether to return a dictionary.
+        """
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if (
+                attention_kwargs is not None
+                and attention_kwargs.get("scale", None) is not None
+            ):
+                logger.warning(
+                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+        _, N, _ = hidden_states.shape
+        # import pdb; pdb.set_trace()
+        # timesteps_proj = self.time_proj(timestep) # N x 256
+        # temb = self.time_embed(timesteps_proj).to(hidden_states.dtype)
+        temb = self.time_embed(timestep).to(hidden_states.dtype)  # N x 1280
+        temb = self.time_proj(temb)  # N x 1280
+        hidden_states = self.proj_in(hidden_states)
+        encoder_hidden_states = self.proj_cross_attention(encoder_hidden_states)
+        for layer, block in enumerate(self.transformer_blocks):
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = (
+                    {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                )
+                encoder_hidden_states, hidden_states = (
+                    torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(block),
+                        hidden_states,
+                        encoder_hidden_states,
+                        temb,
+                        None,  # image_rotary_emb
+                        attention_kwargs,
+                    )
+                )
+            else:
+                encoder_hidden_states, hidden_states = block(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=temb,
+                    image_rotary_emb=None,
+                    joint_attention_kwargs=attention_kwargs,
+                )  # (N, L, D)
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+        for layer, block in enumerate(self.single_transformer_blocks):
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = (
+                    {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                )
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    temb,
+                    None,  # image_rotary_emb
+                    attention_kwargs,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states,
+                    temb=temb,
+                    image_rotary_emb=None,
+                    joint_attention_kwargs=attention_kwargs,
+                )  # (N, L, D)
+        hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+        # final layer
+        hidden_states = self.norm_out(hidden_states, temb)
+        hidden_states = self.proj_out(hidden_states)
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (hidden_states,)
+        return Transformer1DModelOutput(sample=hidden_states)
+@step1x3d_geometry.register("flux-denoiser")
+class FluxDenoiser(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        pretrained_model_name_or_path: Optional[str] = None
+        input_channels: int = 32
+        width: int = 768
+        layers: int = 12
+        num_single_layers: int = 12
+        num_heads: int = 16
+        condition_dim: int = 1024
+        multi_condition_type: str = "in_context"
+        use_visual_condition: bool = False
+        visual_condition_dim: int = 1024
+        n_views: int = 1
+        use_caption_condition: bool = False
+        caption_condition_dim: int = 1024
+        use_label_condition: bool = False
+        label_condition_dim: int = 1024
+        identity_init: bool = False
+    cfg: Config
+    def configure(self) -> None:
+        assert (
+            self.cfg.multi_condition_type == "in_context"
+        ), "Flux Denoiser only support in_context learning of multiple conditions"
+        self.dit_model = FluxTransformer1DModel(
+            num_attention_heads=self.cfg.num_heads,
+            width=self.cfg.width,
+            in_channels=self.cfg.input_channels,
+            num_layers=self.cfg.layers,
+            num_single_layers=self.cfg.num_single_layers,
+            cross_attention_dim=self.cfg.condition_dim,
+        )
+        if (
+            self.cfg.use_visual_condition
+            and self.cfg.visual_condition_dim != self.cfg.condition_dim
+        ):
+            self.proj_visual_condtion = nn.Sequential(
+                nn.RMSNorm(self.cfg.visual_condition_dim),
+                nn.Linear(self.cfg.visual_condition_dim, self.cfg.condition_dim),
+            )
+        if (
+            self.cfg.use_caption_condition
+            and self.cfg.caption_condition_dim != self.cfg.condition_dim
+        ):
+            self.proj_caption_condtion = nn.Sequential(
+                nn.RMSNorm(self.cfg.caption_condition_dim),
+                nn.Linear(self.cfg.caption_condition_dim, self.cfg.condition_dim),
+            )
+        if (
+            self.cfg.use_label_condition
+            and self.cfg.label_condition_dim != self.cfg.condition_dim
+        ):
+            self.proj_label_condtion = nn.Sequential(
+                nn.RMSNorm(self.cfg.label_condition_dim),
+                nn.Linear(self.cfg.label_condition_dim, self.cfg.condition_dim),
+            )
+        if self.cfg.identity_init:
+            self.identity_initialize()
+        if self.cfg.pretrained_model_name_or_path:
+            print(
+                f"Loading pretrained DiT model from {self.cfg.pretrained_model_name_or_path}"
+            )
+            ckpt = torch.load(
+                self.cfg.pretrained_model_name_or_path,
+                map_location="cpu",
+                weights_only=True,
+            )
+            if "state_dict" in ckpt.keys():
+                ckpt = ckpt["state_dict"]
+            self.load_state_dict(ckpt, strict=True)
+    def identity_initialize(self):
+        for block in self.dit_model.blocks:
+            nn.init.constant_(block.attn.c_proj.weight, 0)
+            nn.init.constant_(block.attn.c_proj.bias, 0)
+            nn.init.constant_(block.cross_attn.c_proj.weight, 0)
+            nn.init.constant_(block.cross_attn.c_proj.bias, 0)
+            nn.init.constant_(block.mlp.c_proj.weight, 0)
+            nn.init.constant_(block.mlp.c_proj.bias, 0)
+    def forward(
+        self,
+        model_input: torch.FloatTensor,
+        timestep: torch.LongTensor,
+        visual_condition: Optional[torch.FloatTensor] = None,
+        caption_condition: Optional[torch.FloatTensor] = None,
+        label_condition: Optional[torch.FloatTensor] = None,
+        attention_kwargs: Dict[str, torch.Tensor] = None,
+        return_dict: bool = True,
+    ):
+        r"""
+        Args:
+            model_input (torch.FloatTensor): [bs, n_data, c]
+            timestep (torch.LongTensor): [bs,]
+            visual_condition (torch.FloatTensor): [bs, visual_context_tokens, c]
+            caption_condition (torch.FloatTensor): [bs, text_context_tokens, c]
+            label_condition (torch.FloatTensor): [bs, c]
+        Returns:
+            sample (torch.FloatTensor): [bs, n_data, c]
+        """
+        B, n_data, _ = model_input.shape
+        # 0. conditions projector
+        condition = []
+        if self.cfg.use_visual_condition:
+            assert visual_condition.shape[-1] == self.cfg.visual_condition_dim
+            if self.cfg.visual_condition_dim != self.cfg.condition_dim:
+                visual_condition = self.proj_visual_condtion(visual_condition)
+            condition.append(visual_condition)
+        if self.cfg.use_caption_condition:
+            assert caption_condition.shape[-1] == self.cfg.caption_condition_dim
+            if self.cfg.caption_condition_dim != self.cfg.condition_dim:
+                caption_condition = self.proj_caption_condtion(caption_condition)
+            condition.append(caption_condition)
+        if self.cfg.use_label_condition:
+            assert label_condition.shape[-1] == self.cfg.label_condition_dim
+            if self.cfg.label_condition_dim != self.cfg.condition_dim:
+                label_condition = self.proj_label_condtion(label_condition)
+            condition.append(label_condition)
+        # 1. denoise
+        output = self.dit_model(
+            model_input,
+            timestep,
+            torch.cat(condition, dim=1),
+            attention_kwargs,
+            return_dict=return_dict,
+        )
+        return output

step1x3d_geometry/models/transformers/pixart_transformer_1d.py ADDED Viewed

	@@ -0,0 +1,574 @@

+# Some parts of this file are adapted from Hugging Face Diffusers library.
+from dataclasses import dataclass
+import re
+import math
+import torch
+from torch import nn
+from typing import Callable, List, Optional, Union, Dict, Any
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils import logging
+from diffusers.models.attention_processor import (
+    Attention,
+    AttentionProcessor,
+    AttnProcessor,
+)
+from diffusers.models.embeddings import PatchEmbed, PixArtAlphaTextProjection
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import AdaLayerNormSingle
+from ..attention_processor import FusedAttnProcessor2_0, AttnProcessor2_0
+from ..attention import MultiCondBasicTransformerBlock
+import step1x3d_geometry
+from step1x3d_geometry.utils.base import BaseModule
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class Transformer1DModelOutput:
+    sample: torch.FloatTensor
+class PixArtTransformer1DModel(ModelMixin, ConfigMixin):
+    r"""
+    A 1D Transformer model as introduced in PixArt family of models (https://arxiv.org/abs/2310.00426,
+    https://arxiv.org/abs/2403.04692).
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            The number of heads to use for multi-head attention.
+        width (`int`, *optional*, defaults to 2048):
+            Maximum sequence length in latent space (equivalent to max_seq_length in Transformers).
+            Determines the first dimension size of positional embedding matrices[1](@ref).
+        in_channels (`int`, *optional*, defaults to 64):
+            The number of channels in the input and output (specify if the input is **continuous**).
+        num_layers (`int`, *optional*, defaults to 1):
+            The number of layers of Transformer blocks to use.
+        cross_attention_dim (`int`, *optional*):
+            Dimensionality of conditional embeddings for cross-attention mechanisms
+        use_cross_attention_2 (`bool`, *optional*):
+            Flag to enable secondary cross-attention mechanism. Used for multi-modal conditioning
+            when processing hybrid inputs (e.g., text + image prompts)[1](@ref).
+        cross_attention_2_dim (`int`, *optional*, defaults to 1024):
+            Dimensionality of secondary cross-attention embeddings. Specifies encoding dimensions
+            for additional conditional modalities when use_cross_attention_2 is enabled[1](@ref).
+    """
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["MultiCondBasicTransformerBlock", "PatchEmbed"]
+    _skip_layerwise_casting_patterns = ["pos_embed", "norm", "adaln_single"]
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        width: int = 2048,
+        in_channels: int = 4,
+        num_layers: int = 28,
+        cross_attention_dim: int = 768,
+        use_cross_attention_2: bool = True,
+        cross_attention_2_dim: int = 1024,
+        use_cross_attention_3: bool = True,
+        cross_attention_3_dim: int = 1024,
+    ):
+        super().__init__()
+        # Set some common variables used across the board.
+        self.out_channels = in_channels
+        self.num_heads = num_attention_heads
+        self.inner_dim = width
+        self.proj_in = nn.Linear(self.config.in_channels, self.inner_dim, bias=True)
+        # 2. Initialize the transformer blocks.
+        self.transformer_blocks = nn.ModuleList(
+            [
+                MultiCondBasicTransformerBlock(
+                    self.inner_dim,
+                    self.config.num_attention_heads,
+                    use_self_attention=True,
+                    use_cross_attention=True,
+                    self_attention_norm_type="ada_norm_single",
+                    cross_attention_dim=self.config.cross_attention_dim,
+                    cross_attention_norm_type="ada_norm_single",
+                    use_cross_attention_2=self.config.use_cross_attention_2,
+                    cross_attention_2_dim=self.config.cross_attention_2_dim,
+                    cross_attention_2_norm_type="ada_norm_single",
+                    use_cross_attention_3=self.config.use_cross_attention_3,
+                    cross_attention_3_dim=self.config.cross_attention_3_dim,
+                    cross_attention_3_norm_type="ada_norm_single",
+                    dropout=0.0,
+                    attention_bias=False,
+                    activation_fn="gelu-approximate",
+                    num_embeds_ada_norm=1000,
+                    norm_elementwise_affine=True,
+                    upcast_attention=False,
+                    norm_eps=1e-6,
+                    attention_type="default",
+                )
+                for _ in range(self.config.num_layers)
+            ]
+        )
+        # 3. Output blocks.
+        self.norm_out = nn.RMSNorm(self.inner_dim, elementwise_affine=True, eps=1e-6)
+        self.scale_shift_table = nn.Parameter(
+            torch.randn(2, self.inner_dim) / self.inner_dim**0.5
+        )
+        self.proj_out = nn.Linear(self.inner_dim, self.out_channels)
+        self.adaln_single = AdaLayerNormSingle(
+            self.inner_dim, use_additional_conditions=None
+        )
+        self.gradient_checkpointing = False
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(
+            name: str,
+            module: torch.nn.Module,
+            processors: Dict[str, AttentionProcessor],
+        ):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(
+        self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]
+    ):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        self.set_attn_processor(AttnProcessor2_0())
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        self.original_attn_processors = None
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError(
+                    "`fuse_qkv_projections()` is not supported for models having added KV projections."
+                )
+        self.original_attn_processors = self.attn_processors
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+        self.set_attn_processor(FusedAttnProcessor2_0())
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: Optional[torch.LongTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_hidden_states_2: Optional[torch.Tensor] = None,
+        encoder_hidden_states_3: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask_2: Optional[torch.Tensor] = None,
+        encoder_attention_mask_3: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ):
+        """
+        The [`PixArtTransformer2DModel`] forward method.
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch size, channel, n_tokens)`):
+                Input `hidden_states`.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            encoder_hidden_states_2 (`torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            encoder_hidden_states_3 (`torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep (`torch.LongTensor`, *optional*):
+                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+            cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            attention_mask ( `torch.Tensor`, *optional*):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            encoder_attention_mask ( `torch.Tensor`, *optional*):
+                Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
+                    * Mask `(batch, sequence_length)` True = keep, False = discard.
+                    * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
+                If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
+                above. This bias will be added to the cross-attention scores.
+            encoder_attention_mask_2 ( `torch.Tensor`, *optional*):
+                Cross-attention mask applied to `encoder_hidden_states_2`. Two formats supported:
+                    * Mask `(batch, sequence_length)` True = keep, False = discard.
+                    * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
+                If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
+                above. This bias will be added to the cross-attention scores.
+            encoder_attention_mask_3 ( `torch.Tensor`, *optional*):
+                Cross-attention mask applied to `encoder_hidden_states_3`. Two formats supported:
+                    * Mask `(batch, sequence_length)` True = keep, False = discard.
+                    * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
+                If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
+                above. This bias will be added to the cross-attention scores.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+        Returns:
+            If `return_dict` is True, an [`~Transformer1DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+        #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+        #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None and attention_mask.ndim == 2:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+            encoder_attention_mask = (
+                1 - encoder_attention_mask.to(hidden_states.dtype)
+            ) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        # convert encoder_attention_mask_2 to a bias the same way we do for attention_mask
+        if encoder_attention_mask_2 is not None and encoder_attention_mask_2.ndim == 2:
+            encoder_attention_mask_2 = (
+                1 - encoder_attention_mask_2.to(hidden_states.dtype)
+            ) * -10000.0
+            encoder_attention_mask_2 = encoder_attention_mask_2.unsqueeze(1)
+        # convert encoder_attention_mask_2 to a bias the same way we do for attention_mask
+        if encoder_attention_mask_3 is not None and encoder_attention_mask_3.ndim == 2:
+            encoder_attention_mask_3 = (
+                1 - encoder_attention_mask_3.to(hidden_states.dtype)
+            ) * -10000.0
+            encoder_attention_mask_3 = encoder_attention_mask_3.unsqueeze(1)
+        # 1. Input
+        batch_size = hidden_states.shape[0]
+        timestep, embedded_timestep = self.adaln_single(
+            timestep, batch_size=batch_size, hidden_dtype=hidden_states.dtype
+        )
+        hidden_states = self.proj_in(hidden_states)
+        # 2. Blocks
+        for block in self.transformer_blocks:
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                hidden_states = self._gradient_checkpointing_func(
+                    block,
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_hidden_states_2,
+                    encoder_hidden_states_3,
+                    encoder_attention_mask,
+                    encoder_attention_mask_2,
+                    encoder_attention_mask_3,
+                    timestep,
+                    cross_attention_kwargs,
+                    None,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_hidden_states_2=encoder_hidden_states_2,
+                    encoder_hidden_states_3=encoder_hidden_states_3,
+                    encoder_attention_mask=encoder_attention_mask,
+                    encoder_attention_mask_2=encoder_attention_mask_2,
+                    encoder_attention_mask_3=encoder_attention_mask_3,
+                    timestep=timestep,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    class_labels=None,
+                )
+        # 3. Output
+        shift, scale = (
+            self.scale_shift_table[None]
+            + embedded_timestep[:, None].to(self.scale_shift_table.device)
+        ).chunk(2, dim=1)
+        hidden_states = self.norm_out(hidden_states)
+        # Modulation
+        hidden_states = hidden_states * (1 + scale.to(hidden_states.device)) + shift.to(
+            hidden_states.device
+        )
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = hidden_states.squeeze(1)
+        if not return_dict:
+            return (hidden_states,)
+        return Transformer1DModelOutput(sample=hidden_states)
+@step1x3d_geometry.register("pixart-denoiser")
+class PixArtDenoiser(BaseModule):
+    @dataclass
+    class Config(BaseModule.Config):
+        pretrained_model_name_or_path: Optional[str] = None
+        input_channels: int = 32
+        width: int = 768
+        layers: int = 28
+        num_heads: int = 16
+        condition_dim: int = 1024
+        multi_condition_type: str = "cross_attention"
+        use_visual_condition: bool = False
+        visual_condition_dim: int = 1024
+        n_views: int = 1  # for multi-view condition
+        use_caption_condition: bool = False
+        caption_condition_dim: int = 1024
+        use_label_condition: bool = False
+        label_condition_dim: int = 1024
+        identity_init: bool = False
+    cfg: Config
+    def configure(self) -> None:
+        self.dit_model = PixArtTransformer1DModel(
+            num_attention_heads=self.cfg.num_heads,
+            width=self.cfg.width,
+            in_channels=self.cfg.input_channels,
+            num_layers=self.cfg.layers,
+            cross_attention_dim=self.cfg.condition_dim,
+            use_cross_attention_2=self.cfg.use_caption_condition
+            and self.cfg.multi_condition_type == "cross_attention",
+            cross_attention_2_dim=self.cfg.condition_dim,
+            use_cross_attention_3=self.cfg.use_label_condition
+            and self.cfg.multi_condition_type == "cross_attention",
+            cross_attention_3_dim=self.cfg.condition_dim,
+        )
+        if (
+            self.cfg.use_visual_condition
+            and self.cfg.visual_condition_dim != self.cfg.condition_dim
+        ):
+            self.proj_visual_condtion = nn.Sequential(
+                nn.RMSNorm(self.cfg.visual_condition_dim),
+                nn.Linear(self.cfg.visual_condition_dim, self.cfg.condition_dim),
+            )
+        if (
+            self.cfg.use_caption_condition
+            and self.cfg.caption_condition_dim != self.cfg.condition_dim
+        ):
+            self.proj_caption_condtion = nn.Sequential(
+                nn.RMSNorm(self.cfg.caption_condition_dim),
+                nn.Linear(self.cfg.caption_condition_dim, self.cfg.condition_dim),
+            )
+        if (
+            self.cfg.use_label_condition
+            and self.cfg.label_condition_dim != self.cfg.condition_dim
+        ):
+            self.proj_label_condtion = nn.Sequential(
+                nn.RMSNorm(self.cfg.label_condition_dim),
+                nn.Linear(self.cfg.label_condition_dim, self.cfg.condition_dim),
+            )
+        if self.cfg.identity_init:
+            self.identity_initialize()
+        if self.cfg.pretrained_model_name_or_path:
+            print(
+                f"Loading pretrained DiT model from {self.cfg.pretrained_model_name_or_path}"
+            )
+            ckpt = torch.load(
+                self.cfg.pretrained_model_name_or_path,
+                map_location="cpu",
+                weights_only=False,
+            )
+            if "state_dict" in ckpt.keys():
+                ckpt = ckpt["state_dict"]
+            self.load_state_dict(ckpt, strict=True)
+    def identity_initialize(self):
+        for block in self.dit_model.blocks:
+            nn.init.constant_(block.attn.c_proj.weight, 0)
+            nn.init.constant_(block.attn.c_proj.bias, 0)
+            nn.init.constant_(block.cross_attn.c_proj.weight, 0)
+            nn.init.constant_(block.cross_attn.c_proj.bias, 0)
+            nn.init.constant_(block.mlp.c_proj.weight, 0)
+            nn.init.constant_(block.mlp.c_proj.bias, 0)
+    def forward(
+        self,
+        model_input: torch.FloatTensor,
+        timestep: torch.LongTensor,
+        visual_condition: Optional[torch.FloatTensor] = None,
+        caption_condition: Optional[torch.FloatTensor] = None,
+        label_condition: Optional[torch.FloatTensor] = None,
+        attention_kwargs: Dict[str, torch.Tensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        return_dict: bool = True,
+    ):
+        r"""
+        Args:
+            model_input (torch.FloatTensor): [bs, n_data, c]
+            timestep (torch.LongTensor): [bs,]
+            visual_condition (torch.FloatTensor): [bs, visual_context_tokens, c]
+            text_condition (torch.FloatTensor): [bs, text_context_tokens, c]
+        Returns:
+            sample (torch.FloatTensor): [bs, n_data, c]
+        """
+        B, n_data, _ = model_input.shape
+        # 0. conditions projector
+        condition = []
+        if self.cfg.use_visual_condition:
+            assert visual_condition.shape[-1] == self.cfg.visual_condition_dim
+            if self.cfg.visual_condition_dim != self.cfg.condition_dim:
+                visual_condition = self.proj_visual_condtion(visual_condition)
+            condition.append(visual_condition)
+        else:
+            visual_condition = None
+        if self.cfg.use_caption_condition:
+            assert caption_condition.shape[-1] == self.cfg.caption_condition_dim
+            if self.cfg.caption_condition_dim != self.cfg.condition_dim:
+                caption_condition = self.proj_caption_condtion(caption_condition)
+            condition.append(caption_condition)
+        else:
+            caption_condition = None
+        if self.cfg.use_label_condition:
+            assert label_condition.shape[-1] == self.cfg.label_condition_dim
+            if self.cfg.label_condition_dim != self.cfg.condition_dim:
+                label_condition = self.proj_label_condtion(label_condition)
+            condition.append(label_condition)
+        else:
+            label_condition = None
+        assert not (
+            visual_condition is None
+            and caption_condition is None
+            and label_condition is None
+        )
+        # 1. denoise
+        if self.cfg.multi_condition_type == "cross_attention":
+            output = self.dit_model(
+                model_input,
+                timestep,
+                visual_condition,
+                caption_condition,
+                label_condition,
+                cross_attention_kwargs,
+                return_dict=return_dict,
+            )
+        elif self.cfg.multi_condition_type == "in_context":
+            output = self.dit_model(
+                model_input,
+                timestep,
+                torch.cat(condition, dim=1),
+                None,
+                None,
+                cross_attention_kwargs,
+                return_dict=return_dict,
+            )
+        else:
+            raise ValueError
+        return output

step1x3d_geometry/systems/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from . import shape_autoencoder, shape_diffusion, shape_rectified_flow

step1x3d_geometry/systems/base.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import os
+from dataclasses import dataclass, field
+import pytorch_lightning as pl
+import torch.nn.functional as F
+import step1x3d_geometry
+from step1x3d_geometry.utils.base import (
+    Updateable,
+    update_end_if_possible,
+    update_if_possible,
+)
+from step1x3d_geometry.utils.scheduler import parse_optimizer, parse_scheduler
+from step1x3d_geometry.utils.config import parse_structured
+from step1x3d_geometry.utils.misc import C, cleanup, get_device, load_module_weights
+from step1x3d_geometry.utils.saving import SaverMixin
+from step1x3d_geometry.utils.typing import *
+class BaseSystem(pl.LightningModule, Updateable, SaverMixin):
+    @dataclass
+    class Config:
+        loggers: dict = field(default_factory=dict)
+        loss: dict = field(default_factory=dict)
+        optimizer: dict = field(default_factory=dict)
+        scheduler: Optional[dict] = None
+        weights: Optional[str] = None
+        weights_ignore_modules: Optional[List[str]] = None
+        cleanup_after_validation_step: bool = False
+        cleanup_after_test_step: bool = False
+        pretrained_model_path: Optional[str] = None
+        strict_load: bool = True
+    cfg: Config
+    def __init__(self, cfg, resumed=False) -> None:
+        super().__init__()
+        self.cfg = parse_structured(self.Config, cfg)
+        self._save_dir: Optional[str] = None
+        self._resumed: bool = resumed
+        self._resumed_eval: bool = False
+        self._resumed_eval_status: dict = {"global_step": 0, "current_epoch": 0}
+        if "loggers" in cfg:
+            self.create_loggers(cfg.loggers)
+        self.configure()
+        if self.cfg.weights is not None:
+            self.load_weights(self.cfg.weights, self.cfg.weights_ignore_modules)
+        self.post_configure()
+    def load_weights(self, weights: str, ignore_modules: Optional[List[str]] = None):
+        state_dict, epoch, global_step = load_module_weights(
+            weights, ignore_modules=ignore_modules, map_location="cpu"
+        )
+        self.load_state_dict(state_dict, strict=False)
+        # restore step-dependent states
+        self.do_update_step(epoch, global_step, on_load_weights=True)
+    def set_resume_status(self, current_epoch: int, global_step: int):
+        # restore correct epoch and global step in eval
+        self._resumed_eval = True
+        self._resumed_eval_status["current_epoch"] = current_epoch
+        self._resumed_eval_status["global_step"] = global_step
+    @property
+    def resumed(self):
+        # whether from resumed checkpoint
+        return self._resumed
+    @property
+    def true_global_step(self):
+        if self._resumed_eval:
+            return self._resumed_eval_status["global_step"]
+        else:
+            return self.global_step
+    @property
+    def true_current_epoch(self):
+        if self._resumed_eval:
+            return self._resumed_eval_status["current_epoch"]
+        else:
+            return self.current_epoch
+    def configure(self) -> None:
+        pass
+    def post_configure(self) -> None:
+        """
+        executed after weights are loaded
+        """
+        pass
+    def C(self, value: Any) -> float:
+        return C(value, self.true_current_epoch, self.true_global_step)
+    def configure_optimizers(self):
+        optim = parse_optimizer(self.cfg.optimizer, self)
+        ret = {
+            "optimizer": optim,
+        }
+        if self.cfg.scheduler is not None:
+            ret.update(
+                {
+                    "lr_scheduler": parse_scheduler(self.cfg.scheduler, optim),
+                }
+            )
+        return ret
+    def training_step(self, batch, batch_idx):
+        raise NotImplementedError
+    def validation_step(self, batch, batch_idx):
+        raise NotImplementedError
+    def on_train_batch_end(self, outputs, batch, batch_idx):
+        self.dataset = self.trainer.train_dataloader.dataset
+        update_end_if_possible(
+            self.dataset, self.true_current_epoch, self.true_global_step
+        )
+        self.do_update_step_end(self.true_current_epoch, self.true_global_step)
+    def on_validation_batch_end(self, outputs, batch, batch_idx):
+        self.dataset = self.trainer.val_dataloaders.dataset
+        update_end_if_possible(
+            self.dataset, self.true_current_epoch, self.true_global_step
+        )
+        self.do_update_step_end(self.true_current_epoch, self.true_global_step)
+        if self.cfg.cleanup_after_validation_step:
+            # cleanup to save vram
+            cleanup()
+    def on_validation_epoch_end(self):
+        raise NotImplementedError
+    def test_step(self, batch, batch_idx):
+        raise NotImplementedError
+    def on_test_batch_end(self, outputs, batch, batch_idx):
+        self.dataset = self.trainer.test_dataloaders.dataset
+        update_end_if_possible(
+            self.dataset, self.true_current_epoch, self.true_global_step
+        )
+        self.do_update_step_end(self.true_current_epoch, self.true_global_step)
+        if self.cfg.cleanup_after_test_step:
+            # cleanup to save vram
+            cleanup()
+    def on_test_epoch_end(self):
+        pass
+    def predict_step(self, batch, batch_idx):
+        raise NotImplementedError
+    def on_predict_batch_end(self, outputs, batch, batch_idx):
+        self.dataset = self.trainer.predict_dataloaders.dataset
+        update_end_if_possible(
+            self.dataset, self.true_current_epoch, self.true_global_step
+        )
+        self.do_update_step_end(self.true_current_epoch, self.true_global_step)
+        if self.cfg.cleanup_after_test_step:
+            # cleanup to save vram
+            cleanup()
+    def on_predict_epoch_end(self):
+        pass
+    def preprocess_data(self, batch, stage):
+        pass
+    """
+    Implementing on_after_batch_transfer of DataModule does the same.
+    But on_after_batch_transfer does not support DP.
+    """
+    def on_train_batch_start(self, batch, batch_idx, unused=0):
+        self.preprocess_data(batch, "train")
+        self.dataset = self.trainer.train_dataloader.dataset
+        update_if_possible(self.dataset, self.true_current_epoch, self.true_global_step)
+        self.do_update_step(self.true_current_epoch, self.true_global_step)
+    def on_validation_batch_start(self, batch, batch_idx, dataloader_idx=0):
+        self.preprocess_data(batch, "validation")
+        self.dataset = self.trainer.val_dataloaders.dataset
+        update_if_possible(self.dataset, self.true_current_epoch, self.true_global_step)
+        self.do_update_step(self.true_current_epoch, self.true_global_step)
+    def on_test_batch_start(self, batch, batch_idx, dataloader_idx=0):
+        self.preprocess_data(batch, "test")
+        self.dataset = self.trainer.test_dataloaders.dataset
+        update_if_possible(self.dataset, self.true_current_epoch, self.true_global_step)
+        self.do_update_step(self.true_current_epoch, self.true_global_step)
+    def on_predict_batch_start(self, batch, batch_idx, dataloader_idx=0):
+        self.preprocess_data(batch, "predict")
+        self.dataset = self.trainer.predict_dataloaders.dataset
+        update_if_possible(self.dataset, self.true_current_epoch, self.true_global_step)
+        self.do_update_step(self.true_current_epoch, self.true_global_step)
+    def update_step(self, epoch: int, global_step: int, on_load_weights: bool = False):
+        pass
+    def on_before_optimizer_step(self, optimizer):
+        """
+        # some gradient-related debugging goes here, example:
+        from lightning.pytorch.utilities import grad_norm
+        norms = grad_norm(self.geometry, norm_type=2)
+        print(norms)
+        """
+        pass

step1x3d_geometry/systems/shape_autoencoder.py ADDED Viewed

	@@ -0,0 +1,151 @@

+from dataclasses import dataclass, field
+import numpy as np
+import torch
+from skimage import measure
+from einops import repeat, rearrange
+import step1x3d_geometry
+from step1x3d_geometry.systems.base import BaseSystem
+from step1x3d_geometry.utils.ops import generate_dense_grid_points
+from step1x3d_geometry.utils.typing import *
+from step1x3d_geometry.utils.misc import get_rank
+@step1x3d_geometry.register("shape-autoencoder-system")
+class ShapeAutoEncoderSystem(BaseSystem):
+    @dataclass
+    class Config(BaseSystem.Config):
+        shape_model_type: str = None
+        shape_model: dict = field(default_factory=dict)
+        sample_posterior: bool = True
+        # for mesh extraction
+        bounds: float = 1.05
+        mc_level: float = 0.0
+        octree_resolution: int = 256
+    cfg: Config
+    def configure(self):
+        super().configure()
+        self.shape_model = step1x3d_geometry.find(self.cfg.shape_model_type)(
+            self.cfg.shape_model
+        )
+    def forward(self, batch: Dict[str, Any]) -> Dict[str, Any]:
+        rand_points = batch["rand_points"]
+        if "sdf" in batch:
+            target = batch["sdf"]
+            criteria = torch.nn.MSELoss()
+        elif "occupancies" in batch:
+            target = batch["occupancies"]
+            criteria = torch.nn.BCEWithLogitsLoss()
+        else:
+            raise NotImplementedError
+        # forward pass
+        num_point_feats = 3 + self.cfg.shape_model.point_feats
+        shape_latents, kl_embed, posterior = self.shape_model.encode(
+            batch["surface"][..., :num_point_feats],
+            sharp_surface=(
+                batch["sharp_surface"][..., :num_point_feats]
+                if "sharp_surface" in batch
+                else None
+            ),
+            sample_posterior=self.cfg.sample_posterior,
+        )
+        latents = self.shape_model.decode(kl_embed)  # [B, num_latents, width]
+        logits = self.shape_model.query(rand_points, latents).squeeze(
+            -1
+        )  # [B, num_rand_points]
+        if self.cfg.sample_posterior:
+            loss_kl = posterior.kl()
+            loss_kl = torch.sum(loss_kl) / loss_kl.shape[0]
+            return {
+                "loss_logits": criteria(logits, target).mean(),
+                "loss_kl": loss_kl,
+                "logits": logits,
+                "target": target,
+                "latents": latents,
+            }
+        else:
+            return {
+                "loss_logits": criteria(logits, target).mean(),
+                "latents": latents,
+                "logits": logits,
+            }
+    def training_step(self, batch, batch_idx):
+        """
+        Description:
+        Args:
+            batch:
+            batch_idx:
+        Returns:
+            loss:
+        """
+        out = self(batch)
+        loss = 0.0
+        for name, value in out.items():
+            if name.startswith("loss_"):
+                self.log(f"train/{name}", value)
+                loss += value * self.C(self.cfg.loss[name.replace("loss_", "lambda_")])
+        for name, value in self.cfg.loss.items():
+            self.log(f"train_params/{name}", self.C(value))
+        return {"loss": loss}
+    @torch.no_grad()
+    def validation_step(self, batch, batch_idx):
+        self.eval()
+        out = self(batch)
+        meshes = self.shape_model.extract_geometry(
+            out["latents"],
+            bounds=self.cfg.bounds,
+            mc_level=self.cfg.mc_level,
+            octree_resolution=self.cfg.octree_resolution,
+            enable_pbar=False,
+        )
+        for idx, name in enumerate(batch["uid"]):
+            self.save_mesh(
+                f"it{self.true_global_step}/{name}.obj",
+                meshes[idx].verts,
+                meshes[idx].faces,
+            )
+        threshold = 0
+        outputs = out["logits"]
+        labels = out["target"]
+        pred = torch.zeros_like(outputs)
+        pred[outputs >= threshold] = 1
+        accuracy = (pred == labels).float().sum(dim=1) / labels.shape[1]
+        accuracy = accuracy.mean()
+        intersection = (pred * labels).sum(dim=1)
+        union = (pred + labels).gt(0).sum(dim=1)
+        iou = intersection * 1.0 / union + 1e-5
+        iou = iou.mean()
+        self.log("val/accuracy", accuracy)
+        self.log("val/iou", iou)
+        torch.cuda.empty_cache()
+        return {
+            "val/loss": out["loss_logits"],
+            "val/accuracy": accuracy,
+            "val/iou": iou,
+        }
+    def on_validation_epoch_end(self):
+        pass
+    def test_step(self, batch, batch_idx):
+        return

step1x3d_geometry/systems/shape_diffusion.py ADDED Viewed

	@@ -0,0 +1,425 @@

+from dataclasses import dataclass, field
+from step1x3d_geometry.models.pipelines.pipeline import Step1X3DGeometryPipeline
+import numpy as np
+import json
+import copy
+import torch
+import torch.nn.functional as F
+from skimage import measure
+from einops import repeat
+from tqdm import tqdm
+from PIL import Image
+from diffusers import (
+    DDPMScheduler,
+    DDIMScheduler,
+    UniPCMultistepScheduler,
+    KarrasVeScheduler,
+    DPMSolverMultistepScheduler,
+)
+from diffusers.training_utils import (
+    compute_snr,
+    free_memory,
+)
+import step1x3d_geometry
+from step1x3d_geometry.systems.base import BaseSystem
+from step1x3d_geometry.utils.misc import get_rank
+from step1x3d_geometry.utils.typing import *
+from diffusers import DDIMScheduler
+from step1x3d_geometry.systems.utils import read_image, ddim_sample
+# DEBUG = True
+@step1x3d_geometry.register("diffusion-system")
+class DiffusionSystem(BaseSystem):
+    @dataclass
+    class Config(BaseSystem.Config):
+        val_samples_json: str = ""
+        bounds: float = 1.05
+        mc_level: float = 0.0
+        octree_resolution: int = 256
+        skip_validation: bool = True
+        # diffusion config
+        z_scale_factor: float = 1.0
+        guidance_scale: float = 7.5
+        num_inference_steps: int = 50
+        eta: float = 0.0
+        snr_gamma: float = 5.0
+        # shape vae model
+        shape_model_type: str = None
+        shape_model: dict = field(default_factory=dict)
+        # condition model
+        visual_condition_type: Optional[str] = None
+        visual_condition: dict = field(default_factory=dict)
+        caption_condition_type: Optional[str] = None
+        caption_condition: dict = field(default_factory=dict)
+        label_condition_type: Optional[str] = None
+        label_condition: dict = field(default_factory=dict)
+        # diffusion model
+        denoiser_model_type: str = None
+        denoiser_model: dict = field(default_factory=dict)
+        # noise scheduler
+        noise_scheduler_type: str = None
+        noise_scheduler: dict = field(default_factory=dict)
+        # denoise scheduler
+        denoise_scheduler_type: str = None
+        denoise_scheduler: dict = field(default_factory=dict)
+    cfg: Config
+    def configure(self):
+        super().configure()
+        self.shape_model = step1x3d_geometry.find(self.cfg.shape_model_type)(
+            self.cfg.shape_model
+        )
+        self.shape_model.eval()
+        self.shape_model.requires_grad_(False)
+        if self.cfg.visual_condition_type is not None:
+            self.visual_condition = step1x3d_geometry.find(
+                self.cfg.visual_condition_type
+            )(self.cfg.visual_condition)
+        if self.cfg.caption_condition_type is not None:
+            self.caption_condition = step1x3d_geometry.find(
+                self.cfg.caption_condition_type
+            )(self.cfg.caption_condition)
+        if self.cfg.label_condition_type is not None:
+            self.label_condition = step1x3d_geometry.find(
+                self.cfg.label_condition_type
+            )(self.cfg.label_condition)
+        self.denoiser_model = step1x3d_geometry.find(self.cfg.denoiser_model_type)(
+            self.cfg.denoiser_model
+        )
+        self.noise_scheduler = step1x3d_geometry.find(self.cfg.noise_scheduler_type)(
+            **self.cfg.noise_scheduler
+        )
+        self.denoise_scheduler = step1x3d_geometry.find(
+            self.cfg.denoise_scheduler_type
+        )(**self.cfg.denoise_scheduler)
+    def forward(self, batch: Dict[str, Any], skip_noise=False) -> Dict[str, Any]:
+        # 1. encode shape latents
+        if "sharp_surface" in batch.keys():
+            sharp_surface = batch["sharp_surface"][
+                ..., : 3 + self.cfg.shape_model.point_feats
+            ]
+        else:
+            sharp_surface = None
+        shape_embeds, kl_embed, _ = self.shape_model.encode(
+            batch["surface"][..., : 3 + self.cfg.shape_model.point_feats],
+            sample_posterior=True,
+            sharp_surface=sharp_surface,
+        )
+        latents = kl_embed * self.cfg.z_scale_factor
+        # 2. gain visual condition
+        visual_cond_latents = None
+        if self.cfg.visual_condition_type is not None:
+            if "image" in batch and batch["image"].dim() == 5:
+                if self.training:
+                    bs, n_images = batch["image"].shape[:2]
+                    batch["image"] = batch["image"].view(
+                        bs * n_images, *batch["image"].shape[-3:]
+                    )
+                else:
+                    batch["image"] = batch["image"][:, 0, ...]
+                    n_images = 1
+                    bs = batch["image"].shape[0]
+                visual_cond_latents = self.visual_condition(batch).to(latents)
+                latents = latents.unsqueeze(1).repeat(1, n_images, 1, 1)
+                latents = latents.view(bs * n_images, *latents.shape[-2:])
+            else:
+                visual_cond_latents = self.visual_condition(batch).to(latents)
+        ## 2.1 text condition if provided
+        caption_cond_latents = None
+        if self.cfg.caption_condition_type is not None:
+            assert "caption" in batch.keys(), "caption is required for caption encoder"
+            assert bs == len(
+                batch["caption"]
+            ), "Batch size must be the same as the caption length."
+            caption_cond_latents = (
+                self.caption_condition(batch)
+                .repeat_interleave(n_images, dim=0)
+                .to(latents)
+            )
+        ## 2.2 label condition if provided
+        label_cond_latents = None
+        if self.cfg.label_condition_type is not None:
+            assert "label" in batch.keys(), "label is required for label encoder"
+            assert bs == len(
+                batch["label"]
+            ), "Batch size must be the same as the label length."
+            label_cond_latents = (
+                self.label_condition(batch)
+                .repeat_interleave(n_images, dim=0)
+                .to(latents)
+            )
+        # 3. sample noise that we"ll add to the latents
+        noise = torch.randn_like(latents).to(
+            latents
+        )  # [batch_size, n_token, latent_dim]
+        bs = latents.shape[0]
+        # 4. Sample a random timestep for each motion
+        timesteps = torch.randint(
+            0,
+            self.cfg.noise_scheduler.num_train_timesteps,
+            (bs,),
+            device=latents.device,
+        )
+        timesteps = timesteps.long()
+        # 5. add noise
+        noisy_z = self.noise_scheduler.add_noise(latents, noise, timesteps)
+        # 6. diffusion model forward
+        output = self.denoiser_model(
+            noisy_z,
+            timesteps.long(),
+            visual_cond_latents,
+            caption_cond_latents,
+            label_cond_latents,
+        ).sample
+        # 7. compute loss
+        if self.noise_scheduler.config.prediction_type == "epsilon":
+            target = noise
+        elif self.noise_scheduler.config.prediction_type == "v_prediction":
+            target = self.noise_scheduler.get_velocity(latents, noise, timesteps)
+        else:
+            raise ValueError(
+                f"Prediction Type: {self.noise_scheduler.prediction_type} not supported."
+            )
+        if self.cfg.snr_gamma == 0:
+            if self.cfg.loss.loss_type == "l1":
+                loss = F.l1_loss(output, target, reduction="mean")
+            elif self.cfg.loss.loss_type in ["mse", "l2"]:
+                loss = F.mse_loss(output, target, reduction="mean")
+            else:
+                raise ValueError(f"Loss Type: {self.cfg.loss.loss_type} not supported.")
+        else:
+            # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+            # Since we predict the noise instead of x_0, the original formulation is slightly changed.
+            # This is discussed in Section 4.2 of the same paper.
+            snr = compute_snr(self.noise_scheduler, timesteps)
+            mse_loss_weights = torch.stack(
+                [snr, self.cfg.snr_gamma * torch.ones_like(timesteps)], dim=1
+            ).min(dim=1)[0]
+            if self.noise_scheduler.config.prediction_type == "epsilon":
+                mse_loss_weights = mse_loss_weights / snr
+            elif self.noise_scheduler.config.prediction_type == "v_prediction":
+                mse_loss_weights = mse_loss_weights / (snr + 1)
+            if self.cfg.loss.loss_type == "l1":
+                loss = F.l1_loss(output, target, reduction="none")
+            elif self.cfg.loss.loss_type in ["mse", "l2"]:
+                loss = F.mse_loss(output, target, reduction="none")
+            else:
+                raise ValueError(f"Loss Type: {self.cfg.loss.loss_type} not supported.")
+            loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
+            loss = loss.mean()
+        return {
+            "loss_diffusion": loss,
+            "latents": latents,
+            "x_t": noisy_z,
+            "noise": noise,
+            "noise_pred": output,
+            "timesteps": timesteps,
+        }
+    def training_step(self, batch, batch_idx):
+        out = self(batch)
+        loss = 0.0
+        for name, value in out.items():
+            if name.startswith("loss_"):
+                self.log(f"train/{name}", value)
+                loss += value * self.C(self.cfg.loss[name.replace("loss_", "lambda_")])
+        for name, value in self.cfg.loss.items():
+            if name.startswith("lambda_"):
+                self.log(f"train_params/{name}", self.C(value))
+        return {"loss": loss}
+    @torch.no_grad()
+    def validation_step(self, batch, batch_idx):
+        if self.cfg.skip_validation:
+            return {}
+        self.eval()
+        if get_rank() == 0:
+            sample_inputs = json.loads(
+                open(self.cfg.val_samples_json).read()
+            )  # condition
+            sample_inputs_ = copy.deepcopy(sample_inputs)
+            sample_outputs = self.sample(sample_inputs)  # list
+            for i, latents in enumerate(sample_outputs["latents"]):
+                meshes = self.shape_model.extract_geometry(
+                    latents,
+                    bounds=self.cfg.bounds,
+                    mc_level=self.cfg.mc_level,
+                    octree_resolution=self.cfg.octree_resolution,
+                    enable_pbar=False,
+                )
+                for j in range(len(meshes)):
+                    name = ""
+                    if "image" in sample_inputs_:
+                        name += (
+                            sample_inputs_["image"][j]
+                            .split("/")[-1]
+                            .replace(".png", "")
+                        )
+                    elif "mvimages" in sample_inputs_:
+                        name += (
+                            sample_inputs_["mvimages"][j][0]
+                            .split("/")[-2]
+                            .replace(".png", "")
+                        )
+                    if "caption" in sample_inputs_:
+                        name += "_" + sample_inputs_["caption"][j].replace(" ", "_")
+                    if "label" in sample_inputs_:
+                        name += (
+                            "_"
+                            + sample_inputs_["label"][j]["symmetry"]
+                            + sample_inputs_["label"][j]["edge_type"]
+                        )
+                    if (
+                        meshes[j].verts is not None
+                        and meshes[j].verts.shape[0] > 0
+                        and meshes[j].faces is not None
+                        and meshes[j].faces.shape[0] > 0
+                    ):
+                        self.save_mesh(
+                            f"it{self.true_global_step}/{name}_{i}.obj",
+                            meshes[j].verts,
+                            meshes[j].faces,
+                        )
+                        torch.cuda.empty_cache()
+        out = self(batch)
+        if self.global_step == 0:
+            latents = self.shape_model.decode(out["latents"])
+            meshes = self.shape_model.extract_geometry(
+                latents,
+                bounds=self.cfg.bounds,
+                mc_level=self.cfg.mc_level,
+                octree_resolution=self.cfg.octree_resolution,
+                enable_pbar=False,
+            )
+            for i, mesh in enumerate(meshes):
+                self.save_mesh(
+                    f"it{self.true_global_step}/{batch['uid'][i]}.obj",
+                    mesh.verts,
+                    mesh.faces,
+                )
+        return {"val/loss": out["loss_diffusion"]}
+    @torch.no_grad()
+    def sample(
+        self,
+        sample_inputs: Dict[str, Union[torch.FloatTensor, List[str]]],
+        sample_times: int = 1,
+        steps: Optional[int] = None,
+        guidance_scale: Optional[float] = None,
+        eta: float = 0.0,
+        seed: Optional[int] = None,
+        **kwargs,
+    ):
+        if steps is None:
+            steps = self.cfg.num_inference_steps
+        if guidance_scale is None:
+            guidance_scale = self.cfg.guidance_scale
+        do_classifier_free_guidance = guidance_scale != 1.0
+        # conditional encode
+        visal_cond = None
+        if "image" in sample_inputs:
+            sample_inputs["image"] = [
+                Image.open(img) if type(img) == str else img
+                for img in sample_inputs["image"]
+            ]
+            sample_inputs["image"] = Step1X3DGeometryPipeline.preprocess_image(
+                sample_inputs["image"], **kwargs
+            )
+            cond = self.visual_condition.encode_image(sample_inputs["image"])
+            if do_classifier_free_guidance:
+                un_cond = self.visual_condition.empty_image_embeds.repeat(
+                    len(sample_inputs["image"]), 1, 1
+                ).to(cond)
+                visal_cond = torch.cat([un_cond, cond], dim=0)
+        caption_cond = None
+        if "caption" in sample_inputs:
+            cond = self.label_condition.encode_label(sample_inputs["caption"])
+            if do_classifier_free_guidance:
+                un_cond = self.caption_condition.empty_caption_embeds.repeat(
+                    len(sample_inputs["caption"]), 1, 1
+                ).to(cond)
+                caption_cond = torch.cat([un_cond, cond], dim=0)
+        label_cond = None
+        if "label" in sample_inputs:
+            cond = self.label_condition.encode_label(sample_inputs["label"])
+            if do_classifier_free_guidance:
+                un_cond = self.label_condition.empty_label_embeds.repeat(
+                    len(sample_inputs["label"]), 1
+                ).to(cond)
+                label_cond = torch.cat([un_cond, cond], dim=0)
+        latents_list = []
+        if seed != None:
+            generator = torch.Generator(device="cuda").manual_seed(seed)
+        else:
+            generator = None
+        for _ in range(sample_times):
+            sample_loop = ddim_sample(
+                self.denoise_scheduler,
+                self.denoiser_model.eval(),
+                shape=self.shape_model.latent_shape,
+                visual_cond=visal_cond,
+                caption_cond=caption_cond,
+                label_cond=label_cond,
+                steps=steps,
+                guidance_scale=guidance_scale,
+                do_classifier_free_guidance=do_classifier_free_guidance,
+                device=self.device,
+                eta=eta,
+                disable_prog=False,
+                generator=generator,
+            )
+            for sample, t in sample_loop:
+                latents = sample
+            latents_list.append(self.shape_model.decode(latents))
+        return {"latents": latents_list, "inputs": sample_inputs}
+    def on_validation_epoch_end(self):
+        pass
+    def test_step(self, batch, batch_idx):
+        return

step1x3d_geometry/systems/shape_rectified_flow.py ADDED Viewed

	@@ -0,0 +1,474 @@

+from dataclasses import dataclass, field
+import numpy as np
+import json
+import copy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from skimage import measure
+from einops import repeat
+from tqdm import tqdm
+from PIL import Image
+from diffusers import (
+    DDPMScheduler,
+    DDIMScheduler,
+    UniPCMultistepScheduler,
+    KarrasVeScheduler,
+    DPMSolverMultistepScheduler,
+)
+from diffusers.training_utils import (
+    compute_density_for_timestep_sampling,
+    compute_loss_weighting_for_sd3,
+    free_memory,
+)
+import step1x3d_geometry
+from step1x3d_geometry.systems.base import BaseSystem
+from step1x3d_geometry.utils.misc import get_rank
+from step1x3d_geometry.utils.typing import *
+from step1x3d_geometry.systems.utils import read_image, preprocess_image, flow_sample
+def get_sigmas(noise_scheduler, timesteps, n_dim=4, dtype=torch.float32):
+    sigmas = noise_scheduler.sigmas.to(device=timesteps.device, dtype=dtype)
+    schedule_timesteps = noise_scheduler.timesteps.to(timesteps.device)
+    step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps]
+    sigma = sigmas[step_indices].flatten()
+    while len(sigma.shape) < n_dim:
+        sigma = sigma.unsqueeze(-1)
+    return sigma
+@step1x3d_geometry.register("rectified-flow-system")
+class RectifiedFlowSystem(BaseSystem):
+    @dataclass
+    class Config(BaseSystem.Config):
+        skip_validation: bool = True
+        val_samples_json: str = ""
+        bounds: float = 1.05
+        mc_level: float = 0.0
+        octree_resolution: int = 256
+        # diffusion config
+        guidance_scale: float = 7.5
+        num_inference_steps: int = 30
+        eta: float = 0.0
+        snr_gamma: float = 5.0
+        # flow
+        weighting_scheme: str = "logit_normal"
+        logit_mean: float = 0
+        logit_std: float = 1.0
+        mode_scale: float = 1.29
+        precondition_outputs: bool = True
+        precondition_t: int = 1000
+        # shape vae model
+        shape_model_type: str = None
+        shape_model: dict = field(default_factory=dict)
+        # condition model
+        visual_condition_type: Optional[str] = None
+        visual_condition: dict = field(default_factory=dict)
+        caption_condition_type: Optional[str] = None
+        caption_condition: dict = field(default_factory=dict)
+        label_condition_type: Optional[str] = None
+        label_condition: dict = field(default_factory=dict)
+        # diffusion model
+        denoiser_model_type: str = None
+        denoiser_model: dict = field(default_factory=dict)
+        # noise scheduler
+        noise_scheduler_type: str = None
+        noise_scheduler: dict = field(default_factory=dict)
+        # denoise scheduler
+        denoise_scheduler_type: str = None
+        denoise_scheduler: dict = field(default_factory=dict)
+        # lora
+        use_lora: bool = False
+        lora_layers: Optional[str] = None
+        rank: int = 128  # The dimension of the LoRA update matrices.
+        alpha: int = 128
+    cfg: Config
+    def configure(self):
+        super().configure()
+        self.shape_model = step1x3d_geometry.find(self.cfg.shape_model_type)(
+            self.cfg.shape_model
+        )
+        self.shape_model.eval()
+        self.shape_model.requires_grad_(False)
+        if self.cfg.visual_condition_type is not None:
+            self.visual_condition = step1x3d_geometry.find(
+                self.cfg.visual_condition_type
+            )(self.cfg.visual_condition)
+            self.visual_condition.requires_grad_(False)
+        if self.cfg.caption_condition_type is not None:
+            self.caption_condition = step1x3d_geometry.find(
+                self.cfg.caption_condition_type
+            )(self.cfg.caption_condition)
+            self.caption_condition.requires_grad_(False)
+        if self.cfg.label_condition_type is not None:
+            self.label_condition = step1x3d_geometry.find(
+                self.cfg.label_condition_type
+            )(self.cfg.label_condition)
+        self.denoiser_model = step1x3d_geometry.find(self.cfg.denoiser_model_type)(
+            self.cfg.denoiser_model
+        )
+        if self.cfg.use_lora:  # We only train the additional adapter LoRA layers
+            self.denoiser_model.requires_grad_(False)
+        self.noise_scheduler = step1x3d_geometry.find(self.cfg.noise_scheduler_type)(
+            **self.cfg.noise_scheduler
+        )
+        self.noise_scheduler_copy = copy.deepcopy(self.noise_scheduler)
+        self.denoise_scheduler = step1x3d_geometry.find(
+            self.cfg.denoise_scheduler_type
+        )(**self.cfg.denoise_scheduler)
+        if self.cfg.use_lora:
+            from peft import LoraConfig, set_peft_model_state_dict
+            if self.cfg.lora_layers is not None:
+                self.target_modules = [
+                    layer.strip() for layer in self.cfg.lora_layers.split(",")
+                ]
+            else:
+                self.target_modules = [
+                    "attn.to_k",
+                    "attn.to_q",
+                    "attn.to_v",
+                    "attn.to_out.0",
+                    "attn.add_k_proj",
+                    "attn.add_q_proj",
+                    "attn.add_v_proj",
+                    "attn.to_add_out",
+                    "ff.net.0.proj",
+                    "ff.net.2",
+                    "ff_context.net.0.proj",
+                    "ff_context.net.2",
+                ]
+                self.transformer_lora_config = LoraConfig(
+                    r=self.cfg.rank,
+                    lora_alpha=self.cfg.alpha,
+                    init_lora_weights="gaussian",
+                    target_modules=self.target_modules,
+                )
+                self.denoiser_model.dit_model.add_adapter(self.transformer_lora_config)
+    def forward(self, batch: Dict[str, Any], skip_noise=False) -> Dict[str, Any]:
+        # 1. encode shape latents
+        if "sharp_surface" in batch.keys():
+            sharp_surface = batch["sharp_surface"][
+                ..., : 3 + self.cfg.shape_model.point_feats
+            ]
+        else:
+            sharp_surface = None
+        shape_embeds, latents, _ = self.shape_model.encode(
+            batch["surface"][..., : 3 + self.cfg.shape_model.point_feats],
+            sample_posterior=True,
+            sharp_surface=sharp_surface,
+        )
+        # 2. gain visual condition
+        visual_cond = None
+        if self.cfg.visual_condition_type is not None:
+            assert "image" in batch.keys(), "image is required for label encoder"
+            if "image" in batch and batch["image"].dim() == 5:
+                if self.training:
+                    bs, n_images = batch["image"].shape[:2]
+                    batch["image"] = batch["image"].view(
+                        bs * n_images, *batch["image"].shape[-3:]
+                    )
+                else:
+                    batch["image"] = batch["image"][:, 0, ...]
+                    n_images = 1
+                    bs = batch["image"].shape[0]
+                visual_cond = self.visual_condition(batch).to(latents)
+                latents = latents.unsqueeze(1).repeat(1, n_images, 1, 1)
+                latents = latents.view(bs * n_images, *latents.shape[-2:])
+            else:
+                visual_cond = self.visual_condition(batch).to(latents)
+                bs = visual_cond.shape[0]
+                n_images = 1
+        ## 2.1 text condition if provided
+        caption_cond = None
+        if self.cfg.caption_condition_type is not None:
+            assert "caption" in batch.keys(), "caption is required for caption encoder"
+            assert bs == len(
+                batch["caption"]
+            ), "Batch size must be the same as the caption length."
+            caption_cond = (
+                self.caption_condition(batch)
+                .repeat_interleave(n_images, dim=0)
+                .to(latents)
+            )
+        ## 2.2 label condition if provided
+        label_cond = None
+        if self.cfg.label_condition_type is not None:
+            assert "label" in batch.keys(), "label is required for label encoder"
+            assert bs == len(
+                batch["label"]
+            ), "Batch size must be the same as the label length."
+            label_cond = (
+                self.label_condition(batch)
+                .repeat_interleave(n_images, dim=0)
+                .to(latents)
+            )
+        # 3. sample noise that we"ll add to the latents
+        noise = torch.randn_like(latents).to(
+            latents
+        )  # [batch_size, n_token, latent_dim]
+        # 4. Sample a random timestep
+        u = compute_density_for_timestep_sampling(
+            weighting_scheme=self.cfg.weighting_scheme,
+            batch_size=bs * n_images,
+            logit_mean=self.cfg.logit_mean,
+            logit_std=self.cfg.logit_std,
+            mode_scale=self.cfg.mode_scale,
+        )
+        indices = (u * self.cfg.noise_scheduler.num_train_timesteps).long()
+        timesteps = self.noise_scheduler_copy.timesteps[indices].to(
+            device=latents.device
+        )
+        # 5. add noise
+        sigmas = get_sigmas(
+            self.noise_scheduler_copy, timesteps, n_dim=3, dtype=latents.dtype
+        )
+        noisy_z = (1.0 - sigmas) * latents + sigmas * noise
+        # 6. diffusion model forward
+        output = self.denoiser_model(
+            noisy_z, timesteps.long(), visual_cond, caption_cond, label_cond
+        ).sample
+        # 7. compute loss
+        if self.cfg.precondition_outputs:
+            output = output * (-sigmas) + noisy_z
+        # these weighting schemes use a uniform timestep sampling
+        # and instead post-weight the loss
+        weighting = compute_loss_weighting_for_sd3(
+            weighting_scheme=self.cfg.weighting_scheme, sigmas=sigmas
+        )
+        # flow matching loss
+        if self.cfg.precondition_outputs:
+            target = latents
+        else:
+            target = noise - latents
+        # Compute regular loss.
+        loss = torch.mean(
+            (weighting.float() * (output.float() - target.float()) ** 2).reshape(
+                target.shape[0], -1
+            ),
+            1,
+        )
+        loss = loss.mean()
+        return {
+            "loss_diffusion": loss,
+            "latents": latents,
+            "x_t": noisy_z,
+            "noise": noise,
+            "noise_pred": output,
+            "timesteps": timesteps,
+        }
+    def training_step(self, batch, batch_idx):
+        out = self(batch)
+        loss = 0.0
+        for name, value in out.items():
+            if name.startswith("loss_"):
+                self.log(f"train/{name}", value)
+                loss += value * self.C(self.cfg.loss[name.replace("loss_", "lambda_")])
+            if name.startswith("log_"):
+                self.log(f"log/{name.replace('log_', '')}", value.mean())
+        for name, value in self.cfg.loss.items():
+            if name.startswith("lambda_"):
+                self.log(f"train_params/{name}", self.C(value))
+        return {"loss": loss}
+    @torch.no_grad()
+    def validation_step(self, batch, batch_idx):
+        if self.cfg.skip_validation:
+            return {}
+        self.eval()
+        if get_rank() == 0:
+            sample_inputs = json.loads(
+                open(self.cfg.val_samples_json).read()
+            )  # condition
+            sample_inputs_ = copy.deepcopy(sample_inputs)
+            sample_outputs = self.sample(sample_inputs)  # list
+            for i, latents in enumerate(sample_outputs["latents"]):
+                meshes = self.shape_model.extract_geometry(
+                    latents,
+                    bounds=self.cfg.bounds,
+                    mc_level=self.cfg.mc_level,
+                    octree_resolution=self.cfg.octree_resolution,
+                    enable_pbar=False,
+                )
+                for j in range(len(meshes)):
+                    name = ""
+                    if "image" in sample_inputs_:
+                        name += (
+                            sample_inputs_["image"][j]
+                            .split("/")[-1]
+                            .replace(".png", "")
+                        )
+                    elif "mvimages" in sample_inputs_:
+                        name += (
+                            sample_inputs_["mvimages"][j][0]
+                            .split("/")[-2]
+                            .replace(".png", "")
+                        )
+                    if "caption" in sample_inputs_:
+                        name += "_" + sample_inputs_["caption"][j].replace(
+                            " ", "_"
+                        ).replace(".", "")
+                    if "label" in sample_inputs_:
+                        name += (
+                            "_"
+                            + sample_inputs_["label"][j]["symmetry"]
+                            + sample_inputs_["label"][j]["edge_type"]
+                        )
+                    if (
+                        meshes[j].verts is not None
+                        and meshes[j].verts.shape[0] > 0
+                        and meshes[j].faces is not None
+                        and meshes[j].faces.shape[0] > 0
+                    ):
+                        self.save_mesh(
+                            f"it{self.true_global_step}/{name}_{i}.obj",
+                            meshes[j].verts,
+                            meshes[j].faces,
+                        )
+                        torch.cuda.empty_cache()
+        out = self(batch)
+        if self.global_step == 0:
+            latents = self.shape_model.decode(out["latents"])
+            meshes = self.shape_model.extract_geometry(
+                latents,
+                bounds=self.cfg.bounds,
+                mc_level=self.cfg.mc_level,
+                octree_resolution=self.cfg.octree_resolution,
+                enable_pbar=False,
+            )
+            for i, mesh in enumerate(meshes):
+                self.save_mesh(
+                    f"it{self.true_global_step}/{batch['uid'][i]}.obj",
+                    mesh.verts,
+                    mesh.faces,
+                )
+        return {"val/loss": out["loss_diffusion"]}
+    @torch.no_grad()
+    def sample(
+        self,
+        sample_inputs: Dict[str, Union[torch.FloatTensor, List[str]]],
+        sample_times: int = 1,
+        steps: Optional[int] = None,
+        guidance_scale: Optional[float] = None,
+        eta: float = 0.0,
+        seed: Optional[int] = None,
+        **kwargs,
+    ):
+        if steps is None:
+            steps = self.cfg.num_inference_steps
+        if guidance_scale is None:
+            guidance_scale = self.cfg.guidance_scale
+        do_classifier_free_guidance = guidance_scale != 1.0
+        # conditional encode
+        visal_cond = None
+        if "image" in sample_inputs:
+            sample_inputs["image"] = [
+                Image.open(img) if type(img) == str else img
+                for img in sample_inputs["image"]
+            ]
+            sample_inputs["image"] = preprocess_image(sample_inputs["image"], **kwargs)
+            cond = self.visual_condition.encode_image(sample_inputs["image"])
+            if do_classifier_free_guidance:
+                un_cond = self.visual_condition.empty_image_embeds.repeat(
+                    len(sample_inputs["image"]), 1, 1
+                ).to(cond)
+                visal_cond = torch.cat([un_cond, cond], dim=0)
+        caption_cond = None
+        if "caption" in sample_inputs:
+            cond = self.label_condition.encode_label(sample_inputs["caption"])
+            if do_classifier_free_guidance:
+                un_cond = self.caption_condition.empty_caption_embeds.repeat(
+                    len(sample_inputs["caption"]), 1, 1
+                ).to(cond)
+                caption_cond = torch.cat([un_cond, cond], dim=0)
+        label_cond = None
+        if "label" in sample_inputs:
+            cond = self.label_condition.encode_label(sample_inputs["label"])
+            if do_classifier_free_guidance:
+                un_cond = self.label_condition.empty_label_embeds.repeat(
+                    len(sample_inputs["label"]), 1, 1
+                ).to(cond)
+                label_cond = torch.cat([un_cond, cond], dim=0)
+        latents_list = []
+        if seed != None:
+            generator = torch.Generator(device="cuda").manual_seed(seed)
+        else:
+            generator = None
+        for _ in range(sample_times):
+            sample_loop = flow_sample(
+                self.denoise_scheduler,
+                self.denoiser_model.eval(),
+                shape=self.shape_model.latent_shape,
+                visual_cond=visal_cond,
+                caption_cond=caption_cond,
+                label_cond=label_cond,
+                steps=steps,
+                guidance_scale=guidance_scale,
+                do_classifier_free_guidance=do_classifier_free_guidance,
+                device=self.device,
+                eta=eta,
+                disable_prog=False,
+                generator=generator,
+            )
+            for sample, t in sample_loop:
+                latents = sample
+            latents_list.append(self.shape_model.decode(latents))
+        return {"latents": latents_list, "inputs": sample_inputs}
+    def on_validation_epoch_end(self):
+        pass
+    def test_step(self, batch, batch_idx):
+        return

step1x3d_geometry/systems/utils.py ADDED Viewed

	@@ -0,0 +1,391 @@

+import torch
+import numpy as np
+import rembg
+from PIL import Image
+from tqdm import tqdm
+from diffusers import DDIMScheduler
+from torchvision import transforms
+from step1x3d_geometry.utils.typing import *
+from step1x3d_geometry.utils.misc import get_device
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError(
+            "Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values"
+        )
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()
+        )
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()
+        )
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+@torch.no_grad()
+def ddim_sample(
+    ddim_scheduler: DDIMScheduler,
+    diffusion_model: torch.nn.Module,
+    shape: Union[List[int], Tuple[int]],
+    visual_cond: torch.FloatTensor,
+    caption_cond: torch.FloatTensor,
+    label_cond: torch.FloatTensor,
+    steps: int,
+    eta: float = 0.0,
+    guidance_scale: float = 3.0,
+    do_classifier_free_guidance: bool = True,
+    generator: Optional[torch.Generator] = None,
+    device: torch.device = "cuda:0",
+    disable_prog: bool = True,
+):
+    assert steps > 0, f"{steps} must > 0."
+    # init latents
+    if visual_cond is not None:
+        bsz = visual_cond.shape[0]
+        device = visual_cond.device
+        dtype = visual_cond.dtype
+    if caption_cond is not None:
+        bsz = caption_cond.shape[0]
+        device = caption_cond.device
+        dtype = caption_cond.dtype
+    if label_cond is not None:
+        bsz = label_cond.shape[0]
+        device = label_cond.device
+        dtype = label_cond.dtype
+    if do_classifier_free_guidance:
+        bsz = bsz // 2
+    latents = torch.randn(
+        (bsz, *shape),
+        generator=generator,
+        device=device,
+        dtype=dtype,
+    )
+    try:
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * scheduler.init_noise_sigma
+    except AttributeError:
+        pass
+    # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+    extra_step_kwargs = {"generator": generator}
+    # set timesteps
+    timesteps, num_inference_steps = retrieve_timesteps(
+        scheduler,
+        steps,
+        device,
+    )
+    if eta > 0:
+        assert 0 <= eta <= 1, f"eta must be between [0, 1]. Got {eta}."
+        assert (
+            scheduler.__class__.__name__ == "DDIMScheduler"
+        ), f"eta is only used with the DDIMScheduler."
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, and between [0, 1]
+        extra_step_kwargs["eta"] = eta
+    # reverse
+    for i, t in enumerate(
+        tqdm(timesteps, disable=disable_prog, desc="DDIM Sampling:", leave=False)
+    ):
+        # expand the latents if we are doing classifier free guidance
+        latent_model_input = (
+            torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+        )
+        # predict the noise residual
+        timestep_tensor = torch.tensor([t], dtype=torch.long, device=device)
+        timestep_tensor = timestep_tensor.expand(latent_model_input.shape[0])
+        noise_pred = diffusion_model.forward(
+            latent_model_input, timestep_tensor, visual_cond, caption_cond, label_cond
+        ).sample
+        # perform guidance
+        if do_classifier_free_guidance:
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (
+                noise_pred_text - noise_pred_uncond
+            )
+        # compute the previous noisy sample x_t -> x_t-1
+        latents = ddim_scheduler.step(
+            noise_pred, t, latents, **extra_step_kwargs
+        ).prev_sample
+        yield latents, t
+@torch.no_grad()
+def flow_sample(
+    scheduler: DDIMScheduler,
+    diffusion_model: torch.nn.Module,
+    shape: Union[List[int], Tuple[int]],
+    visual_cond: torch.FloatTensor,
+    caption_cond: torch.FloatTensor,
+    label_cond: torch.FloatTensor,
+    steps: int,
+    eta: float = 0.0,
+    guidance_scale: float = 3.0,
+    do_classifier_free_guidance: bool = True,
+    generator: Optional[torch.Generator] = None,
+    device: torch.device = "cuda:0",
+    disable_prog: bool = True,
+):
+    assert steps > 0, f"{steps} must > 0."
+    # init latents
+    if visual_cond is not None:
+        bsz = visual_cond.shape[0]
+        device = visual_cond.device
+        dtype = visual_cond.dtype
+    if caption_cond is not None:
+        bsz = caption_cond.shape[0]
+        device = caption_cond.device
+        dtype = caption_cond.dtype
+    if label_cond is not None:
+        bsz = label_cond.shape[0]
+        device = label_cond.device
+        dtype = label_cond.dtype
+    if do_classifier_free_guidance:
+        bsz = bsz // 2
+    latents = torch.randn(
+        (bsz, *shape),
+        generator=generator,
+        device=device,
+        dtype=dtype,
+    )
+    try:
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * scheduler.init_noise_sigma
+    except AttributeError:
+        pass
+    # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+    extra_step_kwargs = {"generator": generator}
+    # set timesteps
+    timesteps, num_inference_steps = retrieve_timesteps(
+        scheduler,
+        steps + 1,
+        device,
+    )
+    if eta > 0:
+        assert 0 <= eta <= 1, f"eta must be between [0, 1]. Got {eta}."
+        assert (
+            scheduler.__class__.__name__ == "DDIMScheduler"
+        ), f"eta is only used with the DDIMScheduler."
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, and between [0, 1]
+        extra_step_kwargs["eta"] = eta
+    # reverse
+    distance = (timesteps[:-1] - timesteps[1:]) / scheduler.config.num_train_timesteps
+    for i, t in enumerate(
+        tqdm(timesteps[:-1], disable=disable_prog, desc="Flow Sampling:", leave=False)
+    ):
+        # expand the latents if we are doing classifier free guidance
+        latent_model_input = (
+            torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+        )
+        # predict the noise residual
+        timestep_tensor = torch.tensor([t], dtype=latents.dtype, device=device)
+        timestep_tensor = timestep_tensor.expand(latent_model_input.shape[0])
+        noise_pred = diffusion_model.forward(
+            latent_model_input, timestep_tensor, visual_cond, caption_cond, label_cond
+        ).sample
+        if isinstance(noise_pred, tuple):
+            noise_pred, layer_idx_list, ones_list, pred_c_list = noise_pred
+        # perform guidance
+        if do_classifier_free_guidance:
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (
+                noise_pred_text - noise_pred_uncond
+            )
+        # compute the previous noisy sample x_t -> x_t-1
+        latents = latents - distance[i] * noise_pred
+        yield latents, t
+def compute_snr(noise_scheduler, timesteps):
+    """
+    Computes SNR as per
+    https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
+    """
+    alphas_cumprod = noise_scheduler.alphas_cumprod
+    sqrt_alphas_cumprod = alphas_cumprod**0.5
+    sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5
+    # Expand the tensors.
+    # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
+    sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[
+        timesteps
+    ].float()
+    while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
+        sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
+    alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
+    sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(
+        device=timesteps.device
+    )[timesteps].float()
+    while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
+        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
+    sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
+    # Compute SNR.
+    snr = (alpha / sigma) ** 2
+    return snr
+def read_image(img, img_size=224):
+    transform = transforms.Compose(
+        [
+            transforms.Resize(
+                img_size, transforms.InterpolationMode.BICUBIC, antialias=True
+            ),
+            transforms.CenterCrop(img_size),  # crop a (224, 224) square
+            transforms.ToTensor(),
+        ]
+    )
+    rgb = Image.open(img)
+    rgb = transform(rgb)[:3, ...].permute(1, 2, 0)
+    return rgb
+def preprocess_image(
+    images_pil: List[Image.Image],
+    force: bool = False,
+    background_color: List[int] = [255, 255, 255],
+    foreground_ratio: float = 0.95,
+):
+    r"""
+    Crop and remote the background of the input image
+    Args:
+        image_pil (`List[PIL.Image.Image]`):
+            List of `PIL.Image.Image` objects representing the input image.
+        force (`bool`, *optional*, defaults to `False`):
+            Whether to force remove the background even if the image has an alpha channel.
+    Returns:
+        `List[PIL.Image.Image]`: List of `PIL.Image.Image` objects representing the preprocessed image.
+    """
+    preprocessed_images = []
+    for i in range(len(images_pil)):
+        image = images_pil[i]
+        width, height, size = image.width, image.height, image.size
+        do_remove = True
+        if image.mode == "RGBA" and image.getextrema()[3][0] < 255:
+            # explain why current do not rm bg
+            print(
+                "alhpa channl not empty, skip remove background, using alpha channel as mask"
+            )
+            do_remove = False
+        do_remove = do_remove or force
+        if do_remove:
+            image = rembg.remove(image)
+        # calculate the min bbox of the image
+        alpha = image.split()[-1]
+        bboxs = alpha.getbbox()
+        x1, y1, x2, y2 = bboxs
+        dy, dx = y2 - y1, x2 - x1
+        s = min(height * foreground_ratio / dy, width * foreground_ratio / dx)
+        Ht, Wt = int(dy * s), int(dx * s)
+        background = Image.new("RGBA", image.size, (*background_color, 255))
+        image = Image.alpha_composite(background, image)
+        image = image.crop(alpha.getbbox())
+        alpha = alpha.crop(alpha.getbbox())
+        # Calculate the new size after rescaling
+        new_size = tuple(int(dim * foreground_ratio) for dim in size)
+        # Resize the image while maintaining the aspect ratio
+        resized_image = image.resize((Wt, Ht))
+        resized_alpha = alpha.resize((Wt, Ht))
+        # Create a new image with the original size and white background
+        padded_image = Image.new("RGB", size, tuple(background_color))
+        padded_alpha = Image.new("L", size, (0))
+        paste_position = (
+            (width - resized_image.width) // 2,
+            (height - resized_image.height) // 2,
+        )
+        padded_image.paste(resized_image, paste_position)
+        padded_alpha.paste(resized_alpha, paste_position)
+        # expand image to 1:1
+        width, height = padded_image.size
+        if width == height:
+            padded_image.putalpha(padded_alpha)
+            preprocessed_images.append(padded_image)
+            continue
+        new_size = (max(width, height), max(width, height))
+        new_image = Image.new("RGB", new_size, tuple(background_color))
+        new_alpha = Image.new("L", new_size, (0))
+        paste_position = ((new_size[0] - width) // 2, (new_size[1] - height) // 2)
+        new_image.paste(padded_image, paste_position)
+        new_alpha.paste(padded_alpha, paste_position)
+        new_image.putalpha(new_alpha)
+        preprocessed_images.append(new_image)
+    return preprocessed_images

step1x3d_geometry/utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from . import base

step1x3d_geometry/utils/base.py ADDED Viewed

	@@ -0,0 +1,215 @@

+from dataclasses import dataclass
+import os
+import copy
+import json
+from omegaconf import OmegaConf
+import torch
+import torch.nn as nn
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils import (
+    extract_commit_hash,
+)
+from step1x3d_geometry.utils.config import parse_structured
+from step1x3d_geometry.utils.misc import get_device, load_module_weights
+from step1x3d_geometry.utils.typing import *
+class Configurable:
+    @dataclass
+    class Config:
+        pass
+    def __init__(self, cfg: Optional[dict] = None) -> None:
+        super().__init__()
+        self.cfg = parse_structured(self.Config, cfg)
+class Updateable:
+    def do_update_step(
+        self, epoch: int, global_step: int, on_load_weights: bool = False
+    ):
+        for attr in self.__dir__():
+            if attr.startswith("_"):
+                continue
+            try:
+                module = getattr(self, attr)
+            except:
+                continue  # ignore attributes like property, which can't be retrived using getattr?
+            if isinstance(module, Updateable):
+                module.do_update_step(
+                    epoch, global_step, on_load_weights=on_load_weights
+                )
+        self.update_step(epoch, global_step, on_load_weights=on_load_weights)
+    def do_update_step_end(self, epoch: int, global_step: int):
+        for attr in self.__dir__():
+            if attr.startswith("_"):
+                continue
+            try:
+                module = getattr(self, attr)
+            except:
+                continue  # ignore attributes like property, which can't be retrived using getattr?
+            if isinstance(module, Updateable):
+                module.do_update_step_end(epoch, global_step)
+        self.update_step_end(epoch, global_step)
+    def update_step(self, epoch: int, global_step: int, on_load_weights: bool = False):
+        # override this method to implement custom update logic
+        # if on_load_weights is True, you should be careful doing things related to model evaluations,
+        # as the models and tensors are not guarenteed to be on the same device
+        pass
+    def update_step_end(self, epoch: int, global_step: int):
+        pass
+def update_if_possible(module: Any, epoch: int, global_step: int) -> None:
+    if isinstance(module, Updateable):
+        module.do_update_step(epoch, global_step)
+def update_end_if_possible(module: Any, epoch: int, global_step: int) -> None:
+    if isinstance(module, Updateable):
+        module.do_update_step_end(epoch, global_step)
+class BaseObject(Updateable):
+    @dataclass
+    class Config:
+        pass
+    cfg: Config  # add this to every subclass of BaseObject to enable static type checking
+    def __init__(
+        self, cfg: Optional[Union[dict, DictConfig]] = None, *args, **kwargs
+    ) -> None:
+        super().__init__()
+        self.cfg = parse_structured(self.Config, cfg)
+        self.device = get_device()
+        self.configure(*args, **kwargs)
+    def configure(self, *args, **kwargs) -> None:
+        pass
+class BaseModule(ModelMixin, Updateable, nn.Module):
+    @dataclass
+    class Config:
+        weights: Optional[str] = None
+    cfg: Config  # add this to every subclass of BaseModule to enable static type checking
+    config_name = "config.json"
+    def __init__(
+        self, cfg: Optional[Union[dict, DictConfig]] = None, *args, **kwargs
+    ) -> None:
+        super().__init__()
+        self.cfg = parse_structured(self.Config, cfg)
+        # self.device = get_device()
+        self.configure(*args, **kwargs)
+        if self.cfg.weights is not None:
+            # format: path/to/weights:module_name
+            weights_path, module_name = self.cfg.weights.split(":")
+            state_dict, epoch, global_step = load_module_weights(
+                weights_path, module_name=module_name, map_location="cpu"
+            )
+            self.load_state_dict(state_dict)
+            self.do_update_step(
+                epoch, global_step, on_load_weights=True
+            )  # restore states
+        # dummy tensor to indicate model state
+        self._dummy: Float[Tensor, "..."]
+        self.register_buffer("_dummy", torch.zeros(0).float(), persistent=False)
+    def configure(self, *args, **kwargs) -> None:
+        pass
+    @classmethod
+    def load_config(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        return_unused_kwargs=False,
+        return_commit_hash=False,
+        **kwargs,
+    ):
+        subfolder = kwargs.pop("subfolder", None)
+        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+        if os.path.isfile(pretrained_model_name_or_path):
+            config_file = pretrained_model_name_or_path
+        elif os.path.isdir(pretrained_model_name_or_path):
+            if subfolder is not None and os.path.isfile(
+                os.path.join(pretrained_model_name_or_path, subfolder, cls.config_name)
+            ):
+                config_file = os.path.join(
+                    pretrained_model_name_or_path, subfolder, cls.config_name
+                )
+            elif os.path.isfile(
+                os.path.join(pretrained_model_name_or_path, cls.config_name)
+            ):
+                # Load from a PyTorch checkpoint
+                config_file = os.path.join(
+                    pretrained_model_name_or_path, cls.config_name
+                )
+            else:
+                raise EnvironmentError(
+                    f"Error no file named {cls.config_name} found in directory {pretrained_model_name_or_path}."
+                )
+        else:
+            raise ValueError
+        config_dict = json.load(open(config_file, "r"))
+        commit_hash = extract_commit_hash(config_file)
+        outputs = (config_dict,)
+        if return_unused_kwargs:
+            outputs += (kwargs,)
+        if return_commit_hash:
+            outputs += (commit_hash,)
+        return outputs
+    @classmethod
+    def from_config(cls, config: Dict[str, Any] = None, **kwargs):
+        model = cls(config)
+        return model
+    def register_to_config(self, **kwargs):
+        pass
+    def save_config(self, save_directory: Union[str, os.PathLike], **kwargs):
+        """
+        Save a configuration object to the directory specified in `save_directory` so that it can be reloaded using the
+        [`~ConfigMixin.from_config`] class method.
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the configuration JSON file is saved (will be created if it does not exist).
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        if os.path.isfile(save_directory):
+            raise AssertionError(
+                f"Provided path ({save_directory}) should be a directory, not a file"
+            )
+        os.makedirs(save_directory, exist_ok=True)
+        # If we save using the predefined names, we can load using `from_config`
+        output_config_file = os.path.join(save_directory, self.config_name)
+        config_dict = OmegaConf.to_container(self.cfg, resolve=True)
+        for k in copy.deepcopy(config_dict).keys():
+            if k.startswith("pretrained"):
+                config_dict.pop(k)
+        config_dict.pop("weights")
+        with open(output_config_file, "w", encoding="utf-8") as f:
+            json.dump(config_dict, f, ensure_ascii=False, indent=4)
+        print(f"Configuration saved in {output_config_file}")