Spaces:

dylanebert
/

splat-to-mesh

Sleeping

App Files Files Community

dylanebert HF staff commited on Feb 26

Commit

93f5bda

•

1 Parent(s): 02afac0

initial commit

Browse files

Files changed (19) hide show

.gitattributes +1 -0
.gitignore +1 -0
Dockerfile +46 -0
LICENSE +21 -0
app.py +25 -0
convert.py +462 -0
core/__init__.py +0 -0
core/__pycache__/__init__.cpython-310.pyc +0 -0
core/__pycache__/gs.cpython-310.pyc +0 -0
core/__pycache__/options.cpython-310.pyc +0 -0
core/attention.py +156 -0
core/gs.py +190 -0
core/models.py +174 -0
core/options.py +120 -0
core/provider_objaverse.py +172 -0
core/unet.py +319 -0
core/utils.py +109 -0
data_test/catstatue.ply +3 -0
requirements.txt +20 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data_test/catstatue.ply filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ venv/

Dockerfile ADDED Viewed

	@@ -0,0 +1,46 @@

+from nvidia/cuda:12.1.1-devel-ubuntu22.04
+# Set the environment variable
+ENV DEBIAN_FRONTEND=noninteractive
+# Install the required packages
+RUN apt-get update && apt-get install -y \
+    software-properties-common
+# Add the deadsnakes PPA
+RUN add-apt-repository ppa:deadsnakes/ppa
+# Install Python 3.10
+RUN apt-get update && apt-get install -y \
+    python3.10 \
+    python3.10-dev \
+    python3.10-distutils \
+    python3.10-venv \
+    python3-pip
+# Install other dependencies
+RUN apt-get install -y \
+    git \
+    gcc \
+    g++ \
+    libgl1 \
+    libglib2.0.0 \
+    ffmpeg \
+    cmake \
+    libgtk2.0.0
+# Working directory
+WORKDIR /app
+COPY requirements.txt .
+# Install the required Python packages
+RUN python3.10 -m pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu121
+# Copy all files to the working directory
+COPY . .
+EXPOSE 7860
+# Run the gradio app
+CMD ["python3.10", "app.py"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 3D Topia
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

app.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import gradio as gr
+import subprocess
+def run(input_ply):
+    subprocess.run(
+        "python3.10 convert.py big --force-cuda-rast --test_path " + input_ply,
+        shell=True,
+    )
+    return input_ply.replace(".ply", ".glb")
+def main():
+    demo = gr.Interface(
+        fn=run,
+        inputs=gr.Model3D(label="Input Splat"),
+        outputs=gr.Model3D(label="Output GLB"),
+        examples=
+    )
+    demo.launch(server_name="0.0.0.0", server_port=7860)
+if __name__ == "__main__":
+    main()

convert.py ADDED Viewed

	@@ -0,0 +1,462 @@

+import os
+import tyro
+import tqdm
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from core.options import AllConfigs, Options
+from core.gs import GaussianRenderer
+import mcubes
+import nerfacc
+import nvdiffrast.torch as dr
+import kiui
+from kiui.mesh import Mesh
+from kiui.mesh_utils import clean_mesh, decimate_mesh
+from kiui.mesh_utils import laplacian_smooth_loss, normal_consistency
+from kiui.op import uv_padding, safe_normalize, inverse_sigmoid
+from kiui.cam import orbit_camera, get_perspective
+from kiui.nn import MLP, trunc_exp
+from kiui.gridencoder import GridEncoder
+def get_rays(pose, h, w, fovy, opengl=True):
+    x, y = torch.meshgrid(
+        torch.arange(w, device=pose.device),
+        torch.arange(h, device=pose.device),
+        indexing="xy",
+    )
+    x = x.flatten()
+    y = y.flatten()
+    cx = w * 0.5
+    cy = h * 0.5
+    focal = h * 0.5 / np.tan(0.5 * np.deg2rad(fovy))
+    camera_dirs = F.pad(
+        torch.stack(
+            [
+                (x - cx + 0.5) / focal,
+                (y - cy + 0.5) / focal * (-1.0 if opengl else 1.0),
+            ],
+            dim=-1,
+        ),
+        (0, 1),
+        value=(-1.0 if opengl else 1.0),
+    )  # [hw, 3]
+    rays_d = camera_dirs @ pose[:3, :3].transpose(0, 1)  # [hw, 3]
+    rays_o = pose[:3, 3].unsqueeze(0).expand_as(rays_d) # [hw, 3]
+    rays_d = safe_normalize(rays_d)
+    return rays_o, rays_d
+# Triple renderer of gaussians, gaussian, and diso mesh.
+# gaussian --> nerf --> mesh
+class Converter(nn.Module):
+    def __init__(self, opt: Options):
+        super().__init__()
+        self.opt = opt
+        self.device = torch.device("cuda")
+        # gs renderer
+        self.tan_half_fov = np.tan(0.5 * np.deg2rad(opt.fovy))
+        self.proj_matrix = torch.zeros(4, 4, dtype=torch.float32, device=self.device)
+        self.proj_matrix[0, 0] = 1 / self.tan_half_fov
+        self.proj_matrix[1, 1] = 1 / self.tan_half_fov
+        self.proj_matrix[2, 2] = (opt.zfar + opt.znear) / (opt.zfar - opt.znear)
+        self.proj_matrix[3, 2] = - (opt.zfar * opt.znear) / (opt.zfar - opt.znear)
+        self.proj_matrix[2, 3] = 1
+        self.gs_renderer = GaussianRenderer(opt)
+        self.gaussians = self.gs_renderer.load_ply(opt.test_path).to(self.device)
+        # nerf renderer
+        if not self.opt.force_cuda_rast:
+            self.glctx = dr.RasterizeGLContext()
+        else:
+            self.glctx = dr.RasterizeCudaContext()
+        self.step = 0
+        self.render_step_size = 5e-3
+        self.aabb = torch.tensor([-1.0, -1.0, -1.0, 1.0, 1.0, 1.0], device=self.device)
+        self.estimator = nerfacc.OccGridEstimator(roi_aabb=self.aabb, resolution=64, levels=1)
+        self.encoder_density = GridEncoder(num_levels=12) # VMEncoder(output_dim=16, mode='sum')
+        self.encoder = GridEncoder(num_levels=12)
+        self.mlp_density = MLP(self.encoder_density.output_dim, 1, 32, 2, bias=False)
+        self.mlp = MLP(self.encoder.output_dim, 3, 32, 2, bias=False)
+        # mesh renderer
+        self.proj = torch.from_numpy(get_perspective(self.opt.fovy)).float().to(self.device)
+        self.v = self.f = None
+        self.vt = self.ft = None
+        self.deform = None
+        self.albedo = None
+    @torch.no_grad()
+    def render_gs(self, pose):
+        cam_poses = torch.from_numpy(pose).unsqueeze(0).to(self.device)
+        cam_poses[:, :3, 1:3] *= -1 # invert up & forward direction
+        # cameras needed by gaussian rasterizer
+        cam_view = torch.inverse(cam_poses).transpose(1, 2) # [V, 4, 4]
+        cam_view_proj = cam_view @ self.proj_matrix # [V, 4, 4]
+        cam_pos = - cam_poses[:, :3, 3] # [V, 3]
+        out = self.gs_renderer.render(self.gaussians.unsqueeze(0), cam_view.unsqueeze(0), cam_view_proj.unsqueeze(0), cam_pos.unsqueeze(0))
+        image = out['image'].squeeze(1).squeeze(0) # [C, H, W]
+        alpha = out['alpha'].squeeze(2).squeeze(1).squeeze(0) # [H, W]
+        return image, alpha
+    def get_density(self, xs):
+        # xs: [..., 3]
+        prefix = xs.shape[:-1]
+        xs = xs.view(-1, 3)
+        feats = self.encoder_density(xs)
+        density = trunc_exp(self.mlp_density(feats))
+        density = density.view(*prefix, 1)
+        return density
+    def render_nerf(self, pose):
+        pose = torch.from_numpy(pose.astype(np.float32)).to(self.device)
+        # get rays
+        resolution = self.opt.output_size
+        rays_o, rays_d = get_rays(pose, resolution, resolution, self.opt.fovy)
+        # update occ grid
+        if self.training:
+            def occ_eval_fn(xs):
+                sigmas = self.get_density(xs)
+                return self.render_step_size * sigmas
+            self.estimator.update_every_n_steps(self.step, occ_eval_fn=occ_eval_fn, occ_thre=0.01, n=8)
+            self.step += 1
+        # render
+        def sigma_fn(t_starts, t_ends, ray_indices):
+            t_origins = rays_o[ray_indices]
+            t_dirs = rays_d[ray_indices]
+            xs = t_origins + t_dirs * (t_starts + t_ends)[:, None] / 2.0
+            sigmas = self.get_density(xs)
+            return sigmas.squeeze(-1)
+        with torch.no_grad():
+            ray_indices, t_starts, t_ends = self.estimator.sampling(
+                rays_o,
+                rays_d,
+                sigma_fn=sigma_fn,
+                near_plane=0.01,
+                far_plane=100,
+                render_step_size=self.render_step_size,
+                stratified=self.training,
+                cone_angle=0,
+            )
+        t_origins = rays_o[ray_indices]
+        t_dirs = rays_d[ray_indices]
+        xs = t_origins + t_dirs * (t_starts + t_ends)[:, None] / 2.0
+        sigmas = self.get_density(xs).squeeze(-1)
+        rgbs = torch.sigmoid(self.mlp(self.encoder(xs)))
+        n_rays=rays_o.shape[0]
+        weights, trans, alphas = nerfacc.render_weight_from_density(t_starts, t_ends, sigmas, ray_indices=ray_indices, n_rays=n_rays)
+        color = nerfacc.accumulate_along_rays(weights, values=rgbs, ray_indices=ray_indices, n_rays=n_rays)
+        alpha = nerfacc.accumulate_along_rays(weights, values=None, ray_indices=ray_indices, n_rays=n_rays)
+        color = color + 1 * (1.0 - alpha)
+        color = color.view(resolution, resolution, 3).clamp(0, 1).permute(2, 0, 1).contiguous()
+        alpha = alpha.view(resolution, resolution).clamp(0, 1).contiguous()
+        return color, alpha
+    def fit_nerf(self, iters=512, resolution=128):
+        self.opt.output_size = resolution
+        optimizer = torch.optim.Adam([
+            {'params': self.encoder_density.parameters(), 'lr': 1e-2},
+            {'params': self.encoder.parameters(), 'lr': 1e-2},
+            {'params': self.mlp_density.parameters(), 'lr': 1e-3},
+            {'params': self.mlp.parameters(), 'lr': 1e-3},
+        ])
+        print(f"[INFO] fitting nerf...")
+        pbar = tqdm.trange(iters)
+        for i in pbar:
+            ver = np.random.randint(-45, 45)
+            hor = np.random.randint(-180, 180)
+            rad = np.random.uniform(1.5, 3.0)
+            pose = orbit_camera(ver, hor, rad)
+            image_gt, alpha_gt = self.render_gs(pose)
+            image_pred, alpha_pred = self.render_nerf(pose)
+            # if i % 200 == 0:
+            #     kiui.vis.plot_image(image_gt, alpha_gt, image_pred, alpha_pred)
+            loss_mse = F.mse_loss(image_pred, image_gt) + 0.1 * F.mse_loss(alpha_pred, alpha_gt)
+            loss = loss_mse #+ 0.1 * self.encoder_density.tv_loss() #+ 0.0001 * self.encoder_density.density_loss()
+            loss.backward()
+            self.encoder_density.grad_total_variation(1e-8)
+            optimizer.step()
+            optimizer.zero_grad()
+            pbar.set_description(f"MSE = {loss_mse.item():.6f}")
+        print(f"[INFO] finished fitting nerf!")
+    def render_mesh(self, pose):
+        h = w = self.opt.output_size
+        v = self.v + self.deform
+        f = self.f
+        pose = torch.from_numpy(pose.astype(np.float32)).to(v.device)
+        # get v_clip and render rgb
+        v_cam = torch.matmul(F.pad(v, pad=(0, 1), mode='constant', value=1.0), torch.inverse(pose).T).float().unsqueeze(0)
+        v_clip = v_cam @ self.proj.T
+        rast, rast_db = dr.rasterize(self.glctx, v_clip, f, (h, w))
+        alpha = torch.clamp(rast[..., -1:], 0, 1).contiguous() # [1, H, W, 1]
+        alpha = dr.antialias(alpha, rast, v_clip, f).clamp(0, 1).squeeze(-1).squeeze(0) # [H, W] important to enable gradients!
+        if self.albedo is None:
+            xyzs, _ = dr.interpolate(v.unsqueeze(0), rast, f) # [1, H, W, 3]
+            xyzs = xyzs.view(-1, 3)
+            mask = (alpha > 0).view(-1)
+            image = torch.zeros_like(xyzs, dtype=torch.float32)
+            if mask.any():
+                masked_albedo = torch.sigmoid(self.mlp(self.encoder(xyzs[mask].detach(), bound=1)))
+                image[mask] = masked_albedo.float()
+        else:
+            texc, texc_db = dr.interpolate(self.vt.unsqueeze(0), rast, self.ft, rast_db=rast_db, diff_attrs='all')
+            image = torch.sigmoid(dr.texture(self.albedo.unsqueeze(0), texc, uv_da=texc_db)) # [1, H, W, 3]
+        image = image.view(1, h, w, 3)
+        # image = dr.antialias(image, rast, v_clip, f).clamp(0, 1)
+        image = image.squeeze(0).permute(2, 0, 1).contiguous() # [3, H, W]
+        image = alpha * image + (1 - alpha)
+        return image, alpha
+    def fit_mesh(self, iters=2048, resolution=512, decimate_target=5e4):
+        self.opt.output_size = resolution
+        # init mesh from nerf
+        grid_size = 256
+        sigmas = np.zeros([grid_size, grid_size, grid_size], dtype=np.float32)
+        S = 128
+        density_thresh = 10
+        X = torch.linspace(-1, 1, grid_size).split(S)
+        Y = torch.linspace(-1, 1, grid_size).split(S)
+        Z = torch.linspace(-1, 1, grid_size).split(S)
+        for xi, xs in enumerate(X):
+            for yi, ys in enumerate(Y):
+                for zi, zs in enumerate(Z):
+                    xx, yy, zz = torch.meshgrid(xs, ys, zs, indexing='ij')
+                    pts = torch.cat([xx.reshape(-1, 1), yy.reshape(-1, 1), zz.reshape(-1, 1)], dim=-1) # [S, 3]
+                    val = self.get_density(pts.to(self.device))
+                    sigmas[xi * S: xi * S + len(xs), yi * S: yi * S + len(ys), zi * S: zi * S + len(zs)] = val.reshape(len(xs), len(ys), len(zs)).detach().cpu().numpy() # [S, 1] --> [x, y, z]
+        print(f'[INFO] marching cubes thresh: {density_thresh} ({sigmas.min()} ~ {sigmas.max()})')
+        vertices, triangles = mcubes.marching_cubes(sigmas, density_thresh)
+        vertices = vertices / (grid_size - 1.0) * 2 - 1
+        # clean
+        vertices = vertices.astype(np.float32)
+        triangles = triangles.astype(np.int32)
+        vertices, triangles = clean_mesh(vertices, triangles, remesh=True, remesh_size=0.01)
+        if triangles.shape[0] > decimate_target:
+            vertices, triangles = decimate_mesh(vertices, triangles, decimate_target, optimalplacement=False)
+        self.v = torch.from_numpy(vertices).contiguous().float().to(self.device)
+        self.f = torch.from_numpy(triangles).contiguous().int().to(self.device)
+        self.deform = nn.Parameter(torch.zeros_like(self.v)).to(self.device)
+        # fit mesh from gs
+        lr_factor = 1
+        optimizer = torch.optim.Adam([
+            {'params': self.encoder.parameters(), 'lr': 1e-3 * lr_factor},
+            {'params': self.mlp.parameters(), 'lr': 1e-3 * lr_factor},
+            {'params': self.deform, 'lr': 1e-4},
+        ])
+        print(f"[INFO] fitting mesh...")
+        pbar = tqdm.trange(iters)
+        for i in pbar:
+            ver = np.random.randint(-10, 10)
+            hor = np.random.randint(-180, 180)
+            rad = self.opt.cam_radius # np.random.uniform(1, 2)
+            pose = orbit_camera(ver, hor, rad)
+            image_gt, alpha_gt = self.render_gs(pose)
+            image_pred, alpha_pred = self.render_mesh(pose)
+            loss_mse = F.mse_loss(image_pred, image_gt) + 0.1 * F.mse_loss(alpha_pred, alpha_gt)
+            # loss_lap = laplacian_smooth_loss(self.v + self.deform, self.f)
+            loss_normal = normal_consistency(self.v + self.deform, self.f)
+            loss_offsets = (self.deform ** 2).sum(-1).mean()
+            loss = loss_mse + 0.001 * loss_normal + 0.1 * loss_offsets
+            loss.backward()
+            optimizer.step()
+            optimizer.zero_grad()
+            # remesh periodically
+            if i > 0 and i % 512 == 0:
+                vertices = (self.v + self.deform).detach().cpu().numpy()
+                triangles = self.f.detach().cpu().numpy()
+                vertices, triangles = clean_mesh(vertices, triangles, remesh=True, remesh_size=0.01)
+                if triangles.shape[0] > decimate_target:
+                    vertices, triangles = decimate_mesh(vertices, triangles, decimate_target, optimalplacement=False)
+                self.v = torch.from_numpy(vertices).contiguous().float().to(self.device)
+                self.f = torch.from_numpy(triangles).contiguous().int().to(self.device)
+                self.deform = nn.Parameter(torch.zeros_like(self.v)).to(self.device)
+                lr_factor *= 0.5
+                optimizer = torch.optim.Adam([
+                    {'params': self.encoder.parameters(), 'lr': 1e-3 * lr_factor},
+                    {'params': self.mlp.parameters(), 'lr': 1e-3 * lr_factor},
+                    {'params': self.deform, 'lr': 1e-4},
+                ])
+            pbar.set_description(f"MSE = {loss_mse.item():.6f}")
+        # last clean
+        vertices = (self.v + self.deform).detach().cpu().numpy()
+        triangles = self.f.detach().cpu().numpy()
+        vertices, triangles = clean_mesh(vertices, triangles, remesh=False)
+        self.v = torch.from_numpy(vertices).contiguous().float().to(self.device)
+        self.f = torch.from_numpy(triangles).contiguous().int().to(self.device)
+        self.deform = nn.Parameter(torch.zeros_like(self.v).to(self.device))
+        print(f"[INFO] finished fitting mesh!")
+    # uv mesh refine
+    def fit_mesh_uv(self, iters=512, resolution=512, texture_resolution=1024, padding=2):
+        self.opt.output_size = resolution
+        # unwrap uv
+        print(f"[INFO] uv unwrapping...")
+        mesh = Mesh(v=self.v, f=self.f, albedo=None, device=self.device)
+        mesh.auto_normal()
+        mesh.auto_uv()
+        self.vt = mesh.vt
+        self.ft = mesh.ft
+        # render uv maps
+        h = w = texture_resolution
+        uv = mesh.vt * 2.0 - 1.0 # uvs to range [-1, 1]
+        uv = torch.cat((uv, torch.zeros_like(uv[..., :1]), torch.ones_like(uv[..., :1])), dim=-1) # [N, 4]
+        rast, _ = dr.rasterize(self.glctx, uv.unsqueeze(0), mesh.ft, (h, w)) # [1, h, w, 4]
+        xyzs, _ = dr.interpolate(mesh.v.unsqueeze(0), rast, mesh.f) # [1, h, w, 3]
+        mask, _ = dr.interpolate(torch.ones_like(mesh.v[:, :1]).unsqueeze(0), rast, mesh.f) # [1, h, w, 1]
+        # masked query
+        xyzs = xyzs.view(-1, 3)
+        mask = (mask > 0).view(-1)
+        albedo = torch.zeros(h * w, 3, device=self.device, dtype=torch.float32)
+        if mask.any():
+            print(f"[INFO] querying texture...")
+            xyzs = xyzs[mask] # [M, 3]
+            # batched inference to avoid OOM
+            batch = []
+            head = 0
+            while head < xyzs.shape[0]:
+                tail = min(head + 640000, xyzs.shape[0])
+                batch.append(torch.sigmoid(self.mlp(self.encoder(xyzs[head:tail]))).float())
+                head += 640000
+            albedo[mask] = torch.cat(batch, dim=0)
+        albedo = albedo.view(h, w, -1)
+        mask = mask.view(h, w)
+        albedo = uv_padding(albedo, mask, padding)
+        # optimize texture
+        self.albedo = nn.Parameter(inverse_sigmoid(albedo)).to(self.device)
+        optimizer = torch.optim.Adam([
+            {'params': self.albedo, 'lr': 1e-3},
+        ])
+        print(f"[INFO] fitting mesh texture...")
+        pbar = tqdm.trange(iters)
+        for i in pbar:
+            # shrink to front view as we care more about it...
+            ver = np.random.randint(-5, 5)
+            hor = np.random.randint(-15, 15)
+            rad = self.opt.cam_radius # np.random.uniform(1, 2)
+            pose = orbit_camera(ver, hor, rad)
+            image_gt, alpha_gt = self.render_gs(pose)
+            image_pred, alpha_pred = self.render_mesh(pose)
+            loss_mse = F.mse_loss(image_pred, image_gt)
+            loss = loss_mse
+            loss.backward()
+            optimizer.step()
+            optimizer.zero_grad()
+            pbar.set_description(f"MSE = {loss_mse.item():.6f}")
+        print(f"[INFO] finished fitting mesh texture!")
+    @torch.no_grad()
+    def export_mesh(self, path):
+        mesh = Mesh(v=self.v, f=self.f, vt=self.vt, ft=self.ft, albedo=torch.sigmoid(self.albedo), device=self.device)
+        mesh.auto_normal()
+        mesh.write(path)
+opt = tyro.cli(AllConfigs)
+# load a saved ply and convert to mesh
+assert opt.test_path.endswith('.ply'), '--test_path must be a .ply file saved by infer.py'
+converter = Converter(opt).cuda()
+converter.fit_nerf()
+converter.fit_mesh()
+converter.fit_mesh_uv()
+converter.export_mesh(opt.test_path.replace('.ply', '.glb'))

core/__init__.py ADDED Viewed

File without changes

core/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (125 Bytes). View file

core/__pycache__/gs.cpython-310.pyc ADDED Viewed

Binary file (5.44 kB). View file

core/__pycache__/options.cpython-310.pyc ADDED Viewed

Binary file (2.48 kB). View file

core/attention.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import os
+import warnings
+from torch import Tensor
+from torch import nn
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import memory_efficient_attention, unbind
+        XFORMERS_AVAILABLE = True
+        warnings.warn("xFormers is available (Attention)")
+    else:
+        warnings.warn("xFormers is disabled (Attention)")
+        raise ImportError
+except ImportError:
+    XFORMERS_AVAILABLE = False
+    warnings.warn("xFormers is not available (Attention)")
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: Tensor) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class MemEffAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            if attn_bias is not None:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return super().forward(x)
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = unbind(qkv, 2)
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class CrossAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        dim_q: int,
+        dim_k: int,
+        dim_v: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.to_q = nn.Linear(dim_q, dim, bias=qkv_bias)
+        self.to_k = nn.Linear(dim_k, dim, bias=qkv_bias)
+        self.to_v = nn.Linear(dim_v, dim, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
+        # q: [B, N, Cq]
+        # k: [B, M, Ck]
+        # v: [B, M, Cv]
+        # return: [B, N, C]
+        B, N, _ = q.shape
+        M = k.shape[1]
+        q = self.scale * self.to_q(q).reshape(B, N, self.num_heads, self.dim // self.num_heads).permute(0, 2, 1, 3) # [B, nh, N, C/nh]
+        k = self.to_k(k).reshape(B, M, self.num_heads, self.dim // self.num_heads).permute(0, 2, 1, 3) # [B, nh, M, C/nh]
+        v = self.to_v(v).reshape(B, M, self.num_heads, self.dim // self.num_heads).permute(0, 2, 1, 3) # [B, nh, M, C/nh]
+        attn = q @ k.transpose(-2, -1) # [B, nh, N, M]
+        attn = attn.softmax(dim=-1) # [B, nh, N, M]
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1) # [B, nh, N, M] @ [B, nh, M, C/nh] --> [B, nh, N, C/nh] --> [B, N, nh, C/nh] --> [B, N, C]
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class MemEffCrossAttention(CrossAttention):
+    def forward(self, q: Tensor, k: Tensor, v: Tensor, attn_bias=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            if attn_bias is not None:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return super().forward(x)
+        B, N, _ = q.shape
+        M = k.shape[1]
+        q = self.scale * self.to_q(q).reshape(B, N, self.num_heads, self.dim // self.num_heads) # [B, N, nh, C/nh]
+        k = self.to_k(k).reshape(B, M, self.num_heads, self.dim // self.num_heads) # [B, M, nh, C/nh]
+        v = self.to_v(v).reshape(B, M, self.num_heads, self.dim // self.num_heads) # [B, M, nh, C/nh]
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

core/gs.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diff_gaussian_rasterization import (
+    GaussianRasterizationSettings,
+    GaussianRasterizer,
+)
+from core.options import Options
+import kiui
+class GaussianRenderer:
+    def __init__(self, opt: Options):
+        self.opt = opt
+        self.bg_color = torch.tensor([1, 1, 1], dtype=torch.float32, device="cuda")
+        # intrinsics
+        self.tan_half_fov = np.tan(0.5 * np.deg2rad(self.opt.fovy))
+        self.proj_matrix = torch.zeros(4, 4, dtype=torch.float32)
+        self.proj_matrix[0, 0] = 1 / self.tan_half_fov
+        self.proj_matrix[1, 1] = 1 / self.tan_half_fov
+        self.proj_matrix[2, 2] = (opt.zfar + opt.znear) / (opt.zfar - opt.znear)
+        self.proj_matrix[3, 2] = - (opt.zfar * opt.znear) / (opt.zfar - opt.znear)
+        self.proj_matrix[2, 3] = 1
+    def render(self, gaussians, cam_view, cam_view_proj, cam_pos, bg_color=None, scale_modifier=1):
+        # gaussians: [B, N, 14]
+        # cam_view, cam_view_proj: [B, V, 4, 4]
+        # cam_pos: [B, V, 3]
+        device = gaussians.device
+        B, V = cam_view.shape[:2]
+        # loop of loop...
+        images = []
+        alphas = []
+        for b in range(B):
+            # pos, opacity, scale, rotation, shs
+            means3D = gaussians[b, :, 0:3].contiguous().float()
+            opacity = gaussians[b, :, 3:4].contiguous().float()
+            scales = gaussians[b, :, 4:7].contiguous().float()
+            rotations = gaussians[b, :, 7:11].contiguous().float()
+            rgbs = gaussians[b, :, 11:].contiguous().float() # [N, 3]
+            for v in range(V):
+                # render novel views
+                view_matrix = cam_view[b, v].float()
+                view_proj_matrix = cam_view_proj[b, v].float()
+                campos = cam_pos[b, v].float()
+                raster_settings = GaussianRasterizationSettings(
+                    image_height=self.opt.output_size,
+                    image_width=self.opt.output_size,
+                    tanfovx=self.tan_half_fov,
+                    tanfovy=self.tan_half_fov,
+                    bg=self.bg_color if bg_color is None else bg_color,
+                    scale_modifier=scale_modifier,
+                    viewmatrix=view_matrix,
+                    projmatrix=view_proj_matrix,
+                    sh_degree=0,
+                    campos=campos,
+                    prefiltered=False,
+                    debug=False,
+                )
+                rasterizer = GaussianRasterizer(raster_settings=raster_settings)
+                # Rasterize visible Gaussians to image, obtain their radii (on screen).
+                rendered_image, radii, rendered_depth, rendered_alpha = rasterizer(
+                    means3D=means3D,
+                    means2D=torch.zeros_like(means3D, dtype=torch.float32, device=device),
+                    shs=None,
+                    colors_precomp=rgbs,
+                    opacities=opacity,
+                    scales=scales,
+                    rotations=rotations,
+                    cov3D_precomp=None,
+                )
+                rendered_image = rendered_image.clamp(0, 1)
+                images.append(rendered_image)
+                alphas.append(rendered_alpha)
+        images = torch.stack(images, dim=0).view(B, V, 3, self.opt.output_size, self.opt.output_size)
+        alphas = torch.stack(alphas, dim=0).view(B, V, 1, self.opt.output_size, self.opt.output_size)
+        return {
+            "image": images, # [B, V, 3, H, W]
+            "alpha": alphas, # [B, V, 1, H, W]
+        }
+    def save_ply(self, gaussians, path, compatible=True):
+        # gaussians: [B, N, 14]
+        # compatible: save pre-activated gaussians as in the original paper
+        assert gaussians.shape[0] == 1, 'only support batch size 1'
+        from plyfile import PlyData, PlyElement
+        means3D = gaussians[0, :, 0:3].contiguous().float()
+        opacity = gaussians[0, :, 3:4].contiguous().float()
+        scales = gaussians[0, :, 4:7].contiguous().float()
+        rotations = gaussians[0, :, 7:11].contiguous().float()
+        shs = gaussians[0, :, 11:].unsqueeze(1).contiguous().float() # [N, 1, 3]
+        # prune by opacity
+        mask = opacity.squeeze(-1) >= 0.005
+        means3D = means3D[mask]
+        opacity = opacity[mask]
+        scales = scales[mask]
+        rotations = rotations[mask]
+        shs = shs[mask]
+        # invert activation to make it compatible with the original ply format
+        if compatible:
+            opacity = kiui.op.inverse_sigmoid(opacity)
+            scales = torch.log(scales + 1e-8)
+            shs = (shs - 0.5) / 0.28209479177387814
+        xyzs = means3D.detach().cpu().numpy()
+        f_dc = shs.detach().transpose(1, 2).flatten(start_dim=1).contiguous().cpu().numpy()
+        opacities = opacity.detach().cpu().numpy()
+        scales = scales.detach().cpu().numpy()
+        rotations = rotations.detach().cpu().numpy()
+        l = ['x', 'y', 'z']
+        # All channels except the 3 DC
+        for i in range(f_dc.shape[1]):
+            l.append('f_dc_{}'.format(i))
+        l.append('opacity')
+        for i in range(scales.shape[1]):
+            l.append('scale_{}'.format(i))
+        for i in range(rotations.shape[1]):
+            l.append('rot_{}'.format(i))
+        dtype_full = [(attribute, 'f4') for attribute in l]
+        elements = np.empty(xyzs.shape[0], dtype=dtype_full)
+        attributes = np.concatenate((xyzs, f_dc, opacities, scales, rotations), axis=1)
+        elements[:] = list(map(tuple, attributes))
+        el = PlyElement.describe(elements, 'vertex')
+        PlyData([el]).write(path)
+    def load_ply(self, path, compatible=True):
+        from plyfile import PlyData, PlyElement
+        plydata = PlyData.read(path)
+        xyz = np.stack((np.asarray(plydata.elements[0]["x"]),
+                        np.asarray(plydata.elements[0]["y"]),
+                        np.asarray(plydata.elements[0]["z"])),  axis=1)
+        print("Number of points at loading : ", xyz.shape[0])
+        opacities = np.asarray(plydata.elements[0]["opacity"])[..., np.newaxis]
+        shs = np.zeros((xyz.shape[0], 3))
+        shs[:, 0] = np.asarray(plydata.elements[0]["f_dc_0"])
+        shs[:, 1] = np.asarray(plydata.elements[0]["f_dc_1"])
+        shs[:, 2] = np.asarray(plydata.elements[0]["f_dc_2"])
+        scale_names = [p.name for p in plydata.elements[0].properties if p.name.startswith("scale_")]
+        scales = np.zeros((xyz.shape[0], len(scale_names)))
+        for idx, attr_name in enumerate(scale_names):
+            scales[:, idx] = np.asarray(plydata.elements[0][attr_name])
+        rot_names = [p.name for p in plydata.elements[0].properties if p.name.startswith("rot_")]
+        rots = np.zeros((xyz.shape[0], len(rot_names)))
+        for idx, attr_name in enumerate(rot_names):
+            rots[:, idx] = np.asarray(plydata.elements[0][attr_name])
+        gaussians = np.concatenate([xyz, opacities, scales, rots, shs], axis=1)
+        gaussians = torch.from_numpy(gaussians).float() # cpu
+        if compatible:
+            gaussians[..., 3:4] = torch.sigmoid(gaussians[..., 3:4])
+            gaussians[..., 4:7] = torch.exp(gaussians[..., 4:7])
+            gaussians[..., 11:] = 0.28209479177387814 * gaussians[..., 11:] + 0.5
+        return gaussians

core/models.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import kiui
+from kiui.lpips import LPIPS
+from core.unet import UNet
+from core.options import Options
+from core.gs import GaussianRenderer
+class LGM(nn.Module):
+    def __init__(
+        self,
+        opt: Options,
+    ):
+        super().__init__()
+        self.opt = opt
+        # unet
+        self.unet = UNet(
+            9, 14,
+            down_channels=self.opt.down_channels,
+            down_attention=self.opt.down_attention,
+            mid_attention=self.opt.mid_attention,
+            up_channels=self.opt.up_channels,
+            up_attention=self.opt.up_attention,
+        )
+        # last conv
+        self.conv = nn.Conv2d(14, 14, kernel_size=1) # NOTE: maybe remove it if train again
+        # Gaussian Renderer
+        self.gs = GaussianRenderer(opt)
+        # activations...
+        self.pos_act = lambda x: x.clamp(-1, 1)
+        self.scale_act = lambda x: 0.1 * F.softplus(x)
+        self.opacity_act = lambda x: torch.sigmoid(x)
+        self.rot_act = F.normalize
+        self.rgb_act = lambda x: 0.5 * torch.tanh(x) + 0.5 # NOTE: may use sigmoid if train again
+        # LPIPS loss
+        if self.opt.lambda_lpips > 0:
+            self.lpips_loss = LPIPS(net='vgg')
+            self.lpips_loss.requires_grad_(False)
+    def state_dict(self, **kwargs):
+        # remove lpips_loss
+        state_dict = super().state_dict(**kwargs)
+        for k in list(state_dict.keys()):
+            if 'lpips_loss' in k:
+                del state_dict[k]
+        return state_dict
+    def prepare_default_rays(self, device, elevation=0):
+        from kiui.cam import orbit_camera
+        from core.utils import get_rays
+        cam_poses = np.stack([
+            orbit_camera(elevation, 0, radius=self.opt.cam_radius),
+            orbit_camera(elevation, 90, radius=self.opt.cam_radius),
+            orbit_camera(elevation, 180, radius=self.opt.cam_radius),
+            orbit_camera(elevation, 270, radius=self.opt.cam_radius),
+        ], axis=0) # [4, 4, 4]
+        cam_poses = torch.from_numpy(cam_poses)
+        rays_embeddings = []
+        for i in range(cam_poses.shape[0]):
+            rays_o, rays_d = get_rays(cam_poses[i], self.opt.input_size, self.opt.input_size, self.opt.fovy) # [h, w, 3]
+            rays_plucker = torch.cat([torch.cross(rays_o, rays_d, dim=-1), rays_d], dim=-1) # [h, w, 6]
+            rays_embeddings.append(rays_plucker)
+            ## visualize rays for plotting figure
+            # kiui.vis.plot_image(rays_d * 0.5 + 0.5, save=True)
+        rays_embeddings = torch.stack(rays_embeddings, dim=0).permute(0, 3, 1, 2).contiguous().to(device) # [V, 6, h, w]
+        return rays_embeddings
+    def forward_gaussians(self, images):
+        # images: [B, 4, 9, H, W]
+        # return: Gaussians: [B, dim_t]
+        B, V, C, H, W = images.shape
+        images = images.view(B*V, C, H, W)
+        x = self.unet(images) # [B*4, 14, h, w]
+        x = self.conv(x) # [B*4, 14, h, w]
+        x = x.reshape(B, 4, 14, self.opt.splat_size, self.opt.splat_size)
+        ## visualize multi-view gaussian features for plotting figure
+        # tmp_alpha = self.opacity_act(x[0, :, 3:4])
+        # tmp_img_rgb = self.rgb_act(x[0, :, 11:]) * tmp_alpha + (1 - tmp_alpha)
+        # tmp_img_pos = self.pos_act(x[0, :, 0:3]) * 0.5 + 0.5
+        # kiui.vis.plot_image(tmp_img_rgb, save=True)
+        # kiui.vis.plot_image(tmp_img_pos, save=True)
+        x = x.permute(0, 1, 3, 4, 2).reshape(B, -1, 14)
+        pos = self.pos_act(x[..., 0:3]) # [B, N, 3]
+        opacity = self.opacity_act(x[..., 3:4])
+        scale = self.scale_act(x[..., 4:7])
+        rotation = self.rot_act(x[..., 7:11])
+        rgbs = self.rgb_act(x[..., 11:])
+        gaussians = torch.cat([pos, opacity, scale, rotation, rgbs], dim=-1) # [B, N, 14]
+        return gaussians
+    def forward(self, data, step_ratio=1):
+        # data: output of the dataloader
+        # return: loss
+        results = {}
+        loss = 0
+        images = data['input'] # [B, 4, 9, h, W], input features
+        # use the first view to predict gaussians
+        gaussians = self.forward_gaussians(images) # [B, N, 14]
+        results['gaussians'] = gaussians
+        # random bg for training
+        if self.training:
+            bg_color = torch.rand(3, dtype=torch.float32, device=gaussians.device)
+        else:
+            bg_color = torch.ones(3, dtype=torch.float32, device=gaussians.device)
+        # use the other views for rendering and supervision
+        results = self.gs.render(gaussians, data['cam_view'], data['cam_view_proj'], data['cam_pos'], bg_color=bg_color)
+        pred_images = results['image'] # [B, V, C, output_size, output_size]
+        pred_alphas = results['alpha'] # [B, V, 1, output_size, output_size]
+        results['images_pred'] = pred_images
+        results['alphas_pred'] = pred_alphas
+        gt_images = data['images_output'] # [B, V, 3, output_size, output_size], ground-truth novel views
+        gt_masks = data['masks_output'] # [B, V, 1, output_size, output_size], ground-truth masks
+        gt_images = gt_images * gt_masks + bg_color.view(1, 1, 3, 1, 1) * (1 - gt_masks)
+        loss_mse = F.mse_loss(pred_images, gt_images) + F.mse_loss(pred_alphas, gt_masks)
+        loss = loss + loss_mse
+        if self.opt.lambda_lpips > 0:
+            loss_lpips = self.lpips_loss(
+                # gt_images.view(-1, 3, self.opt.output_size, self.opt.output_size) * 2 - 1,
+                # pred_images.view(-1, 3, self.opt.output_size, self.opt.output_size) * 2 - 1,
+                # downsampled to at most 256 to reduce memory cost
+                F.interpolate(gt_images.view(-1, 3, self.opt.output_size, self.opt.output_size) * 2 - 1, (256, 256), mode='bilinear', align_corners=False),
+                F.interpolate(pred_images.view(-1, 3, self.opt.output_size, self.opt.output_size) * 2 - 1, (256, 256), mode='bilinear', align_corners=False),
+            ).mean()
+            results['loss_lpips'] = loss_lpips
+            loss = loss + self.opt.lambda_lpips * loss_lpips
+        results['loss'] = loss
+        # metric
+        with torch.no_grad():
+            psnr = -10 * torch.log10(torch.mean((pred_images.detach() - gt_images) ** 2))
+            results['psnr'] = psnr
+        return results

core/options.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import tyro
+from dataclasses import dataclass
+from typing import Tuple, Literal, Dict, Optional
+@dataclass
+class Options:
+    ### model
+    # Unet image input size
+    input_size: int = 256
+    # Unet definition
+    down_channels: Tuple[int, ...] = (64, 128, 256, 512, 1024, 1024)
+    down_attention: Tuple[bool, ...] = (False, False, False, True, True, True)
+    mid_attention: bool = True
+    up_channels: Tuple[int, ...] = (1024, 1024, 512, 256)
+    up_attention: Tuple[bool, ...] = (True, True, True, False)
+    # Unet output size, dependent on the input_size and U-Net structure!
+    splat_size: int = 64
+    # gaussian render size
+    output_size: int = 256
+    ### dataset
+    # data mode (only support s3 now)
+    data_mode: Literal['s3'] = 's3'
+    # fovy of the dataset
+    fovy: float = 49.1
+    # camera near plane
+    znear: float = 0.5
+    # camera far plane
+    zfar: float = 2.5
+    # number of all views (input + output)
+    num_views: int = 12
+    # number of views
+    num_input_views: int = 4
+    # camera radius
+    cam_radius: float = 1.5 # to better use [-1, 1]^3 space
+    # num workers
+    num_workers: int = 8
+    ### training
+    # workspace
+    workspace: str = './workspace'
+    # resume
+    resume: Optional[str] = None
+    # batch size (per-GPU)
+    batch_size: int = 8
+    # gradient accumulation
+    gradient_accumulation_steps: int = 1
+    # training epochs
+    num_epochs: int = 30
+    # lpips loss weight
+    lambda_lpips: float = 1.0
+    # gradient clip
+    gradient_clip: float = 1.0
+    # mixed precision
+    mixed_precision: str = 'bf16'
+    # learning rate
+    lr: float = 4e-4
+    # augmentation prob for grid distortion
+    prob_grid_distortion: float = 0.5
+    # augmentation prob for camera jitter
+    prob_cam_jitter: float = 0.5
+    ### testing
+    # test image path
+    test_path: Optional[str] = None
+    ### misc
+    # nvdiffrast backend setting
+    force_cuda_rast: bool = False
+    # render fancy video with gaussian scaling effect
+    fancy_video: bool = False
+# all the default settings
+config_defaults: Dict[str, Options] = {}
+config_doc: Dict[str, str] = {}
+config_doc['lrm'] = 'the default settings for LGM'
+config_defaults['lrm'] = Options()
+config_doc['small'] = 'small model with lower resolution Gaussians'
+config_defaults['small'] = Options(
+    input_size=256,
+    splat_size=64,
+    output_size=256,
+    batch_size=8,
+    gradient_accumulation_steps=1,
+    mixed_precision='bf16',
+)
+config_doc['big'] = 'big model with higher resolution Gaussians'
+config_defaults['big'] = Options(
+    input_size=256,
+    up_channels=(1024, 1024, 512, 256, 128), # one more decoder
+    up_attention=(True, True, True, False, False),
+    splat_size=128,
+    output_size=512, # render & supervise Gaussians at a higher resolution.
+    batch_size=8,
+    num_views=8,
+    gradient_accumulation_steps=1,
+    mixed_precision='bf16',
+)
+config_doc['tiny'] = 'tiny model for ablation'
+config_defaults['tiny'] = Options(
+    input_size=256,
+    down_channels=(32, 64, 128, 256, 512),
+    down_attention=(False, False, False, False, True),
+    up_channels=(512, 256, 128),
+    up_attention=(True, False, False, False),
+    splat_size=64,
+    output_size=256,
+    batch_size=16,
+    num_views=8,
+    gradient_accumulation_steps=1,
+    mixed_precision='bf16',
+)
+AllConfigs = tyro.extras.subcommand_type_from_defaults(config_defaults, config_doc)

core/provider_objaverse.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import os
+import cv2
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.transforms.functional as TF
+from torch.utils.data import Dataset
+import kiui
+from core.options import Options
+from core.utils import get_rays, grid_distortion, orbit_camera_jitter
+IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
+class ObjaverseDataset(Dataset):
+    def _warn(self):
+        raise NotImplementedError('this dataset is just an example and cannot be used directly, you should modify it to your own setting! (search keyword TODO)')
+    def __init__(self, opt: Options, training=True):
+        self.opt = opt
+        self.training = training
+        # TODO: remove this barrier
+        self._warn()
+        # TODO: load the list of objects for training
+        self.items = []
+        with open('TODO: file containing the list', 'r') as f:
+            for line in f.readlines():
+                self.items.append(line.strip())
+        # naive split
+        if self.training:
+            self.items = self.items[:-self.opt.batch_size]
+        else:
+            self.items = self.items[-self.opt.batch_size:]
+        # default camera intrinsics
+        self.tan_half_fov = np.tan(0.5 * np.deg2rad(self.opt.fovy))
+        self.proj_matrix = torch.zeros(4, 4, dtype=torch.float32)
+        self.proj_matrix[0, 0] = 1 / self.tan_half_fov
+        self.proj_matrix[1, 1] = 1 / self.tan_half_fov
+        self.proj_matrix[2, 2] = (self.opt.zfar + self.opt.znear) / (self.opt.zfar - self.opt.znear)
+        self.proj_matrix[3, 2] = - (self.opt.zfar * self.opt.znear) / (self.opt.zfar - self.opt.znear)
+        self.proj_matrix[2, 3] = 1
+    def __len__(self):
+        return len(self.items)
+    def __getitem__(self, idx):
+        uid = self.items[idx]
+        results = {}
+        # load num_views images
+        images = []
+        masks = []
+        cam_poses = []
+        vid_cnt = 0
+        # TODO: choose views, based on your rendering settings
+        if self.training:
+            # input views are in (36, 72), other views are randomly selected
+            vids = np.random.permutation(np.arange(36, 73))[:self.opt.num_input_views].tolist() + np.random.permutation(100).tolist()
+        else:
+            # fixed views
+            vids = np.arange(36, 73, 4).tolist() + np.arange(100).tolist()
+        for vid in vids:
+            image_path = os.path.join(uid, 'rgb', f'{vid:03d}.png')
+            camera_path = os.path.join(uid, 'pose', f'{vid:03d}.txt')
+            try:
+                # TODO: load data (modify self.client here)
+                image = np.frombuffer(self.client.get(image_path), np.uint8)
+                image = torch.from_numpy(cv2.imdecode(image, cv2.IMREAD_UNCHANGED).astype(np.float32) / 255) # [512, 512, 4] in [0, 1]
+                c2w = [float(t) for t in self.client.get(camera_path).decode().strip().split(' ')]
+                c2w = torch.tensor(c2w, dtype=torch.float32).reshape(4, 4)
+            except Exception as e:
+                # print(f'[WARN] dataset {uid} {vid}: {e}')
+                continue
+            # TODO: you may have a different camera system
+            # blender world + opencv cam --> opengl world & cam
+            c2w[1] *= -1
+            c2w[[1, 2]] = c2w[[2, 1]]
+            c2w[:3, 1:3] *= -1 # invert up and forward direction
+            # scale up radius to fully use the [-1, 1]^3 space!
+            c2w[:3, 3] *= self.opt.cam_radius / 1.5 # 1.5 is the default scale
+            image = image.permute(2, 0, 1) # [4, 512, 512]
+            mask = image[3:4] # [1, 512, 512]
+            image = image[:3] * mask + (1 - mask) # [3, 512, 512], to white bg
+            image = image[[2,1,0]].contiguous() # bgr to rgb
+            images.append(image)
+            masks.append(mask.squeeze(0))
+            cam_poses.append(c2w)
+            vid_cnt += 1
+            if vid_cnt == self.opt.num_views:
+                break
+        if vid_cnt < self.opt.num_views:
+            print(f'[WARN] dataset {uid}: not enough valid views, only {vid_cnt} views found!')
+            n = self.opt.num_views - vid_cnt
+            images = images + [images[-1]] * n
+            masks = masks + [masks[-1]] * n
+            cam_poses = cam_poses + [cam_poses[-1]] * n
+        images = torch.stack(images, dim=0) # [V, C, H, W]
+        masks = torch.stack(masks, dim=0) # [V, H, W]
+        cam_poses = torch.stack(cam_poses, dim=0) # [V, 4, 4]
+        # normalized camera feats as in paper (transform the first pose to a fixed position)
+        transform = torch.tensor([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, self.opt.cam_radius], [0, 0, 0, 1]], dtype=torch.float32) @ torch.inverse(cam_poses[0])
+        cam_poses = transform.unsqueeze(0) @ cam_poses  # [V, 4, 4]
+        images_input = F.interpolate(images[:self.opt.num_input_views].clone(), size=(self.opt.input_size, self.opt.input_size), mode='bilinear', align_corners=False) # [V, C, H, W]
+        cam_poses_input = cam_poses[:self.opt.num_input_views].clone()
+        # data augmentation
+        if self.training:
+            # apply random grid distortion to simulate 3D inconsistency
+            if random.random() < self.opt.prob_grid_distortion:
+                images_input[1:] = grid_distortion(images_input[1:])
+            # apply camera jittering (only to input!)
+            if random.random() < self.opt.prob_cam_jitter:
+                cam_poses_input[1:] = orbit_camera_jitter(cam_poses_input[1:])
+        images_input = TF.normalize(images_input, IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD)
+        # resize render ground-truth images, range still in [0, 1]
+        results['images_output'] = F.interpolate(images, size=(self.opt.output_size, self.opt.output_size), mode='bilinear', align_corners=False) # [V, C, output_size, output_size]
+        results['masks_output'] = F.interpolate(masks.unsqueeze(1), size=(self.opt.output_size, self.opt.output_size), mode='bilinear', align_corners=False) # [V, 1, output_size, output_size]
+        # build rays for input views
+        rays_embeddings = []
+        for i in range(self.opt.num_input_views):
+            rays_o, rays_d = get_rays(cam_poses_input[i], self.opt.input_size, self.opt.input_size, self.opt.fovy) # [h, w, 3]
+            rays_plucker = torch.cat([torch.cross(rays_o, rays_d, dim=-1), rays_d], dim=-1) # [h, w, 6]
+            rays_embeddings.append(rays_plucker)
+        rays_embeddings = torch.stack(rays_embeddings, dim=0).permute(0, 3, 1, 2).contiguous() # [V, 6, h, w]
+        final_input = torch.cat([images_input, rays_embeddings], dim=1) # [V=4, 9, H, W]
+        results['input'] = final_input
+        # opengl to colmap camera for gaussian renderer
+        cam_poses[:, :3, 1:3] *= -1 # invert up & forward direction
+        # cameras needed by gaussian rasterizer
+        cam_view = torch.inverse(cam_poses).transpose(1, 2) # [V, 4, 4]
+        cam_view_proj = cam_view @ self.proj_matrix # [V, 4, 4]
+        cam_pos = - cam_poses[:, :3, 3] # [V, 3]
+        results['cam_view'] = cam_view
+        results['cam_view_proj'] = cam_view_proj
+        results['cam_pos'] = cam_pos
+        return results

core/unet.py ADDED Viewed

	@@ -0,0 +1,319 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from typing import Tuple, Literal
+from functools import partial
+from core.attention import MemEffAttention
+class MVAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        groups: int = 32,
+        eps: float = 1e-5,
+        residual: bool = True,
+        skip_scale: float = 1,
+        num_frames: int = 4, # WARN: hardcoded!
+    ):
+        super().__init__()
+        self.residual = residual
+        self.skip_scale = skip_scale
+        self.num_frames = num_frames
+        self.norm = nn.GroupNorm(num_groups=groups, num_channels=dim, eps=eps, affine=True)
+        self.attn = MemEffAttention(dim, num_heads, qkv_bias, proj_bias, attn_drop, proj_drop)
+    def forward(self, x):
+        # x: [B*V, C, H, W]
+        BV, C, H, W = x.shape
+        B = BV // self.num_frames # assert BV % self.num_frames == 0
+        res = x
+        x = self.norm(x)
+        x = x.reshape(B, self.num_frames, C, H, W).permute(0, 1, 3, 4, 2).reshape(B, -1, C)
+        x = self.attn(x)
+        x = x.reshape(B, self.num_frames, H, W, C).permute(0, 1, 4, 2, 3).reshape(BV, C, H, W)
+        if self.residual:
+            x = (x + res) * self.skip_scale
+        return x
+class ResnetBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        resample: Literal['default', 'up', 'down'] = 'default',
+        groups: int = 32,
+        eps: float = 1e-5,
+        skip_scale: float = 1, # multiplied to output
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.skip_scale = skip_scale
+        self.norm1 = nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = nn.GroupNorm(num_groups=groups, num_channels=out_channels, eps=eps, affine=True)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.act = F.silu
+        self.resample = None
+        if resample == 'up':
+            self.resample = partial(F.interpolate, scale_factor=2.0, mode="nearest")
+        elif resample == 'down':
+            self.resample = nn.AvgPool2d(kernel_size=2, stride=2)
+        self.shortcut = nn.Identity()
+        if self.in_channels != self.out_channels:
+            self.shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=True)
+    def forward(self, x):
+        res = x
+        x = self.norm1(x)
+        x = self.act(x)
+        if self.resample:
+            res = self.resample(res)
+            x = self.resample(x)
+        x = self.conv1(x)
+        x = self.norm2(x)
+        x = self.act(x)
+        x = self.conv2(x)
+        x = (x + self.shortcut(res)) * self.skip_scale
+        return x
+class DownBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_layers: int = 1,
+        downsample: bool = True,
+        attention: bool = True,
+        attention_heads: int = 16,
+        skip_scale: float = 1,
+    ):
+        super().__init__()
+        nets = []
+        attns = []
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            nets.append(ResnetBlock(in_channels, out_channels, skip_scale=skip_scale))
+            if attention:
+                attns.append(MVAttention(out_channels, attention_heads, skip_scale=skip_scale))
+            else:
+                attns.append(None)
+        self.nets = nn.ModuleList(nets)
+        self.attns = nn.ModuleList(attns)
+        self.downsample = None
+        if downsample:
+            self.downsample = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=2, padding=1)
+    def forward(self, x):
+        xs = []
+        for attn, net in zip(self.attns, self.nets):
+            x = net(x)
+            if attn:
+                x = attn(x)
+            xs.append(x)
+        if self.downsample:
+            x = self.downsample(x)
+            xs.append(x)
+        return x, xs
+class MidBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        num_layers: int = 1,
+        attention: bool = True,
+        attention_heads: int = 16,
+        skip_scale: float = 1,
+    ):
+        super().__init__()
+        nets = []
+        attns = []
+        # first layer
+        nets.append(ResnetBlock(in_channels, in_channels, skip_scale=skip_scale))
+        # more layers
+        for i in range(num_layers):
+            nets.append(ResnetBlock(in_channels, in_channels, skip_scale=skip_scale))
+            if attention:
+                attns.append(MVAttention(in_channels, attention_heads, skip_scale=skip_scale))
+            else:
+                attns.append(None)
+        self.nets = nn.ModuleList(nets)
+        self.attns = nn.ModuleList(attns)
+    def forward(self, x):
+        x = self.nets[0](x)
+        for attn, net in zip(self.attns, self.nets[1:]):
+            if attn:
+                x = attn(x)
+            x = net(x)
+        return x
+class UpBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_out_channels: int,
+        out_channels: int,
+        num_layers: int = 1,
+        upsample: bool = True,
+        attention: bool = True,
+        attention_heads: int = 16,
+        skip_scale: float = 1,
+    ):
+        super().__init__()
+        nets = []
+        attns = []
+        for i in range(num_layers):
+            cin = in_channels if i == 0 else out_channels
+            cskip = prev_out_channels if (i == num_layers - 1) else out_channels
+            nets.append(ResnetBlock(cin + cskip, out_channels, skip_scale=skip_scale))
+            if attention:
+                attns.append(MVAttention(out_channels, attention_heads, skip_scale=skip_scale))
+            else:
+                attns.append(None)
+        self.nets = nn.ModuleList(nets)
+        self.attns = nn.ModuleList(attns)
+        self.upsample = None
+        if upsample:
+            self.upsample = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x, xs):
+        for attn, net in zip(self.attns, self.nets):
+            res_x = xs[-1]
+            xs = xs[:-1]
+            x = torch.cat([x, res_x], dim=1)
+            x = net(x)
+            if attn:
+                x = attn(x)
+        if self.upsample:
+            x = F.interpolate(x, scale_factor=2.0, mode='nearest')
+            x = self.upsample(x)
+        return x
+# it could be asymmetric!
+class UNet(nn.Module):
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_channels: Tuple[int, ...] = (64, 128, 256, 512, 1024),
+        down_attention: Tuple[bool, ...] = (False, False, False, True, True),
+        mid_attention: bool = True,
+        up_channels: Tuple[int, ...] = (1024, 512, 256),
+        up_attention: Tuple[bool, ...] = (True, True, False),
+        layers_per_block: int = 2,
+        skip_scale: float = np.sqrt(0.5),
+    ):
+        super().__init__()
+        # first
+        self.conv_in = nn.Conv2d(in_channels, down_channels[0], kernel_size=3, stride=1, padding=1)
+        # down
+        down_blocks = []
+        cout = down_channels[0]
+        for i in range(len(down_channels)):
+            cin = cout
+            cout = down_channels[i]
+            down_blocks.append(DownBlock(
+                cin, cout,
+                num_layers=layers_per_block,
+                downsample=(i != len(down_channels) - 1), # not final layer
+                attention=down_attention[i],
+                skip_scale=skip_scale,
+            ))
+        self.down_blocks = nn.ModuleList(down_blocks)
+        # mid
+        self.mid_block = MidBlock(down_channels[-1], attention=mid_attention, skip_scale=skip_scale)
+        # up
+        up_blocks = []
+        cout = up_channels[0]
+        for i in range(len(up_channels)):
+            cin = cout
+            cout = up_channels[i]
+            cskip = down_channels[max(-2 - i, -len(down_channels))] # for assymetric
+            up_blocks.append(UpBlock(
+                cin, cskip, cout,
+                num_layers=layers_per_block + 1, # one more layer for up
+                upsample=(i != len(up_channels) - 1), # not final layer
+                attention=up_attention[i],
+                skip_scale=skip_scale,
+            ))
+        self.up_blocks = nn.ModuleList(up_blocks)
+        # last
+        self.norm_out = nn.GroupNorm(num_channels=up_channels[-1], num_groups=32, eps=1e-5)
+        self.conv_out = nn.Conv2d(up_channels[-1], out_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x):
+        # x: [B, Cin, H, W]
+        # first
+        x = self.conv_in(x)
+        # down
+        xss = [x]
+        for block in self.down_blocks:
+            x, xs = block(x)
+            xss.extend(xs)
+        # mid
+        x = self.mid_block(x)
+        # up
+        for block in self.up_blocks:
+            xs = xss[-len(block.nets):]
+            xss = xss[:-len(block.nets)]
+            x = block(x, xs)
+        # last
+        x = self.norm_out(x)
+        x = F.silu(x)
+        x = self.conv_out(x) # [B, Cout, H', W']
+        return x

core/utils.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import roma
+from kiui.op import safe_normalize
+def get_rays(pose, h, w, fovy, opengl=True):
+    x, y = torch.meshgrid(
+        torch.arange(w, device=pose.device),
+        torch.arange(h, device=pose.device),
+        indexing="xy",
+    )
+    x = x.flatten()
+    y = y.flatten()
+    cx = w * 0.5
+    cy = h * 0.5
+    focal = h * 0.5 / np.tan(0.5 * np.deg2rad(fovy))
+    camera_dirs = F.pad(
+        torch.stack(
+            [
+                (x - cx + 0.5) / focal,
+                (y - cy + 0.5) / focal * (-1.0 if opengl else 1.0),
+            ],
+            dim=-1,
+        ),
+        (0, 1),
+        value=(-1.0 if opengl else 1.0),
+    )  # [hw, 3]
+    rays_d = camera_dirs @ pose[:3, :3].transpose(0, 1)  # [hw, 3]
+    rays_o = pose[:3, 3].unsqueeze(0).expand_as(rays_d) # [hw, 3]
+    rays_o = rays_o.view(h, w, 3)
+    rays_d = safe_normalize(rays_d).view(h, w, 3)
+    return rays_o, rays_d
+def orbit_camera_jitter(poses, strength=0.1):
+    # poses: [B, 4, 4], assume orbit camera in opengl format
+    # random orbital rotate
+    B = poses.shape[0]
+    rotvec_x = poses[:, :3, 1] * strength * np.pi * (torch.rand(B, 1, device=poses.device) * 2 - 1)
+    rotvec_y = poses[:, :3, 0] * strength * np.pi / 2 * (torch.rand(B, 1, device=poses.device) * 2 - 1)
+    rot = roma.rotvec_to_rotmat(rotvec_x) @ roma.rotvec_to_rotmat(rotvec_y)
+    R = rot @ poses[:, :3, :3]
+    T = rot @ poses[:, :3, 3:]
+    new_poses = poses.clone()
+    new_poses[:, :3, :3] = R
+    new_poses[:, :3, 3:] = T
+    return new_poses
+def grid_distortion(images, strength=0.5):
+    # images: [B, C, H, W]
+    # num_steps: int, grid resolution for distortion
+    # strength: float in [0, 1], strength of distortion
+    B, C, H, W = images.shape
+    num_steps = np.random.randint(8, 17)
+    grid_steps = torch.linspace(-1, 1, num_steps)
+    # have to loop batch...
+    grids = []
+    for b in range(B):
+        # construct displacement
+        x_steps = torch.linspace(0, 1, num_steps) # [num_steps], inclusive
+        x_steps = (x_steps + strength * (torch.rand_like(x_steps) - 0.5) / (num_steps - 1)).clamp(0, 1) # perturb
+        x_steps = (x_steps * W).long() # [num_steps]
+        x_steps[0] = 0
+        x_steps[-1] = W
+        xs = []
+        for i in range(num_steps - 1):
+            xs.append(torch.linspace(grid_steps[i], grid_steps[i + 1], x_steps[i + 1] - x_steps[i]))
+        xs = torch.cat(xs, dim=0) # [W]
+        y_steps = torch.linspace(0, 1, num_steps) # [num_steps], inclusive
+        y_steps = (y_steps + strength * (torch.rand_like(y_steps) - 0.5) / (num_steps - 1)).clamp(0, 1) # perturb
+        y_steps = (y_steps * H).long() # [num_steps]
+        y_steps[0] = 0
+        y_steps[-1] = H
+        ys = []
+        for i in range(num_steps - 1):
+            ys.append(torch.linspace(grid_steps[i], grid_steps[i + 1], y_steps[i + 1] - y_steps[i]))
+        ys = torch.cat(ys, dim=0) # [H]
+        # construct grid
+        grid_x, grid_y = torch.meshgrid(xs, ys, indexing='xy') # [H, W]
+        grid = torch.stack([grid_x, grid_y], dim=-1) # [H, W, 2]
+        grids.append(grid)
+    grids = torch.stack(grids, dim=0).to(images.device) # [B, H, W, 2]
+    # grid sample
+    images = F.grid_sample(images, grids, align_corners=False)
+    return images

data_test/catstatue.ply ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:29d36a368577579338f897b6098b4d96ab7d0bcf0c61ebb8249af56b72b0c7aa
+size 2399737

requirements.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+torch==2.1.0+cu121
+torchvision==0.16.0+cu121
+torchaudio==2.1.0+cu121
+tyro
+PyMCubes
+nerfacc
+trimesh
+pymeshlab
+wheel
+tqdm
+opencv-python
+ninja
+plyfile
+xatlas
+scikit-learn
+pygltflib
+gradio
+git+https://github.com/ashawkey/kiuikit.git
+https://github.com/camenduru/LGM-replicate/releases/download/replicate/diff_gaussian_rasterization-0.0.0-cp310-cp310-linux_x86_64.whl
+https://github.com/camenduru/wheels/releases/download/colab/nvdiffrast-0.3.1-py3-none-any.whl