Spaces:

LTT
/

Kiss3DGen

Running on Zero

App Files Files Community

JiantaoLin commited on Dec 16, 2024

Commit

98bebfc

0 Parent(s):

initial commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +319 -0
extension/put_here.txt +0 -0
image_to_mesh.py +437 -0
models/ISOMER/__init__.py +0 -0
models/ISOMER/data/__init__.py +0 -0
models/ISOMER/data/utils.py +87 -0
models/ISOMER/mesh_reconstruction/__init__.py +0 -0
models/ISOMER/mesh_reconstruction/func.py +227 -0
models/ISOMER/mesh_reconstruction/opt.py +191 -0
models/ISOMER/mesh_reconstruction/recon.py +58 -0
models/ISOMER/mesh_reconstruction/refine.py +86 -0
models/ISOMER/mesh_reconstruction/remesh.py +363 -0
models/ISOMER/mesh_reconstruction/render.py +142 -0
models/ISOMER/model/__init__.py +0 -0
models/ISOMER/model/inference_pipeline.py +189 -0
models/ISOMER/projection_func.py +86 -0
models/ISOMER/reconstruction_func.py +88 -0
models/ISOMER/scripts/__init__.py +0 -0
models/ISOMER/scripts/all_typing.py +42 -0
models/ISOMER/scripts/fast_geo.py +86 -0
models/ISOMER/scripts/load_onnx.py +48 -0
models/ISOMER/scripts/mesh_init.py +142 -0
models/ISOMER/scripts/normal_to_height_map.py +205 -0
models/ISOMER/scripts/proj_commands.py +69 -0
models/ISOMER/scripts/project_mesh.py +401 -0
models/ISOMER/scripts/refine_lr_to_sr.py +60 -0
models/ISOMER/scripts/sd_model_zoo.py +131 -0
models/ISOMER/scripts/upsampler.py +260 -0
models/ISOMER/scripts/utils.py +611 -0
models/lrm/config/PRM_inference.yaml +22 -0
models/lrm/models/__init__.py +0 -0
models/lrm/models/decoder/__init__.py +0 -0
models/lrm/models/decoder/transformer.py +123 -0
models/lrm/models/encoder/__init__.py +0 -0
models/lrm/models/encoder/dino.py +550 -0
models/lrm/models/encoder/dino_wrapper.py +80 -0
models/lrm/models/geometry/__init__.py +7 -0
models/lrm/models/geometry/camera/__init__.py +16 -0
models/lrm/models/geometry/camera/perspective_camera.py +35 -0
models/lrm/models/geometry/render/__init__.py +8 -0
models/lrm/models/geometry/render/neural_render.py +293 -0
models/lrm/models/geometry/render/renderutils/__init__.py +11 -0
models/lrm/models/geometry/render/renderutils/bsdf.py +151 -0
models/lrm/models/geometry/render/renderutils/c_src/bsdf.cu +710 -0
models/lrm/models/geometry/render/renderutils/c_src/bsdf.h +84 -0
models/lrm/models/geometry/render/renderutils/c_src/common.cpp +74 -0
models/lrm/models/geometry/render/renderutils/c_src/common.h +41 -0
models/lrm/models/geometry/render/renderutils/c_src/cubemap.cu +350 -0
models/lrm/models/geometry/render/renderutils/c_src/cubemap.h +38 -0
models/lrm/models/geometry/render/renderutils/c_src/loss.cu +210 -0

app.py ADDED Viewed

	@@ -0,0 +1,319 @@

+import gradio as gr
+import os
+import subprocess
+import shlex
+subprocess.run(
+    shlex.split(
+        "pip install ./extension/nvdiffrast-0.3.1.torch-cp310-cp310-linux_x86_64.whl --force-reinstall --no-deps"
+    )
+)
+subprocess.run(
+    shlex.split(
+        "pip install ./extension/renderutils_plugin-1.0-cp310-cp310-linux_x86_64.whl --force-reinstall --no-deps"
+    )
+)
+import torch
+import numpy as np
+from PIL import Image
+from einops import rearrange
+from diffusers import FluxPipeline
+from models.lrm.utils.camera_util import get_flux_input_cameras
+from models.lrm.utils.infer_util import save_video
+from models.lrm.utils.mesh_util import save_obj, save_obj_with_mtl
+from models.lrm.utils.render_utils import rotate_x, rotate_y
+from models.lrm.utils.train_util import instantiate_from_config
+from models.ISOMER.reconstruction_func import reconstruction
+from models.ISOMER.projection_func import projection
+import os
+from einops import rearrange
+from omegaconf import OmegaConf
+import spaces
+import torch
+import numpy as np
+import trimesh
+import torchvision
+import torch.nn.functional as F
+from PIL import Image
+from torchvision import transforms
+from torchvision.transforms import v2
+from diffusers import HeunDiscreteScheduler
+from diffusers import FluxPipeline
+from pytorch_lightning import seed_everything
+import os
+from huggingface_hub import hf_hub_download
+from utils.tool import NormalTransfer, get_background, get_render_cameras_video, load_mipmap, render_frames
+device = "cuda"
+resolution = 512
+save_dir = "./outputs"
+normal_transfer = NormalTransfer()
+isomer_azimuths = torch.from_numpy(np.array([0, 90, 180, 270])).float().to(device)
+isomer_elevations = torch.from_numpy(np.array([5, 5, 5, 5])).float().to(device)
+isomer_radius = 4.5
+isomer_geo_weights = torch.from_numpy(np.array([1, 0.9, 1, 0.9])).float().to(device)
+isomer_color_weights = torch.from_numpy(np.array([1, 0.5, 1, 0.5])).float().to(device)
+# model initialization and loading
+# flux
+flux_pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16).to(device=device, dtype=torch.bfloat16)
+flux_lora_ckpt_path = hf_hub_download(repo_id="LTT/xxx-ckpt", filename="rgb_normal_large.safetensors", repo_type="model")
+flux_pipe.load_lora_weights(flux_lora_ckpt_path)
+flux_pipe.to(device=device, dtype=torch.bfloat16)
+generator = torch.Generator(device=device).manual_seed(10)
+# lrm
+config = OmegaConf.load("./models/lrm/config/PRM_inference.yaml")
+model_config = config.model_config
+infer_config = config.infer_config
+model = instantiate_from_config(model_config)
+model_ckpt_path = hf_hub_download(repo_id="LTT/PRM", filename="final_ckpt.ckpt", repo_type="model")
+state_dict = torch.load(model_ckpt_path, map_location='cpu')['state_dict']
+state_dict = {k[14:]: v for k, v in state_dict.items() if k.startswith('lrm_generator.')}
+model.load_state_dict(state_dict, strict=True)
+model = model.to(device)
+model.init_flexicubes_geometry(device, fovy=50.0)
+model = model.eval()
+@spaces.GPU
+def lrm_reconstructions(image, input_cameras, save_path=None, name="temp", export_texmap=False, if_save_video=False):
+    images = image.unsqueeze(0).to(device)
+    images = v2.functional.resize(images, 512, interpolation=3, antialias=True).clamp(0, 1)
+    # breakpoint()
+    with torch.no_grad():
+        # get triplane
+        planes = model.forward_planes(images, input_cameras)
+        mesh_path_idx = os.path.join(save_path, f'{name}.obj')
+        mesh_out = model.extract_mesh(
+            planes,
+            use_texture_map=export_texmap,
+            **infer_config,
+        )
+        if export_texmap:
+            vertices, faces, uvs, mesh_tex_idx, tex_map = mesh_out
+            save_obj_with_mtl(
+                vertices.data.cpu().numpy(),
+                uvs.data.cpu().numpy(),
+                faces.data.cpu().numpy(),
+                mesh_tex_idx.data.cpu().numpy(),
+                tex_map.permute(1, 2, 0).data.cpu().numpy(),
+                mesh_path_idx,
+            )
+        else:
+            vertices, faces, vertex_colors = mesh_out
+            save_obj(vertices, faces, vertex_colors, mesh_path_idx)
+        print(f"Mesh saved to {mesh_path_idx}")
+        render_size = 512
+        if if_save_video:
+            video_path_idx = os.path.join(save_path, f'{name}.mp4')
+            render_size = infer_config.render_resolution
+            ENV = load_mipmap("models/lrm/env_mipmap/6")
+            materials = (0.0,0.9)
+            all_mv, all_mvp, all_campos = get_render_cameras_video(
+                batch_size=1,
+                M=240,
+                radius=4.5,
+                elevation=(90, 60.0),
+                is_flexicubes=True,
+                fov=30
+            )
+            frames, albedos, pbr_spec_lights, pbr_diffuse_lights, normals, alphas = render_frames(
+                model,
+                planes,
+                render_cameras=all_mvp,
+                camera_pos=all_campos,
+                env=ENV,
+                materials=materials,
+                render_size=render_size,
+                chunk_size=20,
+                is_flexicubes=True,
+            )
+            normals = (torch.nn.functional.normalize(normals) + 1) / 2
+            normals = normals * alphas + (1-alphas)
+            all_frames = torch.cat([frames, albedos, pbr_spec_lights, pbr_diffuse_lights, normals], dim=3)
+            save_video(
+                all_frames,
+                video_path_idx,
+                fps=30,
+            )
+            print(f"Video saved to {video_path_idx}")
+    return vertices, faces
+def local_normal_global_transform(local_normal_images, azimuths_deg, elevations_deg):
+    if local_normal_images.min() >= 0:
+        local_normal = local_normal_images.float() * 2 - 1
+    else:
+        local_normal = local_normal_images.float()
+    global_normal = normal_transfer.trans_local_2_global(local_normal, azimuths_deg, elevations_deg, radius=4.5, for_lotus=False)
+    global_normal[...,0] *= -1
+    global_normal = (global_normal + 1) / 2
+    global_normal = global_normal.permute(0, 3, 1, 2)
+    return global_normal
+# 生成多视图图像
+@spaces.GPU
+def generate_multi_view_images(prompt, seed):
+    generator = torch.manual_seed(seed)
+    with torch.no_grad():
+        images = flux_pipe(
+            prompt=prompt,
+            num_inference_steps=30,
+            guidance_scale=3.5,
+            num_images_per_prompt=1,
+            width=resolution * 4,
+            height=resolution * 2,
+            output_type='np',
+            generator=generator
+        ).images
+    return images
+# 重建 3D 模型
+@spaces.GPU
+def reconstruct_3d_model(images, prompt):
+    rgb_normal_grid = images
+    save_dir_path = os.path.join(save_dir, prompt.replace(" ", "_"))
+    os.makedirs(save_dir_path, exist_ok=True)
+    images = torch.from_numpy(rgb_normal_grid).squeeze(0).permute(2, 0, 1).contiguous().float()     # (3, 1024, 2048)
+    images = rearrange(images, 'c (n h) (m w) -> (n m) c h w', n=2, m=4)        # (8, 3, 512, 512)
+    rgb_multi_view = images[:4, :3, :, :]
+    normal_multi_view = images[4:, :3, :, :]
+    multi_view_mask = get_background(normal_multi_view)
+    rgb_multi_view = rgb_multi_view * rgb_multi_view + (1-multi_view_mask)
+    input_cameras = get_flux_input_cameras(batch_size=1, radius=4.2, fov=30).to(device)
+    vertices, faces = lrm_reconstructions(rgb_multi_view, input_cameras, save_path=save_dir_path, name='lrm', export_texmap=False, if_save_video=False)
+    # local normal to global normal
+    global_normal = local_normal_global_transform(normal_multi_view.permute(0, 2, 3, 1), isomer_azimuths, isomer_elevations)
+    global_normal = global_normal * multi_view_mask + (1-multi_view_mask)
+    global_normal = global_normal.permute(0,2,3,1)
+    rgb_multi_view = rgb_multi_view.permute(0,2,3,1)
+    multi_view_mask = multi_view_mask.permute(0,2,3,1).squeeze(-1)
+    vertices = torch.from_numpy(vertices).to(device)
+    faces = torch.from_numpy(faces).to(device)
+    vertices = vertices @ rotate_x(np.pi / 2, device=vertices.device)[:3, :3]
+    vertices = vertices @ rotate_y(np.pi / 2, device=vertices.device)[:3, :3]
+    # global_normal: B,H,W,3
+    # multi_view_mask: B,H,W
+    # rgb_multi_view: B,H,W,3
+    meshes = reconstruction(
+        normal_pils=global_normal,
+        masks=multi_view_mask,
+        weights=isomer_geo_weights,
+        fov=30,
+        radius=isomer_radius,
+        camera_angles_azi=isomer_azimuths,
+        camera_angles_ele=isomer_elevations,
+        expansion_weight_stage1=0.1,
+        init_type="file",
+        init_verts=vertices,
+        init_faces=faces,
+        stage1_steps=0,
+        stage2_steps=50,
+        start_edge_len_stage1=0.1,
+        end_edge_len_stage1=0.02,
+        start_edge_len_stage2=0.02,
+        end_edge_len_stage2=0.005,
+    )
+    save_glb_addr = projection(
+        meshes,
+        masks=multi_view_mask,
+        images=rgb_multi_view,
+        azimuths=isomer_azimuths,
+        elevations=isomer_elevations,
+        weights=isomer_color_weights,
+        fov=30,
+        radius=isomer_radius,
+        save_dir=f"{save_dir_path}/ISOMER/",
+    )
+    return save_glb_addr
+# Gradio 接口函数
+def gradio_pipeline(prompt, seed):
+    # 生成多视图图像
+    rgb_normal_grid = generate_multi_view_images(prompt, seed)
+    image_preview = Image.fromarray((rgb_normal_grid * 255).astype(np.uint8))
+    # 3d reconstruction
+    # 重建 3D 模型并返回 glb 路径
+    save_glb_addr = reconstruct_3d_model(rgb_normal_grid, prompt)
+    return image_preview, save_glb_addr
+# Gradio Blocks 应用
+with gr.Blocks() as demo:
+    with gr.Row(variant="panel"):
+        # 左侧输入区域
+        with gr.Column():
+            with gr.Row():
+                prompt_input = gr.Textbox(
+                    label="Enter Prompt",
+                    placeholder="Describe your 3D model...",
+                    lines=2,
+                    elem_id="prompt_input"
+                )
+            with gr.Row():
+                sample_seed = gr.Number(value=42, label="Seed Value", precision=0)
+            with gr.Row():
+                submit = gr.Button("Generate", elem_id="generate", variant="primary")
+            with gr.Row(variant="panel"):
+                gr.Markdown("Examples:")
+                gr.Examples(
+                    examples=[
+                        ["a castle on a hill"],
+                        ["an owl wearing a hat"],
+                        ["a futuristic car"]
+                    ],
+                    inputs=[prompt_input],
+                    label="Prompt Examples"
+                )
+        # 右侧输出区域
+        with gr.Column():
+            with gr.Row():
+                rgb_normal_grid_image = gr.Image(
+                    label="RGB Normal Grid",
+                    type="pil",
+                    interactive=False
+                )
+            with gr.Row():
+                with gr.Tab("GLB"):
+                    output_glb_model = gr.Model3D(
+                        label="Generated 3D Model (GLB Format)",
+                        interactive=False
+                    )
+                    gr.Markdown("Download the model for proper visualization.")
+    # 处理逻辑
+    submit.click(
+        fn=gradio_pipeline, inputs=[prompt_input, sample_seed],
+        outputs=[rgb_normal_grid_image, output_glb_model]
+    )
+# 启动应用
+demo.queue(max_size=10)
+demo.launch(server_port=1211)

extension/put_here.txt ADDED Viewed

File without changes

image_to_mesh.py ADDED Viewed

	@@ -0,0 +1,437 @@

+import os
+from einops import rearrange
+from omegaconf import OmegaConf
+import torch
+import numpy as np
+import trimesh
+import torchvision
+import torch.nn.functional as F
+from PIL import Image
+from torchvision import transforms
+from torchvision.transforms import v2
+from transformers import AutoProcessor, AutoModelForCausalLM
+import rembg
+from diffusers import FluxPipeline, FluxControlNetImg2ImgPipeline
+from diffusers.models.controlnet_flux import FluxControlNetModel, FluxMultiControlNetModel
+from diffusers import DiffusionPipeline, EulerAncestralDiscreteScheduler, HeunDiscreteScheduler
+from pytorch_lightning import seed_everything
+import os
+from models.ISOMER.reconstruction_func import reconstruction
+from models.ISOMER.projection_func import projection
+from models.lrm.utils.infer_util import remove_background, resize_foreground, save_video
+from models.lrm.utils.mesh_util import save_obj, save_obj_with_mtl
+from models.lrm.utils.render_utils import rotate_x, rotate_y
+from models.lrm.utils.train_util import instantiate_from_config
+from models.lrm.utils.camera_util import get_zero123plus_input_cameras, get_custom_zero123plus_input_cameras, get_flux_input_cameras
+from utils.tool import NormalTransfer, get_render_cameras_frames, load_mipmap
+from utils.tool import get_background, get_render_cameras_video, render_frames
+import time
+device = "cuda"
+resolution = 512
+save_dir = "./outputs"
+zero123plus_diffusion_steps = 75
+normal_transfer = NormalTransfer()
+rembg_session = rembg.new_session()
+isomer_azimuths = torch.from_numpy(np.array([270, 0, 90, 180])).to(device)
+isomer_elevations = torch.from_numpy(np.array([5, 5, 5, 5])).to(device)
+isomer_radius = 4.1
+isomer_geo_weights = torch.from_numpy(np.array([1, 0.9, 1, 0.9])).float().to(device)
+isomer_color_weights = torch.from_numpy(np.array([1, 0.5, 1, 0.5])).float().to(device)
+# seed_everything(42)
+# model initialization and loading
+# flux
+print('==> Loading Flux model ...')
+flux_base_model_pth = "/hpc2hdd/JH_DATA/share/yingcongchen/PrivateShareGroup/yingcongchen_datasets/model_checkpoint/models--black-forest-labs--FLUX.1-dev"
+flux_controlnet = FluxControlNetModel.from_pretrained("/hpc2hdd/JH_DATA/share/yingcongchen/PrivateShareGroup/yingcongchen_datasets/model_checkpoint/flux_controlnets/FLUX.1-dev-ControlNet-Union-Pro")
+flux_pipe = FluxControlNetImg2ImgPipeline.from_pretrained(flux_base_model_pth, controlnet=[flux_controlnet], torch_dtype=torch.bfloat16).to(device=device, dtype=torch.bfloat16)
+flux_pipe.load_lora_weights('./checkpoint/flux_lora/rgb_normal_large.safetensors')
+flux_pipe.to(device=device, dtype=torch.bfloat16)
+generator = torch.Generator(device=device).manual_seed(0)
+# lrm
+print('==> Loading LRM model ...')
+config = OmegaConf.load("./models/lrm/config/PRM_inference.yaml")
+model_config = config.model_config
+infer_config = config.infer_config
+model = instantiate_from_config(model_config)
+model_ckpt_path = "./checkpoint/lrm/final_ckpt.ckpt"
+state_dict = torch.load(model_ckpt_path, map_location='cpu')['state_dict']
+state_dict = {k[14:]: v for k, v in state_dict.items() if k.startswith('lrm_generator.')}
+model.load_state_dict(state_dict, strict=True)
+model = model.to(device)
+model.init_flexicubes_geometry(device, fovy=50.0)
+model = model.eval()
+# zero123++
+print('==> Loading diffusion model ...')
+zero123plus_pipeline = DiffusionPipeline.from_pretrained(
+    "sudo-ai/zero123plus-v1.2",
+    custom_pipeline="./models/zero123plus",
+    torch_dtype=torch.float16,
+)
+zero123plus_pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(
+    zero123plus_pipeline.scheduler.config, timestep_spacing='trailing'
+)
+unet_ckpt_path = "./checkpoint/zero123++/flexgen_19w.ckpt"
+state_dict = torch.load(unet_ckpt_path, map_location='cpu')['state_dict']
+state_dict = {k[10:]: v for k, v in state_dict.items() if k.startswith('unet.unet.')}
+zero123plus_pipeline.unet.load_state_dict(state_dict, strict=True)
+zero123plus_pipeline = zero123plus_pipeline.to(device)
+# unet_ckpt_path = "checkpoint/zero123++/diffusion_pytorch_model.bin"
+# state_dict = torch.load(unet_ckpt_path, map_location='cpu')
+# zero123plus_pipeline.unet.load_state_dict(state_dict, strict=True)
+# zero123plus_pipeline = zero123plus_pipeline.to(device)
+# florence
+caption_model = AutoModelForCausalLM.from_pretrained(
+        "/hpc2hdd/home/jlin695/.cache/huggingface/hub/models--multimodalart--Florence-2-large-no-flash-attn/snapshots/8db3793cf5b453b2ccfb3a4f613b403b2e6b7ca2", torch_dtype=torch.bfloat16, trust_remote_code=True,
+    ).to(device)
+caption_processor = AutoProcessor.from_pretrained("/hpc2hdd/home/jlin695/.cache/huggingface/hub/models--multimodalart--Florence-2-large-no-flash-attn/snapshots/8db3793cf5b453b2ccfb3a4f613b403b2e6b7ca2", trust_remote_code=True)
+# Flux multi-view generation
+def multi_view_rgb_normal_generation_with_controlnet(prompt, image, strength=1.0,
+                                                    control_image=[],
+                                                    control_mode=[],
+                                                    control_guidance_start=None,
+                                                    control_guidance_end=None,
+                                                    controlnet_conditioning_scale=None,
+                                                    lora_scale=1.0
+                                                    ):
+    control_mode_dict = {
+        'canny': 0,
+        'tile': 1,
+        'depth': 2,
+        'blur': 3,
+        'pose': 4,
+        'gray': 5,
+        'lq': 6,
+    } # for https://huggingface.co/InstantX/FLUX.1-dev-Controlnet-Union only
+    hparam_dict = {
+        'prompt': prompt,
+        'image': image,
+        'strength': strength,
+        'num_inference_steps': 30,
+        'guidance_scale': 3.5,
+        'num_images_per_prompt': 1,
+        'width': resolution*4,
+        'height': resolution*2,
+        'output_type': 'np',
+        'generator': generator,
+        'joint_attention_kwargs': {"scale": lora_scale}
+    }
+    # append controlnet hparams
+    if len(control_image) > 0:
+        assert len(control_mode) == len(control_image) # the count of image should be the same as control mode
+        ctrl_hparams = {
+            'control_mode': [control_mode_dict[mode_] for mode_ in control_mode],
+            'control_image': control_image,
+            'control_guidance_start': control_guidance_start or [0.0 for i in range(len(control_image))],
+            'control_guidance_end': control_guidance_end or [1.0 for i in range(len(control_image))],
+            'controlnet_conditioning_scale': controlnet_conditioning_scale or [1.0 for i in range(len(control_image))],
+        }
+        hparam_dict.update(ctrl_hparams)
+    # generate multi-view images
+    with torch.no_grad():
+        image = flux_pipe(
+            **hparam_dict
+        ).images
+    return image
+# captioning
+def run_captioning(image):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    torch_dtype = torch.bfloat16
+    if isinstance(image, str):  # If image is a file path
+        image = Image.open(image).convert("RGB")
+    prompt = "<MORE_DETAILED_CAPTION>"
+    inputs = caption_processor(text=prompt, images=image, return_tensors="pt").to(device, torch_dtype)
+    # print(f"inputs {inputs}")
+    generated_ids = caption_model.generate(
+        input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, num_beams=3
+    )
+    generated_text = caption_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+    parsed_answer = caption_processor.post_process_generation(
+        generated_text, task=prompt, image_size=(image.width, image.height)
+    )
+    # print(f"parsed_answer = {parsed_answer}")
+    caption_text = parsed_answer["<MORE_DETAILED_CAPTION>"].replace("The image is ", "")
+    return caption_text
+# zero123++ multi-view generation
+def multi_view_rgb_generation(cond_img):
+    # generate multi-view images
+    with torch.no_grad():
+        output_image = zero123plus_pipeline(
+        cond_img,
+        num_inference_steps=zero123plus_diffusion_steps,
+        width=resolution*2,
+        height=resolution*2,
+    ).images[0]
+    return output_image
+# lrm reconstructions
+def lrm_reconstructions(image, input_cameras, save_path=None, name="temp", export_texmap=False, if_save_video=False, render_azimuths=None, render_elevations=None, render_radius=None, render_fov=30):
+    images = image.unsqueeze(0).to(device)
+    images = v2.functional.resize(images, 512, interpolation=3, antialias=True).clamp(0, 1)
+    # breakpoint()
+    with torch.no_grad():
+        # get triplane
+        planes = model.forward_planes(images, input_cameras)
+        mesh_path_idx = os.path.join(save_path, f'{name}.obj')
+        mesh_out = model.extract_mesh(
+            planes,
+            use_texture_map=export_texmap,
+            **infer_config,
+        )
+        if export_texmap:
+            vertices, faces, uvs, mesh_tex_idx, tex_map = mesh_out
+            save_obj_with_mtl(
+                vertices.data.cpu().numpy(),
+                uvs.data.cpu().numpy(),
+                faces.data.cpu().numpy(),
+                mesh_tex_idx.data.cpu().numpy(),
+                tex_map.permute(1, 2, 0).data.cpu().numpy(),
+                mesh_path_idx,
+            )
+        else:
+            vertices, faces, vertex_colors = mesh_out
+            save_obj(vertices, faces, vertex_colors, mesh_path_idx)
+        print(f"Mesh saved to {mesh_path_idx}")
+        render_size = 512
+        if if_save_video:
+            video_path_idx = os.path.join(save_path, f'{name}.mp4')
+            render_size = infer_config.render_resolution
+            ENV = load_mipmap("models/lrm/env_mipmap/6")
+            materials = (0.0,0.9)
+            all_mv, all_mvp, all_campos = get_render_cameras_video(
+                batch_size=1,
+                M=240,
+                radius=4.5,
+                elevation=(90, 60.0),
+                is_flexicubes=True,
+                fov=30
+            )
+            frames, albedos, pbr_spec_lights, pbr_diffuse_lights, normals, alphas = render_frames(
+                model,
+                planes,
+                render_cameras=all_mvp,
+                camera_pos=all_campos,
+                env=ENV,
+                materials=materials,
+                render_size=render_size,
+                chunk_size=20,
+                is_flexicubes=True,
+            )
+            normals = (torch.nn.functional.normalize(normals) + 1) / 2
+            normals = normals * alphas + (1-alphas)
+            all_frames = torch.cat([frames, albedos, pbr_spec_lights, pbr_diffuse_lights, normals], dim=3)
+            # breakpoint()
+            save_video(
+                all_frames,
+                video_path_idx,
+                fps=30,
+            )
+            print(f"Video saved to {video_path_idx}")
+        if render_azimuths is not None and render_elevations is not None and render_radius is not None:
+            render_size = infer_config.render_resolution
+            ENV = load_mipmap("models/lrm/env_mipmap/6")
+            materials = (0.0,0.9)
+            all_mv, all_mvp, all_campos, identity_mv = get_render_cameras_frames(
+                batch_size=1,
+                radius=render_radius,
+                azimuths=render_azimuths,
+                elevations=render_elevations,
+                fov=30
+            )
+            frames, albedos, pbr_spec_lights, pbr_diffuse_lights, normals, alphas = render_frames(
+                model,
+                planes,
+                render_cameras=all_mvp,
+                camera_pos=all_campos,
+                env=ENV,
+                materials=materials,
+                render_size=render_size,
+                render_mv = all_mv,
+                local_normal=True,
+                identity_mv=identity_mv,
+            )
+        else:
+            normals = None
+            frames = None
+            albedos = None
+    return vertices, faces, normals, frames, albedos
+def transform_normal(input_normal, azimuths_deg, elevations_deg, radius=4.5, is_global_to_local=False):
+    """
+    input_normal: in range [-1, 1], shape (b c h w)
+    """
+    input_normal = input_normal.permute(0, 2, 3, 1).cpu()
+    azimuths_deg = np.array(azimuths_deg)
+    elevations_deg = np.array(elevations_deg)
+    if is_global_to_local:
+        local_normal = normal_transfer.trans_global_2_local(input_normal, azimuths_deg, elevations_deg)
+        return local_normal.permute(0, 3, 1, 2)
+    else:
+        global_normal = normal_transfer.trans_local_2_global(input_normal, azimuths_deg, elevations_deg, radius=radius, for_lotus=False)
+        global_normal[..., 0] *= -1
+        return global_normal.permute(0, 3, 1, 2)
+def local_normal_global_transform(local_normal_images,azimuths_deg,elevations_deg):
+    if local_normal_images.min() >= 0:
+        local_normal = local_normal_images.float() * 2 - 1
+    else:
+        local_normal = local_normal_images.float()
+    global_normal = normal_transfer.trans_local_2_global(local_normal, azimuths_deg, elevations_deg, radius=4.5, for_lotus=False)
+    global_normal[...,0] *= -1
+    global_normal = (global_normal + 1) / 2
+    global_normal = global_normal.permute(0, 3, 1, 2)
+    return global_normal
+def main():
+    image_pth = "examples/蓝色小怪物.webp"
+    save_dir_path = os.path.join(save_dir, image_pth.split("/")[-1].split(".")[0])
+    os.makedirs(save_dir_path, exist_ok=True)
+    input_image = Image.open(image_pth)
+    # if not args.no_rembg:
+    input_image = remove_background(input_image, rembg_session)
+    input_image = resize_foreground(input_image, 0.85)
+    # generate caption
+    image_caption = run_captioning(image_pth)
+    # generate multi-view images
+    output_image = multi_view_rgb_generation(input_image)
+    # lrm reconstructions
+    rgb_multi_view = np.asarray(output_image, dtype=np.float32) / 255.0
+    rgb_multi_view = torch.from_numpy(rgb_multi_view).squeeze(0).permute(2, 0, 1).contiguous().float()     # (3, 1024, 2048)
+    rgb_multi_view = rearrange(rgb_multi_view, 'c (n h) (m w) -> (n m) c h w', n=2, m=2)        # (8, 3, 512, 512)
+    input_cameras = get_custom_zero123plus_input_cameras(batch_size=1, radius=3.5, fov=30).to(device)
+    vertices, faces, lrm_multi_view_normals, lrm_multi_view_rgb, lrm_multi_view_albedo = \
+                                        lrm_reconstructions(rgb_multi_view, input_cameras, save_path=save_dir_path, name='lrm',
+                                          export_texmap=False, if_save_video=False, render_azimuths=isomer_azimuths,
+                                          render_elevations=isomer_elevations, render_radius=isomer_radius, render_fov=30)
+    vertices = torch.from_numpy(vertices).to(device)
+    faces = torch.from_numpy(faces).to(device)
+    vertices = vertices @ rotate_x(np.pi / 2, device=vertices.device)[:3, :3]
+    vertices = vertices @ rotate_y(np.pi / 2, device=vertices.device)[:3, :3]
+    # lrm_3D_bundle_image = torchvision.utils.make_grid(torch.cat([lrm_multi_view_rgb.cpu(), (lrm_multi_view_normals.cpu() + 1) / 2], dim=0), nrow=4, padding=0).unsqueeze(0) # range [0, 1]
+    lrm_3D_bundle_image = torchvision.utils.make_grid(torch.cat([rgb_multi_view[[3,0,1,2]].cpu(), (lrm_multi_view_normals.cpu() + 1) / 2], dim=0), nrow=4, padding=0).unsqueeze(0) # range [0, 1]
+    # rgb_multi_view[[3,0,1,2]] : (B,3,H,W)
+    # lrm_multi_view_normals : (B,3,H,W)
+    # combined_images = 0.5 * rgb_multi_view[[3,0,1,2]].cpu() + 0.5 * (lrm_multi_view_normals.cpu() + 1) / 2
+    # torchvision.utils.save_image(combined_images, os.path.join("debug_output", 'combined.png'))
+    # breakpoint()
+    # Use the low-quality controlnet by default, feel free to try the others
+    control_image = [lrm_3D_bundle_image * 2 - 1]
+    control_mode = ['tile']
+    control_guidance_start = [0.0]
+    control_guidance_end = [0.3]
+    controlnet_conditioning_scale = [0.8]
+    flux_pipe.controlnet = FluxMultiControlNetModel([flux_controlnet for _ in control_mode])
+    # breakpoint()
+    rgb_normal_grid = multi_view_rgb_normal_generation_with_controlnet(
+        prompt= ' '.join(['A grid of 2x4 multi-view image, elevation 5. White background.', image_caption]),
+        image=lrm_3D_bundle_image,
+        strength=0.6,
+        control_image=control_image,
+        control_mode=control_mode,
+        control_guidance_start=control_guidance_start,
+        control_guidance_end=control_guidance_end,
+        controlnet_conditioning_scale=controlnet_conditioning_scale,
+        lora_scale=1.0
+    ) # noted that rgb_normal_grid is a (b, h, w, c) numpy array
+    rgb_normal_grid = torch.from_numpy(rgb_normal_grid).contiguous().float()
+    rgb_normal_grid = rearrange(rgb_normal_grid.squeeze(0), '(n h) (m w) c-> (n m) c h w', n=2, m=4)        # (8, 3, 512, 512)
+    rgb_multi_view = rgb_normal_grid[:4, :3, :, :].cuda()
+    normal_multi_view = rgb_normal_grid[4:, :3, :, :].cuda()
+    multi_view_mask = get_background(normal_multi_view).cuda()
+    rgb_multi_view = rgb_multi_view * multi_view_mask + (1-multi_view_mask)
+    # local normal to global normal
+    global_normal = local_normal_global_transform(normal_multi_view.permute(0, 2, 3, 1).cpu(), isomer_azimuths, isomer_elevations).cuda()
+    global_normal = global_normal * multi_view_mask + (1-multi_view_mask)
+    global_normal = global_normal.permute(0,2,3,1)
+    multi_view_mask = multi_view_mask.squeeze(1)
+    rgb_multi_view = rgb_multi_view.permute(0,2,3,1)
+    # global_normal: B,H,W,3
+    # multi_view_mask: B,H,W
+    # rgb_multi_view: B,H,W,3
+    meshes = reconstruction(
+        normal_pils=global_normal,
+        masks=multi_view_mask,
+        weights=isomer_geo_weights,
+        fov=30,
+        radius=isomer_radius,
+        camera_angles_azi=isomer_azimuths,
+        camera_angles_ele=isomer_elevations,
+        expansion_weight_stage1=0.1,
+        init_type="file",
+        init_verts=vertices,
+        init_faces=faces,
+        stage1_steps=0,
+        stage2_steps=50,
+        start_edge_len_stage1=0.1,
+        end_edge_len_stage1=0.02,
+        start_edge_len_stage2=0.02,
+        end_edge_len_stage2=0.005,
+    )
+    save_glb_addr = projection(
+        meshes=meshes,
+        masks=multi_view_mask,
+        images=rgb_multi_view,
+        azimuths=isomer_azimuths,
+        elevations=isomer_elevations,
+        weights=isomer_color_weights,
+        fov=30,
+        radius=isomer_radius,
+        save_dir=f"{save_dir_path}/ISOMER/",
+    )
+    print(f'saved to {save_glb_addr}')
+if __name__ == '__main__':
+    main()

models/ISOMER/__init__.py ADDED Viewed

File without changes

models/ISOMER/data/__init__.py ADDED Viewed

File without changes

models/ISOMER/data/utils.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import torch
+import numpy as np
+from PIL import Image
+import os
+from pytorch3d.io import load_obj
+import trimesh
+from pytorch3d.structures import Meshes
+# from rembg import remove
+def remove_color(arr):
+    if arr.shape[-1] == 4:
+        arr = arr[..., :3]
+    # Convert to torch tensor
+    if type(arr) is not torch.Tensor:
+        arr = torch.tensor(arr, dtype=torch.int32)
+    # Calculate diffs
+    base = arr[0, 0]
+    diffs = torch.abs(arr - base).sum(dim=-1)
+    alpha = (diffs <= 80)
+    arr[alpha] = 255
+    alpha = ~alpha
+    alpha = alpha.unsqueeze(-1).int() * 255
+    arr = torch.cat([arr, alpha], dim=-1)
+    return arr
+def simple_remove_bkg_normal(imgs, rm_bkg_with_rembg, return_Image=False):
+    """Only works for normal"""
+    rets = []
+    for img in imgs:
+        if rm_bkg_with_rembg:
+            from rembg import remove
+            image = Image.fromarray(img.to(torch.uint8).detach().cpu().numpy())  if isinstance(img, torch.Tensor) else img
+            removed_image = remove(image)
+            arr = np.array(removed_image)
+            arr = torch.tensor(arr, dtype=torch.uint8)
+        else:
+            arr = remove_color(img)
+        if return_Image:
+            rets.append(Image.fromarray(arr.to(torch.uint8).detach().cpu().numpy()))
+        else:
+            rets.append(arr.to(torch.uint8))
+    return rets
+def load_glb(file_path):
+    # Load the .glb file as a scene and merge all meshes
+    scene_or_mesh = trimesh.load(file_path)
+    mesh = scene_or_mesh.dump(concatenate=True) if isinstance(scene_or_mesh, trimesh.Scene) else scene_or_mesh
+    # Extract vertices and faces from the merged mesh
+    verts = torch.tensor(mesh.vertices, dtype=torch.float32)
+    faces = torch.tensor(mesh.faces, dtype=torch.int64)
+    textured_mesh = Meshes(verts=[verts], faces=[faces])
+    return textured_mesh
+def load_obj_with_verts_faces(file_path, return_mesh=True):
+    verts, faces, _ = load_obj(file_path)
+    verts = torch.tensor(verts, dtype=torch.float32)
+    faces = faces.verts_idx
+    faces = torch.tensor(faces, dtype=torch.int64)
+    if return_mesh:
+        return Meshes(verts=[verts], faces=[faces])
+    else:
+        return verts, faces
+def normalize_mesh(vertices):
+    min_vals, _ = torch.min(vertices, axis=0)
+    max_vals, _ = torch.max(vertices, axis=0)
+    center = (max_vals + min_vals) / 2
+    vertices = vertices - center
+    max_extent = torch.max(max_vals - min_vals)
+    scale = 2.0 / max_extent
+    vertices = vertices * scale
+    return vertices

models/ISOMER/mesh_reconstruction/__init__.py ADDED Viewed

File without changes

models/ISOMER/mesh_reconstruction/func.py ADDED Viewed

	@@ -0,0 +1,227 @@

+# modified from https://github.com/Profactor/continuous-remeshing
+import torch
+import numpy as np
+import trimesh
+from typing import Tuple
+from pytorch3d.renderer.cameras import camera_position_from_spherical_angles, look_at_rotation
+from pytorch3d.renderer import (
+    FoVOrthographicCameras,
+    look_at_view_transform,
+)
+def to_numpy(*args):
+    def convert(a):
+        if isinstance(a,torch.Tensor):
+            return a.detach().cpu().numpy()
+        assert a is None or isinstance(a,np.ndarray)
+        return a
+    return convert(args[0]) if len(args)==1 else tuple(convert(a) for a in args)
+def laplacian(
+        num_verts:int,
+        edges: torch.Tensor #E,2
+        ) -> torch.Tensor: #sparse V,V
+    """create sparse Laplacian matrix"""
+    V = num_verts
+    E = edges.shape[0]
+    #adjacency matrix,
+    idx = torch.cat([edges, edges.fliplr()], dim=0).type(torch.long).T  # (2, 2*E)
+    ones = torch.ones(2*E, dtype=torch.float32, device=edges.device)
+    A = torch.sparse.FloatTensor(idx, ones, (V, V))
+    #degree matrix
+    deg = torch.sparse.sum(A, dim=1).to_dense()
+    idx = torch.arange(V, device=edges.device)
+    idx = torch.stack([idx, idx], dim=0)
+    D = torch.sparse.FloatTensor(idx, deg, (V, V))
+    return D - A
+def _translation(x, y, z, device):
+    return torch.tensor([[1., 0, 0, x],
+                    [0, 1, 0, y],
+                    [0, 0, 1, z],
+                    [0, 0, 0, 1]],device=device) #4,4
+def _perspective(fovy, aspect=1.0, n=0.1, f=1000.0, device=None):
+    fovy = fovy * torch.pi / 180
+    y = np.tan(fovy / 2)
+    return torch.tensor([[1/(y*aspect),    0,            0,              0],
+                         [           0, 1/-y,            0,              0],
+                         [           0,    0, -(f+n)/(f-n), -(2*f*n)/(f-n)],
+                         [           0,    0,           -1,              0]], dtype=torch.float32, device=device)
+def _projection(r, device, l=None, t=None, b=None, n=1.0, f=50.0, flip_y=True):
+    """
+        see https://blog.csdn.net/wodownload2/article/details/85069240/
+    """
+    if l is None:
+        l = -r
+    if t is None:
+        t = r
+    if b is None:
+        b = -t
+    p = torch.zeros([4,4],device=device)
+    p[0,0] = 2*n/(r-l)
+    p[0,2] = (r+l)/(r-l)
+    p[1,1] = 2*n/(t-b) * (-1 if flip_y else 1)
+    p[1,2] = (t+b)/(t-b)
+    p[2,2] = -(f+n)/(f-n)
+    p[2,3] = -(2*f*n)/(f-n)
+    p[3,2] = -1
+    return p #4,4
+def _orthographic(r, device, l=None, t=None, b=None, n=1.0, f=50.0, flip_y=True):
+    if l is None:
+        l = -r
+    if t is None:
+        t = r
+    if b is None:
+        b = -t
+    o = torch.zeros([4,4],device=device)
+    o[0,0] = 2/(r-l)
+    o[0,3] = -(r+l)/(r-l)
+    o[1,1] = 2/(t-b) * (-1 if flip_y else 1)
+    o[1,3] = -(t+b)/(t-b)
+    o[2,2] = -2/(f-n)
+    o[2,3] = -(f+n)/(f-n)
+    o[3,3] = 1
+    return o #4,4
+def make_star_cameras_orig(phis,pol_count,distance:float=10.,r=None,image_size=[512,512],device='cuda'):
+    if r is None:
+        r = 1/distance
+    A = len(phis)
+    P = pol_count
+    C = A * P # total number of cameras
+    phi = phis * torch.pi / 180
+    phi_rot = torch.eye(3,device=device)[None,None].expand(A,1,3,3).clone()
+    phi_rot[:,0,2,2] = phi.cos()
+    phi_rot[:,0,2,0] = -phi.sin()
+    phi_rot[:,0,0,2] = phi.sin()
+    phi_rot[:,0,0,0] = phi.cos()
+    theta = torch.arange(1,P+1) * (torch.pi/(P+1)) - torch.pi/2
+    theta_rot = torch.eye(3,device=device)[None,None].expand(1,P,3,3).clone()
+    theta_rot[0,:,1,1] = theta.cos()
+    theta_rot[0,:,1,2] = -theta.sin()
+    theta_rot[0,:,2,1] = theta.sin()
+    theta_rot[0,:,2,2] = theta.cos()
+    mv = torch.empty((C,4,4), device=device)
+    mv[:] = torch.eye(4, device=device)
+    mv[:,:3,:3] = (theta_rot @ phi_rot).reshape(C,3,3)
+    mv_ = _translation(0, 0, -distance, device) @ mv
+    return mv_, _projection(r,device)
+def make_star_cameras_mv_new(phis,eles,distance:float=10.,r=None,fov=None,image_size=[512,512],device='cuda',translation=True):
+    import glm
+    def sample_spherical(phi, theta, cam_radius):
+        theta = torch.deg2rad(theta)
+        phi = torch.deg2rad(phi)
+        z = cam_radius * torch.cos(phi) * torch.sin(theta)
+        x = cam_radius * torch.sin(phi) * torch.sin(theta)
+        y = cam_radius * torch.cos(theta)
+        return x, y, z
+    all_mvs = []
+    for i in range(len(phis)):
+        azimuth = - phis[i] + 1e-10
+        ele = - eles[i] + 1e-10 + 90
+        x, y, z = sample_spherical(azimuth, ele, distance)
+        eye = glm.vec3(x, y, z)
+        at = glm.vec3(0.0, 0.0, 0.0)
+        up = glm.vec3(0.0, 1.0, 0.0)
+        view_matrix = glm.lookAt(eye, at, up)
+        all_mvs.append(torch.from_numpy(np.array(view_matrix)).cuda())
+    mv = torch.stack(all_mvs)
+    return mv
+def make_star_cameras_mv(phis,eles,distance:float=10.,r=None,fov=None,image_size=[512,512],device='cuda',translation=True):
+    if r is None:
+        r = 0.15
+    A = len(phis)
+    assert len(eles) == A, f'len(phis): {len(phis)}, len(eles): {len(eles)}'
+    phi = phis * torch.pi / 180
+    phi_rot = torch.eye(3,device=device)[None].expand(A,3,3).clone()
+    phi_rot[:,2,2] = phi.cos()
+    phi_rot[:,2,0] = -phi.sin()
+    phi_rot[:,0,2] = phi.sin()
+    phi_rot[:,0,0] = phi.cos()
+    theta = eles * torch.pi / 180
+    theta_rot = torch.eye(3,device=device)[None].expand(A,3,3).clone()
+    theta_rot[:,1,1] = theta.cos()
+    theta_rot[:,1,2] = -theta.sin()
+    theta_rot[:,2,1] = theta.sin()
+    theta_rot[:,2,2] = theta.cos()
+    mv = torch.empty((A,4,4), device=device)
+    mv[:] = torch.eye(4, device=device)
+    mv[:,:3,:3] = (theta_rot @ phi_rot).reshape(A,3,3)
+    if translation:
+        mv_ = _translation(0, 0, -distance, device) @ mv
+    else:
+        mv_ = mv
+    return mv_
+def make_star_cameras(phis,eles,distance:float=10.,r=None,fov=None,image_size=[512,512],device='cuda',translation=True):
+    mv_ = make_star_cameras_mv_new(phis, eles, distance, r, device=device, translation=translation)
+    return mv_, _perspective(fov,device=device)
+def make_star_cameras_perspective(phis, eles, distance:float=10., r=None, fov=None, device='cuda'):
+    return make_star_cameras(phis, eles, distance, r, fov=fov, device=device, translation=True)
+def make_star_cameras_orthographic(phis, eles, distance:float=10., r=None, device='cuda'):
+    mv = make_star_cameras_mv_new(phis, eles, distance, r, device=device)
+    if r is None:
+        r = 1
+    return mv, _orthographic(r,device)
+def make_sphere(level:int=2,radius=1.,device='cuda') -> Tuple[torch.Tensor,torch.Tensor]:
+    sphere = trimesh.creation.icosphere(subdivisions=level, radius=1.0, color=None)
+    vertices = torch.tensor(sphere.vertices, device=device, dtype=torch.float32) * radius
+    faces = torch.tensor(sphere.faces, device=device, dtype=torch.long)
+    return vertices,faces
+def get_camera(R, T, focal_length=1 / (2**0.5)):
+    focal_length = 1 / focal_length
+    camera = FoVOrthographicCameras(device=R.device, R=R, T=T, min_x=-focal_length, max_x=focal_length, min_y=-focal_length, max_y=focal_length)
+    return camera
+def make_star_cameras_orthographic_py3d(azim_list, device, focal=2/1.35, dist=1.1):
+    R, T = look_at_view_transform(dist, 0, azim_list)
+    focal_length = 1 / focal
+    return FoVOrthographicCameras(device=R.device, R=R, T=T, min_x=-focal_length, max_x=focal_length, min_y=-focal_length, max_y=focal_length).to(device)
+def rotation_matrix_to_euler_angles(R, return_degrees=True):
+    sy = torch.sqrt(R[0, 0] * R[0, 0] + R[1, 0] * R[1, 0])
+    singular = sy < 1e-6
+    if not singular:
+        x = torch.atan2(R[2, 1], R[2, 2])
+        y = torch.atan2(-R[2, 0], sy)
+        z = torch.atan2(R[1, 0], R[0, 0])
+    else:
+        x = torch.atan2(-R[1, 2], R[1, 1])
+        y = torch.atan2(-R[2, 0], sy)
+        z = 0
+    if return_degrees:
+        return torch.tensor([x, y, z]) * 180 / np.pi
+    else:
+        return torch.tensor([x, y, z])

models/ISOMER/mesh_reconstruction/opt.py ADDED Viewed

	@@ -0,0 +1,191 @@

+# modified from https://github.com/Profactor/continuous-remeshing
+import time
+import torch
+import torch_scatter
+from typing import Tuple
+from ..mesh_reconstruction.remesh import calc_edge_length, calc_edges, calc_face_collapses, calc_face_normals, calc_vertex_normals, collapse_edges, flip_edges, pack, prepend_dummies, remove_dummies, split_edges
+@torch.no_grad()
+def remesh(
+        vertices_etc:torch.Tensor, #V,D
+        faces:torch.Tensor, #F,3 long
+        min_edgelen:torch.Tensor, #V
+        max_edgelen:torch.Tensor, #V
+        flip:bool,
+        max_vertices=1e6
+        ):
+    # dummies
+    vertices_etc,faces = prepend_dummies(vertices_etc,faces)
+    vertices = vertices_etc[:,:3] #V,3
+    nan_tensor = torch.tensor([torch.nan],device=min_edgelen.device)
+    min_edgelen = torch.concat((nan_tensor,min_edgelen))
+    max_edgelen = torch.concat((nan_tensor,max_edgelen))
+    # collapse
+    edges,face_to_edge = calc_edges(faces) #E,2 F,3
+    edge_length = calc_edge_length(vertices,edges) #E
+    face_normals = calc_face_normals(vertices,faces,normalize=False) #F,3
+    vertex_normals = calc_vertex_normals(vertices,faces,face_normals) #V,3
+    # then calculates the face collapses, which are the faces that can be removed without changing the overall shape of the object.
+    face_collapse = calc_face_collapses(vertices,faces,edges,face_to_edge,edge_length,face_normals,vertex_normals,min_edgelen,area_ratio=0.5)
+    shortness = (1 - edge_length / min_edgelen[edges].mean(dim=-1)).clamp_min_(0) #e[0,1] 0...ok, 1...edgelen=0
+    priority = face_collapse.float() + shortness
+    vertices_etc, faces = collapse_edges(vertices_etc, faces, edges, priority)
+    # split: If the number of vertices is less than the maximum allowed, the function splits the edges that are longer than the maximum edge length.
+    if vertices.shape[0]<max_vertices:
+        edges,face_to_edge = calc_edges(faces) #E,2 F,3
+        vertices = vertices_etc[:,:3] #V,3
+        edge_length = calc_edge_length(vertices,edges) #E
+        splits = edge_length > max_edgelen[edges].mean(dim=-1)
+        vertices_etc,faces = split_edges(vertices_etc,faces,edges,face_to_edge,splits,pack_faces=False)
+    vertices_etc,faces = pack(vertices_etc,faces)
+    vertices = vertices_etc[:,:3]
+    if flip: # flips the edges of the faces
+        edges,_,edge_to_face = calc_edges(faces,with_edge_to_face=True) #E,2 F,3
+        flip_edges(vertices,faces,edges,edge_to_face,with_border=False)
+    return remove_dummies(vertices_etc,faces)
+def lerp_unbiased(a:torch.Tensor,b:torch.Tensor,weight:float,step:int):
+    """lerp with adam's bias correction"""
+    c_prev = 1-weight**(step-1)
+    c = 1-weight**step
+    a_weight = weight*c_prev/c
+    b_weight = (1-weight)/c
+    a.mul_(a_weight).add_(b, alpha=b_weight)
+class MeshOptimizer:
+    """Use this like a pytorch Optimizer, but after calling opt.step(), do vertices,faces = opt.remesh()."""
+    def __init__(self,
+            vertices:torch.Tensor, #V,3
+            faces:torch.Tensor, #F,3
+            lr=0.3, #learning rate
+            betas=(0.8,0.8,0), #betas[0:2] are the same as in Adam, betas[2] may be used to time-smooth the relative velocity nu
+            gammas=(0,0,0), #optional spatial smoothing for m1,m2,nu, values between 0 (no smoothing) and 1 (max. smoothing)
+            nu_ref=0.3, #reference velocity for edge length controller
+            edge_len_lims=(.01,.15), #smallest and largest allowed reference edge length
+            edge_len_tol=.5, #edge length tolerance for split and collapse
+            gain=.2,  #gain value for edge length controller
+            laplacian_weight=.02, #for laplacian smoothing/regularization
+            ramp=1, #learning rate ramp, actual ramp width is ramp/(1-betas[0])
+            grad_lim=10., #gradients are clipped to m1.abs()*grad_lim
+            remesh_interval=1, #larger intervals are faster but with worse mesh quality
+            local_edgelen=True, #set to False to use a global scalar reference edge length instead
+            ):
+        self._vertices = vertices
+        self._faces = faces
+        self._lr = lr
+        self._betas = betas
+        self._gammas = gammas
+        self._nu_ref = nu_ref
+        self._edge_len_lims = edge_len_lims
+        self._edge_len_tol = edge_len_tol
+        self._gain = gain
+        self._laplacian_weight = laplacian_weight
+        self._ramp = ramp
+        self._grad_lim = grad_lim
+        self._remesh_interval = remesh_interval
+        self._local_edgelen = local_edgelen
+        self._step = 0
+        V = self._vertices.shape[0]
+        # prepare continuous tensor for all vertex-based data
+        self._vertices_etc = torch.zeros([V,9],device=vertices.device)
+        self._split_vertices_etc()
+        self.vertices.copy_(vertices) #initialize vertices
+        self._vertices.requires_grad_()
+        self._ref_len.fill_(edge_len_lims[1])
+    @property
+    def vertices(self):
+        return self._vertices
+    @property
+    def faces(self):
+        return self._faces
+    def _split_vertices_etc(self):
+        self._vertices = self._vertices_etc[:,:3]
+        self._m2 = self._vertices_etc[:,3]
+        self._nu = self._vertices_etc[:,4]
+        self._m1 = self._vertices_etc[:,5:8]
+        self._ref_len = self._vertices_etc[:,8]
+        with_gammas = any(g!=0 for g in self._gammas)
+        self._smooth = self._vertices_etc[:,:8] if with_gammas else self._vertices_etc[:,:3]
+    def zero_grad(self):
+        self._vertices.grad = None
+    @torch.no_grad()
+    def step(self):
+        eps = 1e-8
+        self._step += 1
+        # spatial smoothing
+        edges,_ = calc_edges(self._faces) #E,2
+        E = edges.shape[0]
+        edge_smooth = self._smooth[edges] #E,2,S
+        neighbor_smooth = torch.zeros_like(self._smooth) #V,S
+        torch_scatter.scatter_mean(src=edge_smooth.flip(dims=[1]).reshape(E*2,-1),index=edges.reshape(E*2,1),dim=0,out=neighbor_smooth)
+        #apply optional smoothing of m1,m2,nu
+        if self._gammas[0]:
+            self._m1.lerp_(neighbor_smooth[:,5:8],self._gammas[0])
+        if self._gammas[1]:
+            self._m2.lerp_(neighbor_smooth[:,3],self._gammas[1])
+        if self._gammas[2]:
+            self._nu.lerp_(neighbor_smooth[:,4],self._gammas[2])
+        #add laplace smoothing to gradients
+        laplace = self._vertices - neighbor_smooth[:,:3]
+        grad = torch.addcmul(self._vertices.grad, laplace, self._nu[:,None], value=self._laplacian_weight)
+        #gradient clipping
+        if self._step>1:
+            grad_lim = self._m1.abs().mul_(self._grad_lim)
+            grad.clamp_(min=-grad_lim,max=grad_lim)
+        # moment updates
+        lerp_unbiased(self._m1, grad, self._betas[0], self._step)
+        lerp_unbiased(self._m2, (grad**2).sum(dim=-1), self._betas[1], self._step)
+        velocity = self._m1 / self._m2[:,None].sqrt().add_(eps) #V,3
+        speed = velocity.norm(dim=-1) #V
+        if self._betas[2]:
+            lerp_unbiased(self._nu,speed,self._betas[2],self._step) #V
+        else:
+            self._nu.copy_(speed) #V
+        # update vertices
+        ramped_lr = self._lr * min(1,self._step * (1-self._betas[0]) / self._ramp)
+        self._vertices.add_(velocity * self._ref_len[:,None], alpha=-ramped_lr)
+        # update target edge length
+        if self._step % self._remesh_interval == 0:
+            if self._local_edgelen:
+                len_change = (1 + (self._nu - self._nu_ref) * self._gain)
+            else:
+                len_change = (1 + (self._nu.mean() - self._nu_ref) * self._gain)
+            self._ref_len *= len_change
+            self._ref_len.clamp_(*self._edge_len_lims)
+    def remesh(self, flip:bool=True, poisson=False)->Tuple[torch.Tensor,torch.Tensor]:
+        min_edge_len = self._ref_len * (1 - self._edge_len_tol)
+        max_edge_len = self._ref_len * (1 + self._edge_len_tol)
+        self._vertices_etc,self._faces = remesh(self._vertices_etc,self._faces,min_edge_len,max_edge_len,flip, max_vertices=1e6)
+        self._split_vertices_etc()
+        self._vertices.requires_grad_()
+        return self._vertices, self._faces

models/ISOMER/mesh_reconstruction/recon.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from tqdm import tqdm
+from PIL import Image
+import numpy as np
+import torch
+from torchvision.utils import make_grid
+from typing import List
+from ..mesh_reconstruction.remesh import calc_vertex_normals
+from ..mesh_reconstruction.opt import MeshOptimizer
+from ..mesh_reconstruction.func import make_star_cameras_orthographic, make_star_cameras_orthographic_py3d
+from ..mesh_reconstruction.render import NormalsRenderer, Pytorch3DNormalsRenderer
+from ..scripts.utils import to_py3d_mesh, init_target
+def reconstruct_stage1(pils: List[Image.Image], mv, proj, steps=100, vertices=None, faces=None, start_edge_len=0.15, end_edge_len=0.005, decay=0.995, return_mesh=True, loss_expansion_weight=0.1, gain=0.1, use_remesh=True):
+    vertices, faces = vertices.to("cuda"), faces.to("cuda")
+    renderer = NormalsRenderer(mv,proj,list(pils[0].size))
+    target_images = init_target(pils, new_bkgd=(0., 0., 0.)) # 4s
+    opt = MeshOptimizer(vertices,faces, local_edgelen=False, gain=gain, edge_len_lims=(end_edge_len, start_edge_len))
+    vertices = opt.vertices
+    mask = target_images[..., -1] < 0.5
+    for i in tqdm(range(steps)):
+        opt._lr *= decay
+        normals = calc_vertex_normals(vertices,faces)
+        images = renderer.render(vertices,normals,faces)
+        loss_expand = 0.5 * ((vertices+normals).detach() - vertices).pow(2).mean()
+        t_mask = images[..., -1] > 0.5
+        loss_target_l2 = (images[t_mask] - target_images[t_mask]).abs().pow(2).mean()
+        loss_alpha_target_mask_l2 = (images[..., -1][mask] - target_images[..., -1][mask]).pow(2).mean()
+        loss = loss_target_l2 + loss_alpha_target_mask_l2 + loss_expand * loss_expansion_weight
+        loss_oob = (vertices.abs() > 0.99).float().mean() * 10
+        loss = loss + loss_oob
+        loss.backward()
+        opt.step()
+        if use_remesh:
+            vertices,faces = opt.remesh(poisson=False)
+    vertices, faces = vertices.detach(), faces.detach()
+    if return_mesh:
+        return to_py3d_mesh(vertices, faces)
+    else:
+        return vertices, faces

models/ISOMER/mesh_reconstruction/refine.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from tqdm import tqdm
+from PIL import Image
+import torch
+import numpy as np
+from typing import List
+from ..mesh_reconstruction.remesh import calc_vertex_normals
+from ..mesh_reconstruction.opt import MeshOptimizer
+from ..mesh_reconstruction.func import make_star_cameras_orthographic, make_star_cameras_orthographic_py3d
+from ..mesh_reconstruction.render import NormalsRenderer, Pytorch3DNormalsRenderer
+from ..scripts.project_mesh import multiview_color_projection, get_cameras_list
+from ..scripts.utils import to_py3d_mesh, from_py3d_mesh, init_target
+def run_mesh_refine(vertices, faces, pils: List[Image.Image], mv, proj, weights, cameras, steps=100, start_edge_len=0.02, end_edge_len=0.005, decay=0.99, update_normal_interval=10, update_warmup=10, return_mesh=True, process_inputs=True, process_outputs=True, use_remesh=True, loss_expansion_weight=0):
+    if process_inputs:
+        vertices = vertices * 2 / 1.35
+        vertices[..., [0, 2]] = - vertices[..., [0, 2]]
+    poission_steps = []
+    renderer = NormalsRenderer(mv,proj,list(pils[0].size))
+    target_images = init_target(pils, new_bkgd=(0., 0., 0.)) # 4s
+    opt = MeshOptimizer(vertices,faces, ramp=5, edge_len_lims=(end_edge_len, start_edge_len), local_edgelen=False, laplacian_weight=0.02)
+    vertices = opt.vertices
+    alpha_init = None
+    mask = target_images[..., -1] < 0.5
+    for i in tqdm(range(steps)):
+        opt.zero_grad()
+        opt._lr *= decay
+        normals = calc_vertex_normals(vertices,faces)
+        images = renderer.render(vertices,normals,faces)
+        if alpha_init is None:
+            alpha_init = images.detach()
+        # update explicit target and render images for L_ET calculation
+        if i < update_warmup or i % update_normal_interval == 0:
+            with torch.no_grad():
+                py3d_mesh = to_py3d_mesh(vertices, faces, normals)
+                _, _, target_normal = from_py3d_mesh(multiview_color_projection(py3d_mesh, pils, cameras_list=cameras, weights=weights, confidence_threshold=0.1, complete_unseen=False, below_confidence_strategy='original', reweight_with_cosangle='linear'))
+                target_normal = target_normal * 2 - 1
+                target_normal = torch.nn.functional.normalize(target_normal, dim=-1)
+                debug_images = renderer.render(vertices,target_normal,faces)
+        d_mask = images[..., -1] > 0.5
+        loss_debug_l2 = (images[..., :3][d_mask] - debug_images[..., :3][d_mask]).pow(2).mean()
+        loss_alpha_target_mask_l2 = (images[..., -1][mask] - target_images[..., -1][mask]).pow(2).mean()
+        loss = loss_debug_l2 + loss_alpha_target_mask_l2
+        loss_oob = (vertices.abs() > 0.99).float().mean() * 10
+        loss = loss + loss_oob
+        # this loss_expand does not exist in original ISOMER. we add it here (but default loss_expansion_weight is 0)
+        loss_expand = 0.5 * ((vertices+normals).detach() - vertices).pow(2).mean()
+        loss += loss_expand * loss_expansion_weight
+        loss.backward()
+        opt.step()
+        if use_remesh:
+            vertices,faces = opt.remesh(poisson=(i in poission_steps))
+    vertices, faces = vertices.detach(), faces.detach()
+    if process_outputs:
+        vertices = vertices / 2 * 1.35
+        vertices[..., [0, 2]] = - vertices[..., [0, 2]]
+    if return_mesh:
+        return to_py3d_mesh(vertices, faces)
+    else:
+        return vertices, faces

models/ISOMER/mesh_reconstruction/remesh.py ADDED Viewed

	@@ -0,0 +1,363 @@

+# modified from https://github.com/Profactor/continuous-remeshing
+import torch
+import torch.nn.functional as tfunc
+import torch_scatter
+from typing import Tuple
+def prepend_dummies(
+        vertices:torch.Tensor, #V,D
+        faces:torch.Tensor, #F,3 long
+    )->Tuple[torch.Tensor,torch.Tensor]:
+    """prepend dummy elements to vertices and faces to enable "masked" scatter operations"""
+    V,D = vertices.shape
+    vertices = torch.concat((torch.full((1,D),fill_value=torch.nan,device=vertices.device),vertices),dim=0)
+    faces = torch.concat((torch.zeros((1,3),dtype=torch.long,device=faces.device),faces+1),dim=0)
+    return vertices,faces
+def remove_dummies(
+        vertices:torch.Tensor, #V,D - first vertex all nan and unreferenced
+        faces:torch.Tensor, #F,3 long - first face all zeros
+    )->Tuple[torch.Tensor,torch.Tensor]:
+    """remove dummy elements added with prepend_dummies()"""
+    return vertices[1:],faces[1:]-1
+def calc_edges(
+        faces: torch.Tensor,  # F,3 long - first face may be dummy with all zeros
+        with_edge_to_face: bool = False
+    ) -> Tuple[torch.Tensor, ...]:
+    """
+    returns Tuple of
+    - edges E,2 long, 0 for unused, lower vertex index first
+    - face_to_edge F,3 long
+    - (optional) edge_to_face shape=E,[left,right],[face,side]
+    o-<-----e1     e0,e1...edge, e0<e1
+    |      /A      L,R....left and right face
+    |  L /  |      both triangles ordered counter clockwise
+    |  / R  |      normals pointing out of screen
+    V/      |
+    e0---->-o
+    """
+    F = faces.shape[0]
+    # make full edges, lower vertex index first
+    face_edges = torch.stack((faces,faces.roll(-1,1)),dim=-1) #F*3,3,2
+    full_edges = face_edges.reshape(F*3,2)
+    sorted_edges,_ = full_edges.sort(dim=-1) #F*3,2
+    # make unique edges
+    edges,full_to_unique = torch.unique(input=sorted_edges,sorted=True,return_inverse=True,dim=0) #(E,2),(F*3)
+    E = edges.shape[0]
+    face_to_edge = full_to_unique.reshape(F,3) #F,3
+    if not with_edge_to_face:
+        return edges, face_to_edge
+    is_right = full_edges[:,0]!=sorted_edges[:,0] #F*3
+    edge_to_face = torch.zeros((E,2,2),dtype=torch.long,device=faces.device) #E,LR=2,S=2
+    scatter_src = torch.cartesian_prod(torch.arange(0,F,device=faces.device),torch.arange(0,3,device=faces.device)) #F*3,2
+    edge_to_face.reshape(2*E,2).scatter_(dim=0,index=(2*full_to_unique+is_right)[:,None].expand(F*3,2),src=scatter_src) #E,LR=2,S=2
+    edge_to_face[0] = 0
+    return edges, face_to_edge, edge_to_face
+def calc_edge_length(
+        vertices:torch.Tensor, #V,3 first may be dummy
+        edges:torch.Tensor, #E,2 long, lower vertex index first, (0,0) for unused
+        )->torch.Tensor: #E
+    full_vertices = vertices[edges] #E,2,3
+    a,b = full_vertices.unbind(dim=1) #E,3
+    return torch.norm(a-b,p=2,dim=-1)
+def calc_face_normals(
+        vertices:torch.Tensor, #V,3 first vertex may be unreferenced
+        faces:torch.Tensor, #F,3 long, first face may be all zero
+        normalize:bool=False,
+        )->torch.Tensor: #F,3
+    """
+         n
+         |
+         c0     corners ordered counterclockwise when
+        / \     looking onto surface (in neg normal direction)
+      c1---c2
+    """
+    full_vertices = vertices[faces] #F,C=3,3
+    v0,v1,v2 = full_vertices.unbind(dim=1) #F,3
+    face_normals = torch.cross(v1-v0,v2-v0, dim=1) #F,3
+    if normalize:
+        face_normals = tfunc.normalize(face_normals, eps=1e-6, dim=1)
+    return face_normals #F,3
+def calc_vertex_normals(
+        vertices:torch.Tensor, #V,3 first vertex may be unreferenced
+        faces:torch.Tensor, #F,3 long, first face may be all zero
+        face_normals:torch.Tensor=None, #F,3, not normalized
+        )->torch.Tensor: #F,3
+    F = faces.shape[0]
+    if face_normals is None:
+        face_normals = calc_face_normals(vertices,faces) # this no grad
+    vertex_normals = torch.zeros((vertices.shape[0],3,3),dtype=vertices.dtype,device=vertices.device) #V,C=3,3
+    vertex_normals.scatter_add_(dim=0,index=faces[:,:,None].expand(F,3,3),src=face_normals[:,None,:].expand(F,3,3)) # This no grad
+    vertex_normals = vertex_normals.sum(dim=1) #V,3
+    return tfunc.normalize(vertex_normals, eps=1e-6, dim=1)
+def calc_face_ref_normals(
+        faces:torch.Tensor, #F,3 long, 0 for unused
+        vertex_normals:torch.Tensor, #V,3 first unused
+        normalize:bool=False,
+        )->torch.Tensor: #F,3
+    """calculate reference normals for face flip detection"""
+    full_normals = vertex_normals[faces] #F,C=3,3
+    ref_normals = full_normals.sum(dim=1) #F,3
+    if normalize:
+        ref_normals = tfunc.normalize(ref_normals, eps=1e-6, dim=1)
+    return ref_normals
+def pack(
+        vertices:torch.Tensor, #V,3 first unused and nan
+        faces:torch.Tensor, #F,3 long, 0 for unused
+        )->Tuple[torch.Tensor,torch.Tensor]: #(vertices,faces), keeps first vertex unused
+    """removes unused elements in vertices and faces"""
+    V = vertices.shape[0]
+    # remove unused faces
+    used_faces = faces[:,0]!=0
+    used_faces[0] = True
+    faces = faces[used_faces] #sync
+    # remove unused vertices
+    used_vertices = torch.zeros(V,3,dtype=torch.bool,device=vertices.device)
+    used_vertices.scatter_(dim=0,index=faces,value=True,reduce='add')
+    used_vertices = used_vertices.any(dim=1)
+    used_vertices[0] = True
+    vertices = vertices[used_vertices] #sync
+    # update used faces
+    ind = torch.zeros(V,dtype=torch.long,device=vertices.device)
+    V1 = used_vertices.sum()
+    ind[used_vertices] =  torch.arange(0,V1,device=vertices.device) #sync
+    faces = ind[faces]
+    return vertices,faces
+def split_edges(
+        vertices:torch.Tensor, #V,3 first unused
+        faces:torch.Tensor, #F,3 long, 0 for unused
+        edges:torch.Tensor, #E,2 long 0 for unused, lower vertex index first
+        face_to_edge:torch.Tensor, #F,3 long 0 for unused
+        splits, #E bool
+        pack_faces:bool=True,
+        )->Tuple[torch.Tensor,torch.Tensor]: #(vertices,faces)
+    #   c2                    c2               c...corners = faces
+    #    . .                   . .             s...side_vert, 0 means no split
+    #    .   .                 .N2 .           S...shrunk_face
+    #    .     .               .     .         Ni...new_faces
+    #   s2      s1           s2|c2...s1|c1
+    #    .        .            .     .  .
+    #    .          .          . S .      .
+    #    .            .        . .     N1    .
+    #   c0...(s0=0)....c1    s0|c0...........c1
+    #
+    # pseudo-code:
+    #   S = [s0|c0,s1|c1,s2|c2] example:[c0,s1,s2]
+    #   split = side_vert!=0 example:[False,True,True]
+    #   N0 = split[0]*[c0,s0,s2|c2] example:[0,0,0]
+    #   N1 = split[1]*[c1,s1,s0|c0] example:[c1,s1,c0]
+    #   N2 = split[2]*[c2,s2,s1|c1] example:[c2,s2,s1]
+    V = vertices.shape[0]
+    F = faces.shape[0]
+    S = splits.sum().item() #sync
+    if S==0:
+        return vertices,faces
+    edge_vert = torch.zeros_like(splits, dtype=torch.long) #E
+    edge_vert[splits] = torch.arange(V,V+S,dtype=torch.long,device=vertices.device) #E 0 for no split, sync
+    side_vert = edge_vert[face_to_edge] #F,3 long, 0 for no split
+    split_edges = edges[splits] #S sync
+    #vertices
+    split_vertices = vertices[split_edges].mean(dim=1) #S,3
+    vertices = torch.concat((vertices,split_vertices),dim=0)
+    #faces
+    side_split = side_vert!=0 #F,3
+    shrunk_faces = torch.where(side_split,side_vert,faces) #F,3 long, 0 for no split
+    new_faces = side_split[:,:,None] * torch.stack((faces,side_vert,shrunk_faces.roll(1,dims=-1)),dim=-1) #F,N=3,C=3
+    faces = torch.concat((shrunk_faces,new_faces.reshape(F*3,3))) #4F,3
+    if pack_faces:
+        mask = faces[:,0]!=0
+        mask[0] = True
+        faces = faces[mask] #F',3 sync
+    return vertices,faces
+def collapse_edges(
+        vertices:torch.Tensor, #V,3 first unused
+        faces:torch.Tensor, #F,3 long 0 for unused
+        edges:torch.Tensor, #E,2 long 0 for unused, lower vertex index first
+        priorities:torch.Tensor, #E float
+        stable:bool=False, #only for unit testing
+        )->Tuple[torch.Tensor,torch.Tensor]: #(vertices,faces)
+    V = vertices.shape[0]
+    # check spacing
+    _,order = priorities.sort(stable=stable) #E
+    rank = torch.zeros_like(order)
+    rank[order] = torch.arange(0,len(rank),device=rank.device)
+    vert_rank = torch.zeros(V,dtype=torch.long,device=vertices.device) #V
+    edge_rank = rank #E
+    for i in range(3):
+        torch_scatter.scatter_max(src=edge_rank[:,None].expand(-1,2).reshape(-1),index=edges.reshape(-1),dim=0,out=vert_rank)
+        edge_rank,_ = vert_rank[edges].max(dim=-1) #E
+    candidates = edges[(edge_rank==rank).logical_and_(priorities>0)] #E',2
+    # check connectivity
+    vert_connections = torch.zeros(V,dtype=torch.long,device=vertices.device) #V
+    vert_connections[candidates[:,0]] = 1 #start
+    edge_connections = vert_connections[edges].sum(dim=-1) #E, edge connected to start
+    vert_connections.scatter_add_(dim=0,index=edges.reshape(-1),src=edge_connections[:,None].expand(-1,2).reshape(-1))# one edge from start
+    vert_connections[candidates] = 0 #clear start and end
+    edge_connections = vert_connections[edges].sum(dim=-1) #E, one or two edges from start
+    vert_connections.scatter_add_(dim=0,index=edges.reshape(-1),src=edge_connections[:,None].expand(-1,2).reshape(-1)) #one or two edges from start
+    collapses = candidates[vert_connections[candidates[:,1]] <= 2] # E" not more than two connections between start and end
+    # mean vertices
+    vertices[collapses[:,0]] = vertices[collapses].mean(dim=1)
+    # update faces
+    dest = torch.arange(0,V,dtype=torch.long,device=vertices.device) #V
+    dest[collapses[:,1]] = dest[collapses[:,0]]
+    faces = dest[faces] #F,3
+    c0,c1,c2 = faces.unbind(dim=-1)
+    collapsed = (c0==c1).logical_or_(c1==c2).logical_or_(c0==c2)
+    faces[collapsed] = 0
+    return vertices,faces
+def calc_face_collapses(
+        vertices:torch.Tensor, #V,3 first unused
+        faces:torch.Tensor, #F,3 long, 0 for unused
+        edges:torch.Tensor, #E,2 long 0 for unused, lower vertex index first
+        face_to_edge:torch.Tensor, #F,3 long 0 for unused
+        edge_length:torch.Tensor, #E
+        face_normals:torch.Tensor, #F,3
+        vertex_normals:torch.Tensor, #V,3 first unused
+        min_edge_length:torch.Tensor=None, #V
+        area_ratio = 0.5, #collapse if area < min_edge_length**2 * area_ratio
+        shortest_probability = 0.8
+        )->torch.Tensor: #E edges to collapse
+    E = edges.shape[0]
+    F = faces.shape[0]
+    # face flips
+    ref_normals = calc_face_ref_normals(faces,vertex_normals,normalize=False) #F,3
+    face_collapses = (face_normals*ref_normals).sum(dim=-1)<0 #F
+    # small faces
+    if min_edge_length is not None:
+        min_face_length = min_edge_length[faces].mean(dim=-1) #F
+        min_area = min_face_length**2 * area_ratio #F
+        face_collapses.logical_or_(face_normals.norm(dim=-1) < min_area*2) #F
+        face_collapses[0] = False
+    # faces to edges
+    face_length = edge_length[face_to_edge] #F,3
+    if shortest_probability<1:
+        #select shortest edge with shortest_probability chance
+        randlim = round(2/(1-shortest_probability))
+        rand_ind = torch.randint(0,randlim,size=(F,),device=faces.device).clamp_max_(2) #selected edge local index in face
+        sort_ind = torch.argsort(face_length,dim=-1,descending=True) #F,3
+        local_ind = sort_ind.gather(dim=-1,index=rand_ind[:,None])
+    else:
+        local_ind = torch.argmin(face_length,dim=-1)[:,None] #F,1 0...2 shortest edge local index in face
+    edge_ind = face_to_edge.gather(dim=1,index=local_ind)[:,0] #F 0...E selected edge global index
+    edge_collapses = torch.zeros(E,dtype=torch.long,device=vertices.device)
+    edge_collapses.scatter_add_(dim=0,index=edge_ind,src=face_collapses.long())
+    return edge_collapses.bool()
+def flip_edges(
+        vertices:torch.Tensor, #V,3 first unused
+        faces:torch.Tensor, #F,3 long, first must be 0, 0 for unused
+        edges:torch.Tensor, #E,2 long, first must be 0, 0 for unused, lower vertex index first
+        edge_to_face:torch.Tensor, #E,[left,right],[face,side]
+        with_border:bool=True, #handle border edges (D=4 instead of D=6)
+        with_normal_check:bool=True, #check face normal flips
+        stable:bool=False, #only for unit testing
+        ):
+    V = vertices.shape[0]
+    E = edges.shape[0]
+    device=vertices.device
+    vertex_degree = torch.zeros(V,dtype=torch.long,device=device) #V long
+    vertex_degree.scatter_(dim=0,index=edges.reshape(E*2),value=1,reduce='add')
+    neighbor_corner = (edge_to_face[:,:,1] + 2) % 3 #go from side to corner
+    neighbors = faces[edge_to_face[:,:,0],neighbor_corner] #E,LR=2
+    edge_is_inside = neighbors.all(dim=-1) #E
+    if with_border:
+        # inside vertices should have D=6, border edges D=4, so we subtract 2 for all inside vertices
+        # need to use float for masks in order to use scatter(reduce='multiply')
+        vertex_is_inside = torch.ones(V,2,dtype=torch.float32,device=vertices.device) #V,2 float
+        src = edge_is_inside.type(torch.float32)[:,None].expand(E,2) #E,2 float
+        vertex_is_inside.scatter_(dim=0,index=edges,src=src,reduce='multiply')
+        vertex_is_inside = vertex_is_inside.prod(dim=-1,dtype=torch.long) #V long
+        vertex_degree -= 2 * vertex_is_inside #V long
+    neighbor_degrees = vertex_degree[neighbors] #E,LR=2
+    edge_degrees = vertex_degree[edges] #E,2
+    #
+    # loss = Sum_over_affected_vertices((new_degree-6)**2)
+    # loss_change = Sum_over_neighbor_vertices((degree+1-6)**2-(degree-6)**2)
+    #                   + Sum_over_edge_vertices((degree-1-6)**2-(degree-6)**2)
+    #             = 2 * (2 + Sum_over_neighbor_vertices(degree) - Sum_over_edge_vertices(degree))
+    #
+    loss_change = 2 + neighbor_degrees.sum(dim=-1) - edge_degrees.sum(dim=-1) #E
+    candidates = torch.logical_and(loss_change<0, edge_is_inside) #E
+    loss_change = loss_change[candidates] #E'
+    if loss_change.shape[0]==0:
+        return
+    edges_neighbors = torch.concat((edges[candidates],neighbors[candidates]),dim=-1) #E',4
+    _,order = loss_change.sort(descending=True, stable=stable) #E'
+    rank = torch.zeros_like(order)
+    rank[order] = torch.arange(0,len(rank),device=rank.device)
+    vertex_rank = torch.zeros((V,4),dtype=torch.long,device=device) #V,4
+    torch_scatter.scatter_max(src=rank[:,None].expand(-1,4),index=edges_neighbors,dim=0,out=vertex_rank)
+    vertex_rank,_ = vertex_rank.max(dim=-1) #V
+    neighborhood_rank,_ = vertex_rank[edges_neighbors].max(dim=-1) #E'
+    flip = rank==neighborhood_rank #E'
+    if with_normal_check:
+        #  cl-<-----e1     e0,e1...edge, e0<e1
+        #   |      /A      L,R....left and right face
+        #   |  L /  |      both triangles ordered counter clockwise
+        #   |  / R  |      normals pointing out of screen
+        #   V/      |
+        #   e0---->-cr
+        v = vertices[edges_neighbors] #E",4,3
+        v = v - v[:,0:1] #make relative to e0
+        e1 = v[:,1]
+        cl = v[:,2]
+        cr = v[:,3]
+        n = torch.cross(e1,cl) + torch.cross(cr,e1) #sum of old normal vectors
+        flip.logical_and_(torch.sum(n*torch.cross(cr,cl),dim=-1)>0) #first new face
+        flip.logical_and_(torch.sum(n*torch.cross(cl-e1,cr-e1),dim=-1)>0) #second new face
+    flip_edges_neighbors = edges_neighbors[flip] #E",4
+    flip_edge_to_face = edge_to_face[candidates,:,0][flip] #E",2
+    flip_faces = flip_edges_neighbors[:,[[0,3,2],[1,2,3]]] #E",2,3
+    faces.scatter_(dim=0,index=flip_edge_to_face.reshape(-1,1).expand(-1,3),src=flip_faces.reshape(-1,3))

models/ISOMER/mesh_reconstruction/render.py ADDED Viewed

	@@ -0,0 +1,142 @@

+# modified from https://github.com/Profactor/continuous-remeshing
+import nvdiffrast.torch as dr
+import torch
+from typing import Tuple
+def _warmup(glctx, device=None):
+    device = 'cuda' if device is None else device
+    #windows workaround for https://github.com/NVlabs/nvdiffrast/issues/59
+    def tensor(*args, **kwargs):
+        return torch.tensor(*args, device=device, **kwargs)
+    # defines a triangle in homogeneous coordinates and calls dr.rasterize to render this triangle, which may help to initialize or warm up the GPU context
+    pos = tensor([[[-0.8, -0.8, 0, 1], [0.8, -0.8, 0, 1], [-0.8, 0.8, 0, 1]]], dtype=torch.float32)
+    tri = tensor([[0, 1, 2]], dtype=torch.int32)
+    dr.rasterize(glctx, pos, tri, resolution=[256, 256])
+# glctx = dr.RasterizeGLContext(output_db=False, device="cuda")
+glctx = dr.RasterizeCudaContext(device="cuda")
+class NormalsRenderer:
+    _glctx:dr.RasterizeCudaContext = None
+    def __init__(
+            self,
+            mv: torch.Tensor, #C,4,4  # normal column-major (unlike pytorch3d)
+            proj: torch.Tensor, #C,4,4
+            image_size: Tuple[int,int],
+            mvp = None,
+            device=None,
+            ):
+        if mvp is None:
+            self._mvp = proj @ mv #C,4,4
+        else:
+            self._mvp = mvp
+        self._image_size = image_size
+        self._glctx = glctx
+        _warmup(self._glctx, device)
+    def render(self,
+            vertices: torch.Tensor, #V,3 float
+            normals: torch.Tensor, #V,3 float   in [-1, 1]
+            faces: torch.Tensor, #F,3 long
+            ) ->torch.Tensor: #C,H,W,4
+        V = vertices.shape[0]
+        faces = faces.type(torch.int32)
+        vert_hom = torch.cat((vertices, torch.ones(V,1,device=vertices.device)),axis=-1) #V,3 -> V,4
+        # transforms the vertices into clip space using the mvp matrix.
+        vertices_clip = vert_hom @ self._mvp.transpose(-2,-1) #C,V,4 # the .transpose(-2,-1) operation ensures that the matrix multiplication aligns with the row-major convention.
+        rast_out,_ = dr.rasterize(self._glctx, vertices_clip, faces, resolution=self._image_size, grad_db=False) #C,H,W,4 -> 4 includes the barycentric coordinates and other data.
+        vert_col = (normals+1)/2 #V,3
+        # this function takes the attributes (colors) defined at the vertices and computes their values at each pixel (or fragment) within the triangles
+        col,_ = dr.interpolate(vert_col, rast_out, faces) #C,H,W,3
+        alpha = torch.clamp(rast_out[..., -1:], max=1) #C,H,W,1
+        col = torch.concat((col,alpha),dim=-1) #C,H,W,4
+        col = dr.antialias(col, rast_out, vertices_clip, faces) #C,H,W,4
+        return col #C,H,W,4
+from pytorch3d.structures import Meshes
+from pytorch3d.renderer.mesh.shader import ShaderBase
+from pytorch3d.renderer import (
+    RasterizationSettings,
+    MeshRendererWithFragments,
+    TexturesVertex,
+    MeshRasterizer,
+    BlendParams,
+    FoVOrthographicCameras,
+    look_at_view_transform,
+    hard_rgb_blend,
+)
+class VertexColorShader(ShaderBase):
+    def forward(self, fragments, meshes, **kwargs) -> torch.Tensor:
+        blend_params = kwargs.get("blend_params", self.blend_params)
+        texels = meshes.sample_textures(fragments)
+        return hard_rgb_blend(texels, fragments, blend_params)
+def render_mesh_vertex_color(mesh, cameras, H, W, blur_radius=0.0, faces_per_pixel=1, bkgd=(0., 0., 0.), dtype=torch.float32, device="cuda"):
+    if len(mesh) != len(cameras):
+        if len(cameras) % len(mesh) == 0:
+            mesh = mesh.extend(len(cameras))
+        else:
+            raise NotImplementedError()
+    # render requires everything in float16 or float32
+    input_dtype = dtype
+    blend_params = BlendParams(1e-4, 1e-4, bkgd)
+    # Define the settings for rasterization and shading
+    raster_settings = RasterizationSettings(
+        image_size=(H, W),
+        blur_radius=blur_radius,
+        faces_per_pixel=faces_per_pixel,
+        clip_barycentric_coords=True,
+        bin_size=None,
+        max_faces_per_bin=None,
+    )
+    # Create a renderer by composing a rasterizer and a shader
+    # We simply render vertex colors through the custom VertexColorShader (no lighting, materials are used)
+    renderer = MeshRendererWithFragments(
+        rasterizer=MeshRasterizer(
+            cameras=cameras,
+            raster_settings=raster_settings
+        ),
+        shader=VertexColorShader(
+            device=device,
+            cameras=cameras,
+            blend_params=blend_params
+        )
+    )
+    # render RGB and depth, get mask
+    with torch.autocast(dtype=input_dtype, device_type=torch.device(device).type):
+        images, _ = renderer(mesh)
+    return images   # BHW4
+class Pytorch3DNormalsRenderer: # 100 times slower!!!
+    def __init__(self, cameras, image_size, device):
+        self.cameras = cameras.to(device)
+        self._image_size = image_size
+        self.device = device
+    def render(self,
+            vertices: torch.Tensor, #V,3 float
+            normals: torch.Tensor, #V,3 float   in [-1, 1]
+            faces: torch.Tensor, #F,3 long
+            ) ->torch.Tensor: #C,H,W,4
+        mesh = Meshes(verts=[vertices], faces=[faces], textures=TexturesVertex(verts_features=[(normals + 1) / 2])).to(self.device)
+        return render_mesh_vertex_color(mesh, self.cameras, self._image_size[0], self._image_size[1], device=self.device)
+def save_tensor_to_img(tensor, save_dir):
+    from PIL import Image
+    import numpy as np
+    for idx, img in enumerate(tensor):
+        img = img[..., :3].cpu().numpy()
+        img = (img * 255).astype(np.uint8)
+        img = Image.fromarray(img)
+        img.save(save_dir + f"{idx}.png")

models/ISOMER/model/__init__.py ADDED Viewed

File without changes

models/ISOMER/model/inference_pipeline.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import os
+import numpy as np
+import torch
+from PIL import Image
+from pytorch3d.structures import Meshes
+from pytorch3d.renderer import TexturesVertex
+from ..scripts.fast_geo import fast_geo, create_sphere, create_box
+from ..scripts.project_mesh import get_cameras_list_azi_ele
+from ..mesh_reconstruction.recon import reconstruct_stage1
+from ..mesh_reconstruction.refine import run_mesh_refine
+from ..mesh_reconstruction.func import make_star_cameras_orthographic, make_star_cameras_perspective
+from ..data.utils import (
+    simple_remove_bkg_normal,
+    load_glb,
+    load_obj_with_verts_faces)
+from ..scripts.utils import (
+    to_pyml_mesh,
+    simple_clean_mesh,
+    normal_rotation_img2img_c2w,
+    rotate_normal_R,
+    get_rotation_matrix_azi_ele,
+    manage_elevation_azimuth)
+@torch.enable_grad()
+def reconstruction_pipe(normal_pils,
+                    rotation_angles_azi,
+                    rotation_angles_ele,
+                    front_index=0,
+                    back_index=2,
+                    side_index=1,
+                    weights=None,
+                    expansion_weight=0.1,
+                    expansion_weight_stage2=0.0,
+                    init_type="ball",
+                    sphere_r=None, # only used if init_type=="ball"
+                    box_width=1.0, # only used if init_type=="box"
+                    box_length=1.0, # only used if init_type=="box"
+                    box_height=1.0, # only used if init_type=="box"
+                    init_verts=None,
+                    init_faces=None,
+                    init_mesh_from_file="",
+                    stage1_steps=200,
+                    stage2_steps=200,
+                    projection_type="orthographic",
+                    fovy=None,
+                    radius=None,
+                    ortho_dist=1.1,
+                    camera_angles_azi=None,
+                    camera_angles_ele=None,
+                    rm_bkg=False,
+                    rm_bkg_with_rembg=False, # only used if rm_bkg
+                    normal_rotation_R=None,
+                    train_stage1=True,
+                    train_stage2=True,
+                    use_remesh_stage1=True,
+                    use_remesh_stage2=True,
+                    start_edge_len_stage1=0.1,
+                    end_edge_len_stage1=0.02,
+                    start_edge_len_stage2=0.02,
+                    end_edge_len_stage2=0.005,
+                   ):
+    assert projection_type in ['perspective', 'orthographic'], f"projection_type ({projection_type}) should be one of ['perspective', 'orthographic']"
+    if stage1_steps == 0:
+        train_stage1 = False
+    if stage2_steps == 0:
+        train_stage2 = False
+    if normal_rotation_R is not None:
+        assert normal_rotation_R.shape[-2] == 3 and normal_rotation_R.shape[-1] == 3
+        assert len(normal_rotation_R.shape) == 2
+        normal_rotation_R = normal_rotation_R.float()
+    camera_angles_azi = camera_angles_azi.float()
+    camera_angles_ele = camera_angles_ele.float()
+    camera_angles_ele, camera_angles_azi =  manage_elevation_azimuth(camera_angles_ele, camera_angles_azi)
+    if init_type in ["std", "thin"]:
+        assert camera_angles_azi[front_index]%360==0, f"the camera_angles_azi associated with front image (index {front_index}) should be 0 not {camera_angles_azi[front_index]}"
+        assert camera_angles_azi[back_index]%360==180, f"the camera_angles_azi associated with back image (index {back_index}) should be 180 not {camera_angles_azi[back_index]}"
+        assert camera_angles_azi[side_index]%360==90, f"the camera_angles_azi associated with left side image (index {side_index}) should be 90, not {camera_angles_azi[back_index]}"
+    if rm_bkg:
+        if rm_bkg_with_rembg:
+            os.environ["OMP_NUM_THREADS"] = '8'
+        normal_pils = simple_remove_bkg_normal(normal_pils,rm_bkg_with_rembg)
+    if rotation_angles_azi is not None:
+        rotation_angles_azi = -rotation_angles_azi.float()
+        rotation_angles_ele = rotation_angles_ele.float()
+        rotation_angles_ele, rotation_angles_azi =  manage_elevation_azimuth(rotation_angles_ele, rotation_angles_azi)
+        assert len(normal_pils) == len(rotation_angles_azi), f'len(normal_pils) ({len(normal_pils)}) != len(rotation_angles_azi) ({len(rotation_angles_azi)})'
+        if rotation_angles_ele is None:
+            rotation_angles_ele = [0] * len(normal_pils)
+        normal_pils_rotated = []
+        for i in range(len(normal_pils)):
+            c2w_R = get_rotation_matrix_azi_ele(rotation_angles_azi[i], rotation_angles_ele[i])
+            rotated_ = normal_rotation_img2img_c2w(normal_pils[i], c2w=c2w_R)
+            normal_pils_rotated.append(rotated_)
+        normal_pils = normal_pils_rotated
+    if normal_rotation_R is not None:
+        normal_pils_rotated = []
+        for i in range(len(normal_pils)):
+            rotated_ = rotate_normal_R(normal_pils[i], normal_rotation_R, save_addr="", device="cuda")
+            normal_pils_rotated.append(rotated_)
+        normal_pils = normal_pils_rotated
+    normal_stg1 = [img for img in normal_pils]
+    if init_type in ['thin', 'std']:
+        front_ = normal_stg1[front_index]
+        back_ = normal_stg1[back_index]
+        side_ = normal_stg1[side_index]
+        meshes, depth_front, depth_back, mesh_front, mesh_back = fast_geo(front_, back_, side_, init_type=init_type, return_depth_and_sep_mesh=True)
+    elif init_type in ["ball", "box"]:
+        if init_type == "ball":
+            assert sphere_r is not None, f"sphere_r ({sphere_r}) should not be None when init_type is 'ball'"
+            meshes = create_sphere(sphere_r)
+        if init_type == "box":
+            assert box_width is not None and box_length is not None and box_height is not None, f"box_width ({box_width}), box_length ({box_length}), and box_height ({box_height}) should not be None when init_type is 'box'"
+            meshes = create_box(width=box_width, length=box_length, height=box_height)
+        # add texture just in case
+        num_meshes = len(meshes)
+        num_verts_per_mesh = meshes.verts_packed().shape[0] // num_meshes
+        black_texture = torch.zeros((num_meshes, num_verts_per_mesh, 3), device="cuda")
+        textures = TexturesVertex(verts_features=black_texture)
+        meshes.textures = textures
+    elif init_type == "file":
+        assert init_mesh_from_file or (init_verts is not None and init_faces is not None), f"init_mesh_from_file ({init_mesh_from_file}) should not be None when init_type is 'file', else init_verts and init_faces should not be None"
+        if init_verts is not None and init_faces is not None:
+            meshes = Meshes(verts=[init_verts], faces=[init_faces]).to('cuda')
+        elif init_mesh_from_file.endswith('.glb'):
+            meshes = load_glb(init_mesh_from_file).to('cuda')
+        else:
+            meshes = load_obj_with_verts_faces(init_mesh_from_file).to('cuda')
+        # add texture just in case
+        num_meshes = len(meshes)
+        num_verts_per_mesh = meshes.verts_packed().shape[0] // num_meshes
+        black_texture = torch.zeros((num_meshes, num_verts_per_mesh, 3), device="cuda")
+        textures = TexturesVertex(verts_features=black_texture)
+        meshes.textures = textures
+    if projection_type == 'perspective':
+        assert fovy is not None and radius is not None, f"fovy ({fovy}) and radius ({radius}) should not be None when projection_type is 'perspective'"
+        cameras = get_cameras_list_azi_ele(camera_angles_azi, camera_angles_ele, fov_in_degrees=fovy,device="cuda", dist=radius, cam_type='fov')
+    elif projection_type == 'orthographic':
+        cameras = get_cameras_list_azi_ele(camera_angles_azi, camera_angles_ele, fov_in_degrees=fovy, device="cuda", focal=1., dist=ortho_dist, cam_type='orthographic')
+    vertices, faces = meshes.verts_list()[0], meshes.faces_list()[0]
+    render_camera_angles_azi = -camera_angles_azi
+    render_camera_angles_ele = camera_angles_ele
+    if projection_type == 'orthographic':
+        mv, proj = make_star_cameras_orthographic(render_camera_angles_azi, render_camera_angles_ele)
+    else:
+        mv, proj = make_star_cameras_perspective(render_camera_angles_azi, render_camera_angles_ele, distance=radius, r=radius, fov=fovy, device='cuda')
+    # stage 1
+    if train_stage1:
+        vertices, faces = reconstruct_stage1(normal_stg1, mv=mv, proj=proj, steps=stage1_steps, vertices=vertices, faces=faces, start_edge_len=start_edge_len_stage1, end_edge_len=end_edge_len_stage1, gain=0.05, return_mesh=False, loss_expansion_weight=expansion_weight, use_remesh=use_remesh_stage1)
+    # stage 2
+    if train_stage2:
+        vertices, faces = run_mesh_refine(vertices, faces, normal_pils, mv=mv, proj=proj, weights=weights, steps=stage2_steps, start_edge_len=start_edge_len_stage2, end_edge_len=end_edge_len_stage2, decay=0.99, update_normal_interval=20, update_warmup=5, return_mesh=False, process_inputs=False, process_outputs=False, cameras=cameras, use_remesh=use_remesh_stage2, loss_expansion_weight=expansion_weight_stage2)
+    meshes = simple_clean_mesh(to_pyml_mesh(vertices, faces), apply_smooth=True, stepsmoothnum=1, apply_sub_divide=True, sub_divide_threshold=0.25).to("cuda")
+    return meshes

models/ISOMER/projection_func.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import os
+import numpy as np
+import torch
+from PIL import Image
+import os
+from .scripts.proj_commands import projection as isomer_projection
+from .data.utils import simple_remove_bkg_normal
+# mesh_address,
+def projection(
+    meshes,
+    masks,
+    images,
+    azimuths,
+    elevations,
+    weights,
+    fov,
+    radius,
+    save_dir,
+    save_glb_addr=None,
+    remove_background=False,
+    auto_center=False,
+    projection_type="perspective",
+    below_confidence_strategy="smooth",
+    complete_unseen=True,
+    mesh_scale_factor=1.0,
+    rm_bkg_with_rembg=True,
+):
+    if save_glb_addr is None:
+        os.makedirs(save_dir, exist_ok=True)
+        save_glb_addr=os.path.join(save_dir,  "rgb_projected.glb")
+    bs = len(images)
+    assert len(azimuths) == bs, f'len(azimuths) ({len(azimuths)} != batchsize ({bs}))'
+    assert len(elevations) == bs, f'len(elevations) ({len(elevations)} != batchsize ({bs}))'
+    assert len(weights) == bs, f'len(weights) ({len(weights)} != batchsize ({bs}))'
+    image_rgba = torch.cat([images[:,:,:,:3], masks.unsqueeze(-1)], dim=-1)
+    assert image_rgba.shape[-1] == 4, f'image_rgba.shape is {image_rgba.shape}'
+    img_list = [Image.fromarray((image.cpu()*255).numpy().astype(np.uint8)) for image in image_rgba]
+    if remove_background:
+        if rm_bkg_with_rembg:
+            os.environ["OMP_NUM_THREADS"] = '8'
+        img_list = simple_remove_bkg_normal(img_list, rm_bkg_with_rembg, return_Image=True)
+    resolution = img_list[0].size[0]
+    new_img_list = []
+    for i in range(len(img_list)):
+        new_img = img_list[i].resize((resolution,resolution))
+        path_dir = os.path.join(save_dir, f'projection_images')
+        os.makedirs(path_dir, exist_ok=True)
+        path_ = os.path.join(path_dir, f'ProjectionImg{i}.png')
+        new_img.save(path_)
+        new_img_list.append(new_img)
+    img_list = new_img_list
+    isomer_projection(meshes,
+            img_list=img_list,
+            weights=weights,
+            azimuths=azimuths,
+            elevations=elevations,
+            projection_type=projection_type,
+            auto_center=auto_center,
+            resolution=resolution,
+            fovy=fov,
+            radius=radius,
+            scale_factor=mesh_scale_factor,
+            save_glb_addr=save_glb_addr,
+            scale_verts=True,
+            complete_unseen=complete_unseen,
+            below_confidence_strategy=below_confidence_strategy
+            )
+    return save_glb_addr

models/ISOMER/reconstruction_func.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import os
+import numpy as np
+import torch
+from PIL import Image
+import os
+from .model.inference_pipeline import reconstruction_pipe
+def reconstruction(
+    normal_pils,
+    masks,
+    weights,
+    fov,
+    radius,
+    camera_angles_azi,
+    camera_angles_ele,
+    expansion_weight_stage1=0.1,
+    init_type="ball",
+    init_verts=None,
+    init_faces=None,
+    init_mesh_from_file="",
+    stage1_steps=200,
+    stage2_steps=200,
+    projection_type="perspective",
+    need_normal_rotation=False,
+    rotation_angles_azi=None, # only used if need_normal_rotation
+    rotation_angles_ele=None, # only used if need_normal_rotation
+    normal_rotation_R=None, # only used if need_normal_rotation
+    rm_bkg=False,
+    rm_bkg_with_rembg=True, # only used if rm_bkg
+    start_edge_len_stage1=0.1,
+    end_edge_len_stage1=0.02,
+    start_edge_len_stage2=0.02,
+    end_edge_len_stage2=0.005,
+    expansion_weight_stage2=0.0,
+):
+    if init_type == "file":
+        assert ((init_verts is not None and init_faces is not None) or init_mesh_from_file), f'init_mesh_from_file or (init_verts and init_faces) must be provided if init_type=="file"'
+    if not need_normal_rotation:
+        rotation_angles_azi = None
+        rotation_angles_ele = None
+        normal_rotation_R = None
+    bs = len(normal_pils)
+    assert len(camera_angles_azi) == bs, f'len(camera_angles_azi) ({len(camera_angles_azi)} != batchsize ({bs}))'
+    assert len(camera_angles_ele) == bs, f'len(camera_angles_ele) ({len(camera_angles_ele)} != batchsize ({bs}))'
+    normal_pils_rgba = torch.cat([normal_pils[:,:,:,:3], masks.unsqueeze(-1)], dim=-1)
+    assert normal_pils_rgba.shape[-1] == 4, f'normal_pils_rgba.shape is {normal_pils_rgba.shape}'
+    normal_pils = [Image.fromarray((normal_pil.cpu()*255).numpy().astype(np.uint8)) for normal_pil in normal_pils_rgba]
+    meshes = reconstruction_pipe(
+        normal_pils=normal_pils,
+        rotation_angles_azi=rotation_angles_azi,
+        rotation_angles_ele=rotation_angles_ele,
+        weights=weights,
+        expansion_weight=expansion_weight_stage1,
+        init_type=init_type,
+        stage1_steps=stage1_steps,
+        stage2_steps=stage2_steps,
+        projection_type=projection_type,
+        fovy=fov,
+        radius=radius,
+        camera_angles_azi=camera_angles_azi,
+        camera_angles_ele=camera_angles_ele,
+        rm_bkg=rm_bkg, rm_bkg_with_rembg=rm_bkg_with_rembg,
+        normal_rotation_R=normal_rotation_R,
+        init_mesh_from_file=init_mesh_from_file,
+        start_edge_len_stage1=start_edge_len_stage1,
+        end_edge_len_stage1=end_edge_len_stage1,
+        start_edge_len_stage2=start_edge_len_stage2,
+        end_edge_len_stage2=end_edge_len_stage2,
+        expansion_weight_stage2=expansion_weight_stage2,
+        init_verts=init_verts,
+        init_faces=init_faces,
+    )
+    return meshes

models/ISOMER/scripts/__init__.py ADDED Viewed

File without changes

models/ISOMER/scripts/all_typing.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# code from https://github.com/threestudio-project
+"""
+This module contains type annotations for the project, using
+1. Python type hints (https://docs.python.org/3/library/typing.html) for Python objects
+2. jaxtyping (https://github.com/google/jaxtyping/blob/main/API.md) for PyTorch tensors
+Two types of typing checking can be used:
+1. Static type checking with mypy (install with pip and enabled as the default linter in VSCode)
+2. Runtime type checking with typeguard (install with pip and triggered at runtime, mainly for tensor dtype and shape checking)
+"""
+# Basic types
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Literal,
+    NamedTuple,
+    NewType,
+    Optional,
+    Sized,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+)
+# Tensor dtype
+# for jaxtyping usage, see https://github.com/google/jaxtyping/blob/main/API.md
+from jaxtyping import Bool, Complex, Float, Inexact, Int, Integer, Num, Shaped, UInt
+# Config type
+from omegaconf import DictConfig
+# PyTorch Tensor type
+from torch import Tensor
+# Runtime type checking decorator
+from typeguard import typechecked as typechecker

models/ISOMER/scripts/fast_geo.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import os
+from PIL import Image
+from .mesh_init import build_mesh, calc_w_over_h, fix_border_with_pymeshlab_fast
+from pytorch3d.structures import Meshes, join_meshes_as_scene
+import numpy as np
+import torch
+from pytorch3d.structures import Meshes
+from pytorch3d.utils import ico_sphere
+def create_sphere(radius, device='cuda'):
+    sphere_mesh = ico_sphere(3, device=device)  # Increase the subdivision level (e.g., 2) for higher resolution sphere
+    sphere_mesh = sphere_mesh.scale_verts(radius)
+    meshes = Meshes(verts=[sphere_mesh.verts_list()[0]], faces=[sphere_mesh.faces_list()[0]])
+    return meshes
+def create_box(width, length, height, device='cuda'):
+    """
+    Create a box mesh given the width, length, and height.
+    Args:
+        width (float): Width of the box.
+        length (float): Length of the box.
+        height (float): Height of the box.
+        device (str): Device for the tensor operations, default is 'cuda'.
+    Returns:
+        Meshes: A PyTorch3D Meshes object representing the box.
+    """
+    # Define the 8 vertices of the box
+    verts = torch.tensor([
+        [-width / 2, -length / 2, -height / 2],
+        [ width / 2, -length / 2, -height / 2],
+        [ width / 2,  length / 2, -height / 2],
+        [-width / 2,  length / 2, -height / 2],
+        [-width / 2, -length / 2,  height / 2],
+        [ width / 2, -length / 2,  height / 2],
+        [ width / 2,  length / 2,  height / 2],
+        [-width / 2,  length / 2,  height / 2]
+    ], device=device)
+    # Define the 12 triangles (faces) of the box using vertex indices
+    faces = torch.tensor([
+        [0, 1, 2], [0, 2, 3],  # Bottom face
+        [4, 5, 6], [4, 6, 7],  # Top face
+        [0, 1, 5], [0, 5, 4],  # Front face
+        [1, 2, 6], [1, 6, 5],  # Right face
+        [2, 3, 7], [2, 7, 6],  # Back face
+        [3, 0, 4], [3, 4, 7]   # Left face
+    ], device=device)
+    # Create the Meshes object
+    meshes = Meshes(verts=[verts], faces=[faces])
+    return meshes
+# stage 0 inital mesh estimation
+def fast_geo(front_normal: Image.Image, back_normal: Image.Image, side_normal: Image.Image, clamp=0., init_type="std", return_depth_and_sep_mesh=False):
+    import time
+    assert front_normal.mode != "RGB"
+    assert back_normal.mode != "RGB"
+    assert side_normal.mode != "RGB"
+    front_normal = front_normal.resize((192, 192))
+    back_normal = back_normal.resize((192, 192))
+    side_normal = side_normal.resize((192, 192))
+    # build mesh with front back projection # ~3s
+    side_w_over_h = calc_w_over_h(side_normal)
+    mesh_front, depth_front = build_mesh(front_normal, front_normal, clamp_min=clamp, scale=side_w_over_h, init_type=init_type, return_depth=True)
+    mesh_back, depth_back = build_mesh(back_normal, back_normal, is_back=True, clamp_min=clamp, scale=side_w_over_h, init_type=init_type, return_depth=True)
+    meshes = join_meshes_as_scene([mesh_front, mesh_back])
+    # poisson reconstruction which guarantees a smooth connection between meshes
+    # and simplify into 2000 fewer faces
+    meshes = fix_border_with_pymeshlab_fast(meshes, poissson_depth=6, simplification=2000)
+    if return_depth_and_sep_mesh:
+        return meshes, depth_front, depth_back, mesh_front, mesh_back
+    return meshes

models/ISOMER/scripts/load_onnx.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import onnxruntime
+import torch
+providers = [
+    ('TensorrtExecutionProvider', {
+        'device_id': 0,
+        'trt_max_workspace_size': 8 * 1024 * 1024 * 1024,
+        'trt_fp16_enable': True,
+        'trt_engine_cache_enable': True,
+    }),
+    ('CUDAExecutionProvider', {
+        'device_id': 0,
+        'arena_extend_strategy': 'kSameAsRequested',
+        'gpu_mem_limit': 8 * 1024 * 1024 * 1024,
+        'cudnn_conv_algo_search': 'HEURISTIC',
+    })
+]
+def load_onnx(file_path: str):
+    assert file_path.endswith(".onnx")
+    sess_opt = onnxruntime.SessionOptions()
+    ort_session = onnxruntime.InferenceSession(file_path, sess_opt=sess_opt, providers=providers)
+    return ort_session
+def load_onnx_caller(file_path: str, single_output=False):
+    ort_session = load_onnx(file_path)
+    def caller(*args):
+        torch_input = isinstance(args[0], torch.Tensor)
+        if torch_input:
+            torch_input_dtype = args[0].dtype
+            torch_input_device = args[0].device
+            # check all are torch.Tensor and have same dtype and device
+            assert all([isinstance(arg, torch.Tensor) for arg in args]), "All inputs should be torch.Tensor, if first input is torch.Tensor"
+            assert all([arg.dtype == torch_input_dtype for arg in args]), "All inputs should have same dtype, if first input is torch.Tensor"
+            assert all([arg.device == torch_input_device for arg in args]), "All inputs should have same device, if first input is torch.Tensor"
+            args = [arg.cpu().float().numpy() for arg in args]
+        ort_inputs = {ort_session.get_inputs()[idx].name: args[idx] for idx in range(len(args))}
+        ort_outs = ort_session.run(None, ort_inputs)
+        if torch_input:
+            ort_outs = [torch.tensor(ort_out, dtype=torch_input_dtype, device=torch_input_device) for ort_out in ort_outs]
+        if single_output:
+            return ort_outs[0]
+        return ort_outs
+    return caller

models/ISOMER/scripts/mesh_init.py ADDED Viewed

	@@ -0,0 +1,142 @@

+from PIL import Image
+import torch
+import numpy as np
+from pytorch3d.structures import Meshes
+from pytorch3d.renderer import TexturesVertex
+from .utils import meshlab_mesh_to_py3dmesh, py3dmesh_to_meshlab_mesh
+import pymeshlab
+_MAX_THREAD = 8
+# rgb and depth to mesh
+def get_ortho_ray_directions_origins(W, H, use_pixel_centers=True, device="cuda"):
+    pixel_center = 0.5 if use_pixel_centers else 0
+    i, j = np.meshgrid(
+        np.arange(W, dtype=np.float32) + pixel_center,
+        np.arange(H, dtype=np.float32) + pixel_center,
+        indexing='xy'
+    )
+    i, j = torch.from_numpy(i).to(device), torch.from_numpy(j).to(device)
+    origins = torch.stack([(i/W-0.5)*2, (j/H-0.5)*2 * H / W, torch.zeros_like(i)], dim=-1) # W, H, 3
+    directions = torch.stack([torch.zeros_like(i), torch.zeros_like(j), torch.ones_like(i)], dim=-1) # W, H, 3
+    return origins, directions
+def depth_and_color_to_mesh(rgb_BCHW, pred_HWC, valid_HWC=None, is_back=False):
+    if valid_HWC is None:
+        valid_HWC = torch.ones_like(pred_HWC).bool()
+    H, W = rgb_BCHW.shape[-2:]
+    rgb_BCHW = rgb_BCHW.flip(-2)
+    pred_HWC = pred_HWC.flip(0)
+    valid_HWC = valid_HWC.flip(0)
+    rays_o, rays_d = get_ortho_ray_directions_origins(W, H, device=rgb_BCHW.device)
+    verts = rays_o + rays_d * pred_HWC  # [H, W, 3]
+    verts = verts.reshape(-1, 3)    # [V, 3]
+    indexes = torch.arange(H * W).reshape(H, W).to(rgb_BCHW.device)
+    faces1 = torch.stack([indexes[:-1, :-1], indexes[:-1, 1:], indexes[1:, :-1]], dim=-1)
+    # faces1_valid = valid_HWC[:-1, :-1] | valid_HWC[:-1, 1:] | valid_HWC[1:, :-1]
+    faces1_valid = valid_HWC[:-1, :-1] & valid_HWC[:-1, 1:] & valid_HWC[1:, :-1]
+    faces2 = torch.stack([indexes[1:, 1:], indexes[1:, :-1], indexes[:-1, 1:]], dim=-1)
+    # faces2_valid = valid_HWC[1:, 1:] | valid_HWC[1:, :-1] | valid_HWC[:-1, 1:]
+    faces2_valid = valid_HWC[1:, 1:] & valid_HWC[1:, :-1] & valid_HWC[:-1, 1:]
+    faces = torch.cat([faces1[faces1_valid.expand_as(faces1)].reshape(-1, 3),
+                       faces2[faces2_valid.expand_as(faces2)].reshape(-1, 3)],
+                       dim=0)  # (F, 3)
+    colors = (rgb_BCHW[0].permute((1,2,0)) / 2 + 0.5).reshape(-1, 3)  # (V, 3)
+    if is_back:
+        verts = verts * torch.tensor([-1, 1, -1], dtype=verts.dtype, device=verts.device)
+    used_verts = faces.unique()
+    old_to_new_mapping = torch.zeros_like(verts[..., 0]).long()
+    old_to_new_mapping[used_verts] = torch.arange(used_verts.shape[0], device=verts.device)
+    new_faces = old_to_new_mapping[faces]
+    mesh = Meshes(verts=[verts[used_verts]], faces=[new_faces], textures=TexturesVertex(verts_features=[colors[used_verts]]))
+    return mesh
+def normalmap_to_depthmap(normal_np):
+    from .normal_to_height_map import estimate_height_map
+    height = estimate_height_map(normal_np, raw_values=True, thread_count=_MAX_THREAD, target_iteration_count=96)
+    return height
+def transform_back_normal_to_front(normal_pil):
+    arr = np.array(normal_pil)  # in [0, 255]
+    arr[..., 0] = 255-arr[..., 0]
+    arr[..., 2] = 255-arr[..., 2]
+    return Image.fromarray(arr.astype(np.uint8))
+def calc_w_over_h(normal_pil):
+    if isinstance(normal_pil, Image.Image):
+        arr = np.array(normal_pil)
+    else:
+        assert isinstance(normal_pil, np.ndarray)
+        arr = normal_pil
+    if arr.shape[-1] == 4:
+        alpha = arr[..., -1] / 255.
+        alpha[alpha >= 0.5] = 1
+        alpha[alpha < 0.5] = 0
+    else:
+        alpha = ~(arr.min(axis=-1) >= 250)
+    h_min, w_min = np.min(np.where(alpha), axis=1)
+    h_max, w_max = np.max(np.where(alpha), axis=1)
+    return (w_max - w_min) / (h_max - h_min)
+def build_mesh(normal_pil, rgb_pil, is_back=False, clamp_min=-1, scale=0.3, init_type="std", offset=0, return_depth=False):
+    if is_back:
+        normal_pil = transform_back_normal_to_front(normal_pil)
+    normal_img = np.array(normal_pil)
+    rgb_img = np.array(rgb_pil)
+    if normal_img.shape[-1] == 4:
+        valid_HWC = normal_img[..., [3]] / 255
+    elif rgb_img.shape[-1] == 4:
+        valid_HWC = rgb_img[..., [3]] / 255
+    else:
+        raise ValueError("invalid input, either normal or rgb should have alpha channel")
+    # object area pixels height
+    real_height_pix = np.max(np.where(valid_HWC>0.5)[0]) - np.min(np.where(valid_HWC>0.5)[0])
+    heights = normalmap_to_depthmap(normal_img)
+    rgb_BCHW = torch.from_numpy(rgb_img[..., :3] / 255.).permute((2,0,1))[None]
+    valid_HWC[valid_HWC < 0.5] = 0
+    valid_HWC[valid_HWC >= 0.5] = 1
+    valid_HWC = torch.from_numpy(valid_HWC).bool()
+    if init_type == "std":
+        # accurate but not stable
+        pred_HWC = torch.from_numpy(heights / heights.max() * (real_height_pix / heights.shape[0]) * scale * 2).float()[..., None]
+    elif init_type == "thin":
+        heights = heights - heights.min()
+        heights = (heights / heights.max() * 0.2)
+        pred_HWC = torch.from_numpy(heights * scale).float()[..., None]
+    else:
+        # stable but not accurate
+        heights = heights - heights.min()
+        heights = (heights / heights.max() * (1-offset)) + offset # to [0.2, 1]
+        pred_HWC = torch.from_numpy(heights * scale).float()[..., None]
+    # set the boarder pixels to 0 height
+    import cv2
+    # edge filter
+    edge = cv2.Canny((valid_HWC[..., 0] * 255).numpy().astype(np.uint8), 0, 255)
+    edge = torch.from_numpy(edge).bool()[..., None]
+    pred_HWC[edge] = 0
+    valid_HWC[pred_HWC < clamp_min] = False
+    rt_mesh = depth_and_color_to_mesh(rgb_BCHW.cuda(), pred_HWC.cuda(), valid_HWC.cuda(), is_back)
+    if return_depth:
+        return rt_mesh, pred_HWC
+    return rt_mesh
+# poisson reconstruction which guarantees a smooth connection between meshes
+# and simplify into 2000 fewer faces
+def fix_border_with_pymeshlab_fast(meshes: Meshes, poissson_depth=6, simplification=0):
+    ms = pymeshlab.MeshSet()
+    ms.add_mesh(py3dmesh_to_meshlab_mesh(meshes), "cube_vcolor_mesh")
+    if simplification > 0:
+        ms.apply_filter('meshing_decimation_quadric_edge_collapse', targetfacenum=simplification, preservetopology=True)
+    ms.apply_filter('generate_surface_reconstruction_screened_poisson', threads = 6, depth = poissson_depth, preclean = True)
+    if simplification > 0:
+        ms.apply_filter('meshing_decimation_quadric_edge_collapse', targetfacenum=simplification, preservetopology=True)
+    return meshlab_mesh_to_py3dmesh(ms.current_mesh())

models/ISOMER/scripts/normal_to_height_map.py ADDED Viewed

	@@ -0,0 +1,205 @@

+# code modified from https://github.com/YertleTurtleGit/depth-from-normals
+import numpy as np
+import cv2 as cv
+from multiprocessing.pool import ThreadPool as Pool
+from multiprocessing import cpu_count
+from typing import Tuple, List, Union
+import numba
+def calculate_gradients(
+    normals: np.ndarray, mask: np.ndarray
+) -> Tuple[np.ndarray, np.ndarray]:
+    horizontal_angle_map = np.arccos(np.clip(normals[:, :, 0], -1, 1))
+    left_gradients = np.zeros(normals.shape[:2])
+    left_gradients[mask != 0] = (1 - np.sin(horizontal_angle_map[mask != 0])) * np.sign(
+        horizontal_angle_map[mask != 0] - np.pi / 2
+    )
+    vertical_angle_map = np.arccos(np.clip(normals[:, :, 1], -1, 1))
+    top_gradients = np.zeros(normals.shape[:2])
+    top_gradients[mask != 0] = -(1 - np.sin(vertical_angle_map[mask != 0])) * np.sign(
+        vertical_angle_map[mask != 0] - np.pi / 2
+    )
+    return left_gradients, top_gradients
+@numba.jit(nopython=True)
+def integrate_gradient_field(
+    gradient_field: np.ndarray, axis: int, mask: np.ndarray
+) -> np.ndarray:
+    heights = np.zeros(gradient_field.shape)
+    for d1 in numba.prange(heights.shape[1 - axis]): # numba.prange: executes the loop in parallel
+        sum_value = 0
+        for d2 in range(heights.shape[axis]):
+            coordinates = (d1, d2) if axis == 1 else (d2, d1)
+            if mask[coordinates] != 0:
+                sum_value = sum_value + gradient_field[coordinates] # equation 1 in paper along `axis` axis
+                heights[coordinates] = sum_value
+            else:
+                sum_value = 0
+    return heights
+# equation 1 in paper wrt these directions
+def calculate_heights(
+    left_gradients: np.ndarray, top_gradients, mask: np.ndarray
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    left_heights = integrate_gradient_field(left_gradients, 1, mask)
+    right_heights = np.fliplr(
+        integrate_gradient_field(np.fliplr(-left_gradients), 1, np.fliplr(mask))
+    )
+    top_heights = integrate_gradient_field(top_gradients, 0, mask)
+    bottom_heights = np.flipud(
+        integrate_gradient_field(np.flipud(-top_gradients), 0, np.flipud(mask))
+    )
+    return left_heights, right_heights, top_heights, bottom_heights
+def combine_heights(*heights: np.ndarray) -> np.ndarray:
+    return np.mean(np.stack(heights, axis=0), axis=0)
+def rotate(matrix: np.ndarray, angle: float) -> np.ndarray:
+    h, w = matrix.shape[:2]
+    center = (w / 2, h / 2)
+    rotation_matrix = cv.getRotationMatrix2D(center, angle, 1.0)
+    corners = cv.transform(
+        np.array([[[0, 0], [w, 0], [w, h], [0, h]]]), rotation_matrix
+    )[0]
+    _, _, w, h = cv.boundingRect(corners)
+    rotation_matrix[0, 2] += w / 2 - center[0]
+    rotation_matrix[1, 2] += h / 2 - center[1]
+    result = cv.warpAffine(matrix, rotation_matrix, (w, h), flags=cv.INTER_LINEAR)
+    return result
+def rotate_vector_field_normals(normals: np.ndarray, angle: float) -> np.ndarray:
+    angle = np.radians(angle)
+    cos_angle = np.cos(angle)
+    sin_angle = np.sin(angle)
+    rotated_normals = np.empty_like(normals)
+    rotated_normals[:, :, 0] = (
+        normals[:, :, 0] * cos_angle - normals[:, :, 1] * sin_angle
+    )
+    rotated_normals[:, :, 1] = (
+        normals[:, :, 0] * sin_angle + normals[:, :, 1] * cos_angle
+    )
+    return rotated_normals
+def centered_crop(image: np.ndarray, target_resolution: Tuple[int, int]) -> np.ndarray:
+    return image[
+        (image.shape[0] - target_resolution[0])
+        // 2 : (image.shape[0] - target_resolution[0])
+        // 2
+        + target_resolution[0],
+        (image.shape[1] - target_resolution[1])
+        // 2 : (image.shape[1] - target_resolution[1])
+        // 2
+        + target_resolution[1],
+    ]
+def integrate_vector_field(
+    vector_field: np.ndarray,
+    mask: np.ndarray,
+    target_iteration_count: int,
+    thread_count: int,
+) -> np.ndarray:
+    shape = vector_field.shape[:2]
+    angles = np.linspace(0, 90, target_iteration_count, endpoint=False)
+    def integrate_vector_field_angles(angles: List[float]) -> np.ndarray:
+        all_combined_heights = np.zeros(shape)
+        for angle in angles:
+            rotated_vector_field = rotate_vector_field_normals(
+                rotate(vector_field, angle), angle
+            ) # rotate twice: first rotate the whole in image level, then rotate the individual normal vectors
+            rotated_mask = rotate(mask, angle)
+            left_gradients, top_gradients = calculate_gradients(
+                rotated_vector_field, rotated_mask
+            )
+            (
+                left_heights,
+                right_heights,
+                top_heights,
+                bottom_heights,
+            ) = calculate_heights(left_gradients, top_gradients, rotated_mask)
+            combined_heights = combine_heights(
+                left_heights, right_heights, top_heights, bottom_heights
+            )  # = mean of these heights
+            combined_heights = centered_crop(rotate(combined_heights, -angle), shape)
+            all_combined_heights += combined_heights / len(angles)
+        return all_combined_heights
+    with Pool(processes=thread_count) as pool:
+        heights = pool.map(
+            integrate_vector_field_angles,
+            np.array(
+                np.array_split(angles, thread_count),
+                dtype=object,
+            ),
+        )
+        pool.close()
+        pool.join()
+    isotropic_height = np.zeros(shape)
+    for height in heights:
+        isotropic_height += height / thread_count
+    return isotropic_height
+def estimate_height_map(
+    normal_map: np.ndarray,
+    mask: Union[np.ndarray, None] = None,
+    height_divisor: float = 1,
+    target_iteration_count: int = 250,
+    thread_count: int = cpu_count(),
+    raw_values: bool = False,
+) -> np.ndarray:
+    if mask is None:
+        if normal_map.shape[-1] == 4:
+            mask = normal_map[:, :, 3] / 255
+            mask[mask < 0.5] = 0
+            mask[mask >= 0.5] = 1
+        else:
+            mask = np.ones(normal_map.shape[:2], dtype=np.uint8)
+    normals = ((normal_map[:, :, :3].astype(np.float64) / 255) - 0.5) * 2
+    heights = integrate_vector_field(
+        normals, mask, target_iteration_count, thread_count
+    ) # equation 1 in paper, repeat `target_iteration_count` (8?) times with rotation in angle np.linspace(0, 90, target_iteration_count), then find mean
+    #  target_iteration_count=8 ? defined _MAX_THREAD = 8 in mesh_init.py
+    if raw_values:
+        return heights
+    heights /= height_divisor
+    heights[mask > 0] += 1 / 2
+    heights[mask == 0] = 1 / 2
+    heights *= 2**16 - 1
+    if np.min(heights) < 0 or np.max(heights) > 2**16 - 1:
+        raise OverflowError("Height values are clipping.")
+    heights = np.clip(heights, 0, 2**16 - 1)
+    heights = heights.astype(np.uint16)
+    return heights

models/ISOMER/scripts/proj_commands.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import numpy as np
+import torch
+from PIL import Image
+from pytorch3d.renderer import (
+    TexturesVertex,
+)
+from .project_mesh import (
+    get_cameras_list_azi_ele,
+    multiview_color_projection
+)
+from .utils import save_py3dmesh_with_trimesh_fast
+def projection(meshes,
+               img_list,
+               weights,
+               azimuths,
+               elevations,
+               projection_type='orthographic',
+               auto_center=True,
+               resolution=1024,
+               fovy=None,
+               radius=None,
+               ortho_dist=1.1,
+               scale_factor=1.0,
+               save_glb_addr=None,
+               scale_verts=True,
+               complete_unseen=True,
+               below_confidence_strategy="smooth"
+               ):
+    assert len(img_list) == len(azimuths) == len(elevations) == len(weights), f"len(img_list) ({len(img_list)}) != len(azimuths) ({len(azimuths)}) != len(elevations) ({len(elevations)}) != len(weights) ({len(weights)})"
+    projection_types = ['perspective', 'orthographic']
+    assert projection_type in projection_types, f"projection_type ({projection_type}) should be one of {projection_types}"
+    if auto_center:
+        verts = meshes.verts_packed()
+        max_bb = (verts - 0).max(0)[0]
+        min_bb = (verts - 0).min(0)[0]
+        scale = (max_bb - min_bb).max() / 2
+        center = (max_bb + min_bb) / 2
+        meshes.offset_verts_(-center)
+        if scale_verts:
+            meshes.scale_verts_((scale_factor / float(scale)))
+    elif scale_verts:
+        meshes.scale_verts_((scale_factor))
+    if projection_type == 'perspective':
+        assert fovy is not None and radius is not None, f"fovy ({fovy}) and radius ({radius}) should not be None when projection_type is 'perspective'"
+        cameras = get_cameras_list_azi_ele(azimuths, elevations, fov_in_degrees=fovy,device="cuda", dist=radius, cam_type='fov')
+    elif projection_type == 'orthographic':
+        cameras = get_cameras_list_azi_ele(azimuths, elevations, fov_in_degrees=fovy, device="cuda", focal=2/1.35, dist=ortho_dist, cam_type='orthographic')
+    num_meshes = len(meshes)
+    num_verts_per_mesh = meshes.verts_packed().shape[0] // num_meshes
+    black_texture = torch.zeros((num_meshes, num_verts_per_mesh, 3), device="cuda")
+    textures = TexturesVertex(verts_features=black_texture)
+    meshes.textures = textures
+    proj_mesh = multiview_color_projection(meshes, img_list, cameras, weights=weights, eps=0.05, resolution=resolution, device="cuda", reweight_with_cosangle="square", use_alpha=True, confidence_threshold=0.1, complete_unseen=complete_unseen, below_confidence_strategy=below_confidence_strategy)
+    if save_glb_addr is not None:
+        save_py3dmesh_with_trimesh_fast(proj_mesh, save_glb_addr)

models/ISOMER/scripts/project_mesh.py ADDED Viewed

	@@ -0,0 +1,401 @@

+from typing import List
+import torch
+import numpy as np
+from PIL import Image
+from pytorch3d.renderer.cameras import look_at_view_transform, OrthographicCameras, CamerasBase
+from pytorch3d.io import load_objs_as_meshes
+from pytorch3d.renderer.mesh.rasterizer import Fragments
+from pytorch3d.structures import Meshes
+from pytorch3d.renderer import (
+    RasterizationSettings,
+    TexturesVertex,
+    FoVPerspectiveCameras,
+    FoVOrthographicCameras,
+)
+from pytorch3d.renderer import MeshRasterizer
+def get_camera(world_to_cam, fov_in_degrees=60, focal_length=1 / (2**0.5), cam_type='fov'):
+    # pytorch3d expects transforms as row-vectors, so flip rotation: https://github.com/facebookresearch/pytorch3d/issues/1183
+    R = world_to_cam[:3, :3].t()[None, ...]
+    T = world_to_cam[:3, 3][None, ...]
+    if cam_type == 'fov':
+        assert fov_in_degrees is not None, "fov_in_degrees should not be None when cam_type is fov"
+        camera = FoVPerspectiveCameras(device=world_to_cam.device, R=R, T=T, fov=fov_in_degrees, degrees=True)
+    else:
+        focal_length = 1 / focal_length
+        camera = FoVOrthographicCameras(device=world_to_cam.device, R=R, T=T, min_x=-focal_length, max_x=focal_length, min_y=-focal_length, max_y=focal_length)
+    return camera
+def render_pix2faces_py3d(meshes, cameras, H=512, W=512, blur_radius=0.0, faces_per_pixel=1):
+    """
+    Renders pix2face of visible faces.
+    :param mesh: Pytorch3d.structures.Meshes
+    :param cameras: pytorch3d.renderer.Cameras
+    :param H: target image height
+    :param W: target image width
+    :param blur_radius: Float distance in the range [0, 2] used to expand the face
+            bounding boxes for rasterization. Setting blur radius
+            results in blurred edges around the shape instead of a
+            hard boundary. Set to 0 for no blur.
+    :param faces_per_pixel: (int) Number of faces to keep track of per pixel.
+            We return the nearest faces_per_pixel faces along the z-axis.
+    """
+    # Define the settings for rasterization and shading
+    raster_settings = RasterizationSettings(
+        image_size=(H, W),
+        blur_radius=blur_radius,
+        faces_per_pixel=faces_per_pixel
+    )
+    rasterizer=MeshRasterizer(
+        cameras=cameras,
+        raster_settings=raster_settings
+    )
+    fragments: Fragments = rasterizer(meshes, cameras=cameras)
+    return {
+        "pix_to_face": fragments.pix_to_face[..., 0],
+    }
+import nvdiffrast.torch as dr
+def _warmup(glctx, device=None):
+    device = 'cuda' if device is None else device
+    #windows workaround for https://github.com/NVlabs/nvdiffrast/issues/59
+    def tensor(*args, **kwargs):
+        return torch.tensor(*args, device=device, **kwargs)
+    pos = tensor([[[-0.8, -0.8, 0, 1], [0.8, -0.8, 0, 1], [-0.8, 0.8, 0, 1]]], dtype=torch.float32)
+    tri = tensor([[0, 1, 2]], dtype=torch.int32)
+    dr.rasterize(glctx, pos, tri, resolution=[256, 256])
+class Pix2FacesRenderer:
+    def __init__(self, device="cuda"):
+        # self._glctx = dr.RasterizeGLContext(output_db=False, device=device)
+        self._glctx = dr.RasterizeCudaContext(device=device)
+        self.device = device
+        _warmup(self._glctx, device)
+    def transform_vertices(self, meshes: Meshes, cameras: CamerasBase):
+        vertices = cameras.transform_points_ndc(meshes.verts_padded())
+        perspective_correct = cameras.is_perspective()
+        znear = cameras.get_znear()
+        if isinstance(znear, torch.Tensor):
+            znear = znear.min().item()
+        z_clip = None #if not perspective_correct or znear is None else znear / 2
+        if z_clip:
+            vertices = vertices[vertices[..., 2] >= cameras.get_znear()][None]    # clip
+        vertices = vertices * torch.tensor([-1, -1, 1]).to(vertices)
+        vertices = torch.cat([vertices, torch.ones_like(vertices[..., :1])], dim=-1).to(torch.float32)
+        return vertices
+    def render_pix2faces_nvdiff(self, meshes: Meshes, cameras: CamerasBase, H=512, W=512):
+        meshes = meshes.to(self.device)
+        cameras = cameras.to(self.device)
+        vertices = self.transform_vertices(meshes, cameras)
+        faces = meshes.faces_packed().to(torch.int32)
+        rast_out,_ = dr.rasterize(self._glctx, vertices, faces, resolution=(H, W), grad_db=False) #C,H,W,4
+        pix_to_face = rast_out[..., -1].to(torch.int32) - 1
+        return pix_to_face
+pix2faces_renderer = Pix2FacesRenderer()
+def get_visible_faces(meshes: Meshes, cameras: CamerasBase, resolution=1024):
+    # pix_to_face = render_pix2faces_py3d(meshes, cameras, H=resolution, W=resolution)['pix_to_face']
+    pix_to_face = pix2faces_renderer.render_pix2faces_nvdiff(meshes, cameras, H=resolution, W=resolution)
+    unique_faces = torch.unique(pix_to_face.flatten())
+    unique_faces = unique_faces[unique_faces != -1]
+    return unique_faces
+def project_color(meshes: Meshes, cameras: CamerasBase, pil_image: Image.Image, use_alpha=True, eps=0.05, resolution=1024, device="cuda") -> dict:
+    """
+    Projects color from a given image onto a 3D mesh.
+    Args:
+        meshes (pytorch3d.structures.Meshes): The 3D mesh object.
+        cameras (pytorch3d.renderer.cameras.CamerasBase): The camera object.
+        pil_image (PIL.Image.Image): The input image.
+        use_alpha (bool, optional): Whether to use the alpha channel of the image. Defaults to True.
+        eps (float, optional): The threshold for selecting visible faces. Defaults to 0.05.
+        resolution (int, optional): The resolution of the projection. Defaults to 1024.
+        device (str, optional): The device to use for computation. Defaults to "cuda".
+        debug (bool, optional): Whether to save debug images. Defaults to False.
+    Returns:
+        dict: A dictionary containing the following keys:
+            - "new_texture" (TexturesVertex): The updated texture with interpolated colors.
+            - "valid_verts" (Tensor of [M,3]): The indices of the vertices being projected.
+            - "valid_colors" (Tensor of [M,3]): The interpolated colors for the valid vertices.
+    """
+    meshes = meshes.to(device)
+    cameras = cameras.to(device)
+    image = torch.from_numpy(np.array(pil_image.convert("RGBA")) / 255.).permute((2, 0, 1)).float().to(device)     # in CHW format of [0, 1.]
+    unique_faces = get_visible_faces(meshes, cameras, resolution=resolution)
+    # visible faces
+    faces_normals = meshes.faces_normals_packed()[unique_faces]
+    faces_normals = faces_normals / faces_normals.norm(dim=1, keepdim=True)
+    world_points = cameras.unproject_points(torch.tensor([[[0., 0., 0.1], [0., 0., 0.2]]]).to(device))[0]
+    view_direction = world_points[1] - world_points[0]
+    view_direction = view_direction / view_direction.norm(dim=0, keepdim=True)
+    # find invalid faces
+    cos_angles = (faces_normals * view_direction).sum(dim=1)
+    # assert cos_angles.mean() < 0, f"The view direction is not correct. cos_angles.mean()={cos_angles.mean()}"
+    selected_faces = unique_faces[cos_angles < -eps]
+    # find verts
+    faces = meshes.faces_packed()[selected_faces]   # [N, 3]
+    verts = torch.unique(faces.flatten())   # [N, 1]
+    verts_coordinates = meshes.verts_packed()[verts]   # [N, 3]
+    # compute color
+    pt_tensor = cameras.transform_points(verts_coordinates)[..., :2] # NDC space points
+    valid = ~((pt_tensor.isnan()|(pt_tensor<-1)|(1<pt_tensor)).any(dim=1))  # checked, correct
+    valid_pt = pt_tensor[valid, :]
+    valid_idx = verts[valid]
+    valid_color = torch.nn.functional.grid_sample(image[None].flip((-1, -2)), valid_pt[None, :, None, :], align_corners=False, padding_mode="reflection", mode="bilinear")[0, :, :, 0].T.clamp(0, 1)   # [N, 4], note that bicubic may give invalid value
+    alpha, valid_color = valid_color[:, 3:], valid_color[:, :3]
+    if not use_alpha:
+        alpha = torch.ones_like(alpha)
+    # modify color
+    old_colors = meshes.textures.verts_features_packed()
+    old_colors[valid_idx] = valid_color * alpha + old_colors[valid_idx] * (1 - alpha)
+    new_texture = TexturesVertex(verts_features=[old_colors])
+    valid_verts_normals = meshes.verts_normals_packed()[valid_idx]
+    valid_verts_normals = valid_verts_normals / valid_verts_normals.norm(dim=1, keepdim=True).clamp_min(0.001)
+    cos_angles = (valid_verts_normals * view_direction).sum(dim=1)
+    return {
+        "new_texture": new_texture,
+        "valid_verts": valid_idx,
+        "valid_colors": valid_color,
+        "valid_alpha": alpha,
+        "cos_angles": cos_angles,
+    }
+def complete_unseen_vertex_color(meshes: Meshes, valid_index: torch.Tensor) -> dict:
+    """
+    meshes: the mesh with vertex color to be completed.
+    valid_index: the index of the valid vertices, where valid means colors are fixed. [V, 1]
+    """
+    valid_index = valid_index.to(meshes.device)
+    colors = meshes.textures.verts_features_packed()    # [V, 3]
+    V = colors.shape[0]
+    invalid_index = torch.ones_like(colors[:, 0]).bool()    # [V]
+    invalid_index[valid_index] = False
+    invalid_index = torch.arange(V).to(meshes.device)[invalid_index]
+    L = meshes.laplacian_packed()  # connectivity
+    E = torch.sparse_coo_tensor(torch.tensor([list(range(V))] * 2), torch.ones((V,)), size=(V, V)).to(meshes.device)
+    L = L + E
+    # E = torch.eye(V, layout=torch.sparse_coo, device=meshes.device)
+    # L = L + E
+    colored_count = torch.ones_like(colors[:, 0])   # [V]
+    colored_count[invalid_index] = 0
+    L_invalid = torch.index_select(L, 0, invalid_index)    # sparse [IV, V]
+    total_colored = colored_count.sum()
+    coloring_round = 0
+    stage = "uncolored"
+    from tqdm import tqdm
+    pbar = tqdm(miniters=100)
+    while stage == "uncolored" or coloring_round > 0:
+        new_color = torch.matmul(L_invalid, colors * colored_count[:, None])    # [IV, 3]
+        new_count = torch.matmul(L_invalid, colored_count)[:, None]             # [IV, 1]
+        colors[invalid_index] = torch.where(new_count > 0, new_color / new_count, colors[invalid_index])
+        colored_count[invalid_index] = (new_count[:, 0] > 0).float()
+        new_total_colored = colored_count.sum()
+        if new_total_colored > total_colored:
+            total_colored = new_total_colored
+            coloring_round += 1
+        else:
+            stage = "colored"
+            coloring_round -= 1
+        pbar.update(1)
+        if coloring_round > 10000:
+            print("coloring_round > 10000, break")
+            break
+    assert not torch.isnan(colors).any()
+    meshes.textures = TexturesVertex(verts_features=[colors])
+    return meshes
+def load_glb_mesh(glb_path, device="cuda"):
+    meshes = load_objs_as_meshes([glb_path], device=device)
+    return meshes
+def get_separated_images_from_img_grid(img_grid_path, image_num):
+    img_list = []
+    grid = Image.open(img_grid_path)
+    w, h = grid.size
+    for i in range(0, image_num):
+        img_list.append(grid.crop((i*h, 0, i*h + h, h)))
+    return img_list
+def get_fov_camera_(azimuth, elevation, fovy, radius, mesh, auto_center, scale_factor, device='cuda'):
+    if auto_center:
+        verts = mesh.verts_packed()
+        max_bb = (verts - 0).max(0)[0]
+        min_bb = (verts - 0).min(0)[0]
+        scale = (max_bb - min_bb).max() / 2
+        center = (max_bb + min_bb) / 2
+        mesh.offset_verts_(-center)
+        mesh.scale_verts_((scale_factor / float(scale)))
+    else:
+        mesh.scale_verts_((scale_factor))
+    R, T = look_at_view_transform(radius, azimuth, elevation, device=device)
+    cameras = FoVPerspectiveCameras(device=device, R=R, T=T, fov=fovy)
+    return cameras
+def multiview_color_projection(meshes: Meshes, image_list: List[Image.Image], cameras_list: List[CamerasBase], weights=None, eps=0.05, resolution=1024, device="cuda", reweight_with_cosangle="square", use_alpha=True, confidence_threshold=0.1, complete_unseen=False, below_confidence_strategy="smooth") -> Meshes:
+    """
+    Projects color from a given image onto a 3D mesh.
+    Args:
+        meshes (pytorch3d.structures.Meshes): The 3D mesh object, only one mesh.
+        image_list (PIL.Image.Image): List of images.
+        cameras_list (list): List of cameras.
+        weights (list, optional): List of weights for each image, for ['front', 'front_right', 'right', 'back', 'left', 'front_left']. Defaults to None.
+        eps (float, optional): The threshold for selecting visible faces. Defaults to 0.05.
+        resolution (int, optional): The resolution of the projection. Defaults to 1024.
+        device (str, optional): The device to use for computation. Defaults to "cuda".
+        reweight_with_cosangle (str, optional): Whether to reweight the color with the angle between the view direction and the vertex normal. Defaults to None.
+        use_alpha (bool, optional): Whether to use the alpha channel of the image. Defaults to True.
+        confidence_threshold (float, optional): The threshold for the confidence of the projected color, if final projection weight is less than this, we will use the original color. Defaults to 0.1.
+        complete_unseen (bool, optional): Whether to complete the unseen vertex color using laplacian. Defaults to False.
+    Returns:
+        Meshes: the colored mesh
+    """
+    if image_list is None:
+        raise ValueError("image_list is None")
+    meshes = meshes.clone().to(device)
+    if weights is None:
+        weights = [1. for _ in range(len(cameras_list))]
+    assert len(cameras_list) == len(image_list) == len(weights), f'the following three lengths should be equal: len(cameras_list)({len(cameras_list)}), len(image_list)({len(image_list)}), len(weights)({len(weights)})'
+    original_color = meshes.textures.verts_features_packed()
+    assert not torch.isnan(original_color).any()
+    texture_counts = torch.zeros_like(original_color[..., :1])
+    texture_values = torch.zeros_like(original_color)
+    max_texture_counts = torch.zeros_like(original_color[..., :1])
+    max_texture_values = torch.zeros_like(original_color)
+    for camera, image, weight in zip(cameras_list, image_list, weights):
+        ret = project_color(meshes, camera, image, eps=eps, resolution=resolution, device=device, use_alpha=use_alpha)
+        if reweight_with_cosangle == "linear":
+            weight = (ret['cos_angles'].abs() * weight)[:, None]
+        elif reweight_with_cosangle == "square":
+            weight = (ret['cos_angles'].abs() ** 2 * weight)[:, None]
+        if use_alpha:
+            weight = weight * ret['valid_alpha']
+        try:
+            assert weight.min() > -0.0001, f'weight.min() is {weight.min()}, but shoule be > -0.0001'
+        except Exception as e:
+            raise e
+        texture_counts[ret['valid_verts']] += weight
+        texture_values[ret['valid_verts']] += ret['valid_colors'] * weight
+        max_texture_values[ret['valid_verts']] = torch.where(weight > max_texture_counts[ret['valid_verts']], ret['valid_colors'], max_texture_values[ret['valid_verts']])
+        max_texture_counts[ret['valid_verts']] = torch.max(max_texture_counts[ret['valid_verts']], weight)
+    texture_values = torch.where(texture_counts > confidence_threshold, texture_values / texture_counts, texture_values)
+    if below_confidence_strategy == "smooth":
+        texture_values = torch.where(texture_counts <= confidence_threshold, (original_color * (confidence_threshold - texture_counts) + texture_values) / confidence_threshold, texture_values)
+    elif below_confidence_strategy == "original":
+        texture_values = torch.where(texture_counts <= confidence_threshold, original_color, texture_values)
+    else:
+        raise ValueError(f"below_confidence_strategy={below_confidence_strategy} is not supported")
+    assert not torch.isnan(texture_values).any()
+    meshes.textures = TexturesVertex(verts_features=[texture_values])
+    if complete_unseen:
+        meshes = complete_unseen_vertex_color(meshes, torch.arange(texture_values.shape[0]).to(device)[texture_counts[:, 0] >= confidence_threshold])
+    ret_mesh = meshes.detach()
+    del meshes
+    return ret_mesh
+def get_cameras_list(azim_list, device, elevation, fov_in_degrees=None, focal=2/1.35, dist=1.1, cam_type='orthographic'):
+    ret = []
+    for azim in azim_list:
+        R, T = look_at_view_transform(dist, elevation, azim)
+        w2c = torch.cat([R[0].T, T[0, :, None]], dim=1)
+        cameras = get_camera(w2c, fov_in_degrees=fov_in_degrees, focal_length=focal, cam_type=cam_type).to(device)
+        ret.append(cameras)
+    return ret
+def get_cameras_list_azi_ele(azim_list, elev_list, device, fov_in_degrees=None, focal=2/1.35, dist=1.1, cam_type='orthographic'):
+    ret = []
+    for i in range(len(azim_list)):
+        R, T = look_at_view_transform(dist, elev_list[i], azim_list[i])
+        w2c = torch.cat([R[0].T, T[0, :, None]], dim=1)
+        cameras = get_camera(w2c, fov_in_degrees=fov_in_degrees, focal_length=focal, cam_type=cam_type).to(device)
+        ret.append(cameras)
+    return ret
+def get_8view_cameras(device, focal=2/1.35):
+    return get_cameras_list(azim_list = [180, 225, 270, 315, 0, 45, 90, 135], elevation=0, device=device, focal=focal)
+def get_6view_cameras(device, focal=2/1.35):
+    return get_cameras_list(azim_list = [180, 225, 270, 0, 90, 135], elevation=0, device=device, focal=focal)
+def get_4view_cameras(device, focal=2/1.35):
+    return get_cameras_list(azim_list = [180, 270, 0, 90], elevation=0, device=device, focal=focal)
+def get_2view_cameras(device, focal=2/1.35):
+    return get_cameras_list(azim_list = [180, 0], elevation=0, device=device, focal=focal)
+def get_multiple_view_cameras(device, focal=2/1.35, offset=180, num_views=8, dist=1.1):
+    return get_cameras_list(azim_list = (np.linspace(0, 360, num_views+1)[:-1] + offset) % 360, elevation=0, device=device, focal=focal, dist=dist)
+def align_with_alpha_bbox(source_img, target_img, final_size=1024):
+    # align source_img with target_img using alpha channel
+    # source_img and target_img are PIL.Image.Image
+    source_img = source_img.convert("RGBA")
+    target_img = target_img.convert("RGBA").resize((final_size, final_size))
+    source_np = np.array(source_img)
+    target_np = np.array(target_img)
+    source_alpha = source_np[:, :, 3]
+    target_alpha = target_np[:, :, 3]
+    bbox_source_min, bbox_source_max = np.argwhere(source_alpha > 0).min(axis=0), np.argwhere(source_alpha > 0).max(axis=0)
+    bbox_target_min, bbox_target_max = np.argwhere(target_alpha > 0).min(axis=0), np.argwhere(target_alpha > 0).max(axis=0)
+    source_content = source_np[bbox_source_min[0]:bbox_source_max[0]+1, bbox_source_min[1]:bbox_source_max[1]+1, :]
+    # resize source_content to fit in the position of target_content
+    source_content = Image.fromarray(source_content).resize((bbox_target_max[1]-bbox_target_min[1]+1, bbox_target_max[0]-bbox_target_min[0]+1), resample=Image.BICUBIC)
+    target_np[bbox_target_min[0]:bbox_target_max[0]+1, bbox_target_min[1]:bbox_target_max[1]+1, :] = np.array(source_content)
+    return Image.fromarray(target_np)
+def load_image_list_from_mvdiffusion(mvdiffusion_path, front_from_pil_or_path=None):
+    import os
+    image_list = []
+    for dir in ['front', 'front_right', 'right', 'back', 'left', 'front_left']:
+        image_path = os.path.join(mvdiffusion_path, f"rgb_000_{dir}.png")
+        pil = Image.open(image_path)
+        if dir == 'front':
+            if front_from_pil_or_path is not None:
+                if isinstance(front_from_pil_or_path, str):
+                    replace_pil = Image.open(front_from_pil_or_path)
+                else:
+                    replace_pil = front_from_pil_or_path
+                # align replace_pil with pil using bounding box in alpha channel
+                pil = align_with_alpha_bbox(replace_pil, pil, final_size=1024)
+        image_list.append(pil)
+    return image_list
+def load_image_list_from_img_grid(img_grid_path, resolution = 1024):
+    img_list = []
+    grid = Image.open(img_grid_path)
+    w, h = grid.size
+    for row in range(0, h, resolution):
+        for col in range(0, w, resolution):
+            img_list.append(grid.crop((col, row, col + resolution, row + resolution)))
+    return img_list

models/ISOMER/scripts/refine_lr_to_sr.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import torch
+import os
+import numpy as np
+from hashlib import md5
+def hash_img(img):
+    return md5(np.array(img).tobytes()).hexdigest()
+def hash_any(obj):
+    return md5(str(obj).encode()).hexdigest()
+def refine_lr_with_sd(pil_image_list, concept_img_list, control_image_list, prompt_list, pipe=None, strength=0.35, neg_prompt_list="", output_size=(512, 512), controlnet_conditioning_scale=1.):
+    with torch.no_grad():
+        images = pipe(
+            image=pil_image_list,
+            ip_adapter_image=concept_img_list,
+            prompt=prompt_list,
+            neg_prompt=neg_prompt_list,
+            num_inference_steps=50,
+            strength=strength,
+            height=output_size[0],
+            width=output_size[1],
+            control_image=control_image_list,
+            guidance_scale=5.0,
+            controlnet_conditioning_scale=controlnet_conditioning_scale,
+            generator=torch.manual_seed(233),
+        ).images
+    return images
+SR_cache = None
+def run_sr_fast(source_pils, scale=4):
+    from PIL import Image
+    from scripts.upsampler import RealESRGANer
+    import numpy as np
+    global SR_cache
+    if SR_cache is not None:
+        upsampler = SR_cache
+    else:
+        upsampler = RealESRGANer(
+            scale=4,
+            onnx_path="ckpt/realesrgan-x4.onnx",
+            tile=0,
+            tile_pad=10,
+            pre_pad=0,
+            half=True,
+            gpu_id=0,
+        )
+    ret_pils = []
+    for idx, img_pils in enumerate(source_pils):
+        np_in = isinstance(img_pils, np.ndarray)
+        assert isinstance(img_pils, (Image.Image, np.ndarray))
+        img = np.array(img_pils)
+        output, _ = upsampler.enhance(img, outscale=scale)
+        if np_in:
+            ret_pils.append(output)
+        else:
+            ret_pils.append(Image.fromarray(output))
+    if SR_cache is None:
+        SR_cache = upsampler
+    return ret_pils

models/ISOMER/scripts/sd_model_zoo.py ADDED Viewed

	@@ -0,0 +1,131 @@

+from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, EulerAncestralDiscreteScheduler, StableDiffusionControlNetImg2ImgPipeline, StableDiffusionControlNetImg2ImgPipeline, StableDiffusionPipeline
+from transformers import CLIPVisionModelWithProjection
+import torch
+from copy import deepcopy
+ENABLE_CPU_CACHE = False
+DEFAULT_BASE_MODEL = "runwayml/stable-diffusion-v1-5"
+cached_models = {}  # cache for models to avoid repeated loading, key is model name
+def cache_model(func):
+    def wrapper(*args, **kwargs):
+        if ENABLE_CPU_CACHE:
+            model_name = func.__name__ + str(args) + str(kwargs)
+            if model_name not in cached_models:
+                cached_models[model_name] = func(*args, **kwargs)
+            return cached_models[model_name]
+        else:
+            return func(*args, **kwargs)
+    return wrapper
+def copied_cache_model(func):
+    def wrapper(*args, **kwargs):
+        if ENABLE_CPU_CACHE:
+            model_name = func.__name__ + str(args) + str(kwargs)
+            if model_name not in cached_models:
+                cached_models[model_name] = func(*args, **kwargs)
+            return deepcopy(cached_models[model_name])
+        else:
+            return func(*args, **kwargs)
+    return wrapper
+def model_from_ckpt_or_pretrained(ckpt_or_pretrained, model_cls, original_config_file='ckpt/v1-inference.yaml', torch_dtype=torch.float16, **kwargs):
+    if ckpt_or_pretrained.endswith(".safetensors"):
+        pipe = model_cls.from_single_file(ckpt_or_pretrained, original_config_file=original_config_file, torch_dtype=torch_dtype, **kwargs)
+    else:
+        pipe = model_cls.from_pretrained(ckpt_or_pretrained, torch_dtype=torch_dtype, **kwargs)
+    return pipe
+@copied_cache_model
+def load_base_model_components(base_model=DEFAULT_BASE_MODEL, torch_dtype=torch.float16):
+    model_kwargs = dict(
+        torch_dtype=torch_dtype,
+        requires_safety_checker=False,
+        safety_checker=None,
+    )
+    pipe: StableDiffusionPipeline = model_from_ckpt_or_pretrained(
+        base_model,
+        StableDiffusionPipeline,
+        **model_kwargs
+    )
+    pipe.to("cpu")
+    return pipe.components
+@cache_model
+def load_controlnet(controlnet_path, torch_dtype=torch.float16):
+    controlnet = ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch_dtype)
+    return controlnet
+@cache_model
+def load_image_encoder():
+    image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+        "h94/IP-Adapter",
+        subfolder="models/image_encoder",
+        torch_dtype=torch.float16,
+    )
+    return image_encoder
+def load_common_sd15_pipe(base_model=DEFAULT_BASE_MODEL, device="auto", controlnet=None, ip_adapter=False, plus_model=True, torch_dtype=torch.float16, model_cpu_offload_seq=None, enable_sequential_cpu_offload=False, vae_slicing=False, pipeline_class=None, **kwargs):
+    model_kwargs = dict(
+        torch_dtype=torch_dtype,
+        device_map=device,
+        requires_safety_checker=False,
+        safety_checker=None,
+    )
+    components = load_base_model_components(base_model=base_model, torch_dtype=torch_dtype)
+    model_kwargs.update(components)
+    model_kwargs.update(kwargs)
+    if controlnet is not None:
+        if isinstance(controlnet, list):
+            controlnet = [load_controlnet(controlnet_path, torch_dtype=torch_dtype) for controlnet_path in controlnet]
+        else:
+            controlnet = load_controlnet(controlnet, torch_dtype=torch_dtype)
+        model_kwargs.update(controlnet=controlnet)
+    if pipeline_class is None:
+        if controlnet is not None:
+            pipeline_class = StableDiffusionControlNetPipeline
+        else:
+            pipeline_class = StableDiffusionPipeline
+    pipe: StableDiffusionPipeline = model_from_ckpt_or_pretrained(
+        base_model,
+        pipeline_class,
+        **model_kwargs
+    )
+    if ip_adapter:
+        image_encoder = load_image_encoder()
+        pipe.image_encoder = image_encoder
+        if plus_model:
+            pipe.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter-plus_sd15.safetensors")
+        else:
+            pipe.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.safetensors")
+        pipe.set_ip_adapter_scale(1.0)
+    else:
+        pipe.unload_ip_adapter()
+    pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
+    if model_cpu_offload_seq is None:
+        if isinstance(pipe, StableDiffusionControlNetPipeline):
+            pipe.model_cpu_offload_seq = "text_encoder->controlnet->unet->vae"
+        elif isinstance(pipe, StableDiffusionControlNetImg2ImgPipeline):
+            pipe.model_cpu_offload_seq = "text_encoder->controlnet->vae->unet->vae"
+    else:
+        pipe.model_cpu_offload_seq = model_cpu_offload_seq
+    if enable_sequential_cpu_offload:
+        pipe.enable_sequential_cpu_offload()
+    else:
+        pipe = pipe.to("cuda")
+        pass
+        # pipe.enable_model_cpu_offload()
+    if vae_slicing:
+        pipe.enable_vae_slicing()
+    import gc
+    gc.collect()
+    return pipe

models/ISOMER/scripts/upsampler.py ADDED Viewed

	@@ -0,0 +1,260 @@

+import cv2
+import math
+import numpy as np
+import os
+import torch
+from torch.nn import functional as F
+from scripts.load_onnx import load_onnx_caller
+ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+class RealESRGANer():
+    """A helper class for upsampling images with RealESRGAN.
+    Args:
+        scale (int): Upsampling scale factor used in the networks. It is usually 2 or 4.
+        model_path (str): The path to the pretrained model. It can be urls (will first download it automatically).
+        model (nn.Module): The defined network. Default: None.
+        tile (int): As too large images result in the out of GPU memory issue, so this tile option will first crop
+            input images into tiles, and then process each of them. Finally, they will be merged into one image.
+            0 denotes for do not use tile. Default: 0.
+        tile_pad (int): The pad size for each tile, to remove border artifacts. Default: 10.
+        pre_pad (int): Pad the input images to avoid border artifacts. Default: 10.
+        half (float): Whether to use half precision during inference. Default: False.
+    """
+    def __init__(self,
+                 scale,
+                 onnx_path,
+                 tile=0,
+                 tile_pad=10,
+                 pre_pad=10,
+                 half=False,
+                 device=None,
+                 gpu_id=None):
+        self.scale = scale
+        self.tile_size = tile
+        self.tile_pad = tile_pad
+        self.pre_pad = pre_pad
+        self.mod_scale = None
+        self.half = half
+        print('about to initialize model')
+        # initialize model
+        if gpu_id:
+            self.device = torch.device(
+                f'cuda:{gpu_id}' if torch.cuda.is_available() else 'cpu') if device is None else device
+        else:
+            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if device is None else device
+        print('self.device set')
+        print(f'about to self.model = load_onnx_caller({onnx_path}, single_output=True)')
+        self.model = load_onnx_caller(onnx_path, single_output=True)
+        print('self.model loaded')
+        print('about to warm up')
+        # warm up
+        sample_input = torch.randn(1,3,512,512).cuda().float()
+        print(f'sample_input.shape = {sample_input.shape}')
+        self.model(sample_input)
+        print('finished warming up')
+    def pre_process(self, img):
+        """Pre-process, such as pre-pad and mod pad, so that the images can be divisible
+        """
+        img = torch.from_numpy(np.transpose(img, (2, 0, 1))).float()
+        self.img = img.unsqueeze(0).to(self.device)
+        if self.half:
+            self.img = self.img.half()
+        # pre_pad
+        if self.pre_pad != 0:
+            self.img = F.pad(self.img, (0, self.pre_pad, 0, self.pre_pad), 'reflect')
+        # mod pad for divisible borders
+        if self.scale == 2:
+            self.mod_scale = 2
+        elif self.scale == 1:
+            self.mod_scale = 4
+        if self.mod_scale is not None:
+            self.mod_pad_h, self.mod_pad_w = 0, 0
+            _, _, h, w = self.img.size()
+            if (h % self.mod_scale != 0):
+                self.mod_pad_h = (self.mod_scale - h % self.mod_scale)
+            if (w % self.mod_scale != 0):
+                self.mod_pad_w = (self.mod_scale - w % self.mod_scale)
+            self.img = F.pad(self.img, (0, self.mod_pad_w, 0, self.mod_pad_h), 'reflect')
+    def process(self):
+        # model inference
+        self.output = self.model(self.img)
+    def tile_process(self):
+        """It will first crop input images to tiles, and then process each tile.
+        Finally, all the processed tiles are merged into one images.
+        Modified from: https://github.com/ata4/esrgan-launcher
+        """
+        batch, channel, height, width = self.img.shape
+        output_height = height * self.scale
+        output_width = width * self.scale
+        output_shape = (batch, channel, output_height, output_width)
+        # start with black image
+        self.output = self.img.new_zeros(output_shape)
+        tiles_x = math.ceil(width / self.tile_size)
+        tiles_y = math.ceil(height / self.tile_size)
+        # loop over all tiles
+        for y in range(tiles_y):
+            for x in range(tiles_x):
+                # extract tile from input image
+                ofs_x = x * self.tile_size
+                ofs_y = y * self.tile_size
+                # input tile area on total image
+                input_start_x = ofs_x
+                input_end_x = min(ofs_x + self.tile_size, width)
+                input_start_y = ofs_y
+                input_end_y = min(ofs_y + self.tile_size, height)
+                # input tile area on total image with padding
+                input_start_x_pad = max(input_start_x - self.tile_pad, 0)
+                input_end_x_pad = min(input_end_x + self.tile_pad, width)
+                input_start_y_pad = max(input_start_y - self.tile_pad, 0)
+                input_end_y_pad = min(input_end_y + self.tile_pad, height)
+                # input tile dimensions
+                input_tile_width = input_end_x - input_start_x
+                input_tile_height = input_end_y - input_start_y
+                tile_idx = y * tiles_x + x + 1
+                input_tile = self.img[:, :, input_start_y_pad:input_end_y_pad, input_start_x_pad:input_end_x_pad]
+                # upscale tile
+                try:
+                    with torch.no_grad():
+                        output_tile = self.model(input_tile)
+                except RuntimeError as error:
+                    print('Error', error)
+                print(f'\tTile {tile_idx}/{tiles_x * tiles_y}')
+                # output tile area on total image
+                output_start_x = input_start_x * self.scale
+                output_end_x = input_end_x * self.scale
+                output_start_y = input_start_y * self.scale
+                output_end_y = input_end_y * self.scale
+                # output tile area without padding
+                output_start_x_tile = (input_start_x - input_start_x_pad) * self.scale
+                output_end_x_tile = output_start_x_tile + input_tile_width * self.scale
+                output_start_y_tile = (input_start_y - input_start_y_pad) * self.scale
+                output_end_y_tile = output_start_y_tile + input_tile_height * self.scale
+                # put tile into output image
+                self.output[:, :, output_start_y:output_end_y,
+                            output_start_x:output_end_x] = output_tile[:, :, output_start_y_tile:output_end_y_tile,
+                                                                       output_start_x_tile:output_end_x_tile]
+    def post_process(self):
+        # remove extra pad
+        if self.mod_scale is not None:
+            _, _, h, w = self.output.size()
+            self.output = self.output[:, :, 0:h - self.mod_pad_h * self.scale, 0:w - self.mod_pad_w * self.scale]
+        # remove prepad
+        if self.pre_pad != 0:
+            _, _, h, w = self.output.size()
+            self.output = self.output[:, :, 0:h - self.pre_pad * self.scale, 0:w - self.pre_pad * self.scale]
+        return self.output
+    @torch.no_grad()
+    def enhance(self, img, outscale=None, alpha_upsampler='realesrgan'):
+        print('inside enhance')
+        h_input, w_input = img.shape[0:2]
+        # img: numpy
+        img = img.astype(np.float32)
+        if np.max(img) > 256:  # 16-bit image
+            max_range = 65535
+            print('\tInput is a 16-bit image')
+        else:
+            max_range = 255
+        img = img / max_range
+        if len(img.shape) == 2:  # gray image
+            img_mode = 'L'
+            img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
+        elif img.shape[2] == 4:  # RGBA image with alpha channel
+            img_mode = 'RGBA'
+            alpha = img[:, :, 3]
+            img = img[:, :, 0:3]
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+            if alpha_upsampler == 'realesrgan':
+                alpha = cv2.cvtColor(alpha, cv2.COLOR_GRAY2RGB)
+        else:
+            img_mode = 'RGB'
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        # ------------------- process image (without the alpha channel) ------------------- #
+        print('about to process image (without the alpha channel)')
+        self.pre_process(img)
+        if self.tile_size > 0:
+            print(f'self.tile_size is {self.tile_size}, thus about to self.tile_process()')
+            self.tile_process()
+            print('finished self.tile_process()')
+        else:
+            print('about to self.process()')
+            self.process()
+            print('finished self.process()')
+        print('about to self.post_process()')
+        output_img = self.post_process()
+        print('finished self.post_process()')
+        output_img = output_img.data.squeeze().float().cpu().clamp_(0, 1).numpy()
+        output_img = np.transpose(output_img[[2, 1, 0], :, :], (1, 2, 0))
+        if img_mode == 'L':
+            output_img = cv2.cvtColor(output_img, cv2.COLOR_BGR2GRAY)
+        print('finished process image (without the alpha channel)')
+        # ------------------- process the alpha channel if necessary ------------------- #
+        if img_mode == 'RGBA':
+            print("img_mode == 'RGBA' thus about to process alpha channel")
+            if alpha_upsampler == 'realesrgan':
+                print(f"alpha_upsampler == 'realesrgan', about to self.pre_process({alpha})")
+                self.pre_process(alpha)
+                print('finished self.pre_process')
+                if self.tile_size > 0:
+                    print(f'self.tile_size is {self.tile_size}, thus about to self.tile_process()')
+                    self.tile_process()
+                    print('finished self.tile_process()')
+                else:
+                    print('about to self.process()')
+                    self.process()
+                    print('finished self.process()')
+                print('about to self.post_process()')
+                output_alpha = self.post_process()
+                print('finished self.post_process()')
+                output_alpha = output_alpha.data.squeeze().float().cpu().clamp_(0, 1).numpy()
+                output_alpha = np.transpose(output_alpha[[2, 1, 0], :, :], (1, 2, 0))
+                output_alpha = cv2.cvtColor(output_alpha, cv2.COLOR_BGR2GRAY)
+            else:  # use the cv2 resize for alpha channel
+                print('about to use the cv2 resize for alpha channel')
+                h, w = alpha.shape[0:2]
+                output_alpha = cv2.resize(alpha, (w * self.scale, h * self.scale), interpolation=cv2.INTER_LINEAR)
+            print('about to merge the alpha channel')
+            # merge the alpha channel
+            output_img = cv2.cvtColor(output_img, cv2.COLOR_BGR2BGRA)
+            output_img[:, :, 3] = output_alpha
+            print('finished process alpha channel')
+        print('about to resize and return')
+        # ------------------------------ return ------------------------------ #
+        if max_range == 65535:  # 16-bit image
+            output = (output_img * 65535.0).round().astype(np.uint16)
+        else:
+            output = (output_img * 255.0).round().astype(np.uint8)
+        if outscale is not None and outscale != float(self.scale):
+            output = cv2.resize(
+                output, (
+                    int(w_input * outscale),
+                    int(h_input * outscale),
+                ), interpolation=cv2.INTER_LANCZOS4)
+        return output, img_mode

models/ISOMER/scripts/utils.py ADDED Viewed

	@@ -0,0 +1,611 @@

+import torch
+import numpy as np
+from PIL import Image
+import pymeshlab
+import pymeshlab as ml
+from pymeshlab import PercentageValue
+from pytorch3d.renderer import TexturesVertex
+from pytorch3d.structures import Meshes
+import torch
+import torch.nn.functional as F
+from typing import List, Tuple
+from PIL import Image
+import trimesh
+EPSILON = 1e-8
+def load_mesh_with_trimesh(file_name, file_type=None):
+    import trimesh
+    mesh: trimesh.Trimesh = trimesh.load(file_name, file_type=file_type)
+    if isinstance(mesh, trimesh.Scene):
+        assert len(mesh.geometry) > 0
+        # save to obj first and load again to avoid offset issue
+        from io import BytesIO
+        with BytesIO() as f:
+            mesh.export(f, file_type="obj")
+            f.seek(0)
+            mesh = trimesh.load(f, file_type="obj")
+        if isinstance(mesh, trimesh.Scene):
+            # we lose texture information here
+            mesh = trimesh.util.concatenate(
+                tuple(trimesh.Trimesh(vertices=g.vertices, faces=g.faces)
+                    for g in mesh.geometry.values()))
+    assert isinstance(mesh, trimesh.Trimesh)
+    vertices = torch.from_numpy(mesh.vertices).T
+    faces = torch.from_numpy(mesh.faces).T
+    colors = None
+    if mesh.visual is not None:
+        if hasattr(mesh.visual, 'vertex_colors'):
+            colors = torch.from_numpy(mesh.visual.vertex_colors)[..., :3].T / 255.
+    if colors is None:
+        colors = torch.ones_like(vertices) * 0.5
+    return vertices, faces, colors
+def meshlab_mesh_to_py3dmesh(mesh: pymeshlab.Mesh) -> Meshes:
+    verts = torch.from_numpy(mesh.vertex_matrix()).float()
+    faces = torch.from_numpy(mesh.face_matrix()).long()
+    colors = torch.from_numpy(mesh.vertex_color_matrix()[..., :3]).float()
+    textures = TexturesVertex(verts_features=[colors])
+    return Meshes(verts=[verts], faces=[faces], textures=textures)
+def py3dmesh_to_meshlab_mesh(meshes: Meshes) -> pymeshlab.Mesh:
+    colors_in = F.pad(meshes.textures.verts_features_packed().cpu().float(), [0,1], value=1).numpy().astype(np.float64)
+    m1 = pymeshlab.Mesh(
+        vertex_matrix=meshes.verts_packed().cpu().float().numpy().astype(np.float64),
+        face_matrix=meshes.faces_packed().cpu().long().numpy().astype(np.int32),
+        v_normals_matrix=meshes.verts_normals_packed().cpu().float().numpy().astype(np.float64),
+        v_color_matrix=colors_in)
+    return m1
+def to_pyml_mesh(vertices,faces):
+    m1 = pymeshlab.Mesh(
+        vertex_matrix=vertices.cpu().float().numpy().astype(np.float64),
+        face_matrix=faces.cpu().long().numpy().astype(np.int32),
+    )
+    return m1
+def to_py3d_mesh(vertices, faces, normals=None):
+    from pytorch3d.structures import Meshes
+    from pytorch3d.renderer.mesh.textures import TexturesVertex
+    mesh = Meshes(verts=[vertices], faces=[faces], textures=None)
+    if normals is None:
+        normals = mesh.verts_normals_packed()
+    # set normals as vertext colors
+    mesh.textures = TexturesVertex(verts_features=[normals / 2 + 0.5])
+    return mesh
+def from_py3d_mesh(mesh):
+    return mesh.verts_list()[0], mesh.faces_list()[0], mesh.textures.verts_features_packed()
+def rotate_normalmap_by_angle(normal_map: np.ndarray, angle: float):
+    """
+    rotate along y-axis
+    normal_map: np.array, shape=(H, W, 3) in [-1, 1]
+    angle: float, in degree
+    """
+    angle = angle / 180 * np.pi
+    R = np.array([[np.cos(angle), 0, np.sin(angle)], [0, 1, 0], [-np.sin(angle), 0, np.cos(angle)]])
+    return np.dot(normal_map.reshape(-1, 3), R.T).reshape(normal_map.shape)
+# from view coord to front view world coord
+def rotate_normals(normal_pils, return_types='np', rotate_direction=1) -> np.ndarray:  # [0, 255]
+    n_views = len(normal_pils)
+    ret = []
+    for idx, rgba_normal in enumerate(normal_pils):
+        # rotate normal
+        normal_np = np.array(rgba_normal)[:, :, :3] / 255           # in [-1, 1]
+        alpha_np = np.array(rgba_normal)[:, :, 3] / 255             # in [0, 1]
+        normal_np = normal_np * 2 - 1
+        normal_np = rotate_normalmap_by_angle(normal_np, rotate_direction * idx * (360 / n_views))
+        normal_np = (normal_np + 1) / 2
+        normal_np = normal_np * alpha_np[..., None]                 # make bg black
+        rgba_normal_np = np.concatenate([normal_np * 255, alpha_np[:, :, None] * 255] , axis=-1)
+        if return_types == 'np':
+            ret.append(rgba_normal_np)
+        elif return_types == 'pil':
+            ret.append(Image.fromarray(rgba_normal_np.astype(np.uint8)))
+        else:
+            raise ValueError(f"return_types should be 'np' or 'pil', but got {return_types}")
+    return ret
+def rotate_normalmap_by_angle_torch(normal_map, angle):
+    """
+    rotate along y-axis
+    normal_map: torch.Tensor, shape=(H, W, 3) in [-1, 1], device='cuda'
+    angle: float, in degree
+    """
+    angle = torch.tensor(angle / 180 * np.pi).to(normal_map)
+    R = torch.tensor([[torch.cos(angle), 0, torch.sin(angle)],
+                      [0, 1, 0],
+                      [-torch.sin(angle), 0, torch.cos(angle)]]).to(normal_map)
+    return torch.matmul(normal_map.view(-1, 3), R.T).view(normal_map.shape)
+def do_rotate(rgba_normal, angle):
+    rgba_normal = torch.from_numpy(rgba_normal).float().cuda() / 255
+    rotated_normal_tensor = rotate_normalmap_by_angle_torch(rgba_normal[..., :3] * 2 - 1, angle)
+    rotated_normal_tensor = (rotated_normal_tensor + 1) / 2
+    rotated_normal_tensor = rotated_normal_tensor * rgba_normal[:, :, [3]]    # make bg black
+    rgba_normal_np = torch.cat([rotated_normal_tensor * 255, rgba_normal[:, :, [3]] * 255], dim=-1).cpu().numpy()
+    return rgba_normal_np
+def rotate_normals_torch(normal_pils, return_types='np', rotate_direction=1):
+    n_views = len(normal_pils)
+    ret = []
+    for idx, rgba_normal in enumerate(normal_pils):
+        # rotate normal
+        angle = rotate_direction * idx * (360 / n_views)
+        rgba_normal_np = do_rotate(np.array(rgba_normal), angle)
+        if return_types == 'np':
+            ret.append(rgba_normal_np)
+        elif return_types == 'pil':
+            ret.append(Image.fromarray(rgba_normal_np.astype(np.uint8)))
+        else:
+            raise ValueError(f"return_types should be 'np' or 'pil', but got {return_types}")
+    return ret
+def change_bkgd(img_pils, new_bkgd=(0., 0., 0.)):
+    ret = []
+    new_bkgd = np.array(new_bkgd).reshape(1, 1, 3)
+    for rgba_img in img_pils:
+        img_np = np.array(rgba_img)[:, :, :3] / 255
+        alpha_np = np.array(rgba_img)[:, :, 3] / 255
+        ori_bkgd = img_np[:1, :1]
+        # color = ori_color * alpha + bkgd * (1-alpha)
+        # ori_color = (color - bkgd * (1-alpha)) / alpha
+        alpha_np_clamp = np.clip(alpha_np, 1e-6, 1) # avoid divide by zero
+        ori_img_np = (img_np - ori_bkgd * (1 - alpha_np[..., None])) / alpha_np_clamp[..., None]
+        img_np = np.where(alpha_np[..., None] > 0.05, ori_img_np * alpha_np[..., None] + new_bkgd * (1 - alpha_np[..., None]), new_bkgd)
+        rgba_img_np = np.concatenate([img_np * 255, alpha_np[..., None] * 255], axis=-1)
+        ret.append(Image.fromarray(rgba_img_np.astype(np.uint8)))
+    return ret
+def change_bkgd_to_normal(normal_pils) -> List[Image.Image]:
+    n_views = len(normal_pils)
+    ret = []
+    for idx, rgba_normal in enumerate(normal_pils):
+        # calcuate background normal
+        target_bkgd = rotate_normalmap_by_angle(np.array([[[0., 0., 1.]]]), idx * (360 / n_views))
+        normal_np = np.array(rgba_normal)[:, :, :3] / 255           # in [-1, 1]
+        alpha_np = np.array(rgba_normal)[:, :, 3] / 255             # in [0, 1]
+        normal_np = normal_np * 2 - 1
+        old_bkgd = normal_np[:1,:1]
+        normal_np[alpha_np > 0.05] = (normal_np[alpha_np > 0.05] - old_bkgd * (1 - alpha_np[alpha_np > 0.05][..., None])) / alpha_np[alpha_np > 0.05][..., None]
+        normal_np = normal_np * alpha_np[..., None] + target_bkgd * (1 - alpha_np[..., None])
+        normal_np = (normal_np + 1) / 2
+        rgba_normal_np = np.concatenate([normal_np * 255, alpha_np[..., None] * 255] , axis=-1)
+        ret.append(Image.fromarray(rgba_normal_np.astype(np.uint8)))
+    return ret
+def fix_vert_color_glb(mesh_path):
+    from pygltflib import GLTF2, Material, PbrMetallicRoughness
+    obj1 = GLTF2().load(mesh_path)
+    obj1.meshes[0].primitives[0].material = 0
+    obj1.materials.append(Material(
+        pbrMetallicRoughness = PbrMetallicRoughness(
+            baseColorFactor = [1.0, 1.0, 1.0, 1.0],
+            metallicFactor = 0.,
+            roughnessFactor = 1.0,
+        ),
+        emissiveFactor = [0.0, 0.0, 0.0],
+        doubleSided = True,
+    ))
+    obj1.save(mesh_path)
+def srgb_to_linear(c_srgb):
+    c_linear = np.where(c_srgb <= 0.04045, c_srgb / 12.92, ((c_srgb + 0.055) / 1.055) ** 2.4)
+    return c_linear.clip(0, 1.)
+def save_py3dmesh_with_trimesh_fast(meshes: Meshes, save_glb_path, apply_sRGB_to_LinearRGB=True):
+    # convert from pytorch3d meshes to trimesh mesh
+    vertices = meshes.verts_packed().cpu().float().numpy()
+    triangles = meshes.faces_packed().cpu().long().numpy()
+    np_color = meshes.textures.verts_features_packed().cpu().float().numpy()
+    if save_glb_path.endswith(".glb"):
+        # rotate 180 along +Y
+        vertices[:, [0, 2]] = -vertices[:, [0, 2]]
+    if apply_sRGB_to_LinearRGB:
+        np_color = srgb_to_linear(np_color)
+    assert vertices.shape[0] == np_color.shape[0]
+    assert np_color.shape[1] == 3
+    assert 0 <= np_color.min() and np_color.max() <= 1, f"min={np_color.min()}, max={np_color.max()}"
+    mesh = trimesh.Trimesh(vertices=vertices, faces=triangles, vertex_colors=np_color)
+    mesh.remove_unreferenced_vertices()
+    # save mesh
+    mesh.export(save_glb_path)
+    if save_glb_path.endswith(".glb"):
+        fix_vert_color_glb(save_glb_path)
+def save_glb_and_video(save_mesh_prefix: str, meshes: Meshes, with_timestamp=True, dist=3.5, azim_offset=180, resolution=512, fov_in_degrees=1 / 1.15, cam_type="ortho", view_padding=60, export_video=True) -> Tuple[str, str]:
+    import time
+    if '.' in save_mesh_prefix:
+        save_mesh_prefix = ".".join(save_mesh_prefix.split('.')[:-1])
+    if with_timestamp:
+        save_mesh_prefix = save_mesh_prefix + f"_{int(time.time())}"
+    ret_mesh = save_mesh_prefix + ".glb"
+    # optimizied version
+    save_py3dmesh_with_trimesh_fast(meshes, ret_mesh)
+    return ret_mesh, None
+def simple_clean_mesh(pyml_mesh: ml.Mesh, apply_smooth=True, stepsmoothnum=1, apply_sub_divide=False, sub_divide_threshold=0.25):
+    ms = ml.MeshSet()
+    ms.add_mesh(pyml_mesh, "cube_mesh")
+    if apply_smooth:
+        ms.apply_filter("apply_coord_laplacian_smoothing", stepsmoothnum=stepsmoothnum, cotangentweight=False)
+    if apply_sub_divide:    # 5s, slow
+        ms.apply_filter("meshing_repair_non_manifold_vertices")
+        ms.apply_filter("meshing_repair_non_manifold_edges", method='Remove Faces')
+        ms.apply_filter("meshing_surface_subdivision_loop", iterations=2, threshold=PercentageValue(sub_divide_threshold))
+    return meshlab_mesh_to_py3dmesh(ms.current_mesh())
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+def init_target(img_pils, new_bkgd=(0., 0., 0.), device="cuda"):
+    new_bkgd = torch.tensor(new_bkgd, dtype=torch.float32).view(1, 1, 3).to(device)
+    imgs = torch.stack([torch.from_numpy(np.array(img, dtype=np.float32)) for img in img_pils]).to(device) / 255
+    img_nps = imgs[..., :3]
+    alpha_nps = imgs[..., 3]
+    ori_bkgds = img_nps[:, :1, :1]
+    alpha_nps_clamp = torch.clamp(alpha_nps, 1e-6, 1)
+    ori_img_nps = (img_nps - ori_bkgds * (1 - alpha_nps.unsqueeze(-1))) / alpha_nps_clamp.unsqueeze(-1)
+    ori_img_nps = torch.clamp(ori_img_nps, 0, 1)
+    img_nps = torch.where(alpha_nps.unsqueeze(-1) > 0.05, ori_img_nps * alpha_nps.unsqueeze(-1) + new_bkgd * (1 - alpha_nps.unsqueeze(-1)), new_bkgd)
+    rgba_img_np = torch.cat([img_nps, alpha_nps.unsqueeze(-1)], dim=-1)
+    return rgba_img_np
+def rotation_matrix_axis_angle(axis, angle, device='cuda'):
+    """
+    Return the rotation matrix associated with counterclockwise rotation about
+    the given axis by angle degrees, using PyTorch.
+    """
+    if type(axis) != torch.tensor:
+        axis = torch.tensor(axis, device=device)
+    axis = axis.float().to(device)
+    if type(angle) != torch.tensor:
+        angle = torch.tensor(angle, device=device)
+    angle = angle.float().to(device)
+    theta = angle * torch.pi / 180.0
+    axis = torch.tensor(axis, dtype=torch.float32)
+    if torch.dot(axis, axis) > 0:
+        denom = torch.sqrt(torch.dot(axis, axis))
+        demon = torch.where(denom == 0, torch.tensor(EPSILON).to(denom.device), denom)
+        axis = axis / torch.sqrt(demon)
+        a = torch.cos(theta / 2.0)
+        b, c, d = -axis[0] * torch.sin(theta / 2.0), -axis[1] * torch.sin(theta / 2.0), -axis[2] * torch.sin(theta / 2.0)
+        aa, bb, cc, dd = a*a, b*b, c*c, d*d
+        bc, ad, ac, ab, bd, cd = b*c, a*d, a*c, a*b, b*d, c*d
+        return torch.stack([
+            torch.stack([aa+bb-cc-dd, 2*(bc+ad), 2*(bd-ac)]),
+            torch.stack([2*(bc-ad), aa+cc-bb-dd, 2*(cd+ab)]),
+            torch.stack([2*(bd+ac), 2*(cd-ab), aa+dd-bb-cc])
+        ])
+    else:
+        return torch.eye(3)
+def normal_rotation_img2img_angle_axis(image, angle, axis=None, device='cuda'):
+    """
+    Rotate an image by a given angle around a given axis using PyTorch.
+    Args:
+        image: Input Image to rotate.
+        angle: Rotation angle in degrees.
+        axis: Rotation axis as a array of 3 floats.
+    Returns:
+        Image: Rotated Image.
+    """
+    if axis is None:
+        axis = [0,1,0]
+    axis = torch.tensor(axis, device=device)
+    if type(image) == Image.Image:
+        image_array = torch.tensor(np.array(image, dtype='float32'))
+    else:
+        image_array = image
+    image_array = image_array.to(device)
+    if type(angle) != torch.Tensor:
+        angle = torch.tensor(angle)
+    angle = angle.to(device)
+    if image_array.shape[2] == 4:
+        rgb_array, alpha_array = image_array[:, :, :3], image_array[:, :, 3]
+    else:
+        rgb_array = image_array
+        alpha_array = None
+    rgb_array =  rgb_array / 255.0 - 0.5
+    rgb_array = rgb_array.permute(2, 0, 1)
+    rotated_tensor = apply_rotation_angle_axis(rgb_array.unsqueeze(0), axis, torch.tensor([angle], device=rgb_array.device))
+    rotated_array = rotated_tensor.squeeze().permute(1, 2, 0)
+    rotated_array = (rotated_array/2 + 0.5) * 255
+    if alpha_array is not None:
+        rotated_array = torch.cat((rotated_array, alpha_array.unsqueeze(2)), dim=2)
+    rotated_array_uint8 = np.array(rotated_array.detach().cpu()).astype('uint8')
+    rotated_normal = Image.fromarray(rotated_array_uint8)
+    return rotated_normal
+def normal_rotation_img2img_c2w(image, c2w, device='cuda'):
+    if type(image) != torch.Tensor:
+        image_array = torch.tensor(np.array(image, dtype='float32'))
+    else:
+        image_array = image
+    image_array = image_array.to(device)
+    if image_array.shape[2] == 4:
+        rgb_array, alpha_array = image_array[:, :, :3], image_array[:, :, 3]
+    else:
+        rgb_array = image_array
+        alpha_array = None
+    rgb_array =  rgb_array / 255.0 - 0.5
+    rotation_matrix = c2w
+    rotated_tensor = transform_normals_R(rgb_array, rotation_matrix)
+    rotated_array = rotated_tensor.squeeze().permute(1, 2, 0)
+    rotated_array = (rotated_array/2 + 0.5) * 255
+    if alpha_array is not None:
+        rotated_array = torch.cat((rotated_array, alpha_array.unsqueeze(2)), dim=2)
+    rotated_array_uint8 = np.array(rotated_array.detach().cpu()).astype('uint8')
+    rotated_normal = Image.fromarray(rotated_array_uint8)
+    return rotated_normal
+def normal_rotation_img2img_azi_ele(image, azi, ele, device='cuda'):
+    """
+    Rotate an image by a given angle around a given axis using PyTorch.
+    Args:
+        image: Input Image to rotate.
+    Returns:
+        Image: Rotated Image.
+    """
+    if type(image) == Image.Image:
+        image_array = torch.tensor(np.array(image, dtype='float32'))
+    else:
+        image_array = image
+    image_array = image_array.to(device)
+    if type(azi) != torch.Tensor:
+        azi = torch.tensor(azi)
+    azi = azi.to(device)
+    if type(ele) != torch.Tensor:
+        ele = torch.tensor(ele)
+    ele = ele.to(device)
+    if image_array.shape[2] == 4:
+        rgb_array, alpha_array = image_array[:, :, :3], image_array[:, :, 3]
+    else:
+        rgb_array = image_array
+        alpha_array = None
+    rgb_array =  rgb_array / 255.0 - 0.5
+    rotation_matrix = get_rotation_matrix_azi_ele(azi, ele)
+    rotated_tensor = transform_normals_R(rgb_array, rotation_matrix)
+    rotated_array = rotated_tensor.squeeze().permute(1, 2, 0)
+    rotated_array = (rotated_array/2 + 0.5) * 255
+    if alpha_array is not None:
+        rotated_array = torch.cat((rotated_array, alpha_array.unsqueeze(2)), dim=2)
+    rotated_array_uint8 = np.array(rotated_array.detach().cpu()).astype('uint8')
+    rotated_normal = Image.fromarray(rotated_array_uint8)
+    return rotated_normal
+def rotate_normal_R(image, rotation_matrix, save_addr="", device="cuda"):
+    """
+    Rotate a normal map by a given Rotation matrix using PyTorch.
+    Args:
+        image: Input Image to rotate.
+    Returns:
+        Image: Rotated Image.
+    """
+    if type(image) != torch.tensor:
+        image_array = torch.tensor(np.array(image, dtype='float32'))
+    else:
+        image_array = image
+    image_array = image_array.to(device)
+    if image_array.shape[2] == 4:
+        rgb_array, alpha_array = image_array[:, :, :3], image_array[:, :, 3]
+    else:
+        rgb_array = image_array
+        alpha_array = None
+    rgb_array =  rgb_array / 255.0 - 0.5
+    rotated_tensor = transform_normals_R(rgb_array, rotation_matrix.to(device))
+    rotated_array = rotated_tensor.squeeze().permute(1, 2, 0)
+    rotated_array = (rotated_array/2 + 0.5) * 255
+    if alpha_array is not None:
+        rotated_array = torch.cat((rotated_array, alpha_array.unsqueeze(2)), dim=2)
+    rotated_array_uint8 = np.array(rotated_array.detach().cpu()).astype('uint8')
+    rotated_normal = Image.fromarray(rotated_array_uint8)
+    if save_addr:
+        rotated_normal.save(save_addr)
+    return rotated_normal
+def transform_normals_R(local_normals, rotation_matrix):
+    assert local_normals.shape[2] ==3 ,f'local_normals.shape[2]: {local_normals.shape[2]}. only support rgb image'
+    h, w = local_normals.shape[:2]
+    local_normals_flat = local_normals.view(-1, 3).permute(1, 0)
+    images_flat = local_normals_flat.unsqueeze(0)
+    rotation_matrices = rotation_matrix.unsqueeze(0)
+    rotated_images_flat = torch.bmm(rotation_matrices, images_flat)
+    rotated_images = rotated_images_flat.view(1, 3, h, w)
+    norms = torch.norm(rotated_images, p=2, dim=1, keepdim=True)
+    norms = torch.where(norms == 0, torch.tensor(EPSILON).to(norms.device), norms)
+    normalized_images = rotated_images / norms
+    return normalized_images
+def manage_elevation_azimuth(ele_list, azi_list):
+    """deal with cases when elevation > 90"""
+    for i in range(len(ele_list)):
+        elevation = ele_list[i] % 360
+        azimuth = azi_list[i] % 360
+        if elevation > 90 and elevation<=270:
+            # when elevation is too big，camera gets to the other side
+            # print(f'!!! elevation({elevation}) > 90 and <=270, set to 180-elevation, and add 180 to azimuth')
+            elevation = 180 - elevation
+            azimuth = azimuth + 180
+            # print(f'new elevation: {elevation}, new azimuth: {azimuth}')
+        elif elevation>270:
+            # print(f'!!! elevation({elevation}) > 270, set to elevation-360, and use original azimuth')
+            elevation = elevation - 360
+            azimuth = azimuth
+            # print(f'new elevation: {elevation}, new azimuth: {azimuth}')
+        ele_list[i] = elevation
+        azi_list[i] = azimuth
+    return ele_list, azi_list
+def get_rotation_matrix_azi_ele(azimuth, elevation):
+    ele = elevation/180 * torch.pi
+    azi = azimuth/180 * torch.pi
+    Rz = torch.tensor([
+        [torch.cos(azi), 0, -torch.sin(azi)],
+        [0, 1, 0],
+        [torch.sin(azi), 0, torch.cos(azi)],
+    ]).to(azimuth.device)
+    Re = torch.tensor([
+        [1, 0, 0],
+        [0, torch.cos(ele), torch.sin(ele)],
+        [0, -torch.sin(ele), torch.cos(ele)],
+    ]).to(elevation.device)
+    return torch.matmul(Rz,Re).to(azimuth.device)
+def rotate_vector(vector, axis, angle, device='cuda'):
+    rot_matrix = rotation_matrix_axis_angle(axis, angle)
+    return torch.matmul(vector.to(device).float(), rot_matrix.to(device).float())
+def apply_rotation_angle_axis(image, axis, angle, device='cuda'):
+    """Apply rotation to a batch of images with shape [batch_size, 3(rgb), h, w] using PyTorch.
+    Args:
+        image (torch.Tensor): Input RGB image tensor of shape [batch_size, 3, h, w]. each pixel's rgb channels refer to direction of normal (can be negative)
+        axis (torch.Tensor): Rotation axis of shape [3].
+        angle (torch.Tensor): Rotation angles in degrees, of shape [batch_size].
+    Returns:
+        torch.Tensor: Rotated image tensor of shape [batch_size, 3, h, w]. values between [-1., 1.]
+    """
+    if not isinstance(image, torch.Tensor):
+        image_tensor = torch.tensor(image).to(device)
+    else:
+        image_tensor = image.to(device)
+    if not isinstance(axis, torch.Tensor):
+        axis = torch.tensor(axis)
+    axis = axis.to(device)
+    if not isinstance(angle, torch.Tensor):
+        angle = torch.tensor(angle)
+    angle = angle.to(device)
+    batch_size, channels, h, w = image_tensor.shape
+    rot_matrix = rotation_matrix_axis_angle(axis, angle)
+    rotation_matrices = rot_matrix.permute(2, 0, 1)
+    batch_size, c, h, w = image_tensor.shape
+    images_flat = image_tensor.view(batch_size, c, h * w)
+    rotated_images_flat = torch.bmm(rotation_matrices, images_flat)
+    rotated_images = rotated_images_flat.view(batch_size, c, h, w)
+    norms = torch.norm(rotated_images, p=2, dim=1, keepdim=True)
+    norms = torch.where(norms == 0, torch.tensor(EPSILON).to(norms.device), norms)
+    normalized_images = rotated_images / norms
+    return normalized_images

models/lrm/config/PRM_inference.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+model_config:
+  target: models.lrm.models.lrm_mesh.PRM
+  params:
+    encoder_feat_dim: 768
+    encoder_freeze: false
+    encoder_model_name: facebook/dino-vitb16
+    transformer_dim: 1024
+    transformer_layers: 16
+    transformer_heads: 16
+    triplane_low_res: 32
+    triplane_high_res: 64
+    triplane_dim: 80
+    rendering_samples_per_ray: 128
+    grid_res: 128
+    grid_scale: 2.1
+infer_config:
+  unet_path: ckpts/diffusion_pytorch_model.bin
+  model_path: ckpts/final_ckpt.ckpt
+  texture_resolution: 2048
+  render_resolution: 512

models/lrm/models/__init__.py ADDED Viewed

File without changes

models/lrm/models/decoder/__init__.py ADDED Viewed

File without changes

models/lrm/models/decoder/transformer.py ADDED Viewed

	@@ -0,0 +1,123 @@

+# Copyright (c) 2023, Zexin He
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+class BasicTransformerBlock(nn.Module):
+    """
+    Transformer block that takes in a cross-attention condition and another modulation vector applied to sub-blocks.
+    """
+    # use attention from torch.nn.MultiHeadAttention
+    # Block contains a cross-attention layer, a self-attention layer, and a MLP
+    def __init__(
+        self,
+        inner_dim: int,
+        cond_dim: int,
+        num_heads: int,
+        eps: float,
+        attn_drop: float = 0.,
+        attn_bias: bool = False,
+        mlp_ratio: float = 4.,
+        mlp_drop: float = 0.,
+    ):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(inner_dim)
+        self.cross_attn = nn.MultiheadAttention(
+            embed_dim=inner_dim, num_heads=num_heads, kdim=cond_dim, vdim=cond_dim,
+            dropout=attn_drop, bias=attn_bias, batch_first=True)
+        self.norm2 = nn.LayerNorm(inner_dim)
+        self.self_attn = nn.MultiheadAttention(
+            embed_dim=inner_dim, num_heads=num_heads,
+            dropout=attn_drop, bias=attn_bias, batch_first=True)
+        self.norm3 = nn.LayerNorm(inner_dim)
+        self.mlp = nn.Sequential(
+            nn.Linear(inner_dim, int(inner_dim * mlp_ratio)),
+            nn.GELU(),
+            nn.Dropout(mlp_drop),
+            nn.Linear(int(inner_dim * mlp_ratio), inner_dim),
+            nn.Dropout(mlp_drop),
+        )
+    def forward(self, x, cond):
+        # x: [N, L, D]
+        # cond: [N, L_cond, D_cond]
+        x = x + self.cross_attn(self.norm1(x), cond, cond)[0]
+        before_sa = self.norm2(x)
+        x = x + self.self_attn(before_sa, before_sa, before_sa)[0]
+        x = x + self.mlp(self.norm3(x))
+        return x
+class TriplaneTransformer(nn.Module):
+    """
+    Transformer with condition that generates a triplane representation.
+    Reference:
+    Timm: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L486
+    """
+    def __init__(
+        self,
+        inner_dim: int,
+        image_feat_dim: int,
+        triplane_low_res: int,
+        triplane_high_res: int,
+        triplane_dim: int,
+        num_layers: int,
+        num_heads: int,
+        eps: float = 1e-6,
+    ):
+        super().__init__()
+        # attributes
+        self.triplane_low_res = triplane_low_res
+        self.triplane_high_res = triplane_high_res
+        self.triplane_dim = triplane_dim
+        # modules
+        # initialize pos_embed with 1/sqrt(dim) * N(0, 1)
+        self.pos_embed = nn.Parameter(torch.randn(1, 3*triplane_low_res**2, inner_dim) * (1. / inner_dim) ** 0.5)
+        self.layers = nn.ModuleList([
+            BasicTransformerBlock(
+                inner_dim=inner_dim, cond_dim=image_feat_dim, num_heads=num_heads, eps=eps)
+            for _ in range(num_layers)
+        ])
+        self.norm = nn.LayerNorm(inner_dim, eps=eps)
+        self.deconv = nn.ConvTranspose2d(inner_dim, triplane_dim, kernel_size=2, stride=2, padding=0)
+    def forward(self, image_feats):
+        # image_feats: [N, L_cond, D_cond]
+        N = image_feats.shape[0]
+        H = W = self.triplane_low_res
+        L = 3 * H * W
+        x = self.pos_embed.repeat(N, 1, 1)  # [N, L, D]
+        for layer in self.layers:
+            x = layer(x, image_feats)
+        x = self.norm(x)
+        # separate each plane and apply deconv
+        x = x.view(N, 3, H, W, -1)
+        x = torch.einsum('nihwd->indhw', x)  # [3, N, D, H, W]
+        x = x.contiguous().view(3*N, -1, H, W)  # [3*N, D, H, W]
+        x = self.deconv(x)  # [3*N, D', H', W']
+        x = x.view(3, N, *x.shape[-3:])  # [3, N, D', H', W']
+        x = torch.einsum('indhw->nidhw', x)  # [N, 3, D', H', W']
+        x = x.contiguous()
+        return x

models/lrm/models/encoder/__init__.py ADDED Viewed

File without changes

models/lrm/models/encoder/dino.py ADDED Viewed

	@@ -0,0 +1,550 @@

+# coding=utf-8
+# Copyright 2021 Google AI, Ross Wightman, The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch ViT model."""
+import collections.abc
+import math
+from typing import Dict, List, Optional, Set, Tuple, Union
+import torch
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+)
+from transformers import PreTrainedModel, ViTConfig
+from transformers.pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+class ViTEmbeddings(nn.Module):
+    """
+    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
+    """
+    def __init__(self, config: ViTConfig, use_mask_token: bool = False) -> None:
+        super().__init__()
+        self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) if use_mask_token else None
+        self.patch_embeddings = ViTPatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size))
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.config = config
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
+        resolution images.
+        Source:
+        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+        """
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+        if num_patches == num_positions and height == width:
+            return self.position_embeddings
+        class_pos_embed = self.position_embeddings[:, 0]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+        dim = embeddings.shape[-1]
+        h0 = height // self.config.patch_size
+        w0 = width // self.config.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        h0, w0 = h0 + 0.1, w0 + 0.1
+        patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            scale_factor=(h0 / math.sqrt(num_positions), w0 / math.sqrt(num_positions)),
+            mode="bicubic",
+            align_corners=False,
+        )
+        assert int(h0) == patch_pos_embed.shape[-2] and int(w0) == patch_pos_embed.shape[-1]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+        if bool_masked_pos is not None:
+            seq_length = embeddings.shape[1]
+            mask_tokens = self.mask_token.expand(batch_size, seq_length, -1)
+            # replace the masked visual tokens by mask_tokens
+            mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+        # add positional encoding to each token
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embeddings
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class ViTPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+    def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+                f" Expected {self.num_channels} but got {num_channels}."
+            )
+        if not interpolate_pos_encoding:
+            if height != self.image_size[0] or width != self.image_size[1]:
+                raise ValueError(
+                    f"Input image size ({height}*{width}) doesn't match model"
+                    f" ({self.image_size[0]}*{self.image_size[1]})."
+                )
+        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return embeddings
+class ViTSelfAttention(nn.Module):
+    def __init__(self, config: ViTConfig) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(
+        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        mixed_query_layer = self.query(hidden_states)
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+        return outputs
+class ViTSelfOutput(nn.Module):
+    """
+    The residual connection is defined in ViTLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+    def __init__(self, config: ViTConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+class ViTAttention(nn.Module):
+    def __init__(self, config: ViTConfig) -> None:
+        super().__init__()
+        self.attention = ViTSelfAttention(config)
+        self.output = ViTSelfOutput(config)
+        self.pruned_heads = set()
+    def prune_heads(self, heads: Set[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+class ViTIntermediate(nn.Module):
+    def __init__(self, config: ViTConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+class ViTOutput(nn.Module):
+    def __init__(self, config: ViTConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states + input_tensor
+        return hidden_states
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+class ViTLayer(nn.Module):
+    """This corresponds to the Block class in the timm implementation."""
+    def __init__(self, config: ViTConfig) -> None:
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = ViTAttention(config)
+        self.intermediate = ViTIntermediate(config)
+        self.output = ViTOutput(config)
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(config.hidden_size, 4 * config.hidden_size, bias=True)
+        )
+        nn.init.constant_(self.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.adaLN_modulation[-1].bias, 0)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        adaln_input: torch.Tensor = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        shift_msa, scale_msa, shift_mlp, scale_mlp = self.adaLN_modulation(adaln_input).chunk(4, dim=1)
+        self_attention_outputs = self.attention(
+            modulate(self.layernorm_before(hidden_states), shift_msa, scale_msa),  # in ViT, layernorm is applied before self-attention
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+        # in ViT, layernorm is also applied after self-attention
+        layer_output = modulate(self.layernorm_after(hidden_states), shift_mlp, scale_mlp)
+        layer_output = self.intermediate(layer_output)
+        # second residual connection is done here
+        layer_output = self.output(layer_output, hidden_states)
+        outputs = (layer_output,) + outputs
+        return outputs
+class ViTEncoder(nn.Module):
+    def __init__(self, config: ViTConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([ViTLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        adaln_input: torch.Tensor = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    adaln_input,
+                    layer_head_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, adaln_input, layer_head_mask, output_attentions)
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+class ViTPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = ViTConfig
+    base_model_prefix = "vit"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["ViTEmbeddings", "ViTLayer"]
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, ViTEmbeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.position_embeddings.dtype)
+            module.cls_token.data = nn.init.trunc_normal_(
+                module.cls_token.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.cls_token.dtype)
+class ViTModel(ViTPreTrainedModel):
+    def __init__(self, config: ViTConfig, add_pooling_layer: bool = True, use_mask_token: bool = False):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = ViTEmbeddings(config, use_mask_token=use_mask_token)
+        self.encoder = ViTEncoder(config)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pooler = ViTPooler(config) if add_pooling_layer else None
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> ViTPatchEmbeddings:
+        return self.embeddings.patch_embeddings
+    def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        adaln_input: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        # TODO: maybe have a cleaner way to cast the input (from `ImageProcessor` side?)
+        expected_dtype = self.embeddings.patch_embeddings.projection.weight.dtype
+        if pixel_values.dtype != expected_dtype:
+            pixel_values = pixel_values.to(expected_dtype)
+        embedding_output = self.embeddings(
+            pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            adaln_input=adaln_input,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+        if not return_dict:
+            head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
+            return head_outputs + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+class ViTPooler(nn.Module):
+    def __init__(self, config: ViTConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output

models/lrm/models/encoder/dino_wrapper.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Copyright (c) 2023, Zexin He
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch.nn as nn
+from transformers import ViTImageProcessor
+from einops import rearrange, repeat
+from .dino import ViTModel
+class DinoWrapper(nn.Module):
+    """
+    Dino v1 wrapper using huggingface transformer implementation.
+    """
+    def __init__(self, model_name: str, freeze: bool = True):
+        super().__init__()
+        self.model, self.processor = self._build_dino(model_name)
+        self.camera_embedder = nn.Sequential(
+            nn.Linear(16, self.model.config.hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(self.model.config.hidden_size, self.model.config.hidden_size, bias=True)
+        )
+        if freeze:
+            self._freeze()
+    def forward(self, image, camera):
+        # image: [B, N, C, H, W]
+        # camera: [B, N, D]
+        # RGB image with [0,1] scale and properly sized
+        if image.ndim == 5:
+            image = rearrange(image, 'b n c h w -> (b n) c h w')
+        dtype = image.dtype
+        inputs = self.processor(
+            images=image.float(),
+            return_tensors="pt",
+            do_rescale=False,
+            do_resize=False,
+        ).to(self.model.device).to(dtype)
+        # embed camera
+        N = camera.shape[1]
+        camera_embeddings = self.camera_embedder(camera)
+        camera_embeddings = rearrange(camera_embeddings, 'b n d -> (b n) d')
+        embeddings = camera_embeddings
+        # This resampling of positional embedding uses bicubic interpolation
+        outputs = self.model(**inputs, adaln_input=embeddings, interpolate_pos_encoding=True)
+        last_hidden_states = outputs.last_hidden_state
+        return last_hidden_states
+    def _freeze(self):
+        print(f"======== Freezing DinoWrapper ========")
+        self.model.eval()
+        for name, param in self.model.named_parameters():
+            param.requires_grad = False
+    @staticmethod
+    def _build_dino(model_name: str, proxy_error_retries: int = 3, proxy_error_cooldown: int = 5):
+        import requests
+        try:
+            model = ViTModel.from_pretrained(model_name, add_pooling_layer=False)
+            processor = ViTImageProcessor.from_pretrained(model_name)
+            return model, processor
+        except requests.exceptions.ProxyError as err:
+            if proxy_error_retries > 0:
+                print(f"Huggingface ProxyError: Retrying in {proxy_error_cooldown} seconds...")
+                import time
+                time.sleep(proxy_error_cooldown)
+                return DinoWrapper._build_dino(model_name, proxy_error_retries - 1, proxy_error_cooldown)
+            else:
+                raise err

models/lrm/models/geometry/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION & AFFILIATES and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION & AFFILIATES is strictly prohibited.

models/lrm/models/geometry/camera/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION & AFFILIATES and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION & AFFILIATES is strictly prohibited.
+import torch
+from torch import nn
+class Camera(nn.Module):
+    def __init__(self):
+        super(Camera, self).__init__()
+        pass

models/lrm/models/geometry/camera/perspective_camera.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION & AFFILIATES and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION & AFFILIATES is strictly prohibited.
+import torch
+from . import Camera
+import numpy as np
+def projection(x=0.1, n=1.0, f=50.0, near_plane=None):
+    if near_plane is None:
+        near_plane = n
+    return np.array(
+        [[n / x, 0, 0, 0],
+         [0, n / -x, 0, 0],
+         [0, 0, -(f + near_plane) / (f - near_plane), -(2 * f * near_plane) / (f - near_plane)],
+         [0, 0, -1, 0]]).astype(np.float32)
+class PerspectiveCamera(Camera):
+    def __init__(self, fovy=49.0, device='cuda'):
+        super(PerspectiveCamera, self).__init__()
+        self.device = device
+        focal = np.tan(fovy / 180.0 * np.pi * 0.5)
+        self.proj_mtx = torch.from_numpy(projection(x=focal, f=1000.0, n=1.0, near_plane=0.1)).to(self.device).unsqueeze(dim=0)
+    def project(self, points_bxnx4):
+        out = torch.matmul(
+            points_bxnx4,
+            torch.transpose(self.proj_mtx, 1, 2))
+        return out

models/lrm/models/geometry/render/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import torch
+class Renderer():
+    def __init__(self):
+        pass
+    def forward(self):
+        pass

models/lrm/models/geometry/render/neural_render.py ADDED Viewed

	@@ -0,0 +1,293 @@

+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION & AFFILIATES and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION & AFFILIATES is strictly prohibited.
+import torch
+import torch.nn.functional as F
+import nvdiffrast.torch as dr
+from . import Renderer
+from . import util
+from . import renderutils as ru
+_FG_LUT = None
+def interpolate(attr, rast, attr_idx, rast_db=None):
+    return dr.interpolate(
+        attr.contiguous(), rast, attr_idx, rast_db=rast_db,
+        diff_attrs=None if rast_db is None else 'all')
+def xfm_points(points, matrix, use_python=True):
+    '''Transform points.
+    Args:
+        points: Tensor containing 3D points with shape [minibatch_size, num_vertices, 3] or [1, num_vertices, 3]
+        matrix: A 4x4 transform matrix with shape [minibatch_size, 4, 4]
+        use_python: Use PyTorch's torch.matmul (for validation)
+    Returns:
+        Transformed points in homogeneous 4D with shape [minibatch_size, num_vertices, 4].
+    '''
+    out = torch.matmul(torch.nn.functional.pad(points, pad=(0, 1), mode='constant', value=1.0), torch.transpose(matrix, 1, 2))
+    if torch.is_anomaly_enabled():
+        assert torch.all(torch.isfinite(out)), "Output of xfm_points contains inf or NaN"
+    return out
+def dot(x, y):
+    return torch.sum(x * y, -1, keepdim=True)
+def compute_vertex_normal(v_pos, t_pos_idx):
+    i0 = t_pos_idx[:, 0]
+    i1 = t_pos_idx[:, 1]
+    i2 = t_pos_idx[:, 2]
+    v0 = v_pos[i0, :]
+    v1 = v_pos[i1, :]
+    v2 = v_pos[i2, :]
+    face_normals = torch.cross(v1 - v0, v2 - v0)
+    # Splat face normals to vertices
+    v_nrm = torch.zeros_like(v_pos)
+    v_nrm.scatter_add_(0, i0[:, None].repeat(1, 3), face_normals)
+    v_nrm.scatter_add_(0, i1[:, None].repeat(1, 3), face_normals)
+    v_nrm.scatter_add_(0, i2[:, None].repeat(1, 3), face_normals)
+    # Normalize, replace zero (degenerated) normals with some default value
+    v_nrm = torch.where(
+        dot(v_nrm, v_nrm) > 1e-20, v_nrm, torch.as_tensor([0.0, 0.0, 1.0]).to(v_nrm)
+    )
+    v_nrm = F.normalize(v_nrm, dim=1)
+    assert torch.all(torch.isfinite(v_nrm))
+    return v_nrm
+class NeuralRender(Renderer):
+    def __init__(self, device='cuda', camera_model=None):
+        super(NeuralRender, self).__init__()
+        self.device = device
+        self.ctx = dr.RasterizeCudaContext(device=device)
+        self.projection_mtx = None
+        self.camera = camera_model
+    # ==============================================================================================
+    #  pixel shader
+    # ==============================================================================================
+    # def shade(
+    #         self,
+    #         gb_pos,
+    #         gb_geometric_normal,
+    #         gb_normal,
+    #         gb_tangent,
+    #         gb_texc,
+    #         gb_texc_deriv,
+    #         view_pos,
+    #     ):
+    #     ################################################################################
+    #     # Texture lookups
+    #     ################################################################################
+    #     breakpoint()
+    #     # Separate kd into alpha and color, default alpha = 1
+    #     alpha = kd[..., 3:4] if kd.shape[-1] == 4 else torch.ones_like(kd[..., 0:1])
+    #     kd = kd[..., 0:3]
+    #     ################################################################################
+    #     # Normal perturbation & normal bend
+    #     ################################################################################
+    #     perturbed_nrm = None
+    #     gb_normal = ru.prepare_shading_normal(gb_pos, view_pos, perturbed_nrm, gb_normal, gb_tangent, gb_geometric_normal, two_sided_shading=True, opengl=True)
+    #     ################################################################################
+    #     # Evaluate BSDF
+    #     ################################################################################
+    #     assert 'bsdf' in material or bsdf is not None, "Material must specify a BSDF type"
+    #     bsdf = material['bsdf'] if bsdf is None else bsdf
+    #     if bsdf == 'pbr':
+    #         if isinstance(lgt, light.EnvironmentLight):
+    #             shaded_col = lgt.shade(gb_pos, gb_normal, kd, ks, view_pos, specular=True)
+    #         else:
+    #             assert False, "Invalid light type"
+    #     elif bsdf == 'diffuse':
+    #         if isinstance(lgt, light.EnvironmentLight):
+    #             shaded_col = lgt.shade(gb_pos, gb_normal, kd, ks, view_pos, specular=False)
+    #         else:
+    #             assert False, "Invalid light type"
+    #     elif bsdf == 'normal':
+    #         shaded_col = (gb_normal + 1.0)*0.5
+    #     elif bsdf == 'tangent':
+    #         shaded_col = (gb_tangent + 1.0)*0.5
+    #     elif bsdf == 'kd':
+    #         shaded_col = kd
+    #     elif bsdf == 'ks':
+    #         shaded_col = ks
+    #     else:
+    #         assert False, "Invalid BSDF '%s'" % bsdf
+    #     # Return multiple buffers
+    #     buffers = {
+    #         'shaded'    : torch.cat((shaded_col, alpha), dim=-1),
+    #         'kd_grad'   : torch.cat((kd_grad, alpha), dim=-1),
+    #         'occlusion' : torch.cat((ks[..., :1], alpha), dim=-1)
+    #     }
+    #     return buffers
+    # ==============================================================================================
+    #  Render a depth slice of the mesh (scene), some limitations:
+    #  - Single mesh
+    #  - Single light
+    #  - Single material
+    # ==============================================================================================
+    def render_layer(
+            self,
+            rast,
+            rast_deriv,
+            mesh,
+            view_pos,
+            resolution,
+            spp,
+            msaa
+        ):
+        # Scale down to shading resolution when MSAA is enabled, otherwise shade at full resolution
+        rast_out_s = rast
+        rast_out_deriv_s = rast_deriv
+        ################################################################################
+        # Interpolate attributes
+        ################################################################################
+        # Interpolate world space position
+        gb_pos, _ = interpolate(mesh.v_pos[None, ...], rast_out_s, mesh.t_pos_idx.int())
+        # Compute geometric normals. We need those because of bent normals trick (for bump mapping)
+        v0 = mesh.v_pos[mesh.t_pos_idx[:, 0], :]
+        v1 = mesh.v_pos[mesh.t_pos_idx[:, 1], :]
+        v2 = mesh.v_pos[mesh.t_pos_idx[:, 2], :]
+        face_normals = util.safe_normalize(torch.cross(v1 - v0, v2 - v0))
+        face_normal_indices = (torch.arange(0, face_normals.shape[0], dtype=torch.int64, device='cuda')[:, None]).repeat(1, 3)
+        gb_geometric_normal, _ = interpolate(face_normals[None, ...], rast_out_s, face_normal_indices.int())
+        # Compute tangent space
+        assert mesh.v_nrm is not None and mesh.v_tng is not None
+        gb_normal, _ = interpolate(mesh.v_nrm[None, ...], rast_out_s, mesh.t_nrm_idx.int())
+        gb_tangent, _ = interpolate(mesh.v_tng[None, ...], rast_out_s, mesh.t_tng_idx.int()) # Interpolate tangents
+        # Texture coordinate
+        # assert mesh.v_tex is not None
+        # gb_texc, gb_texc_deriv = interpolate(mesh.v_tex[None, ...], rast_out_s, mesh.t_tex_idx.int(), rast_db=rast_out_deriv_s)
+        perturbed_nrm = None
+        gb_normal = ru.prepare_shading_normal(gb_pos, view_pos[:,None,None,:], perturbed_nrm, gb_normal, gb_tangent, gb_geometric_normal, two_sided_shading=True, opengl=True)
+        return gb_pos, gb_normal
+    def render_mesh(
+            self,
+            mesh_v_pos_bxnx3,
+            mesh_t_pos_idx_fx3,
+            mesh,
+            camera_mv_bx4x4,
+            camera_pos,
+            mesh_v_feat_bxnxd,
+            resolution=256,
+            spp=1,
+            device='cuda',
+            hierarchical_mask=False
+    ):
+        assert not hierarchical_mask
+        mtx_in = torch.tensor(camera_mv_bx4x4, dtype=torch.float32, device=device) if not torch.is_tensor(camera_mv_bx4x4) else camera_mv_bx4x4
+        v_pos = xfm_points(mesh_v_pos_bxnx3, mtx_in)  # Rotate it to camera coordinates
+        v_pos_clip = self.camera.project(v_pos)  # Projection in the camera
+        # view_pos = torch.linalg.inv(mtx_in)[:, :3, 3]
+        view_pos = camera_pos
+        v_nrm = mesh.v_nrm  #compute_vertex_normal(mesh_v_pos_bxnx3[0], mesh_t_pos_idx_fx3.long())  # vertex normals in world coordinates
+        # Render the image,
+        # Here we only return the feature (3D location) at each pixel, which will be used as the input for neural render
+        num_layers = 1
+        mask_pyramid = None
+        assert mesh_t_pos_idx_fx3.shape[0] > 0  # Make sure we have shapes
+        mesh_v_feat_bxnxd = torch.cat([mesh_v_feat_bxnxd.repeat(v_pos.shape[0], 1, 1), v_pos], dim=-1)  # Concatenate the pos [org_pos, clip space pose for rasterization]
+        layers = []
+        with dr.DepthPeeler(self.ctx, v_pos_clip, mesh.t_pos_idx.int(), [resolution * spp, resolution * spp]) as peeler:
+            for _ in range(num_layers):
+                rast, db = peeler.rasterize_next_layer()
+                gb_pos, gb_normal = self.render_layer(rast, db, mesh, view_pos, resolution, spp, msaa=False)
+        with dr.DepthPeeler(self.ctx, v_pos_clip, mesh_t_pos_idx_fx3, [resolution * spp, resolution * spp]) as peeler:
+            for _ in range(num_layers):
+                rast, db = peeler.rasterize_next_layer()
+                gb_feat, _ = interpolate(mesh_v_feat_bxnxd, rast, mesh_t_pos_idx_fx3)
+        hard_mask = torch.clamp(rast[..., -1:], 0, 1)
+        antialias_mask = dr.antialias(
+            hard_mask.clone().contiguous(), rast, v_pos_clip,
+            mesh_t_pos_idx_fx3)
+        depth = gb_feat[..., -2:-1]
+        ori_mesh_feature = gb_feat[..., :-4]
+        normal, _ = interpolate(v_nrm[None, ...], rast, mesh_t_pos_idx_fx3)
+        normal = dr.antialias(normal.clone().contiguous(), rast, v_pos_clip, mesh_t_pos_idx_fx3)
+        # normal = F.normalize(normal, dim=-1)
+        # normal = torch.lerp(torch.zeros_like(normal), (normal + 1.0) / 2.0, hard_mask.float())      # black background
+        return ori_mesh_feature, antialias_mask, hard_mask, rast, v_pos_clip, mask_pyramid, depth, normal, gb_normal
+    def render_mesh_light(
+            self,
+            mesh_v_pos_bxnx3,
+            mesh_t_pos_idx_fx3,
+            mesh,
+            camera_mv_bx4x4,
+            mesh_v_feat_bxnxd,
+            resolution=256,
+            spp=1,
+            device='cuda',
+            hierarchical_mask=False
+    ):
+        assert not hierarchical_mask
+        mtx_in = torch.tensor(camera_mv_bx4x4, dtype=torch.float32, device=device) if not torch.is_tensor(camera_mv_bx4x4) else camera_mv_bx4x4
+        v_pos = xfm_points(mesh_v_pos_bxnx3, mtx_in)  # Rotate it to camera coordinates
+        v_pos_clip = self.camera.project(v_pos)  # Projection in the camera
+        v_nrm = compute_vertex_normal(mesh_v_pos_bxnx3[0], mesh_t_pos_idx_fx3.long())  # vertex normals in world coordinates
+        # Render the image,
+        # Here we only return the feature (3D location) at each pixel, which will be used as the input for neural render
+        num_layers = 1
+        mask_pyramid = None
+        assert mesh_t_pos_idx_fx3.shape[0] > 0  # Make sure we have shapes
+        mesh_v_feat_bxnxd = torch.cat([mesh_v_feat_bxnxd.repeat(v_pos.shape[0], 1, 1), v_pos], dim=-1)  # Concatenate the pos
+        with dr.DepthPeeler(self.ctx, v_pos_clip, mesh_t_pos_idx_fx3, [resolution * spp, resolution * spp]) as peeler:
+            for _ in range(num_layers):
+                rast, db = peeler.rasterize_next_layer()
+                gb_feat, _ = interpolate(mesh_v_feat_bxnxd, rast, mesh_t_pos_idx_fx3)
+        hard_mask = torch.clamp(rast[..., -1:], 0, 1)
+        antialias_mask = dr.antialias(
+            hard_mask.clone().contiguous(), rast, v_pos_clip,
+            mesh_t_pos_idx_fx3)
+        depth = gb_feat[..., -2:-1]
+        ori_mesh_feature = gb_feat[..., :-4]
+        normal, _ = interpolate(v_nrm[None, ...], rast, mesh_t_pos_idx_fx3)
+        normal = dr.antialias(normal.clone().contiguous(), rast, v_pos_clip, mesh_t_pos_idx_fx3)
+        normal = F.normalize(normal, dim=-1)
+        normal = torch.lerp(torch.zeros_like(normal), (normal + 1.0) / 2.0, hard_mask.float())      # black background
+        return ori_mesh_feature, antialias_mask, hard_mask, rast, v_pos_clip, mask_pyramid, depth, normal

models/lrm/models/geometry/render/renderutils/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+from .ops import xfm_points, xfm_vectors, image_loss, diffuse_cubemap, specular_cubemap, prepare_shading_normal, lambert, frostbite_diffuse, pbr_specular, pbr_bsdf, _fresnel_shlick, _ndf_ggx, _lambda_ggx, _masking_smith
+__all__ = ["xfm_vectors", "xfm_points", "image_loss", "diffuse_cubemap","specular_cubemap", "prepare_shading_normal", "lambert", "frostbite_diffuse", "pbr_specular", "pbr_bsdf", "_fresnel_shlick", "_ndf_ggx", "_lambda_ggx", "_masking_smith", ]

models/lrm/models/geometry/render/renderutils/bsdf.py ADDED Viewed

	@@ -0,0 +1,151 @@

+# Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+# property and proprietary rights in and to this material, related
+# documentation and any modifications thereto. Any use, reproduction,
+# disclosure or distribution of this material and related documentation
+# without an express license agreement from NVIDIA CORPORATION or
+# its affiliates is strictly prohibited.
+import math
+import torch
+NORMAL_THRESHOLD = 0.1
+################################################################################
+# Vector utility functions
+################################################################################
+def _dot(x, y):
+    return torch.sum(x*y, -1, keepdim=True)
+def _reflect(x, n):
+    return 2*_dot(x, n)*n - x
+def _safe_normalize(x):
+    return torch.nn.functional.normalize(x, dim = -1)
+def _bend_normal(view_vec, smooth_nrm, geom_nrm, two_sided_shading):
+    # Swap normal direction for backfacing surfaces
+    if two_sided_shading:
+        smooth_nrm = torch.where(_dot(geom_nrm, view_vec) > 0, smooth_nrm, -smooth_nrm)
+        geom_nrm   = torch.where(_dot(geom_nrm, view_vec) > 0, geom_nrm, -geom_nrm)
+    t = torch.clamp(_dot(view_vec, smooth_nrm) / NORMAL_THRESHOLD, min=0, max=1)
+    return torch.lerp(geom_nrm, smooth_nrm, t)
+def _perturb_normal(perturbed_nrm, smooth_nrm, smooth_tng, opengl):
+    smooth_bitang = _safe_normalize(torch.cross(smooth_tng, smooth_nrm))
+    if opengl:
+        shading_nrm = smooth_tng * perturbed_nrm[..., 0:1] - smooth_bitang * perturbed_nrm[..., 1:2] + smooth_nrm * torch.clamp(perturbed_nrm[..., 2:3], min=0.0)
+    else:
+        shading_nrm = smooth_tng * perturbed_nrm[..., 0:1] + smooth_bitang * perturbed_nrm[..., 1:2] + smooth_nrm * torch.clamp(perturbed_nrm[..., 2:3], min=0.0)
+    return _safe_normalize(shading_nrm)
+def bsdf_prepare_shading_normal(pos, view_pos, perturbed_nrm, smooth_nrm, smooth_tng, geom_nrm, two_sided_shading, opengl):
+    smooth_nrm = _safe_normalize(smooth_nrm)
+    smooth_tng = _safe_normalize(smooth_tng)
+    view_vec   = _safe_normalize(view_pos - pos)
+    shading_nrm = _perturb_normal(perturbed_nrm, smooth_nrm, smooth_tng, opengl)
+    return _bend_normal(view_vec, shading_nrm, geom_nrm, two_sided_shading)
+################################################################################
+# Simple lambertian diffuse BSDF
+################################################################################
+def bsdf_lambert(nrm, wi):
+    return torch.clamp(_dot(nrm, wi), min=0.0) / math.pi
+################################################################################
+# Frostbite diffuse
+################################################################################
+def bsdf_frostbite(nrm, wi, wo, linearRoughness):
+    wiDotN = _dot(wi, nrm)
+    woDotN = _dot(wo, nrm)
+    h = _safe_normalize(wo + wi)
+    wiDotH = _dot(wi, h)
+    energyBias = 0.5 * linearRoughness
+    energyFactor = 1.0 - (0.51 / 1.51) * linearRoughness
+    f90 = energyBias + 2.0 * wiDotH * wiDotH * linearRoughness
+    f0 = 1.0
+    wiScatter = bsdf_fresnel_shlick(f0, f90, wiDotN)
+    woScatter = bsdf_fresnel_shlick(f0, f90, woDotN)
+    res = wiScatter * woScatter * energyFactor
+    return torch.where((wiDotN > 0.0) & (woDotN > 0.0), res, torch.zeros_like(res))
+################################################################################
+# Phong specular, loosely based on mitsuba implementation
+################################################################################
+def bsdf_phong(nrm, wo, wi, N):
+    dp_r = torch.clamp(_dot(_reflect(wo, nrm), wi), min=0.0, max=1.0)
+    dp_l = torch.clamp(_dot(nrm, wi), min=0.0, max=1.0)
+    return (dp_r ** N) * dp_l * (N + 2) / (2 * math.pi)
+################################################################################
+# PBR's implementation of GGX specular
+################################################################################
+specular_epsilon = 1e-4
+def bsdf_fresnel_shlick(f0, f90, cosTheta):
+    _cosTheta = torch.clamp(cosTheta, min=specular_epsilon, max=1.0 - specular_epsilon)
+    return f0 + (f90 - f0) * (1.0 - _cosTheta) ** 5.0
+def bsdf_ndf_ggx(alphaSqr, cosTheta):
+    _cosTheta = torch.clamp(cosTheta, min=specular_epsilon, max=1.0 - specular_epsilon)
+    d = (_cosTheta * alphaSqr - _cosTheta) * _cosTheta + 1
+    return alphaSqr / (d * d * math.pi)
+def bsdf_lambda_ggx(alphaSqr, cosTheta):
+    _cosTheta = torch.clamp(cosTheta, min=specular_epsilon, max=1.0 - specular_epsilon)
+    cosThetaSqr = _cosTheta * _cosTheta
+    tanThetaSqr = (1.0 - cosThetaSqr) / cosThetaSqr
+    res = 0.5 * (torch.sqrt(1 + alphaSqr * tanThetaSqr) - 1.0)
+    return res
+def bsdf_masking_smith_ggx_correlated(alphaSqr, cosThetaI, cosThetaO):
+    lambdaI = bsdf_lambda_ggx(alphaSqr, cosThetaI)
+    lambdaO = bsdf_lambda_ggx(alphaSqr, cosThetaO)
+    return 1 / (1 + lambdaI + lambdaO)
+def bsdf_pbr_specular(col, nrm, wo, wi, alpha, min_roughness=0.08):
+    _alpha = torch.clamp(alpha, min=min_roughness*min_roughness, max=1.0)
+    alphaSqr = _alpha * _alpha
+    h = _safe_normalize(wo + wi)
+    woDotN = _dot(wo, nrm)
+    wiDotN = _dot(wi, nrm)
+    woDotH = _dot(wo, h)
+    nDotH  = _dot(nrm, h)
+    D = bsdf_ndf_ggx(alphaSqr, nDotH)
+    G = bsdf_masking_smith_ggx_correlated(alphaSqr, woDotN, wiDotN)
+    F = bsdf_fresnel_shlick(col, 1, woDotH)
+    w = F * D * G * 0.25 / torch.clamp(woDotN, min=specular_epsilon)
+    frontfacing = (woDotN > specular_epsilon) & (wiDotN > specular_epsilon)
+    return torch.where(frontfacing, w, torch.zeros_like(w))
+def bsdf_pbr(kd, arm, pos, nrm, view_pos, light_pos, min_roughness, BSDF):
+    wo = _safe_normalize(view_pos - pos)
+    wi = _safe_normalize(light_pos - pos)
+    spec_str  = arm[..., 0:1] # x component
+    roughness = arm[..., 1:2] # y component
+    metallic  = arm[..., 2:3] # z component
+    ks = (0.04 * (1.0 - metallic) + kd * metallic) * (1 - spec_str)
+    kd = kd * (1.0 - metallic)
+    if BSDF == 0:
+        diffuse = kd * bsdf_lambert(nrm, wi)
+    else:
+        diffuse = kd * bsdf_frostbite(nrm, wi, wo, roughness)
+    specular = bsdf_pbr_specular(ks, nrm, wo, wi, roughness*roughness, min_roughness=min_roughness)
+    return diffuse + specular

models/lrm/models/geometry/render/renderutils/c_src/bsdf.cu ADDED Viewed

	@@ -0,0 +1,710 @@

+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#include "common.h"
+#include "bsdf.h"
+#define SPECULAR_EPSILON 1e-4f
+//------------------------------------------------------------------------
+// Lambert functions
+__device__ inline float fwdLambert(const vec3f nrm, const vec3f wi)
+{
+    return max(dot(nrm, wi) / M_PI, 0.0f);
+}
+__device__ inline void bwdLambert(const vec3f nrm, const vec3f wi, vec3f& d_nrm, vec3f& d_wi, const float d_out)
+{
+    if (dot(nrm, wi) > 0.0f)
+        bwdDot(nrm, wi, d_nrm, d_wi, d_out / M_PI);
+}
+//------------------------------------------------------------------------
+// Fresnel Schlick
+__device__ inline float fwdFresnelSchlick(const float f0, const float f90, const float cosTheta)
+{
+    float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
+    float scale = powf(1.0f - _cosTheta, 5.0f);
+    return f0 * (1.0f - scale) + f90 * scale;
+}
+__device__ inline void bwdFresnelSchlick(const float f0, const float f90, const float cosTheta, float& d_f0, float& d_f90, float& d_cosTheta, const float d_out)
+{
+    float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
+    float scale = pow(max(1.0f - _cosTheta, 0.0f), 5.0f);
+    d_f0 += d_out * (1.0 - scale);
+    d_f90 += d_out * scale;
+    if (cosTheta >= SPECULAR_EPSILON && cosTheta < 1.0f - SPECULAR_EPSILON)
+    {
+        d_cosTheta += d_out * (f90 - f0) * -5.0f * powf(1.0f - cosTheta, 4.0f);
+    }
+}
+__device__ inline vec3f fwdFresnelSchlick(const vec3f f0, const vec3f f90, const float cosTheta)
+{
+    float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
+    float scale = powf(1.0f - _cosTheta, 5.0f);
+    return f0 * (1.0f - scale) + f90 * scale;
+}
+__device__ inline void bwdFresnelSchlick(const vec3f f0, const vec3f f90, const float cosTheta, vec3f& d_f0, vec3f& d_f90, float& d_cosTheta, const vec3f d_out)
+{
+    float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
+    float scale = pow(max(1.0f - _cosTheta, 0.0f), 5.0f);
+    d_f0 += d_out * (1.0 - scale);
+    d_f90 += d_out * scale;
+    if (cosTheta >= SPECULAR_EPSILON && cosTheta < 1.0f - SPECULAR_EPSILON)
+    {
+        d_cosTheta += sum(d_out * (f90 - f0) * -5.0f * powf(1.0f - cosTheta, 4.0f));
+    }
+}
+//------------------------------------------------------------------------
+// Frostbite diffuse
+__device__ inline float fwdFrostbiteDiffuse(const vec3f nrm, const vec3f wi, const vec3f wo, float linearRoughness)
+{
+    float wiDotN = dot(wi, nrm);
+    float woDotN = dot(wo, nrm);
+    if (wiDotN > 0.0f && woDotN > 0.0f)
+    {
+        vec3f h = safeNormalize(wo + wi);
+        float wiDotH = dot(wi, h);
+        float energyBias = 0.5f * linearRoughness;
+        float energyFactor = 1.0f - (0.51f / 1.51f) * linearRoughness;
+        float f90 = energyBias + 2.f * wiDotH * wiDotH * linearRoughness;
+        float f0 = 1.f;
+        float wiScatter = fwdFresnelSchlick(f0, f90, wiDotN);
+        float woScatter = fwdFresnelSchlick(f0, f90, woDotN);
+        return wiScatter * woScatter * energyFactor;
+    }
+    else return 0.0f;
+}
+__device__ inline void bwdFrostbiteDiffuse(const vec3f nrm, const vec3f wi, const vec3f wo, float linearRoughness, vec3f& d_nrm, vec3f& d_wi, vec3f& d_wo, float &d_linearRoughness, const float d_out)
+{
+    float wiDotN = dot(wi, nrm);
+    float woDotN = dot(wo, nrm);
+    if (wiDotN > 0.0f && woDotN > 0.0f)
+    {
+        vec3f h = safeNormalize(wo + wi);
+        float wiDotH = dot(wi, h);
+        float energyBias = 0.5f * linearRoughness;
+        float energyFactor = 1.0f - (0.51f / 1.51f) * linearRoughness;
+        float f90 = energyBias + 2.f * wiDotH * wiDotH * linearRoughness;
+        float f0 = 1.f;
+        float wiScatter = fwdFresnelSchlick(f0, f90, wiDotN);
+        float woScatter = fwdFresnelSchlick(f0, f90, woDotN);
+        // -------------- BWD --------------
+        // Backprop: return wiScatter * woScatter * energyFactor;
+        float d_wiScatter = d_out * woScatter * energyFactor;
+        float d_woScatter = d_out * wiScatter * energyFactor;
+        float d_energyFactor = d_out * wiScatter * woScatter;
+        // Backprop: float woScatter = fwdFresnelSchlick(f0, f90, woDotN);
+        float d_woDotN = 0.0f, d_f0 = 0.0, d_f90 = 0.0f;
+        bwdFresnelSchlick(f0, f90, woDotN, d_f0, d_f90, d_woDotN, d_woScatter);
+        // Backprop: float wiScatter = fwdFresnelSchlick(fd0, fd90, wiDotN);
+        float d_wiDotN = 0.0f;
+        bwdFresnelSchlick(f0, f90, wiDotN, d_f0, d_f90, d_wiDotN, d_wiScatter);
+        // Backprop: float f90 = energyBias + 2.f * wiDotH * wiDotH * linearRoughness;
+        float d_energyBias = d_f90;
+        float d_wiDotH = d_f90 * 4 * wiDotH * linearRoughness;
+        d_linearRoughness += d_f90 * 2 * wiDotH * wiDotH;
+        // Backprop: float energyFactor = 1.0f - (0.51f / 1.51f) * linearRoughness;
+        d_linearRoughness -= (0.51f / 1.51f) * d_energyFactor;
+        // Backprop: float energyBias = 0.5f * linearRoughness;
+        d_linearRoughness += 0.5 * d_energyBias;
+        // Backprop: float wiDotH = dot(wi, h);
+        vec3f d_h(0);
+        bwdDot(wi, h, d_wi, d_h, d_wiDotH);
+        // Backprop: vec3f h = safeNormalize(wo + wi);
+        vec3f d_wo_wi(0);
+        bwdSafeNormalize(wo + wi, d_wo_wi, d_h);
+        d_wi += d_wo_wi; d_wo += d_wo_wi;
+        bwdDot(wo, nrm, d_wo, d_nrm, d_woDotN);
+        bwdDot(wi, nrm, d_wi, d_nrm, d_wiDotN);
+    }
+}
+//------------------------------------------------------------------------
+// Ndf GGX
+__device__ inline float fwdNdfGGX(const float alphaSqr, const float cosTheta)
+{
+    float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
+    float d = (_cosTheta * alphaSqr - _cosTheta) * _cosTheta + 1.0f;
+    return alphaSqr / (d * d * M_PI);
+}
+__device__ inline void bwdNdfGGX(const float alphaSqr, const float cosTheta, float& d_alphaSqr, float& d_cosTheta, const float d_out)
+{
+    // Torch only back propagates if clamp doesn't trigger
+    float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
+    float cosThetaSqr = _cosTheta * _cosTheta;
+    d_alphaSqr += d_out * (1.0f - (alphaSqr + 1.0f) * cosThetaSqr) / (M_PI * powf((alphaSqr - 1.0) * cosThetaSqr + 1.0f, 3.0f));
+    if (cosTheta > SPECULAR_EPSILON && cosTheta < 1.0f - SPECULAR_EPSILON)
+    {
+        d_cosTheta += d_out * -(4.0f * (alphaSqr - 1.0f) * alphaSqr * cosTheta) / (M_PI * powf((alphaSqr - 1.0) * cosThetaSqr + 1.0f, 3.0f));
+    }
+}
+//------------------------------------------------------------------------
+// Lambda GGX
+__device__ inline float fwdLambdaGGX(const float alphaSqr, const float cosTheta)
+{
+    float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
+    float cosThetaSqr = _cosTheta * _cosTheta;
+    float tanThetaSqr = (1.0 - cosThetaSqr) / cosThetaSqr;
+    float res = 0.5f * (sqrtf(1.0f + alphaSqr * tanThetaSqr) - 1.0f);
+    return res;
+}
+__device__ inline void bwdLambdaGGX(const float alphaSqr, const float cosTheta, float& d_alphaSqr, float& d_cosTheta, const float d_out)
+{
+    float _cosTheta = clamp(cosTheta, SPECULAR_EPSILON, 1.0f - SPECULAR_EPSILON);
+    float cosThetaSqr = _cosTheta * _cosTheta;
+    float tanThetaSqr = (1.0 - cosThetaSqr) / cosThetaSqr;
+    float res = 0.5f * (sqrtf(1.0f + alphaSqr * tanThetaSqr) - 1.0f);
+    d_alphaSqr += d_out * (0.25 * tanThetaSqr) / sqrtf(alphaSqr * tanThetaSqr + 1.0f);
+    if (cosTheta > SPECULAR_EPSILON && cosTheta < 1.0f - SPECULAR_EPSILON)
+        d_cosTheta += d_out * -(0.5 * alphaSqr) / (powf(_cosTheta, 3.0f) * sqrtf(alphaSqr / cosThetaSqr - alphaSqr + 1.0f));
+}
+//------------------------------------------------------------------------
+// Masking GGX
+__device__ inline float fwdMaskingSmithGGXCorrelated(const float alphaSqr, const float cosThetaI, const float cosThetaO)
+{
+    float lambdaI = fwdLambdaGGX(alphaSqr, cosThetaI);
+    float lambdaO = fwdLambdaGGX(alphaSqr, cosThetaO);
+    return 1.0f / (1.0f + lambdaI + lambdaO);
+}
+__device__ inline void bwdMaskingSmithGGXCorrelated(const float alphaSqr, const float cosThetaI, const float cosThetaO, float& d_alphaSqr, float& d_cosThetaI, float& d_cosThetaO, const float d_out)
+{
+    // FWD eval
+    float lambdaI = fwdLambdaGGX(alphaSqr, cosThetaI);
+    float lambdaO = fwdLambdaGGX(alphaSqr, cosThetaO);
+    // BWD eval
+    float d_lambdaIO = -d_out / powf(1.0f + lambdaI + lambdaO, 2.0f);
+    bwdLambdaGGX(alphaSqr, cosThetaI, d_alphaSqr, d_cosThetaI, d_lambdaIO);
+    bwdLambdaGGX(alphaSqr, cosThetaO, d_alphaSqr, d_cosThetaO, d_lambdaIO);
+}
+//------------------------------------------------------------------------
+// GGX specular
+__device__ vec3f fwdPbrSpecular(const vec3f col, const vec3f nrm, const vec3f wo, const vec3f wi, const float alpha, const float min_roughness)
+{
+    float _alpha = clamp(alpha, min_roughness * min_roughness, 1.0f);
+    float alphaSqr = _alpha * _alpha;
+    vec3f h = safeNormalize(wo + wi);
+    float woDotN = dot(wo, nrm);
+    float wiDotN = dot(wi, nrm);
+    float woDotH = dot(wo, h);
+    float nDotH = dot(nrm, h);
+    float D = fwdNdfGGX(alphaSqr, nDotH);
+    float G = fwdMaskingSmithGGXCorrelated(alphaSqr, woDotN, wiDotN);
+    vec3f F = fwdFresnelSchlick(col, 1.0f, woDotH);
+    vec3f w = F * D * G * 0.25 / woDotN;
+    bool frontfacing = (woDotN > SPECULAR_EPSILON) & (wiDotN > SPECULAR_EPSILON);
+    return frontfacing ? w : 0.0f;
+}
+__device__ void bwdPbrSpecular(
+    const vec3f col, const vec3f nrm, const vec3f wo, const vec3f wi, const float alpha, const float min_roughness,
+    vec3f& d_col, vec3f& d_nrm, vec3f& d_wo, vec3f& d_wi, float& d_alpha, const vec3f d_out)
+{
+    ///////////////////////////////////////////////////////////////////////
+    // FWD eval
+    float _alpha = clamp(alpha, min_roughness * min_roughness, 1.0f);
+    float alphaSqr = _alpha * _alpha;
+    vec3f h = safeNormalize(wo + wi);
+    float woDotN = dot(wo, nrm);
+    float wiDotN = dot(wi, nrm);
+    float woDotH = dot(wo, h);
+    float nDotH = dot(nrm, h);
+    float D = fwdNdfGGX(alphaSqr, nDotH);
+    float G = fwdMaskingSmithGGXCorrelated(alphaSqr, woDotN, wiDotN);
+    vec3f F = fwdFresnelSchlick(col, 1.0f, woDotH);
+    vec3f w = F * D * G * 0.25 / woDotN;
+    bool frontfacing = (woDotN > SPECULAR_EPSILON) & (wiDotN > SPECULAR_EPSILON);
+    if (frontfacing)
+    {
+        ///////////////////////////////////////////////////////////////////////
+        // BWD eval
+        vec3f d_F = d_out * D * G * 0.25f / woDotN;
+        float d_D = sum(d_out * F * G * 0.25f / woDotN);
+        float d_G = sum(d_out * F * D * 0.25f / woDotN);
+        float d_woDotN = -sum(d_out * F * D * G * 0.25f / (woDotN * woDotN));
+        vec3f d_f90(0);
+        float d_woDotH(0), d_wiDotN(0), d_nDotH(0), d_alphaSqr(0);
+        bwdFresnelSchlick(col, 1.0f, woDotH, d_col, d_f90, d_woDotH, d_F);
+        bwdMaskingSmithGGXCorrelated(alphaSqr, woDotN, wiDotN, d_alphaSqr, d_woDotN, d_wiDotN, d_G);
+        bwdNdfGGX(alphaSqr, nDotH, d_alphaSqr, d_nDotH, d_D);
+        vec3f d_h(0);
+        bwdDot(nrm, h, d_nrm, d_h, d_nDotH);
+        bwdDot(wo, h, d_wo, d_h, d_woDotH);
+        bwdDot(wi, nrm, d_wi, d_nrm, d_wiDotN);
+        bwdDot(wo, nrm, d_wo, d_nrm, d_woDotN);
+        vec3f d_h_unnorm(0);
+        bwdSafeNormalize(wo + wi, d_h_unnorm, d_h);
+        d_wo += d_h_unnorm;
+        d_wi += d_h_unnorm;
+        if (alpha > min_roughness * min_roughness)
+            d_alpha += d_alphaSqr * 2 * alpha;
+    }
+}
+//------------------------------------------------------------------------
+// Full PBR BSDF
+__device__ vec3f fwdPbrBSDF(const vec3f kd, const vec3f arm, const vec3f pos, const vec3f nrm, const vec3f view_pos, const vec3f light_pos, const float min_roughness, int BSDF)
+{
+    vec3f wo = safeNormalize(view_pos - pos);
+    vec3f wi = safeNormalize(light_pos - pos);
+    float alpha = arm.y * arm.y;
+    vec3f spec_col = (0.04f * (1.0f - arm.z) + kd * arm.z) * (1.0 - arm.x);
+    vec3f diff_col = kd * (1.0f - arm.z);
+    float diff = 0.0f;
+    if (BSDF == 0)
+        diff = fwdLambert(nrm, wi);
+    else
+        diff = fwdFrostbiteDiffuse(nrm, wi, wo, arm.y);
+    vec3f diffuse = diff_col * diff;
+    vec3f specular = fwdPbrSpecular(spec_col, nrm, wo, wi, alpha, min_roughness);
+    return diffuse + specular;
+}
+__device__ void bwdPbrBSDF(
+    const vec3f kd, const vec3f arm, const vec3f pos, const vec3f nrm, const vec3f view_pos, const vec3f light_pos, const float min_roughness, int BSDF,
+    vec3f& d_kd, vec3f& d_arm, vec3f& d_pos, vec3f& d_nrm, vec3f& d_view_pos, vec3f& d_light_pos, const vec3f d_out)
+{
+    ////////////////////////////////////////////////////////////////////////
+    // FWD
+    vec3f _wi = light_pos - pos;
+    vec3f _wo = view_pos - pos;
+    vec3f wi = safeNormalize(_wi);
+    vec3f wo = safeNormalize(_wo);
+    float alpha = arm.y * arm.y;
+    vec3f spec_col = (0.04f * (1.0f - arm.z) + kd * arm.z) * (1.0 - arm.x);
+    vec3f diff_col = kd * (1.0f - arm.z);
+    float diff = 0.0f;
+    if (BSDF == 0)
+        diff = fwdLambert(nrm, wi);
+    else
+        diff = fwdFrostbiteDiffuse(nrm, wi, wo, arm.y);
+    ////////////////////////////////////////////////////////////////////////
+    // BWD
+    float d_alpha(0);
+    vec3f d_spec_col(0), d_wi(0), d_wo(0);
+    bwdPbrSpecular(spec_col, nrm, wo, wi, alpha, min_roughness, d_spec_col, d_nrm, d_wo, d_wi, d_alpha, d_out);
+    float d_diff = sum(diff_col * d_out);
+    if (BSDF == 0)
+        bwdLambert(nrm, wi, d_nrm, d_wi, d_diff);
+    else
+        bwdFrostbiteDiffuse(nrm, wi, wo, arm.y, d_nrm, d_wi, d_wo, d_arm.y, d_diff);
+    // Backprop: diff_col = kd * (1.0f - arm.z)
+    vec3f d_diff_col = d_out * diff;
+    d_kd += d_diff_col * (1.0f - arm.z);
+    d_arm.z -= sum(d_diff_col * kd);
+    // Backprop: spec_col = (0.04f * (1.0f - arm.z) + kd * arm.z) * (1.0 - arm.x)
+    d_kd -= d_spec_col * (arm.x - 1.0f) * arm.z;
+    d_arm.x += sum(d_spec_col * (arm.z * (0.04f - kd) - 0.04f));
+    d_arm.z -= sum(d_spec_col * (kd - 0.04f) * (arm.x - 1.0f));
+    // Backprop: alpha = arm.y * arm.y
+    d_arm.y += d_alpha * 2 * arm.y;
+    // Backprop: vec3f wi = safeNormalize(light_pos - pos);
+    vec3f d__wi(0);
+    bwdSafeNormalize(_wi, d__wi, d_wi);
+    d_light_pos += d__wi;
+    d_pos -= d__wi;
+    // Backprop: vec3f wo = safeNormalize(view_pos - pos);
+    vec3f d__wo(0);
+    bwdSafeNormalize(_wo, d__wo, d_wo);
+    d_view_pos += d__wo;
+    d_pos -= d__wo;
+}
+//------------------------------------------------------------------------
+// Kernels
+__global__ void LambertFwdKernel(LambertKernelParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    vec3f nrm = p.nrm.fetch3(px, py, pz);
+    vec3f wi = p.wi.fetch3(px, py, pz);
+    float res = fwdLambert(nrm, wi);
+    p.out.store(px, py, pz, res);
+}
+__global__ void LambertBwdKernel(LambertKernelParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    vec3f nrm = p.nrm.fetch3(px, py, pz);
+    vec3f wi = p.wi.fetch3(px, py, pz);
+    float d_out = p.out.fetch1(px, py, pz);
+    vec3f d_nrm(0), d_wi(0);
+    bwdLambert(nrm, wi, d_nrm, d_wi, d_out);
+    p.nrm.store_grad(px, py, pz, d_nrm);
+    p.wi.store_grad(px, py, pz, d_wi);
+}
+__global__ void FrostbiteDiffuseFwdKernel(FrostbiteDiffuseKernelParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    vec3f nrm = p.nrm.fetch3(px, py, pz);
+    vec3f wi = p.wi.fetch3(px, py, pz);
+    vec3f wo = p.wo.fetch3(px, py, pz);
+    float linearRoughness = p.linearRoughness.fetch1(px, py, pz);
+    float res = fwdFrostbiteDiffuse(nrm, wi, wo, linearRoughness);
+    p.out.store(px, py, pz, res);
+}
+__global__ void FrostbiteDiffuseBwdKernel(FrostbiteDiffuseKernelParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    vec3f nrm = p.nrm.fetch3(px, py, pz);
+    vec3f wi = p.wi.fetch3(px, py, pz);
+    vec3f wo = p.wo.fetch3(px, py, pz);
+    float linearRoughness = p.linearRoughness.fetch1(px, py, pz);
+    float d_out = p.out.fetch1(px, py, pz);
+    float d_linearRoughness = 0.0f;
+    vec3f d_nrm(0), d_wi(0), d_wo(0);
+    bwdFrostbiteDiffuse(nrm, wi, wo, linearRoughness, d_nrm, d_wi, d_wo, d_linearRoughness, d_out);
+    p.nrm.store_grad(px, py, pz, d_nrm);
+    p.wi.store_grad(px, py, pz, d_wi);
+    p.wo.store_grad(px, py, pz, d_wo);
+    p.linearRoughness.store_grad(px, py, pz, d_linearRoughness);
+}
+__global__ void FresnelShlickFwdKernel(FresnelShlickKernelParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    vec3f f0 = p.f0.fetch3(px, py, pz);
+    vec3f f90 = p.f90.fetch3(px, py, pz);
+    float cosTheta = p.cosTheta.fetch1(px, py, pz);
+    vec3f res = fwdFresnelSchlick(f0, f90, cosTheta);
+    p.out.store(px, py, pz, res);
+}
+__global__ void FresnelShlickBwdKernel(FresnelShlickKernelParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    vec3f f0 = p.f0.fetch3(px, py, pz);
+    vec3f f90 = p.f90.fetch3(px, py, pz);
+    float cosTheta = p.cosTheta.fetch1(px, py, pz);
+    vec3f d_out = p.out.fetch3(px, py, pz);
+    vec3f d_f0(0), d_f90(0);
+    float d_cosTheta(0);
+    bwdFresnelSchlick(f0, f90, cosTheta, d_f0, d_f90, d_cosTheta, d_out);
+    p.f0.store_grad(px, py, pz, d_f0);
+    p.f90.store_grad(px, py, pz, d_f90);
+    p.cosTheta.store_grad(px, py, pz, d_cosTheta);
+}
+__global__ void ndfGGXFwdKernel(NdfGGXParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    float alphaSqr = p.alphaSqr.fetch1(px, py, pz);
+    float cosTheta = p.cosTheta.fetch1(px, py, pz);
+    float res = fwdNdfGGX(alphaSqr, cosTheta);
+    p.out.store(px, py, pz, res);
+}
+__global__ void ndfGGXBwdKernel(NdfGGXParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    float alphaSqr = p.alphaSqr.fetch1(px, py, pz);
+    float cosTheta = p.cosTheta.fetch1(px, py, pz);
+    float d_out = p.out.fetch1(px, py, pz);
+    float d_alphaSqr(0), d_cosTheta(0);
+    bwdNdfGGX(alphaSqr, cosTheta, d_alphaSqr, d_cosTheta, d_out);
+    p.alphaSqr.store_grad(px, py, pz, d_alphaSqr);
+    p.cosTheta.store_grad(px, py, pz, d_cosTheta);
+}
+__global__ void lambdaGGXFwdKernel(NdfGGXParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    float alphaSqr = p.alphaSqr.fetch1(px, py, pz);
+    float cosTheta = p.cosTheta.fetch1(px, py, pz);
+    float res = fwdLambdaGGX(alphaSqr, cosTheta);
+    p.out.store(px, py, pz, res);
+}
+__global__ void lambdaGGXBwdKernel(NdfGGXParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    float alphaSqr = p.alphaSqr.fetch1(px, py, pz);
+    float cosTheta = p.cosTheta.fetch1(px, py, pz);
+    float d_out = p.out.fetch1(px, py, pz);
+    float d_alphaSqr(0), d_cosTheta(0);
+    bwdLambdaGGX(alphaSqr, cosTheta, d_alphaSqr, d_cosTheta, d_out);
+    p.alphaSqr.store_grad(px, py, pz, d_alphaSqr);
+    p.cosTheta.store_grad(px, py, pz, d_cosTheta);
+}
+__global__ void maskingSmithFwdKernel(MaskingSmithParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    float alphaSqr = p.alphaSqr.fetch1(px, py, pz);
+    float cosThetaI = p.cosThetaI.fetch1(px, py, pz);
+    float cosThetaO = p.cosThetaO.fetch1(px, py, pz);
+    float res = fwdMaskingSmithGGXCorrelated(alphaSqr, cosThetaI, cosThetaO);
+    p.out.store(px, py, pz, res);
+}
+__global__ void maskingSmithBwdKernel(MaskingSmithParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    float alphaSqr = p.alphaSqr.fetch1(px, py, pz);
+    float cosThetaI = p.cosThetaI.fetch1(px, py, pz);
+    float cosThetaO = p.cosThetaO.fetch1(px, py, pz);
+    float d_out = p.out.fetch1(px, py, pz);
+    float d_alphaSqr(0), d_cosThetaI(0), d_cosThetaO(0);
+    bwdMaskingSmithGGXCorrelated(alphaSqr, cosThetaI, cosThetaO, d_alphaSqr, d_cosThetaI, d_cosThetaO, d_out);
+    p.alphaSqr.store_grad(px, py, pz, d_alphaSqr);
+    p.cosThetaI.store_grad(px, py, pz, d_cosThetaI);
+    p.cosThetaO.store_grad(px, py, pz, d_cosThetaO);
+}
+__global__ void pbrSpecularFwdKernel(PbrSpecular p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    vec3f col = p.col.fetch3(px, py, pz);
+    vec3f nrm = p.nrm.fetch3(px, py, pz);
+    vec3f wo = p.wo.fetch3(px, py, pz);
+    vec3f wi = p.wi.fetch3(px, py, pz);
+    float alpha = p.alpha.fetch1(px, py, pz);
+    vec3f res = fwdPbrSpecular(col, nrm, wo, wi, alpha, p.min_roughness);
+    p.out.store(px, py, pz, res);
+}
+__global__ void pbrSpecularBwdKernel(PbrSpecular p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    vec3f col = p.col.fetch3(px, py, pz);
+    vec3f nrm = p.nrm.fetch3(px, py, pz);
+    vec3f wo = p.wo.fetch3(px, py, pz);
+    vec3f wi = p.wi.fetch3(px, py, pz);
+    float alpha = p.alpha.fetch1(px, py, pz);
+    vec3f d_out = p.out.fetch3(px, py, pz);
+    float d_alpha(0);
+    vec3f d_col(0), d_nrm(0), d_wo(0), d_wi(0);
+    bwdPbrSpecular(col, nrm, wo, wi, alpha, p.min_roughness, d_col, d_nrm, d_wo, d_wi, d_alpha, d_out);
+    p.col.store_grad(px, py, pz, d_col);
+    p.nrm.store_grad(px, py, pz, d_nrm);
+    p.wo.store_grad(px, py, pz, d_wo);
+    p.wi.store_grad(px, py, pz, d_wi);
+    p.alpha.store_grad(px, py, pz, d_alpha);
+}
+__global__ void pbrBSDFFwdKernel(PbrBSDF p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    vec3f kd = p.kd.fetch3(px, py, pz);
+    vec3f arm = p.arm.fetch3(px, py, pz);
+    vec3f pos = p.pos.fetch3(px, py, pz);
+    vec3f nrm = p.nrm.fetch3(px, py, pz);
+    vec3f view_pos = p.view_pos.fetch3(px, py, pz);
+    vec3f light_pos = p.light_pos.fetch3(px, py, pz);
+    vec3f res = fwdPbrBSDF(kd, arm, pos, nrm, view_pos, light_pos, p.min_roughness, p.BSDF);
+    p.out.store(px, py, pz, res);
+}
+__global__ void pbrBSDFBwdKernel(PbrBSDF p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    vec3f kd = p.kd.fetch3(px, py, pz);
+    vec3f arm = p.arm.fetch3(px, py, pz);
+    vec3f pos = p.pos.fetch3(px, py, pz);
+    vec3f nrm = p.nrm.fetch3(px, py, pz);
+    vec3f view_pos = p.view_pos.fetch3(px, py, pz);
+    vec3f light_pos = p.light_pos.fetch3(px, py, pz);
+    vec3f d_out = p.out.fetch3(px, py, pz);
+    vec3f d_kd(0), d_arm(0), d_pos(0), d_nrm(0), d_view_pos(0), d_light_pos(0);
+    bwdPbrBSDF(kd, arm, pos, nrm, view_pos, light_pos, p.min_roughness, p.BSDF, d_kd, d_arm, d_pos, d_nrm, d_view_pos, d_light_pos, d_out);
+    p.kd.store_grad(px, py, pz, d_kd);
+    p.arm.store_grad(px, py, pz, d_arm);
+    p.pos.store_grad(px, py, pz, d_pos);
+    p.nrm.store_grad(px, py, pz, d_nrm);
+    p.view_pos.store_grad(px, py, pz, d_view_pos);
+    p.light_pos.store_grad(px, py, pz, d_light_pos);
+}

models/lrm/models/geometry/render/renderutils/c_src/bsdf.h ADDED Viewed

	@@ -0,0 +1,84 @@

+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#pragma once
+#include "common.h"
+struct LambertKernelParams
+{
+    Tensor  nrm;
+    Tensor  wi;
+    Tensor  out;
+    dim3    gridSize;
+};
+struct FrostbiteDiffuseKernelParams
+{
+    Tensor  nrm;
+    Tensor  wi;
+    Tensor  wo;
+    Tensor  linearRoughness;
+    Tensor  out;
+    dim3    gridSize;
+};
+struct FresnelShlickKernelParams
+{
+    Tensor  f0;
+    Tensor  f90;
+    Tensor  cosTheta;
+    Tensor  out;
+    dim3    gridSize;
+};
+struct NdfGGXParams
+{
+    Tensor  alphaSqr;
+    Tensor  cosTheta;
+    Tensor  out;
+    dim3    gridSize;
+};
+struct MaskingSmithParams
+{
+    Tensor  alphaSqr;
+    Tensor  cosThetaI;
+    Tensor  cosThetaO;
+    Tensor  out;
+    dim3    gridSize;
+};
+struct PbrSpecular
+{
+    Tensor  col;
+    Tensor  nrm;
+    Tensor  wo;
+    Tensor  wi;
+    Tensor  alpha;
+    Tensor  out;
+    dim3    gridSize;
+    float   min_roughness;
+};
+struct PbrBSDF
+{
+    Tensor  kd;
+    Tensor  arm;
+    Tensor  pos;
+    Tensor  nrm;
+    Tensor  view_pos;
+    Tensor  light_pos;
+    Tensor  out;
+    dim3    gridSize;
+    float   min_roughness;
+    int     BSDF;
+};

models/lrm/models/geometry/render/renderutils/c_src/common.cpp ADDED Viewed

	@@ -0,0 +1,74 @@

+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#include <cuda_runtime.h>
+#include <algorithm>
+//------------------------------------------------------------------------
+// Block and grid size calculators for kernel launches.
+dim3 getLaunchBlockSize(int maxWidth, int maxHeight, dim3 dims)
+{
+    int maxThreads = maxWidth * maxHeight;
+    if (maxThreads <= 1 || (dims.x * dims.y) <= 1)
+        return dim3(1, 1, 1); // Degenerate.
+    // Start from max size.
+    int bw = maxWidth;
+    int bh = maxHeight;
+    // Optimizations for weirdly sized buffers.
+    if (dims.x < bw)
+    {
+        // Decrease block width to smallest power of two that covers the buffer width.
+        while ((bw >> 1) >= dims.x)
+            bw >>= 1;
+        // Maximize height.
+        bh = maxThreads / bw;
+        if (bh > dims.y)
+            bh = dims.y;
+    }
+    else if (dims.y < bh)
+    {
+        // Halve height and double width until fits completely inside buffer vertically.
+        while (bh > dims.y)
+        {
+            bh >>= 1;
+            if (bw < dims.x)
+                bw <<= 1;
+        }
+    }
+    // Done.
+    return dim3(bw, bh, 1);
+}
+// returns the size of a block that can be reduced using horizontal SIMD operations (e.g. __shfl_xor_sync)
+dim3 getWarpSize(dim3 blockSize)
+{
+    return dim3(
+        std::min(blockSize.x, 32u),
+        std::min(std::max(32u / blockSize.x, 1u), std::min(32u, blockSize.y)),
+        std::min(std::max(32u / (blockSize.x * blockSize.y), 1u), std::min(32u, blockSize.z))
+    );
+}
+dim3 getLaunchGridSize(dim3 blockSize, dim3 dims)
+{
+    dim3 gridSize;
+    gridSize.x = (dims.x  - 1) / blockSize.x + 1;
+    gridSize.y = (dims.y - 1) / blockSize.y + 1;
+    gridSize.z = (dims.z  - 1) / blockSize.z + 1;
+    return gridSize;
+}
+//------------------------------------------------------------------------

models/lrm/models/geometry/render/renderutils/c_src/common.h ADDED Viewed

	@@ -0,0 +1,41 @@

+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#pragma once
+#include <cuda.h>
+#include <stdint.h>
+#include "vec3f.h"
+#include "vec4f.h"
+#include "tensor.h"
+dim3 getLaunchBlockSize(int maxWidth, int maxHeight, dim3 dims);
+dim3 getLaunchGridSize(dim3 blockSize, dim3 dims);
+#ifdef __CUDACC__
+#ifdef _MSC_VER
+#define M_PI 3.14159265358979323846f
+#endif
+__host__ __device__ static inline dim3 getWarpSize(dim3 blockSize)
+{
+    return dim3(
+        min(blockSize.x, 32u),
+        min(max(32u / blockSize.x, 1u), min(32u, blockSize.y)),
+        min(max(32u / (blockSize.x * blockSize.y), 1u), min(32u, blockSize.z))
+    );
+}
+__device__ static inline float clamp(float val, float mn, float mx) { return min(max(val, mn), mx); }
+#else
+dim3 getWarpSize(dim3 blockSize);
+#endif

models/lrm/models/geometry/render/renderutils/c_src/cubemap.cu ADDED Viewed

	@@ -0,0 +1,350 @@

+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#include "common.h"
+#include "cubemap.h"
+#include <float.h>
+// https://cgvr.cs.uni-bremen.de/teaching/cg_literatur/Spherical,%20Cubic,%20and%20Parabolic%20Environment%20Mappings.pdf
+__device__ float pixel_area(int x, int y, int N)
+{
+    if (N > 1)
+    {
+        int H = N / 2;
+        x = abs(x - H);
+        y = abs(y - H);
+        float dx = atan((float)(x + 1) / (float)H) - atan((float)x / (float)H);
+        float dy = atan((float)(y + 1) / (float)H) - atan((float)y / (float)H);
+        return dx * dy;
+    }
+    else
+        return 1;
+}
+__device__ vec3f cube_to_dir(int x, int y, int side, int N)
+{
+    float fx = 2.0f * (((float)x + 0.5f) / (float)N) - 1.0f;
+    float fy = 2.0f * (((float)y + 0.5f) / (float)N) - 1.0f;
+    switch (side)
+    {
+        case 0: return safeNormalize(vec3f(1, -fy, -fx));
+        case 1: return safeNormalize(vec3f(-1, -fy, fx));
+        case 2: return safeNormalize(vec3f(fx, 1, fy));
+        case 3: return safeNormalize(vec3f(fx, -1, -fy));
+        case 4: return safeNormalize(vec3f(fx, -fy, 1));
+        case 5: return safeNormalize(vec3f(-fx, -fy, -1));
+    }
+    return vec3f(0,0,0); // Unreachable
+}
+__device__ vec3f dir_to_side(int side, vec3f v)
+{
+    switch (side)
+    {
+    case 0: return vec3f(-v.z, -v.y,  v.x);
+    case 1: return vec3f( v.z, -v.y, -v.x);
+    case 2: return vec3f( v.x,  v.z,  v.y);
+    case 3: return vec3f( v.x, -v.z, -v.y);
+    case 4: return vec3f( v.x, -v.y,  v.z);
+    case 5: return vec3f(-v.x, -v.y, -v.z);
+    }
+    return vec3f(0,0,0); // Unreachable
+}
+__device__ void extents_1d(float x, float z, float theta, float& _min, float& _max)
+{
+    float l = sqrtf(x * x + z * z);
+    float pxr = x + z * tan(theta) * l, pzr = z - x * tan(theta) * l;
+    float pxl = x - z * tan(theta) * l, pzl = z + x * tan(theta) * l;
+    if (pzl <= 0.00001f)
+        _min = pxl > 0.0f ? FLT_MAX : -FLT_MAX;
+    else
+        _min = pxl / pzl;
+    if (pzr <= 0.00001f)
+        _max = pxr > 0.0f ? FLT_MAX : -FLT_MAX;
+    else
+        _max = pxr / pzr;
+}
+__device__ void dir_extents(int side, int N, vec3f v, float theta, int &_xmin, int& _xmax, int& _ymin, int& _ymax)
+{
+    vec3f c = dir_to_side(side, v); // remap to (x,y,z) where side is at z = 1
+    if (theta < 0.785398f) // PI/4
+    {
+        float xmin, xmax, ymin, ymax;
+        extents_1d(c.x, c.z, theta, xmin, xmax);
+        extents_1d(c.y, c.z, theta, ymin, ymax);
+        if (xmin > 1.0f || xmax < -1.0f || ymin > 1.0f || ymax < -1.0f)
+        {
+            _xmin = -1; _xmax = -1; _ymin = -1; _ymax = -1; // Bad aabb
+        }
+        else
+        {
+            _xmin = (int)min(max((xmin + 1.0f) * (0.5f * (float)N), 0.0f), (float)(N - 1));
+            _xmax = (int)min(max((xmax + 1.0f) * (0.5f * (float)N), 0.0f), (float)(N - 1));
+            _ymin = (int)min(max((ymin + 1.0f) * (0.5f * (float)N), 0.0f), (float)(N - 1));
+            _ymax = (int)min(max((ymax + 1.0f) * (0.5f * (float)N), 0.0f), (float)(N - 1));
+        }
+    }
+    else
+    {
+            _xmin = 0.0f;
+            _xmax = (float)(N-1);
+            _ymin = 0.0f;
+            _ymax = (float)(N-1);
+    }
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Diffuse kernel
+__global__ void DiffuseCubemapFwdKernel(DiffuseCubemapKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    int Npx = p.cubemap.dims[1];
+    vec3f N = cube_to_dir(px, py, pz, Npx);
+    vec3f col(0);
+    for (int s = 0; s < p.cubemap.dims[0]; ++s)
+    {
+        for (int y = 0; y < Npx; ++y)
+        {
+            for (int x = 0; x < Npx; ++x)
+            {
+                vec3f L = cube_to_dir(x, y, s, Npx);
+                float costheta = min(max(dot(N, L), 0.0f), 0.999f);
+                float w = costheta * pixel_area(x, y, Npx) / 3.141592f; // pi = area of positive hemisphere
+                col += p.cubemap.fetch3(x, y, s) * w;
+            }
+        }
+    }
+    p.out.store(px, py, pz, col);
+}
+__global__ void DiffuseCubemapBwdKernel(DiffuseCubemapKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    int Npx = p.cubemap.dims[1];
+    vec3f N = cube_to_dir(px, py, pz, Npx);
+    vec3f grad = p.out.fetch3(px, py, pz);
+    for (int s = 0; s < p.cubemap.dims[0]; ++s)
+    {
+        for (int y = 0; y < Npx; ++y)
+        {
+            for (int x = 0; x < Npx; ++x)
+            {
+                vec3f L = cube_to_dir(x, y, s, Npx);
+                float costheta = min(max(dot(N, L), 0.0f), 0.999f);
+                float w = costheta * pixel_area(x, y, Npx) / 3.141592f; // pi = area of positive hemisphere
+                atomicAdd((float*)p.cubemap.d_val + p.cubemap.nhwcIndexContinuous(s, y, x, 0), grad.x * w);
+                atomicAdd((float*)p.cubemap.d_val + p.cubemap.nhwcIndexContinuous(s, y, x, 1), grad.y * w);
+                atomicAdd((float*)p.cubemap.d_val + p.cubemap.nhwcIndexContinuous(s, y, x, 2), grad.z * w);
+            }
+        }
+    }
+}
+///////////////////////////////////////////////////////////////////////////////////////////////////////////
+// GGX splitsum kernel
+__device__ inline float ndfGGX(const float alphaSqr, const float cosTheta)
+{
+    float _cosTheta = clamp(cosTheta, 0.0, 1.0f);
+    float d = (_cosTheta * alphaSqr - _cosTheta) * _cosTheta + 1.0f;
+    return alphaSqr / (d * d * M_PI);
+}
+__global__ void SpecularBoundsKernel(SpecularBoundsKernelParams p)
+{
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    int Npx = p.gridSize.x;
+    vec3f VNR = cube_to_dir(px, py, pz, Npx);
+    const int TILE_SIZE = 16;
+    // Brute force entire cubemap and compute bounds for the cone
+    for (int s = 0; s < p.gridSize.z; ++s)
+    {
+        // Assume empty BBox
+        int _min_x = p.gridSize.x - 1, _max_x = 0;
+        int _min_y = p.gridSize.y - 1, _max_y = 0;
+        // For each (8x8) tile
+        for (int tx = 0; tx < (p.gridSize.x + TILE_SIZE - 1) / TILE_SIZE; tx++)
+        {
+            for (int ty = 0; ty < (p.gridSize.y + TILE_SIZE - 1) / TILE_SIZE; ty++)
+            {
+                // Compute tile extents
+                int tsx = tx * TILE_SIZE, tsy = ty * TILE_SIZE;
+                int tex = min((tx + 1) * TILE_SIZE, p.gridSize.x), tey = min((ty + 1) * TILE_SIZE, p.gridSize.y);
+                // Use some blunt interval arithmetics to cull tiles
+                vec3f L0 = cube_to_dir(tsx, tsy, s, Npx), L1 = cube_to_dir(tex, tsy, s, Npx);
+                vec3f L2 = cube_to_dir(tsx, tey, s, Npx), L3 = cube_to_dir(tex, tey, s, Npx);
+                float minx = min(min(L0.x, L1.x), min(L2.x, L3.x)), maxx = max(max(L0.x, L1.x), max(L2.x, L3.x));
+                float miny = min(min(L0.y, L1.y), min(L2.y, L3.y)), maxy = max(max(L0.y, L1.y), max(L2.y, L3.y));
+                float minz = min(min(L0.z, L1.z), min(L2.z, L3.z)), maxz = max(max(L0.z, L1.z), max(L2.z, L3.z));
+                float maxdp = max(minx * VNR.x, maxx * VNR.x) + max(miny * VNR.y, maxy * VNR.y) + max(minz * VNR.z, maxz * VNR.z);
+                if (maxdp >= p.costheta_cutoff)
+                {
+                    // Test all pixels in tile.
+                    for (int y = tsy; y < tey; ++y)
+                    {
+                        for (int x = tsx; x < tex; ++x)
+                        {
+                            vec3f L = cube_to_dir(x, y, s, Npx);
+                            if (dot(L, VNR) >= p.costheta_cutoff)
+                            {
+                                _min_x = min(_min_x, x);
+                                _max_x = max(_max_x, x);
+                                _min_y = min(_min_y, y);
+                                _max_y = max(_max_y, y);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        p.out.store(p.out._nhwcIndex(pz, py, px, s * 4 + 0), _min_x);
+        p.out.store(p.out._nhwcIndex(pz, py, px, s * 4 + 1), _max_x);
+        p.out.store(p.out._nhwcIndex(pz, py, px, s * 4 + 2), _min_y);
+        p.out.store(p.out._nhwcIndex(pz, py, px, s * 4 + 3), _max_y);
+    }
+}
+__global__ void SpecularCubemapFwdKernel(SpecularCubemapKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    int Npx = p.cubemap.dims[1];
+    vec3f VNR = cube_to_dir(px, py, pz, Npx);
+    float alpha = p.roughness * p.roughness;
+    float alphaSqr = alpha * alpha;
+    float wsum = 0.0f;
+    vec3f col(0);
+    for (int s = 0; s < p.cubemap.dims[0]; ++s)
+    {
+        int xmin, xmax, ymin, ymax;
+        xmin = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 0));
+        xmax = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 1));
+        ymin = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 2));
+        ymax = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 3));
+        if (xmin <= xmax)
+        {
+            for (int y = ymin; y <= ymax; ++y)
+            {
+                for (int x = xmin; x <= xmax; ++x)
+                {
+                    vec3f L = cube_to_dir(x, y, s, Npx);
+                    if (dot(L, VNR) >= p.costheta_cutoff)
+                    {
+                        vec3f H = safeNormalize(L + VNR);
+                        float wiDotN = max(dot(L, VNR), 0.0f);
+                        float VNRDotH = max(dot(VNR, H), 0.0f);
+                        float w = wiDotN * ndfGGX(alphaSqr, VNRDotH) * pixel_area(x, y, Npx) / 4.0f;
+                        col += p.cubemap.fetch3(x, y, s) * w;
+                        wsum += w;
+                    }
+                }
+            }
+        }
+    }
+    p.out.store(p.out._nhwcIndex(pz, py, px, 0), col.x);
+    p.out.store(p.out._nhwcIndex(pz, py, px, 1), col.y);
+    p.out.store(p.out._nhwcIndex(pz, py, px, 2), col.z);
+    p.out.store(p.out._nhwcIndex(pz, py, px, 3), wsum);
+}
+__global__ void SpecularCubemapBwdKernel(SpecularCubemapKernelParams p)
+{
+    // Calculate pixel position.
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    int Npx = p.cubemap.dims[1];
+    vec3f VNR = cube_to_dir(px, py, pz, Npx);
+    vec3f grad = p.out.fetch3(px, py, pz);
+    float alpha = p.roughness * p.roughness;
+    float alphaSqr = alpha * alpha;
+    vec3f col(0);
+    for (int s = 0; s < p.cubemap.dims[0]; ++s)
+    {
+        int xmin, xmax, ymin, ymax;
+        xmin = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 0));
+        xmax = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 1));
+        ymin = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 2));
+        ymax = (int)p.bounds.fetch(p.bounds._nhwcIndex(pz, py, px, s * 4 + 3));
+        if (xmin <= xmax)
+        {
+            for (int y = ymin; y <= ymax; ++y)
+            {
+                for (int x = xmin; x <= xmax; ++x)
+                {
+                    vec3f L = cube_to_dir(x, y, s, Npx);
+                    if (dot(L, VNR) >= p.costheta_cutoff)
+                    {
+                        vec3f H = safeNormalize(L + VNR);
+                        float wiDotN = max(dot(L, VNR), 0.0f);
+                        float VNRDotH = max(dot(VNR, H), 0.0f);
+                        float w = wiDotN * ndfGGX(alphaSqr, VNRDotH) * pixel_area(x, y, Npx) / 4.0f;
+                        atomicAdd((float*)p.cubemap.d_val + p.cubemap.nhwcIndexContinuous(s, y, x, 0), grad.x * w);
+                        atomicAdd((float*)p.cubemap.d_val + p.cubemap.nhwcIndexContinuous(s, y, x, 1), grad.y * w);
+                        atomicAdd((float*)p.cubemap.d_val + p.cubemap.nhwcIndexContinuous(s, y, x, 2), grad.z * w);
+                    }
+                }
+            }
+        }
+    }
+}

models/lrm/models/geometry/render/renderutils/c_src/cubemap.h ADDED Viewed

	@@ -0,0 +1,38 @@

+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#pragma once
+#include "common.h"
+struct DiffuseCubemapKernelParams
+{
+    Tensor  cubemap;
+    Tensor  out;
+    dim3    gridSize;
+};
+struct SpecularCubemapKernelParams
+{
+    Tensor  cubemap;
+    Tensor  bounds;
+    Tensor  out;
+    dim3    gridSize;
+    float   costheta_cutoff;
+    float   roughness;
+};
+struct SpecularBoundsKernelParams
+{
+    float   costheta_cutoff;
+    Tensor  out;
+    dim3    gridSize;
+};

models/lrm/models/geometry/render/renderutils/c_src/loss.cu ADDED Viewed

	@@ -0,0 +1,210 @@

+/*
+ * Copyright (c) 2020-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
+ * property and proprietary rights in and to this material, related
+ * documentation and any modifications thereto. Any use, reproduction,
+ * disclosure or distribution of this material and related documentation
+ * without an express license agreement from NVIDIA CORPORATION or
+ * its affiliates is strictly prohibited.
+ */
+#include <cuda.h>
+#include "common.h"
+#include "loss.h"
+//------------------------------------------------------------------------
+// Utils
+__device__ inline float bwdAbs(float x) { return x == 0.0f ? 0.0f : x < 0.0f ? -1.0f : 1.0f; }
+__device__ float warpSum(float val) {
+    for (int i = 1; i < 32; i *= 2)
+        val += __shfl_xor_sync(0xFFFFFFFF, val, i);
+    return val;
+}
+//------------------------------------------------------------------------
+// Tonemapping
+__device__ inline float fwdSRGB(float x)
+{
+    return x > 0.0031308f ? powf(max(x, 0.0031308f), 1.0f / 2.4f) * 1.055f - 0.055f : 12.92f * max(x, 0.0f);
+}
+__device__ inline void bwdSRGB(float x, float &d_x, float d_out)
+{
+    if (x > 0.0031308f)
+        d_x += d_out * 0.439583f / powf(x, 0.583333f);
+    else if (x > 0.0f)
+        d_x += d_out * 12.92f;
+}
+__device__ inline vec3f fwdTonemapLogSRGB(vec3f x)
+{
+    return vec3f(fwdSRGB(logf(x.x + 1.0f)), fwdSRGB(logf(x.y + 1.0f)), fwdSRGB(logf(x.z + 1.0f)));
+}
+__device__ inline void bwdTonemapLogSRGB(vec3f x, vec3f& d_x, vec3f d_out)
+{
+    if (x.x > 0.0f && x.x < 65535.0f)
+    {
+        bwdSRGB(logf(x.x + 1.0f), d_x.x, d_out.x);
+        d_x.x *= 1 / (x.x + 1.0f);
+    }
+    if (x.y > 0.0f && x.y < 65535.0f)
+    {
+        bwdSRGB(logf(x.y + 1.0f), d_x.y, d_out.y);
+        d_x.y *= 1 / (x.y + 1.0f);
+    }
+    if (x.z > 0.0f && x.z < 65535.0f)
+    {
+        bwdSRGB(logf(x.z + 1.0f), d_x.z, d_out.z);
+        d_x.z *= 1 / (x.z + 1.0f);
+    }
+}
+__device__ inline float fwdRELMSE(float img, float target, float eps = 0.1f)
+{
+    return (img - target) * (img - target) / (img * img + target * target + eps);
+}
+__device__ inline void bwdRELMSE(float img, float target, float &d_img, float &d_target, float d_out, float eps = 0.1f)
+{
+    float denom  = (target * target + img * img + eps);
+    d_img    += d_out * 2 * (img - target) * (target * (target + img) + eps) / (denom * denom);
+    d_target -= d_out * 2 * (img - target) * (img * (target + img) + eps) / (denom * denom);
+}
+__device__ inline float fwdSMAPE(float img, float target, float eps=0.01f)
+{
+    return abs(img - target) / (img + target + eps);
+}
+__device__ inline void bwdSMAPE(float img, float target, float& d_img, float& d_target, float d_out, float eps = 0.01f)
+{
+    float denom = (target + img + eps);
+    d_img    += d_out * bwdAbs(img - target) * (2 * target + eps) / (denom * denom);
+    d_target -= d_out * bwdAbs(img - target) * (2 * img + eps) / (denom * denom);
+}
+//------------------------------------------------------------------------
+// Kernels
+__global__ void imgLossFwdKernel(LossKernelParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    float floss = 0.0f;
+    if (px < p.gridSize.x && py < p.gridSize.y && pz < p.gridSize.z)
+    {
+        vec3f img = p.img.fetch3(px, py, pz);
+        vec3f target = p.target.fetch3(px, py, pz);
+        img = vec3f(clamp(img.x, 0.0f, 65535.0f), clamp(img.y, 0.0f, 65535.0f), clamp(img.z, 0.0f, 65535.0f));
+        target = vec3f(clamp(target.x, 0.0f, 65535.0f), clamp(target.y, 0.0f, 65535.0f), clamp(target.z, 0.0f, 65535.0f));
+        if (p.tonemapper == TONEMAPPER_LOG_SRGB)
+        {
+            img = fwdTonemapLogSRGB(img);
+            target = fwdTonemapLogSRGB(target);
+        }
+        vec3f vloss(0);
+        if (p.loss == LOSS_MSE)
+            vloss = (img - target) * (img - target);
+        else if (p.loss == LOSS_RELMSE)
+            vloss = vec3f(fwdRELMSE(img.x, target.x), fwdRELMSE(img.y, target.y), fwdRELMSE(img.z, target.z));
+        else if (p.loss == LOSS_SMAPE)
+            vloss = vec3f(fwdSMAPE(img.x, target.x), fwdSMAPE(img.y, target.y), fwdSMAPE(img.z, target.z));
+        else
+            vloss = vec3f(abs(img.x - target.x), abs(img.y - target.y), abs(img.z - target.z));
+        floss = sum(vloss) / 3.0f;
+    }
+    floss = warpSum(floss);
+    dim3 warpSize = getWarpSize(blockDim);
+    if (px < p.gridSize.x && py < p.gridSize.y && pz < p.gridSize.z && threadIdx.x % warpSize.x == 0 && threadIdx.y % warpSize.y == 0 && threadIdx.z % warpSize.z == 0)
+        p.out.store(px / warpSize.x, py / warpSize.y, pz / warpSize.z, floss);
+}
+__global__ void imgLossBwdKernel(LossKernelParams p)
+{
+    // Calculate pixel position.
+    unsigned int px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int py = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int pz = blockIdx.z;
+    if (px >= p.gridSize.x || py >= p.gridSize.y || pz >= p.gridSize.z)
+        return;
+    dim3 warpSize = getWarpSize(blockDim);
+    vec3f _img = p.img.fetch3(px, py, pz);
+    vec3f _target = p.target.fetch3(px, py, pz);
+    float d_out = p.out.fetch1(px / warpSize.x, py / warpSize.y, pz / warpSize.z);
+    /////////////////////////////////////////////////////////////////////
+    // FWD
+    vec3f img = _img, target = _target;
+    if (p.tonemapper == TONEMAPPER_LOG_SRGB)
+    {
+        img = fwdTonemapLogSRGB(img);
+        target = fwdTonemapLogSRGB(target);
+    }
+    /////////////////////////////////////////////////////////////////////
+    // BWD
+    vec3f d_vloss = vec3f(d_out, d_out, d_out) / 3.0f;
+    vec3f d_img(0), d_target(0);
+    if (p.loss == LOSS_MSE)
+    {
+        d_img = vec3f(d_vloss.x * 2 * (img.x - target.x), d_vloss.y * 2 * (img.y - target.y), d_vloss.x * 2 * (img.z - target.z));
+        d_target = -d_img;
+    }
+    else if (p.loss == LOSS_RELMSE)
+    {
+        bwdRELMSE(img.x, target.x, d_img.x, d_target.x, d_vloss.x);
+        bwdRELMSE(img.y, target.y, d_img.y, d_target.y, d_vloss.y);
+        bwdRELMSE(img.z, target.z, d_img.z, d_target.z, d_vloss.z);
+    }
+    else if (p.loss == LOSS_SMAPE)
+    {
+        bwdSMAPE(img.x, target.x, d_img.x, d_target.x, d_vloss.x);
+        bwdSMAPE(img.y, target.y, d_img.y, d_target.y, d_vloss.y);
+        bwdSMAPE(img.z, target.z, d_img.z, d_target.z, d_vloss.z);
+    }
+    else
+    {
+        d_img = d_vloss * vec3f(bwdAbs(img.x - target.x), bwdAbs(img.y - target.y), bwdAbs(img.z - target.z));
+        d_target = -d_img;
+    }
+    if (p.tonemapper == TONEMAPPER_LOG_SRGB)
+    {
+        vec3f d__img(0), d__target(0);
+        bwdTonemapLogSRGB(_img, d__img, d_img);
+        bwdTonemapLogSRGB(_target, d__target, d_target);
+        d_img = d__img; d_target = d__target;
+    }
+    if (_img.x <= 0.0f || _img.x >= 65535.0f) d_img.x = 0;
+    if (_img.y <= 0.0f || _img.y >= 65535.0f) d_img.y = 0;
+    if (_img.z <= 0.0f || _img.z >= 65535.0f) d_img.z = 0;
+    if (_target.x <= 0.0f || _target.x >= 65535.0f) d_target.x = 0;
+    if (_target.y <= 0.0f || _target.y >= 65535.0f) d_target.y = 0;
+    if (_target.z <= 0.0f || _target.z >= 65535.0f) d_target.z = 0;
+    p.img.store_grad(px, py, pz, d_img);
+    p.target.store_grad(px, py, pz, d_target);
+}