Spaces:

LTT
/

Kiss3DGen

Running on Zero

App Files Files Community

JiantaoLin commited on Dec 16, 2024

Commit

4157d39

1 Parent(s): a2907bc

new

Browse files

Files changed (9) hide show

image_to_mesh_new.py +436 -0
pipeline/kiss3d_wrapper.py +429 -0
pipeline/pipeline_config/default.yaml +25 -0
pipeline/run_hpc.sh +10 -0
pipeline/utils.py +198 -0
run.sh +2 -0
run_hpc.sh +11 -0
text_to_mesh_new.py +244 -0
upload_huggingface.py +57 -0

image_to_mesh_new.py ADDED Viewed

	@@ -0,0 +1,436 @@

+import os
+from einops import rearrange
+from omegaconf import OmegaConf
+import torch
+import numpy as np
+import trimesh
+import torchvision
+import torch.nn.functional as F
+from PIL import Image
+from torchvision import transforms
+from torchvision.transforms import v2
+from transformers import AutoProcessor, AutoModelForCausalLM
+import rembg
+from diffusers import FluxPipeline, FluxControlNetImg2ImgPipeline
+from diffusers.models.controlnet_flux import FluxControlNetModel, FluxMultiControlNetModel
+from diffusers import DiffusionPipeline, EulerAncestralDiscreteScheduler, HeunDiscreteScheduler
+from pytorch_lightning import seed_everything
+import os
+from models.ISOMER.reconstruction_func import reconstruction
+from models.ISOMER.projection_func import projection
+from models.lrm.utils.infer_util import remove_background, resize_foreground, save_video
+from models.lrm.utils.mesh_util import save_obj, save_obj_with_mtl
+from models.lrm.utils.render_utils import rotate_x, rotate_y
+from models.lrm.utils.train_util import instantiate_from_config
+from models.lrm.utils.camera_util import get_zero123plus_input_cameras, get_custom_zero123plus_input_cameras, get_flux_input_cameras
+from utils.tool import NormalTransfer, get_render_cameras_frames, load_mipmap
+from utils.tool import get_background, get_render_cameras_video, render_frames, mask_fix
+device = "cuda"
+resolution = 512
+save_dir = "./outputs"
+zero123plus_diffusion_steps = 75
+normal_transfer = NormalTransfer()
+rembg_session = rembg.new_session()
+isomer_azimuths = torch.from_numpy(np.array([270, 0, 90, 180])).to(device)
+isomer_elevations = torch.from_numpy(np.array([5, 5, 5, 5])).to(device)
+isomer_radius = 4.1
+isomer_geo_weights = torch.from_numpy(np.array([1, 0.9, 1, 0.9])).float().to(device)
+isomer_color_weights = torch.from_numpy(np.array([1, 0.5, 1, 0.5])).float().to(device)
+# seed_everything(42)
+# model initialization and loading
+# flux
+print('==> Loading Flux model ...')
+flux_base_model_pth = "/hpc2hdd/JH_DATA/share/yingcongchen/PrivateShareGroup/yingcongchen_datasets/model_checkpoint/models--black-forest-labs--FLUX.1-dev"
+flux_controlnet = FluxControlNetModel.from_pretrained("/hpc2hdd/JH_DATA/share/yingcongchen/PrivateShareGroup/yingcongchen_datasets/model_checkpoint/flux_controlnets/FLUX.1-dev-ControlNet-Union-Pro")
+flux_pipe = FluxControlNetImg2ImgPipeline.from_pretrained(flux_base_model_pth, controlnet=[flux_controlnet], torch_dtype=torch.bfloat16).to(device=device, dtype=torch.bfloat16)
+flux_pipe.load_lora_weights('./checkpoint/flux_lora/rgb_normal_large.safetensors')
+flux_pipe.to(device=device, dtype=torch.bfloat16)
+generator = torch.Generator(device=device).manual_seed(0)
+# lrm
+print('==> Loading LRM model ...')
+config = OmegaConf.load("./models/lrm/config/PRM_inference.yaml")
+model_config = config.model_config
+infer_config = config.infer_config
+model = instantiate_from_config(model_config)
+model_ckpt_path = "./checkpoint/lrm/final_ckpt.ckpt"
+state_dict = torch.load(model_ckpt_path, map_location='cpu')['state_dict']
+state_dict = {k[14:]: v for k, v in state_dict.items() if k.startswith('lrm_generator.')}
+model.load_state_dict(state_dict, strict=True)
+model = model.to(device)
+model.init_flexicubes_geometry(device, fovy=50.0)
+model = model.eval()
+# zero123++
+print('==> Loading diffusion model ...')
+zero123plus_pipeline = DiffusionPipeline.from_pretrained(
+    "sudo-ai/zero123plus-v1.2",
+    custom_pipeline="./models/zero123plus",
+    torch_dtype=torch.float16,
+)
+zero123plus_pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(
+    zero123plus_pipeline.scheduler.config, timestep_spacing='trailing'
+)
+unet_ckpt_path = "./checkpoint/zero123++/flexgen_19w.ckpt"
+state_dict = torch.load(unet_ckpt_path, map_location='cpu')['state_dict']
+state_dict = {k[10:]: v for k, v in state_dict.items() if k.startswith('unet.unet.')}
+zero123plus_pipeline.unet.load_state_dict(state_dict, strict=True)
+zero123plus_pipeline = zero123plus_pipeline.to(device)
+# unet_ckpt_path = "checkpoint/zero123++/diffusion_pytorch_model.bin"
+# state_dict = torch.load(unet_ckpt_path, map_location='cpu')
+# zero123plus_pipeline.unet.load_state_dict(state_dict, strict=True)
+# zero123plus_pipeline = zero123plus_pipeline.to(device)
+# florence
+caption_model = AutoModelForCausalLM.from_pretrained(
+        "/hpc2hdd/home/jlin695/.cache/huggingface/hub/models--multimodalart--Florence-2-large-no-flash-attn/snapshots/8db3793cf5b453b2ccfb3a4f613b403b2e6b7ca2", torch_dtype=torch.bfloat16, trust_remote_code=True,
+    ).to(device)
+caption_processor = AutoProcessor.from_pretrained("/hpc2hdd/home/jlin695/.cache/huggingface/hub/models--multimodalart--Florence-2-large-no-flash-attn/snapshots/8db3793cf5b453b2ccfb3a4f613b403b2e6b7ca2", trust_remote_code=True)
+# Flux multi-view generation
+def multi_view_rgb_normal_generation_with_controlnet(prompt, image, strength=1.0,
+                                                    control_image=[],
+                                                    control_mode=[],
+                                                    control_guidance_start=None,
+                                                    control_guidance_end=None,
+                                                    controlnet_conditioning_scale=None,
+                                                    lora_scale=1.0
+                                                    ):
+    control_mode_dict = {
+        'canny': 0,
+        'tile': 1,
+        'depth': 2,
+        'blur': 3,
+        'pose': 4,
+        'gray': 5,
+        'lq': 6,
+    } # for https://huggingface.co/InstantX/FLUX.1-dev-Controlnet-Union only
+    hparam_dict = {
+        'prompt': prompt,
+        'image': image,
+        'strength': strength,
+        'num_inference_steps': 30,
+        'guidance_scale': 3.5,
+        'num_images_per_prompt': 1,
+        'width': resolution*4,
+        'height': resolution*2,
+        'output_type': 'np',
+        'generator': generator,
+        'joint_attention_kwargs': {"scale": lora_scale}
+    }
+    # append controlnet hparams
+    if len(control_image) > 0:
+        assert len(control_mode) == len(control_image) # the count of image should be the same as control mode
+        ctrl_hparams = {
+            'control_mode': [control_mode_dict[mode_] for mode_ in control_mode],
+            'control_image': control_image,
+            'control_guidance_start': control_guidance_start or [0.0 for i in range(len(control_image))],
+            'control_guidance_end': control_guidance_end or [1.0 for i in range(len(control_image))],
+            'controlnet_conditioning_scale': controlnet_conditioning_scale or [1.0 for i in range(len(control_image))],
+        }
+        hparam_dict.update(ctrl_hparams)
+    # generate multi-view images
+    with torch.no_grad():
+        image = flux_pipe(
+            **hparam_dict
+        ).images
+    return image
+# captioning
+def run_captioning(image):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    torch_dtype = torch.bfloat16
+    if isinstance(image, str):  # If image is a file path
+        image = Image.open(image).convert("RGB")
+    prompt = "<MORE_DETAILED_CAPTION>"
+    inputs = caption_processor(text=prompt, images=image, return_tensors="pt").to(device, torch_dtype)
+    # print(f"inputs {inputs}")
+    generated_ids = caption_model.generate(
+        input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, num_beams=3
+    )
+    generated_text = caption_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+    parsed_answer = caption_processor.post_process_generation(
+        generated_text, task=prompt, image_size=(image.width, image.height)
+    )
+    # print(f"parsed_answer = {parsed_answer}")
+    caption_text = parsed_answer["<MORE_DETAILED_CAPTION>"].replace("The image is ", "")
+    return caption_text
+# zero123++ multi-view generation
+def multi_view_rgb_generation(cond_img):
+    # generate multi-view images
+    with torch.no_grad():
+        output_image = zero123plus_pipeline(
+        cond_img,
+        num_inference_steps=zero123plus_diffusion_steps,
+        width=resolution*2,
+        height=resolution*2,
+    ).images[0]
+    return output_image
+# lrm reconstructions
+def lrm_reconstructions(image, input_cameras, save_path=None, name="temp", export_texmap=False, if_save_video=False, render_azimuths=None, render_elevations=None, render_radius=None, render_fov=30):
+    images = image.unsqueeze(0).to(device)
+    images = v2.functional.resize(images, 512, interpolation=3, antialias=True).clamp(0, 1)
+    # breakpoint()
+    with torch.no_grad():
+        # get triplane
+        planes = model.forward_planes(images, input_cameras)
+        mesh_path_idx = os.path.join(save_path, f'{name}.obj')
+        mesh_out = model.extract_mesh(
+            planes,
+            use_texture_map=export_texmap,
+            **infer_config,
+        )
+        if export_texmap:
+            vertices, faces, uvs, mesh_tex_idx, tex_map = mesh_out
+            save_obj_with_mtl(
+                vertices.data.cpu().numpy(),
+                uvs.data.cpu().numpy(),
+                faces.data.cpu().numpy(),
+                mesh_tex_idx.data.cpu().numpy(),
+                tex_map.permute(1, 2, 0).data.cpu().numpy(),
+                mesh_path_idx,
+            )
+        else:
+            vertices, faces, vertex_colors = mesh_out
+            save_obj(vertices, faces, vertex_colors, mesh_path_idx)
+        print(f"Mesh saved to {mesh_path_idx}")
+        render_size = 512
+        if if_save_video:
+            video_path_idx = os.path.join(save_path, f'{name}.mp4')
+            render_size = infer_config.render_resolution
+            ENV = load_mipmap("models/lrm/env_mipmap/6")
+            materials = (0.0,0.9)
+            all_mv, all_mvp, all_campos = get_render_cameras_video(
+                batch_size=1,
+                M=240,
+                radius=4.5,
+                elevation=(90, 60.0),
+                is_flexicubes=True,
+                fov=30
+            )
+            frames, albedos, pbr_spec_lights, pbr_diffuse_lights, normals, alphas = render_frames(
+                model,
+                planes,
+                render_cameras=all_mvp,
+                camera_pos=all_campos,
+                env=ENV,
+                materials=materials,
+                render_size=render_size,
+                chunk_size=20,
+                is_flexicubes=True,
+            )
+            normals = (torch.nn.functional.normalize(normals) + 1) / 2
+            normals = normals * alphas + (1-alphas)
+            all_frames = torch.cat([frames, albedos, pbr_spec_lights, pbr_diffuse_lights, normals], dim=3)
+            # breakpoint()
+            save_video(
+                all_frames,
+                video_path_idx,
+                fps=30,
+            )
+            print(f"Video saved to {video_path_idx}")
+        if render_azimuths is not None and render_elevations is not None and render_radius is not None:
+            render_size = infer_config.render_resolution
+            ENV = load_mipmap("models/lrm/env_mipmap/6")
+            materials = (0.0,0.9)
+            all_mv, all_mvp, all_campos, identity_mv = get_render_cameras_frames(
+                batch_size=1,
+                radius=render_radius,
+                azimuths=render_azimuths,
+                elevations=render_elevations,
+                fov=30
+            )
+            frames, albedos, pbr_spec_lights, pbr_diffuse_lights, normals, alphas = render_frames(
+                model,
+                planes,
+                render_cameras=all_mvp,
+                camera_pos=all_campos,
+                env=ENV,
+                materials=materials,
+                render_size=render_size,
+                render_mv = all_mv,
+                local_normal=True,
+                identity_mv=identity_mv,
+            )
+        else:
+            normals = None
+            frames = None
+            albedos = None
+    return vertices, faces, normals, frames, albedos
+def transform_normal(input_normal, azimuths_deg, elevations_deg, radius=4.5, is_global_to_local=False):
+    """
+    input_normal: in range [-1, 1], shape (b c h w)
+    """
+    input_normal = input_normal.permute(0, 2, 3, 1).cpu()
+    azimuths_deg = np.array(azimuths_deg)
+    elevations_deg = np.array(elevations_deg)
+    if is_global_to_local:
+        local_normal = normal_transfer.trans_global_2_local(input_normal, azimuths_deg, elevations_deg)
+        return local_normal.permute(0, 3, 1, 2)
+    else:
+        global_normal = normal_transfer.trans_local_2_global(input_normal, azimuths_deg, elevations_deg, radius=radius, for_lotus=False)
+        global_normal[..., 0] *= -1
+        return global_normal.permute(0, 3, 1, 2)
+def local_normal_global_transform(local_normal_images,azimuths_deg,elevations_deg):
+    if local_normal_images.min() >= 0:
+        local_normal = local_normal_images.float() * 2 - 1
+    else:
+        local_normal = local_normal_images.float()
+    global_normal = normal_transfer.trans_local_2_global(local_normal, azimuths_deg, elevations_deg, radius=4.5, for_lotus=False)
+    global_normal[...,0] *= -1
+    global_normal = (global_normal + 1) / 2
+    global_normal = global_normal.permute(0, 3, 1, 2)
+    return global_normal
+def main():
+    image_pth = "examples/蓝色小怪物.webp"
+    save_dir_path = os.path.join(save_dir, image_pth.split("/")[-1].split(".")[0])
+    os.makedirs(save_dir_path, exist_ok=True)
+    input_image = Image.open(image_pth)
+    # if not args.no_rembg:
+    input_image = remove_background(input_image, rembg_session)
+    input_image = resize_foreground(input_image, 0.85)
+    # generate caption
+    image_caption = run_captioning(image_pth)
+    # generate multi-view images
+    output_image = multi_view_rgb_generation(input_image)
+    # lrm reconstructions
+    rgb_multi_view = np.asarray(output_image, dtype=np.float32) / 255.0
+    rgb_multi_view = torch.from_numpy(rgb_multi_view).squeeze(0).permute(2, 0, 1).contiguous().float()     # (3, 1024, 2048)
+    rgb_multi_view = rearrange(rgb_multi_view, 'c (n h) (m w) -> (n m) c h w', n=2, m=2)        # (8, 3, 512, 512)
+    input_cameras = get_custom_zero123plus_input_cameras(batch_size=1, radius=3.5, fov=30).to(device)
+    vertices, faces, lrm_multi_view_normals, lrm_multi_view_rgb, lrm_multi_view_albedo = \
+                                        lrm_reconstructions(rgb_multi_view, input_cameras, save_path=save_dir_path, name='lrm',
+                                          export_texmap=False, if_save_video=False, render_azimuths=isomer_azimuths,
+                                          render_elevations=isomer_elevations, render_radius=isomer_radius, render_fov=30)
+    vertices = torch.from_numpy(vertices).to(device)
+    faces = torch.from_numpy(faces).to(device)
+    vertices = vertices @ rotate_x(np.pi / 2, device=vertices.device)[:3, :3]
+    vertices = vertices @ rotate_y(np.pi / 2, device=vertices.device)[:3, :3]
+    # lrm_3D_bundle_image = torchvision.utils.make_grid(torch.cat([lrm_multi_view_rgb.cpu(), (lrm_multi_view_normals.cpu() + 1) / 2], dim=0), nrow=4, padding=0).unsqueeze(0) # range [0, 1]
+    lrm_3D_bundle_image = torchvision.utils.make_grid(torch.cat([rgb_multi_view[[3,0,1,2]].cpu(), (lrm_multi_view_normals.cpu() + 1) / 2], dim=0), nrow=4, padding=0).unsqueeze(0) # range [0, 1]
+    # rgb_multi_view[[3,0,1,2]] : (B,3,H,W)
+    # lrm_multi_view_normals : (B,3,H,W)
+    # combined_images = 0.5 * rgb_multi_view[[3,0,1,2]].cpu() + 0.5 * (lrm_multi_view_normals.cpu() + 1) / 2
+    # torchvision.utils.save_image(combined_images, os.path.join("debug_output", 'combined.png'))
+    # breakpoint()
+    # Use the low-quality controlnet by default, feel free to try the others
+    control_image = [lrm_3D_bundle_image * 2 - 1]
+    control_mode = ['tile']
+    control_guidance_start = [0.0]
+    control_guidance_end = [0.3]
+    controlnet_conditioning_scale = [0.8]
+    flux_pipe.controlnet = FluxMultiControlNetModel([flux_controlnet for _ in control_mode])
+    # breakpoint()
+    rgb_normal_grid = multi_view_rgb_normal_generation_with_controlnet(
+        prompt= ' '.join(['A grid of 2x4 multi-view image, elevation 5. White background.', image_caption]),
+        image=lrm_3D_bundle_image,
+        strength=0.6,
+        control_image=control_image,
+        control_mode=control_mode,
+        control_guidance_start=control_guidance_start,
+        control_guidance_end=control_guidance_end,
+        controlnet_conditioning_scale=controlnet_conditioning_scale,
+        lora_scale=1.0
+    ) # noted that rgb_normal_grid is a (b, h, w, c) numpy array
+    rgb_normal_grid = torch.from_numpy(rgb_normal_grid).contiguous().float()
+    rgb_normal_grid = rearrange(rgb_normal_grid.squeeze(0), '(n h) (m w) c-> (n m) c h w', n=2, m=4)        # (8, 3, 512, 512)
+    rgb_multi_view = rgb_normal_grid[:4, :3, :, :].cuda()
+    normal_multi_view = rgb_normal_grid[4:, :3, :, :].cuda()
+    multi_view_mask = get_background(normal_multi_view).cuda()
+    rgb_multi_view = rgb_multi_view * multi_view_mask + (1-multi_view_mask)
+    # local normal to global normal
+    global_normal = local_normal_global_transform(normal_multi_view.permute(0, 2, 3, 1).cpu(), isomer_azimuths, isomer_elevations).cuda()
+    global_normal = global_normal * multi_view_mask + (1-multi_view_mask)
+    global_normal = global_normal.permute(0,2,3,1)
+    multi_view_mask = multi_view_mask.squeeze(1)
+    rgb_multi_view = rgb_multi_view.permute(0,2,3,1)
+    # global_normal: B,H,W,3
+    # multi_view_mask: B,H,W
+    # rgb_multi_view: B,H,W,3
+    meshes = reconstruction(
+        normal_pils=global_normal,
+        masks=multi_view_mask,
+        weights=isomer_geo_weights,
+        fov=30,
+        radius=isomer_radius,
+        camera_angles_azi=isomer_azimuths,
+        camera_angles_ele=isomer_elevations,
+        expansion_weight_stage1=0.1,
+        init_type="file",
+        init_verts=vertices,
+        init_faces=faces,
+        stage1_steps=0,
+        stage2_steps=50,
+        start_edge_len_stage1=0.1,
+        end_edge_len_stage1=0.02,
+        start_edge_len_stage2=0.02,
+        end_edge_len_stage2=0.005,
+    )
+    save_glb_addr = projection(
+        meshes=meshes,
+        masks=multi_view_mask,
+        images=rgb_multi_view,
+        azimuths=isomer_azimuths,
+        elevations=isomer_elevations,
+        weights=isomer_color_weights,
+        fov=30,
+        radius=isomer_radius,
+        save_dir=f"{save_dir_path}/ISOMER/",
+    )
+    print(f'saved to {save_glb_addr}')
+if __name__ == '__main__':
+    main()

pipeline/kiss3d_wrapper.py ADDED Viewed

	@@ -0,0 +1,429 @@

+# The kiss3d pipeline wrapper for inference
+import os
+import numpy as np
+import torch
+import yaml
+import uuid
+from typing import Union, Any, Dict
+from einops import rearrange
+from PIL import Image
+from pipeline.utils import logger, TMP_DIR, OUT_DIR
+from pipeline.utils import lrm_reconstruct, isomer_reconstruct
+import torch
+import torchvision
+# for reconstruction model
+from omegaconf import OmegaConf
+from models.lrm.utils.train_util import instantiate_from_config
+from models.lrm.utils.render_utils import rotate_x, rotate_y
+from utils.tool import get_background
+# for florence2
+from transformers import AutoProcessor, AutoModelForCausalLM
+from diffusers import FluxPipeline, FluxControlNetImg2ImgPipeline, FluxImg2ImgPipeline, DiffusionPipeline, EulerAncestralDiscreteScheduler
+from diffusers.models.controlnet_flux import FluxControlNetModel, FluxMultiControlNetModel
+def init_wrapper_from_config(config_path):
+    with open(config_path, 'r') as config_file:
+        config_ = yaml.load(config_file, yaml.FullLoader)
+    # init flux_pipeline
+    logger.info('==> Loading Flux model ...')
+    flux_device = config_['flux'].get('device', 'cpu')
+    flux_base_model_pth = config_['flux'].get('base_model', None)
+    flux_controlnet_pth = config_['flux'].get('controlnet', None)
+    flux_lora_pth = config_['flux'].get('lora', None)
+    # load flux model and controlnet
+    if flux_controlnet_pth is not None:
+        flux_controlnet = FluxControlNetModel.from_pretrained(flux_controlnet_pth)
+        flux_pipe = FluxControlNetImg2ImgPipeline.from_pretrained(flux_base_model_pth, controlnet=[flux_controlnet], \
+                                torch_dtype=torch.bfloat16)
+    else:
+        flux_pipe = FluxImg2ImgPipeline(flux_base_model_pth, torch_dtype=torch.bfloat16)
+    # load lora weights
+    flux_pipe.load_lora_weights(flux_lora_pth)
+    flux_pipe.to(device=flux_device, dtype=torch.bfloat16)
+    # TODO: load redux model
+    # FluxPriorReduxPipeline.from_pretrained()
+    # TODO: load pulid model
+    # init multiview model
+    logger.info('==> Loading multiview diffusion model ...')
+    multiview_device = config_['multiview'].get('device', 'cpu')
+    multiview_pipeline = DiffusionPipeline.from_pretrained(
+        config_['multiview']['base_model'],
+        custom_pipeline=config_['multiview']['custom_pipeline'],
+        torch_dtype=torch.float16,
+    )
+    multiview_pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(
+        multiview_pipeline.scheduler.config, timestep_spacing='trailing'
+    )
+    unet_ckpt_path = config_['multiview'].get('unet', None)
+    if unet_ckpt_path is not None:
+        state_dict = torch.load(unet_ckpt_path, map_location='cpu')['state_dict']
+        state_dict = {k[10:]: v for k, v in state_dict.items() if k.startswith('unet.unet.')}
+        multiview_pipeline.unet.load_state_dict(state_dict, strict=True)
+    multiview_pipeline.to(multiview_device)
+    # load caption model
+    logger.info('==> Loading caption model ...')
+    caption_device = config_['caption'].get('device', 'cpu')
+    caption_model = AutoModelForCausalLM.from_pretrained(config_['caption']['base_model'], \
+                    torch_dtype=torch.bfloat16, trust_remote_code=True).to(caption_device)
+    caption_processor = AutoProcessor.from_pretrained(config_['caption']['base_model'], trust_remote_code=True)
+    # load reconstruction model
+    logger.info('==> Loading reconstruction model ...')
+    recon_device = config_['reconstruction'].get('device', 'cpu')
+    recon_model_config = OmegaConf.load(config_['reconstruction']['model_config'])
+    recon_model = instantiate_from_config(recon_model_config.model_config)
+    # load recon model checkpoint
+    state_dict = torch.load(config_['reconstruction']['base_model'], map_location='cpu')['state_dict']
+    state_dict = {k[14:]: v for k, v in state_dict.items() if k.startswith('lrm_generator.')}
+    recon_model.load_state_dict(state_dict, strict=True)
+    recon_model.to(recon_device)
+    recon_model.init_flexicubes_geometry(recon_device, fovy=50.0)
+    recon_model.eval()
+    return kiss3d_wrapper(
+        config = config_,
+        flux_pipeline = flux_pipe,
+        multiview_pipeline = multiview_pipeline,
+        caption_processor = caption_processor,
+        caption_model = caption_model,
+        reconstruction_model_config = recon_model_config,
+        reconstruction_model = recon_model,
+    )
+class kiss3d_wrapper(object):
+    def __init__(self,
+        config: Dict,
+        flux_pipeline: Union[FluxPipeline, FluxControlNetImg2ImgPipeline],
+        multiview_pipeline: DiffusionPipeline,
+        caption_processor: AutoProcessor,
+        caption_model: AutoModelForCausalLM,
+        reconstruction_model_config: Any,
+        reconstruction_model: Any,
+    ):
+        self.config = config
+        self.flux_pipeline = flux_pipeline
+        self.multiview_pipeline = multiview_pipeline
+        self.caption_model = caption_model
+        self.caption_processor = caption_processor
+        self.recon_model_config = reconstruction_model_config
+        self.recon_model = reconstruction_model
+        self.renew_uuid()
+    def renew_uuid(self):
+        self.uuid = uuid.uuid4()
+    def context(self):
+        if self.config['use_zero_gpu']:
+            import spaces
+            return spaces.GPU()
+        else:
+            return torch.no_grad()
+    def get_image_caption(self, image):
+        """
+        image: PIL image or path of PIL image
+        """
+        torch_dtype = torch.bfloat16
+        caption_device = self.config['caption'].get('device', 'cpu')
+        if isinstance(image, str):  # If image is a file path
+            image = Image.open(image).convert("RGB")
+        elif isinstance(image, Image):
+            image = image.convert("RGB")
+        else:
+            raise NotImplementedError('unexpected image type')
+        prompt = "<MORE_DETAILED_CAPTION>"
+        inputs = self.caption_processor(text=prompt, images=image, return_tensors="pt").to(caption_device, torch_dtype)
+        generated_ids = self.caption_model.generate(
+                input_ids=inputs["input_ids"], pixel_values=inputs["pixel_values"], max_new_tokens=1024, num_beams=3
+            )
+        generated_text = self.caption_processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+        parsed_answer = self.caption_processor.post_process_generation(
+            generated_text, task=prompt, image_size=(image.width, image.height)
+        )
+        caption_text = parsed_answer["<MORE_DETAILED_CAPTION>"].replace("The image is ", "")
+        return caption_text
+    def generate_multiview(self, image):
+        with self.context():
+            mv_image = self.multiview_pipeline(image,
+                                               num_inference_steps=self.config['multiview']['num_inference_steps'],
+                                               width=512*2, height=512*2).images[0]
+        return mv_image
+    def reconstruct_from_multiview(self, mv_image):
+        """
+        mv_image: PIL.Image
+        """
+        recon_device = self.config['reconstruction'].get('device', 'cpu')
+        rgb_multi_view = np.asarray(mv_image, dtype=np.float32) / 255.0
+        rgb_multi_view = torch.from_numpy(rgb_multi_view).squeeze(0).permute(2, 0, 1).contiguous().float()     # (3, 1024, 2048)
+        rgb_multi_view = rearrange(rgb_multi_view, 'c (n h) (m w) -> (n m) c h w', n=2, m=2).unsqueeze(0).to(recon_device)
+        with self.context():
+            vertices, faces, lrm_multi_view_normals, lrm_multi_view_rgb, lrm_multi_view_albedo = \
+            lrm_reconstruct(self.recon_model, self.recon_model_config.infer_config,
+                            rgb_multi_view, name=self.uuid)
+        return vertices, faces, lrm_multi_view_normals, lrm_multi_view_rgb, lrm_multi_view_albedo
+    def generate_reference_3D_bundle_image_zero123(self, image, save_intermediate_results=True):
+        """
+        input: image, PIL.Image
+        return: ref_3D_bundle_image, Tensor of shape (1, 3, 1024, 2048)
+        """
+        mv_image = self.generate_multiview(image)
+        if save_intermediate_results:
+            mv_image.save(os.path.join(TMP_DIR, f'{self.uuid}_mv_image.png'))
+        vertices, faces, lrm_multi_view_normals, lrm_multi_view_rgb, lrm_multi_view_albedo = self.reconstruct_from_multiview(mv_image)
+        ref_3D_bundle_image = torchvision.utils.make_grid(torch.cat([lrm_multi_view_rgb.cpu(), (lrm_multi_view_normals.cpu() + 1) / 2], dim=0), nrow=4, padding=0).unsqueeze(0) # range [0, 1]
+        if save_intermediate_results:
+            save_path = os.path.join(TMP_DIR, f'{self.uuid}_ref_3d_bundle_image.png')
+            torchvision.utils.save_image(ref_3D_bundle_image, save_path)
+            logger.info(f"Save reference 3D bundle image to {save_path}")
+            return ref_3D_bundle_image, save_path
+        return ref_3D_bundle_image
+    def generate_3d_bundle_image_controlnet(self,
+                                 prompt,
+                                 image=None,
+                                 strength=1.0,
+                                 control_image=[],
+                                 control_mode=[],
+                                 control_guidance_start=None,
+                                 control_guidance_end=None,
+                                 controlnet_conditioning_scale=None,
+                                 lora_scale=1.0,
+                                 save_intermediate_results=True,
+                                 **kwargs):
+        control_mode_dict = {
+            'canny': 0,
+            'tile': 1,
+            'depth': 2,
+            'blur': 3,
+            'pose': 4,
+            'gray': 5,
+            'lq': 6,
+        } # for https://huggingface.co/InstantX/FLUX.1-dev-Controlnet-Union only
+        flux_device = self.config['flux'].get('device', 'cpu')
+        seed = self.config['flux'].get('seed', 0)
+        generator = torch.Generator(device=flux_device).manual_seed(seed)
+        hparam_dict = {
+            'prompt': ' '.join(['A grid of 2x4 multi-view image, elevation 5. White background.', prompt]),
+            'image': image or torch.zeros((1, 3, 1024, 2048), dtype=torch.float32, device=flux_device),
+            'strength': strength,
+            'num_inference_steps': 30,
+            'guidance_scale': 3.5,
+            'num_images_per_prompt': 1,
+            'width': 2048,
+            'height': 1024,
+            'output_type': 'np',
+            'generator': generator,
+            'joint_attention_kwargs': {"scale": lora_scale}
+        }
+        hparam_dict.update(kwargs)
+         # append controlnet hparams
+        if len(control_image) > 0:
+            assert isinstance(self.flux_pipeline, FluxControlNetImg2ImgPipeline)
+            assert len(control_mode) == len(control_image) # the count of image should be the same as control mode
+            flux_ctrl_net = self.flux_pipeline.controlnet.nets[0]
+            self.flux_pipeline.controlnet = FluxMultiControlNetModel([flux_ctrl_net for i in range(len(control_image))])
+            ctrl_hparams = {
+                'control_mode': [control_mode_dict[mode_] for mode_ in control_mode],
+                'control_image': control_image,
+                'control_guidance_start': control_guidance_start or [0.0 for i in range(len(control_image))],
+                'control_guidance_end': control_guidance_end or [1.0 for i in range(len(control_image))],
+                'controlnet_conditioning_scale': controlnet_conditioning_scale or [1.0 for i in range(len(control_image))],
+            }
+            hparam_dict.update(ctrl_hparams)
+        with self.context():
+            gen_3d_bundle_image = self.flux_pipeline(**hparam_dict).images
+        gen_3d_bundle_image_ = torch.from_numpy(gen_3d_bundle_image).squeeze(0).permute(2, 0, 1).contiguous().float()     # (3, 1024, 2048)
+        if save_intermediate_results:
+            save_path = os.path.join(TMP_DIR, f'{self.uuid}_gen_3d_bundle_image.png')
+            torchvision.utils.save_image(gen_3d_bundle_image_, save_path)
+            logger.info(f"Save generated 3D bundle image to {save_path}")
+            return gen_3d_bundle_image_, save_path
+        return gen_3d_bundle_image_
+    def generate_3d_bundle_image_text(self,
+                                      prompt,
+                                      image=None,
+                                      strength=1.0,
+                                      lora_scale=1.0,
+                                      num_inference_steps=30,
+                                      save_intermediate_results=True,
+                                      **kwargs):
+        """
+        return: gen_3d_bundle_image, torch.Tensor of shape (3, 1024, 2048), range [0., 1.]
+        """
+        if isinstance(self.flux_pipeline, FluxControlNetImg2ImgPipeline):
+            flux_pipeline = FluxImg2ImgPipeline(
+                scheduler = self.flux_pipeline.scheduler,
+                vae = self.flux_pipeline.vae,
+                text_encoder = self.flux_pipeline.text_encoder,
+                tokenizer = self.flux_pipeline.tokenizer,
+                text_encoder_2 = self.flux_pipeline.text_encoder_2,
+                tokenizer_2 = self.flux_pipeline.tokenizer_2,
+                transformer = self.flux_pipeline.transformer
+            )
+        else:
+            flux_pipeline = self.flux_pipeline
+        flux_device = self.config['flux'].get('device', 'cpu')
+        seed = self.config['flux'].get('seed', 0)
+        generator = torch.Generator(device=flux_device).manual_seed(seed)
+        hparam_dict = {
+            'prompt': ' '.join(['A grid of 2x4 multi-view image, elevation 5. White background.', prompt]),
+            'image': image or torch.zeros((1, 3, 1024, 2048), dtype=torch.float32, device=flux_device),
+            'strength': strength,
+            'num_inference_steps': num_inference_steps,
+            'guidance_scale': 3.5,
+            'num_images_per_prompt': 1,
+            'width': 2048,
+            'height': 1024,
+            'output_type': 'np',
+            'generator': generator,
+            'joint_attention_kwargs': {"scale": lora_scale}
+        }
+        hparam_dict.update(kwargs)
+        with self.context():
+            gen_3d_bundle_image = flux_pipeline(**hparam_dict).images
+        gen_3d_bundle_image_ = torch.from_numpy(gen_3d_bundle_image).squeeze(0).permute(2, 0, 1).contiguous().float()     # (3, 1024, 2048)
+        if save_intermediate_results:
+            save_path = os.path.join(TMP_DIR, f'{self.uuid}_gen_3d_bundle_image.png')
+            torchvision.utils.save_image(gen_3d_bundle_image_, save_path)
+            logger.info(f"Save generated 3D bundle image to {save_path}")
+            return gen_3d_bundle_image_, save_path
+        return gen_3d_bundle_image_
+    def reconstruct_3d_bundle_image(self, image, save_intermediate_results=True):
+        """
+        image: torch.Tensor, range [0., 1.], (3, 1024, 2048)
+        """
+        recon_device = self.config['reconstruction'].get('device', 'cpu')
+        # split rgb and normal
+        images = rearrange(image, 'c (n h) (m w) -> (n m) c h w', n=2, m=4) # (3, 1024, 2048) -> (8, 3, 512, 512)
+        rgb_multi_view, normal_multi_view = images.chunk(2, dim=0)
+        multi_view_mask = get_background(normal_multi_view).to(recon_device)
+        rgb_multi_view = rgb_multi_view.to(recon_device) * multi_view_mask + (1 - multi_view_mask)
+        with self.context():
+            vertices, faces, lrm_multi_view_normals, lrm_multi_view_rgb, lrm_multi_view_albedo = \
+            lrm_reconstruct(self.recon_model, self.recon_model_config.infer_config,
+                            rgb_multi_view.unsqueeze(0).to(recon_device), name=self.uuid,
+                            input_camera_type='kiss3d', render_3d_bundle_image=save_intermediate_results,
+                            render_azimuths=[0, 90, 180, 270])
+        if save_intermediate_results:
+            recon_3D_bundle_image = torchvision.utils.make_grid(torch.cat([lrm_multi_view_rgb.cpu(), (lrm_multi_view_normals.cpu() + 1) / 2], dim=0), nrow=4, padding=0).unsqueeze(0) # range [0, 1]
+            torchvision.utils.save_image(recon_3D_bundle_image, os.path.join(TMP_DIR, f'{k3d_wrapper.uuid})_lrm_recon_3d_bundle_image.png'))
+        recon_mesh_path = os.path.join(TMP_DIR, f"{self.uuid}_isomer_recon_mesh.obj")
+        return isomer_reconstruct(rgb_multi_view=rgb_multi_view,
+                                  normal_multi_view=normal_multi_view,
+                                  multi_view_mask=multi_view_mask,
+                                  vertices=vertices,
+                                  faces=faces,
+                                  save_path=recon_mesh_path)
+def run_text_to_3d(k3d_wrapper,
+                   prompt,
+                   init_image_path=None):
+    # ======================================= Example of text to 3D generation ======================================
+    # Renew The uuid
+    k3d_wrapper.renew_uuid()
+    # FOR Text to 3D (also for image to image) with init image
+    init_image = None
+    if init_image_path is not None:
+        init_image = Image.open(init_image_path)
+    gen_3d_bundle_image, gen_save_path = k3d_wrapper.generate_3d_bundle_image_text(prompt,
+                                                                                     image=init_image,
+                                                                                     strength=1.0,
+                                                                                     save_intermediate_results=True)
+    # recon from 3D Bundle image
+    recon_mesh_path = k3d_wrapper.reconstruct_3d_bundle_image(gen_3d_bundle_image, save_intermediate_results=False)
+    return gen_save_path, recon_mesh_path
+def run_image_to_3d(k3d_wrapper, init_image_path):
+    # ======================================= Example of image to 3D generation ======================================
+    # Renew The uuid
+    k3d_wrapper.renew_uuid()
+    # FOR IMAGE TO 3D: generate reference 3D bundle image from a single input image
+    input_image = Image.open(init_image_path)
+    reference_3d_bundle_image, reference_save_path = k3d_wrapper.generate_reference_3D_bundle_image_zero123(input_image)
+    caption = k3d_wrapper.get_image_caption(input_image)
+    import pdb
+    pdb.set_trace()
+if __name__ == "__main__":
+    k3d_wrapper = init_wrapper_from_config('/hpc2hdd/home/jlin695/code/Kiss3DGen/pipeline/pipeline_config/default.yaml')
+    # Example of loading existing 3D bundle Image
+    # demo_image = Image.open('/hpc2hdd/home/jlin695/code/github/Kiss3DGen/outputs/tmp/ea25bc9b-d775-46bb-9827-660a9a6540c8_gen_3d_bundle_image.png')
+    # gen_3d_bundle_image = torchvision.transforms.functional.to_tensor(demo_image)
+    run_image_to_3d(k3d_wrapper, '/hpc2hdd/home/jlin695/code/Kiss3DGen/examples/蓝色小怪物.webp')
+    # run_text_to_3d(k3d_wrapper, prompt='A doll of a girl in Harry Potter')

pipeline/pipeline_config/default.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+flux:
+  base_model: "/hpc2hdd/JH_DATA/share/yingcongchen/PrivateShareGroup/yingcongchen_datasets/model_checkpoint/models--black-forest-labs--FLUX.1-dev"
+  lora: "./checkpoint/flux_lora/rgb_normal_doll_object.safetensors"
+  controlnet: "/hpc2hdd/JH_DATA/share/yingcongchen/PrivateShareGroup/yingcongchen_datasets/model_checkpoint/flux_controlnets/FLUX.1-dev-ControlNet-Union-Pro"
+  seed: 0
+  device: 'cuda:0'
+multiview:
+  base_model: "sudo-ai/zero123plus-v1.2"
+  custom_pipeline: "./models/zero123plus"
+  unet: "./checkpoint/zero123++/flexgen_19w.ckpt"
+  num_inference_steps: 75
+  device: 'cuda:0'
+reconstruction:
+  model_config: "./models/lrm/config/PRM_inference.yaml"
+  base_model: "./checkpoint/lrm/final_ckpt.ckpt"
+  device: 'cuda:0'
+caption:
+  base_model: "/hpc2hdd/home/jlin695/.cache/huggingface/hub/models--multimodalart--Florence-2-large-no-flash-attn/snapshots/8db3793cf5b453b2ccfb3a4f613b403b2e6b7ca2"
+  device: 'cuda:0'
+use_zero_gpu: false # for huggingface demo only
+3d_bundle_templates: '/hpc2hdd/home/jlin695/code/github/Kiss3DGen/init_3d_Bundle'

pipeline/run_hpc.sh ADDED Viewed

	@@ -0,0 +1,10 @@

+source /hpc2ssd/softwares/anaconda3/bin/activate kiss3dgen
+module load cuda/12.1 compilers/gcc-11.1.0 compilers/icc-2023.1.0 cmake/3.27.0
+export CXX=$(which g++)
+export CC=$(which gcc)
+export CPLUS_INCLUDE_PATH=/hpc2ssd/softwares/cuda/cuda-12.1/targets/x86_64-linux/include:$CPLUS_INCLUDE_PATH
+export CUDA_LAUNCH_BLOCKING=1
+export NCCL_TIMEOUT=3600
+export CUDA_VISIBLE_DEVICES="0"
+python ./pipeline/kiss3d_wrapper.py

pipeline/utils.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import os
+import sys
+import logging
+__workdir__ = '/'.join(os.path.abspath(__file__).split('/')[:-2])
+sys.path.insert(0, __workdir__)
+print(__workdir__)
+import numpy as np
+import torch
+from torchvision.transforms import v2
+from models.lrm.online_render.render_single import load_mipmap
+from models.lrm.utils.camera_util import get_zero123plus_input_cameras, get_custom_zero123plus_input_cameras, get_flux_input_cameras
+from models.lrm.utils.render_utils import rotate_x, rotate_y
+from models.lrm.utils.mesh_util import save_obj, save_obj_with_mtl
+from models.ISOMER.reconstruction_func import reconstruction
+from models.ISOMER.projection_func import projection
+from utils.tool import NormalTransfer, get_render_cameras_frames, get_background, get_render_cameras_video, render_frames, mask_fix
+logging.basicConfig(
+    level = logging.INFO
+)
+logger = logging.getLogger('kiss3d_wrapper')
+OUT_DIR = './outputs'
+TMP_DIR = './outputs/tmp'
+os.makedirs(TMP_DIR, exist_ok=True)
+def lrm_reconstruct(model, infer_config, images,
+                    name='', export_texmap=False,
+                    input_camera_type='zero123',
+                    render_3d_bundle_image=True,
+                    render_azimuths=[270, 0, 90, 180],
+                    render_elevations=[5, 5, 5, 5],
+                    render_radius=4.5):
+    """
+    image: Tensor, shape (1, c, h, w)
+    """
+    mesh_path_idx = os.path.join(TMP_DIR, f'{name}_recon_from_{input_camera_type}.obj')
+    device = images.device
+    if input_camera_type == 'zero123':
+        input_cameras = get_custom_zero123plus_input_cameras(batch_size=1, radius=3.5, fov=30).to(device)
+    elif input_camera_type == 'kiss3d':
+        input_cameras = get_flux_input_cameras(batch_size=1, radius=4.2, fov=30).to(device)
+    else:
+        raise NotImplementedError(f'Unexpected input camera type: {input_camera_type}')
+    images = v2.functional.resize(images, 512, interpolation=3, antialias=True).clamp(0, 1)
+    logger.info(f"==> Runing LRM reconstruction ...")
+    planes = model.forward_planes(images, input_cameras)
+    mesh_out = model.extract_mesh(
+            planes,
+            use_texture_map=export_texmap,
+            **infer_config,
+        )
+    if export_texmap:
+        vertices, faces, uvs, mesh_tex_idx, tex_map = mesh_out
+        save_obj_with_mtl(
+            vertices.data.cpu().numpy(),
+            uvs.data.cpu().numpy(),
+            faces.data.cpu().numpy(),
+            mesh_tex_idx.data.cpu().numpy(),
+            tex_map.permute(1, 2, 0).data.cpu().numpy(),
+            mesh_path_idx,
+        )
+    else:
+        vertices, faces, vertex_colors = mesh_out
+        save_obj(vertices, faces, vertex_colors, mesh_path_idx)
+    logger.info(f"Mesh saved to {mesh_path_idx}")
+    if render_3d_bundle_image:
+        assert render_azimuths is not None and render_elevations is not None and render_radius is not None
+        render_azimuths = torch.Tensor(render_azimuths).to(device)
+        render_elevations = torch.Tensor(render_elevations).to(device)
+        render_size = infer_config.render_resolution
+        ENV = load_mipmap("models/lrm/env_mipmap/6")
+        materials = (0.0,0.9)
+        all_mv, all_mvp, all_campos, identity_mv = get_render_cameras_frames(
+            batch_size=1,
+            radius=render_radius,
+            azimuths=render_azimuths,
+            elevations=render_elevations,
+            fov=30
+        )
+        frames, albedos, pbr_spec_lights, pbr_diffuse_lights, normals, alphas = render_frames(
+            model,
+            planes,
+            render_cameras=all_mvp,
+            camera_pos=all_campos,
+            env=ENV,
+            materials=materials,
+            render_size=render_size,
+            render_mv = all_mv,
+            local_normal=True,
+            identity_mv=identity_mv,
+        )
+    else:
+        normals = None
+        frames = None
+        albedos = None
+    vertices = torch.from_numpy(vertices).to(device)
+    faces = torch.from_numpy(faces).to(device)
+    vertices = vertices @ rotate_x(np.pi / 2, device=device)[:3, :3]
+    vertices = vertices @ rotate_y(np.pi / 2, device=device)[:3, :3]
+    return vertices.cpu(), faces.cpu(), normals, frames, albedos
+normal_transfer = NormalTransfer()
+def local_normal_global_transform(local_normal_images,azimuths_deg,elevations_deg):
+    if local_normal_images.min() >= 0:
+        local_normal = local_normal_images.float() * 2 - 1
+    else:
+        local_normal = local_normal_images.float()
+    global_normal = normal_transfer.trans_local_2_global(local_normal, azimuths_deg, elevations_deg, radius=4.5, for_lotus=False)
+    global_normal[...,0] *= -1
+    global_normal = (global_normal + 1) / 2
+    global_normal = global_normal.permute(0, 3, 1, 2)
+    return global_normal
+def isomer_reconstruct(
+        rgb_multi_view,
+        normal_multi_view,
+        multi_view_mask,
+        vertices,
+        faces,
+        save_path=None,
+        azimuths=[0, 90, 180, 270],
+        elevations=[5, 5, 5, 5],
+        geo_weights=[1, 0.9, 1, 0.9],
+        color_weights=[1, 0.5, 1, 0.5],
+        reconstruction_stage1_steps=50,
+        reconstruction_stage2_steps=50,
+        radius=4.1):
+    device = rgb_multi_view.device
+    to_tensor_ = lambda x: torch.Tensor(x).float().to(device)
+    # local normal to global normal
+    global_normal = local_normal_global_transform(normal_multi_view.permute(0, 2, 3, 1).cpu(), to_tensor_(azimuths), to_tensor_(elevations)).to(device)
+    global_normal = global_normal * multi_view_mask + (1-multi_view_mask)
+    global_normal = global_normal.permute(0,2,3,1)
+    multi_view_mask = multi_view_mask.squeeze(1)
+    rgb_multi_view = rgb_multi_view.permute(0,2,3,1)
+    logger.info(f"==> Runing ISOMER reconstruction ...")
+    meshes = reconstruction(
+        normal_pils=global_normal,
+        masks=multi_view_mask,
+        weights=to_tensor_(geo_weights),
+        fov=30,
+        radius=radius,
+        camera_angles_azi=to_tensor_(azimuths),
+        camera_angles_ele=to_tensor_(elevations),
+        expansion_weight_stage1=0.1,
+        init_type="file",
+        init_verts=vertices,
+        init_faces=faces,
+        stage1_steps=reconstruction_stage1_steps,
+        stage2_steps=reconstruction_stage2_steps,
+        start_edge_len_stage1=0.1,
+        end_edge_len_stage1=0.02,
+        start_edge_len_stage2=0.02,
+        end_edge_len_stage2=0.005,
+    )
+    multi_view_mask_proj = mask_fix(multi_view_mask, erode_dilate=-10, blur=5)
+    logger.info(f"==> Runing ISOMER projection ...")
+    save_glb_addr = projection(
+        meshes,
+        masks=multi_view_mask_proj.to(device),
+        images=rgb_multi_view.to(device),
+        azimuths=to_tensor_(azimuths),
+        elevations=to_tensor_(elevations),
+        weights=to_tensor_(color_weights),
+        fov=30,
+        radius=radius,
+        save_dir=TMP_DIR,
+        save_glb_addr=save_path
+    )
+    logger.info(f"==> Save mesh to {save_glb_addr} ...")
+    return save_glb_addr

run.sh ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ export CUDA_VISIBLE_DEVICES="0"
2	+ python text_to_mesh.py

run_hpc.sh ADDED Viewed

	@@ -0,0 +1,11 @@

+source /hpc2ssd/softwares/anaconda3/bin/activate kiss3dgen
+module load cuda/12.1 compilers/gcc-11.1.0 compilers/icc-2023.1.0 cmake/3.27.0
+export CXX=$(which g++)
+export CC=$(which gcc)
+export CPLUS_INCLUDE_PATH=/hpc2ssd/softwares/cuda/cuda-12.1/targets/x86_64-linux/include:$CPLUS_INCLUDE_PATH
+export CUDA_LAUNCH_BLOCKING=1
+export NCCL_TIMEOUT=3600
+export CUDA_VISIBLE_DEVICES="0"
+# python app.py
+python text_to_mesh.py
+# python image_to_mesh.py

text_to_mesh_new.py ADDED Viewed

	@@ -0,0 +1,244 @@

+import os
+from einops import rearrange
+from omegaconf import OmegaConf
+import torch
+import numpy as np
+import trimesh
+import torchvision
+import torch.nn.functional as F
+from PIL import Image
+from torchvision import transforms
+from torchvision.transforms import v2
+from diffusers import HeunDiscreteScheduler
+from diffusers import FluxPipeline
+from pytorch_lightning import seed_everything
+import os
+import time
+from models.lrm.utils.infer_util import save_video
+from models.lrm.utils.mesh_util import save_obj, save_obj_with_mtl
+from models.lrm.utils.render_utils import rotate_x, rotate_y
+from models.lrm.utils.train_util import instantiate_from_config
+from models.lrm.utils.camera_util import get_flux_input_cameras
+from models.ISOMER.reconstruction_func import reconstruction
+from models.ISOMER.projection_func import projection
+from utils.tool import NormalTransfer, load_mipmap
+from utils.tool import get_background, get_render_cameras_video, render_frames, mask_fix
+device = "cuda"
+resolution = 512
+save_dir = "./outputs/text2"
+normal_transfer = NormalTransfer()
+isomer_azimuths = torch.from_numpy(np.array([0, 90, 180, 270])).float().to(device)
+isomer_elevations = torch.from_numpy(np.array([5, 5, 5, 5])).float().to(device)
+isomer_radius = 4.5
+isomer_geo_weights = torch.from_numpy(np.array([1, 0.9, 1, 0.9])).float().to(device)
+isomer_color_weights = torch.from_numpy(np.array([1, 0.5, 1, 0.5])).float().to(device)
+# model initialization and loading
+# flux
+flux_pipe = FluxPipeline.from_pretrained("/hpc2hdd/JH_DATA/share/yingcongchen/PrivateShareGroup/yingcongchen_datasets/model_checkpoint/models--black-forest-labs--FLUX.1-dev", torch_dtype=torch.bfloat16).to(device=device, dtype=torch.bfloat16)
+flux_pipe.load_lora_weights('./checkpoint/flux_lora/rgb_normal_large.safetensors')
+flux_pipe.to(device=device, dtype=torch.bfloat16)
+generator = torch.Generator(device=device).manual_seed(10)
+# lrm
+config = OmegaConf.load("./models/lrm/config/PRM_inference.yaml")
+model_config = config.model_config
+infer_config = config.infer_config
+model = instantiate_from_config(model_config)
+model_ckpt_path = "./checkpoint/lrm/final_ckpt.ckpt"
+state_dict = torch.load(model_ckpt_path, map_location='cpu')['state_dict']
+state_dict = {k[14:]: v for k, v in state_dict.items() if k.startswith('lrm_generator.')}
+model.load_state_dict(state_dict, strict=True)
+model = model.to(device)
+model.init_flexicubes_geometry(device, fovy=50.0)
+model = model.eval()
+# Flux multi-view generation
+def multi_view_rgb_normal_generation(prompt, save_path=None):
+    # generate multi-view images
+    with torch.no_grad():
+        image = flux_pipe(
+            prompt=prompt,
+            num_inference_steps=30,
+            guidance_scale=3.5,
+            num_images_per_prompt=1,
+            width=resolution*4,
+            height=resolution*2,
+            output_type='np',
+            generator=generator
+        ).images
+    return image
+# lrm reconstructions
+def lrm_reconstructions(image, input_cameras, save_path=None, name="temp", export_texmap=False, if_save_video=False):
+    images = image.unsqueeze(0).to(device)
+    images = v2.functional.resize(images, 512, interpolation=3, antialias=True).clamp(0, 1)
+    # breakpoint()
+    with torch.no_grad():
+        # get triplane
+        planes = model.forward_planes(images, input_cameras)
+        mesh_path_idx = os.path.join(save_path, f'{name}.obj')
+        mesh_out = model.extract_mesh(
+            planes,
+            use_texture_map=export_texmap,
+            **infer_config,
+        )
+        if export_texmap:
+            vertices, faces, uvs, mesh_tex_idx, tex_map = mesh_out
+            save_obj_with_mtl(
+                vertices.data.cpu().numpy(),
+                uvs.data.cpu().numpy(),
+                faces.data.cpu().numpy(),
+                mesh_tex_idx.data.cpu().numpy(),
+                tex_map.permute(1, 2, 0).data.cpu().numpy(),
+                mesh_path_idx,
+            )
+        else:
+            vertices, faces, vertex_colors = mesh_out
+            save_obj(vertices, faces, vertex_colors, mesh_path_idx)
+        print(f"Mesh saved to {mesh_path_idx}")
+        render_size = 512
+        if if_save_video:
+            video_path_idx = os.path.join(save_path, f'{name}.mp4')
+            render_size = infer_config.render_resolution
+            ENV = load_mipmap("models/lrm/env_mipmap/6")
+            materials = (0.0,0.9)
+            all_mv, all_mvp, all_campos = get_render_cameras_video(
+                batch_size=1,
+                M=240,
+                radius=4.5,
+                elevation=(90, 60.0),
+                is_flexicubes=True,
+                fov=30
+            )
+            frames, albedos, pbr_spec_lights, pbr_diffuse_lights, normals, alphas = render_frames(
+                model,
+                planes,
+                render_cameras=all_mvp,
+                camera_pos=all_campos,
+                env=ENV,
+                materials=materials,
+                render_size=render_size,
+                chunk_size=20,
+                is_flexicubes=True,
+            )
+            normals = (torch.nn.functional.normalize(normals) + 1) / 2
+            normals = normals * alphas + (1-alphas)
+            all_frames = torch.cat([frames, albedos, pbr_spec_lights, pbr_diffuse_lights, normals], dim=3)
+            save_video(
+                all_frames,
+                video_path_idx,
+                fps=30,
+            )
+            print(f"Video saved to {video_path_idx}")
+    return vertices, faces
+def local_normal_global_transform(local_normal_images, azimuths_deg, elevations_deg):
+    if local_normal_images.min() >= 0:
+        local_normal = local_normal_images.float() * 2 - 1
+    else:
+        local_normal = local_normal_images.float()
+    global_normal = normal_transfer.trans_local_2_global(local_normal, azimuths_deg, elevations_deg, radius=4.5, for_lotus=False)
+    global_normal[...,0] *= -1
+    global_normal = (global_normal + 1) / 2
+    global_normal = global_normal.permute(0, 3, 1, 2)
+    return global_normal
+def main(prompt = "a owl wearing a hat."):
+    fix_prompt = 'a grid of 2x4 multi-view image. elevation 5. white background.'
+    # user prompt
+    save_dir_path = os.path.join(save_dir, prompt.split(".")[0].replace(" ", "_"))
+    os.makedirs(save_dir_path, exist_ok=True)
+    prompt = fix_prompt+" "+prompt
+    # generate multi-view images
+    rgb_normal_grid = multi_view_rgb_normal_generation(prompt)
+    # lrm reconstructions
+    images = torch.from_numpy(rgb_normal_grid).squeeze(0).permute(2, 0, 1).contiguous().float()     # (3, 1024, 2048)
+    images = rearrange(images, 'c (n h) (m w) -> (n m) c h w', n=2, m=4)        # (8, 3, 512, 512)
+    rgb_multi_view = images[:4, :3, :, :]
+    normal_multi_view = images[4:, :3, :, :]
+    multi_view_mask = get_background(normal_multi_view)
+    rgb_multi_view = rgb_multi_view * rgb_multi_view + (1-multi_view_mask)
+    input_cameras = get_flux_input_cameras(batch_size=1, radius=4.2, fov=30).to(device)
+    vertices, faces = lrm_reconstructions(rgb_multi_view, input_cameras, save_path=save_dir_path, name='lrm', export_texmap=False, if_save_video=False)
+    # local normal to global normal
+    global_normal = local_normal_global_transform(normal_multi_view.permute(0, 2, 3, 1), isomer_azimuths, isomer_elevations)
+    global_normal = global_normal * multi_view_mask + (1-multi_view_mask)
+    global_normal = global_normal.permute(0,2,3,1)
+    rgb_multi_view = rgb_multi_view.permute(0,2,3,1)
+    multi_view_mask = multi_view_mask.permute(0,2,3,1).squeeze(-1)
+    vertices = torch.from_numpy(vertices).to(device)
+    faces = torch.from_numpy(faces).to(device)
+    vertices = vertices @ rotate_x(np.pi / 2, device=vertices.device)[:3, :3]
+    vertices = vertices @ rotate_y(np.pi / 2, device=vertices.device)[:3, :3]
+    # global_normal: B,H,W,3
+    # multi_view_mask: B,H,W
+    # rgb_multi_view: B,H,W,3
+    multi_view_mask_proj = mask_fix(multi_view_mask, erode_dilate=-6, blur=5)
+    meshes = reconstruction(
+        normal_pils=global_normal,
+        masks=multi_view_mask,
+        weights=isomer_geo_weights,
+        fov=30,
+        radius=isomer_radius,
+        camera_angles_azi=isomer_azimuths,
+        camera_angles_ele=isomer_elevations,
+        expansion_weight_stage1=0.1,
+        init_type="file",
+        init_verts=vertices,
+        init_faces=faces,
+        stage1_steps=0,
+        stage2_steps=50,
+        start_edge_len_stage1=0.1,
+        end_edge_len_stage1=0.02,
+        start_edge_len_stage2=0.02,
+        end_edge_len_stage2=0.005,
+    )
+    multi_view_mask_proj = mask_fix(multi_view_mask, erode_dilate=-10, blur=5)
+    save_glb_addr = projection(
+        meshes,
+        masks=multi_view_mask_proj,
+        images=rgb_multi_view,
+        azimuths=isomer_azimuths,
+        elevations=isomer_elevations,
+        weights=isomer_color_weights,
+        fov=30,
+        radius=isomer_radius,
+        save_dir=f"{save_dir_path}/ISOMER/",
+    )
+    print(f'saved to {save_glb_addr}')
+if __name__ == '__main__':
+    import time
+    start_time = time.time()
+    prompts = ["A red dragon soaring", "A running Chihuahua", "A dancing rabbit", "A girl with blue hair and white dress", "A teacher", "A tiger playing guitar", "A red rose", "A red peony", "A rose in a vase", "A golden retriever sitting", "A golden retriever running"]
+    for prompt in prompts:
+        main(prompt)
+    end_time = time.time()
+    print(f"Time taken: {end_time - start_time:.2f} seconds for {len(prompts)} prompts")
+    breakpoint()

upload_huggingface.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from huggingface_hub import HfApi, HfFolder, Repository, create_repo, upload_file
+import os
+# 登录到 Hugging Face
+from huggingface_hub import login
+login()
+# 创建或指定现有的 Repository
+repo_name = "xxx-ckpt"
+username = "LTT"
+repo_id = f"{username}/{repo_name}"
+# 创建仓库（如果它不存在）
+create_repo(repo_id, exist_ok=True)
+# 文件夹
+# 上传整个文件夹
+def upload_folder(folder_path, repo_id):
+    """
+    递归上传文件夹及其内容到 Hugging Face 仓库。
+    """
+    for root, _, files in os.walk(folder_path):
+        for file in files:
+            # 文件完整路径
+            full_file_path = os.path.join(root, file)
+            # 相对于文件夹的相对路径（保留文件夹结构）
+            relative_path = os.path.relpath(full_file_path, folder_path)
+            # 上传文件到仓库
+            print(f"Uploading {relative_path}...")
+            upload_file(
+                path_or_fileobj=full_file_path,
+                path_in_repo=relative_path,
+                repo_id=repo_id
+            )
+            print(f"Uploaded {relative_path} successfully.")
+# 上传模型文件
+model_path = "checkpoint/zero123++/flexgen_19w.ckpt"
+upload_file(path_or_fileobj=model_path, path_in_repo="flexgen_19w.ckpt", repo_id=repo_id)
+# # 上传数据文件
+# data_path = "/hpc2hdd/home/jlin695/data/env_map/data/env_mipmap_large.tar.gz"
+# upload_file(path_or_fileobj=data_path, path_in_repo="env_mipmap_large.tar.gz", repo_id=repo_id)
+# # 上传数据文件
+# data_path = "/hpc2hdd/home/jlin695/data/env_map/data/env_map_light_large.tar.gz"
+# upload_file(path_or_fileobj=data_path, path_in_repo="env_map_light_large.tar.gz", repo_id=repo_id)
+# # 定义要上传的文件夹路径
+# folder_path = "checkpoint/flux_lora"
+# # 调用上传文件夹的函数
+# upload_folder(folder_path, repo_id)
+# print("模型和数据文件已上传到 Hugging Face。")