GLM / app.py
jorgejungle's picture
Update app.py
29a0098 verified
raw
history blame contribute delete
No virus
12.2 kB
import os
import shlex
import subprocess
import tyro
import imageio
import numpy as np
import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms.functional as TF
from safetensors.torch import load_file
import rembg
import gradio as gr
# download checkpoints
from huggingface_hub import hf_hub_download
ckpt_path = hf_hub_download(repo_id="ashawkey/LGM", filename="model_fp16.safetensors")
subprocess.run(shlex.split("pip install wheel/diff_gaussian_rasterization-0.0.0-cp310-cp310-linux_x86_64.whl"))
import kiui
from kiui.op import recenter
from kiui.cam import orbit_camera
from core.options import AllConfigs, Options, config_defaults
from core.models import LGM
from convert import Converter
from mvdream.pipeline_mvdream import MVDreamPipeline
# import spaces
IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
GRADIO_VIDEO_PATH = 'gradio_output.mp4'
GRADIO_PLY_PATH = 'gradio_output.ply'
GRADIO_GLB_PATH = 'gradio_output.glb'
# opt = tyro.cli(AllConfigs)
opt = Options(
input_size=256,
up_channels=(1024, 1024, 512, 256, 128), # one more decoder
up_attention=(True, True, True, False, False),
splat_size=128,
output_size=512, # render & supervise Gaussians at a higher resolution.
batch_size=8,
num_views=8,
gradient_accumulation_steps=1,
mixed_precision='bf16',
resume=ckpt_path,
)
# model
model = LGM(opt)
# resume pretrained checkpoint
if opt.resume is not None:
if opt.resume.endswith('safetensors'):
ckpt = load_file(opt.resume, device='cpu')
else:
ckpt = torch.load(opt.resume, map_location='cpu')
model.load_state_dict(ckpt, strict=False)
print(f'[INFO] Loaded checkpoint from {opt.resume}')
else:
print(f'[WARN] model randomly initialized, are you sure?')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.half().to(device)
model.eval()
tan_half_fov = np.tan(0.5 * np.deg2rad(opt.fovy))
proj_matrix = torch.zeros(4, 4, dtype=torch.float32).to(device)
proj_matrix[0, 0] = 1 / tan_half_fov
proj_matrix[1, 1] = 1 / tan_half_fov
proj_matrix[2, 2] = (opt.zfar + opt.znear) / (opt.zfar - opt.znear)
proj_matrix[3, 2] = - (opt.zfar * opt.znear) / (opt.zfar - opt.znear)
proj_matrix[2, 3] = 1
# load dreams
pipe_text = MVDreamPipeline.from_pretrained(
'ashawkey/mvdream-sd2.1-diffusers', # remote weights
torch_dtype=torch.float16,
trust_remote_code=True,
# local_files_only=True,
)
pipe_text = pipe_text.to(device)
pipe_image = MVDreamPipeline.from_pretrained(
"ashawkey/imagedream-ipmv-diffusers", # remote weights
torch_dtype=torch.float16,
trust_remote_code=True,
# local_files_only=True,
)
pipe_image = pipe_image.to(device)
# load rembg
bg_remover = rembg.new_session()
# process function
# @spaces.GPU
def generate(input_image, prompt, prompt_neg='', input_elevation=0, input_num_steps=30, input_seed=42):
# seed
kiui.seed_everything(input_seed)
os.makedirs(opt.workspace, exist_ok=True)
output_video_path = os.path.join(opt.workspace, GRADIO_VIDEO_PATH)
output_ply_path = os.path.join(opt.workspace, GRADIO_PLY_PATH)
output_glb_path = os.path.join(opt.workspace, GRADIO_GLB_PATH)
# text-conditioned
if input_image is None:
mv_image_uint8 = pipe_text(prompt, negative_prompt=prompt_neg, num_inference_steps=input_num_steps, guidance_scale=7.5, elevation=input_elevation)
mv_image_uint8 = (mv_image_uint8 * 255).astype(np.uint8)
# bg removal
mv_image = []
for i in range(4):
image = rembg.remove(mv_image_uint8[i], session=bg_remover) # [H, W, 4]
# to white bg
image = image.astype(np.float32) / 255
image = recenter(image, image[..., 0] > 0, border_ratio=0.2)
image = image[..., :3] * image[..., -1:] + (1 - image[..., -1:])
mv_image.append(image)
# image-conditioned (may also input text, but no text usually works too)
else:
input_image = np.array(input_image) # uint8
# bg removal
carved_image = rembg.remove(input_image, session=bg_remover) # [H, W, 4]
mask = carved_image[..., -1] > 0
image = recenter(carved_image, mask, border_ratio=0.2)
image = image.astype(np.float32) / 255.0
image = image[..., :3] * image[..., 3:4] + (1 - image[..., 3:4])
mv_image = pipe_image(prompt, image, negative_prompt=prompt_neg, num_inference_steps=input_num_steps, guidance_scale=5.0, elevation=input_elevation)
mv_image_grid = np.concatenate([
np.concatenate([mv_image[1], mv_image[2]], axis=1),
np.concatenate([mv_image[3], mv_image[0]], axis=1),
], axis=0)
# generate gaussians
input_image = np.stack([mv_image[1], mv_image[2], mv_image[3], mv_image[0]], axis=0) # [4, 256, 256, 3], float32
input_image = torch.from_numpy(input_image).permute(0, 3, 1, 2).float().to(device) # [4, 3, 256, 256]
input_image = F.interpolate(input_image, size=(opt.input_size, opt.input_size), mode='bilinear', align_corners=False)
input_image = TF.normalize(input_image, IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD)
rays_embeddings = model.prepare_default_rays(device, elevation=input_elevation)
input_image = torch.cat([input_image, rays_embeddings], dim=1).unsqueeze(0) # [1, 4, 9, H, W]
with torch.no_grad():
with torch.autocast(device_type='cuda', dtype=torch.float16):
# generate gaussians
gaussians = model.forward_gaussians(input_image)
# save gaussians
model.gs.save_ply(gaussians, output_ply_path)
# render 360 video
images = []
elevation = 0
if opt.fancy_video:
azimuth = np.arange(0, 720, 4, dtype=np.int32)
for azi in tqdm.tqdm(azimuth):
cam_poses = torch.from_numpy(orbit_camera(elevation, azi, radius=opt.cam_radius, opengl=True)).unsqueeze(0).to(device)
cam_poses[:, :3, 1:3] *= -1 # invert up & forward direction
# cameras needed by gaussian rasterizer
cam_view = torch.inverse(cam_poses).transpose(1, 2) # [V, 4, 4]
cam_view_proj = cam_view @ proj_matrix # [V, 4, 4]
cam_pos = - cam_poses[:, :3, 3] # [V, 3]
scale = min(azi / 360, 1)
image = model.gs.render(gaussians, cam_view.unsqueeze(0), cam_view_proj.unsqueeze(0), cam_pos.unsqueeze(0), scale_modifier=scale)['image']
images.append((image.squeeze(1).permute(0,2,3,1).contiguous().float().cpu().numpy() * 255).astype(np.uint8))
else:
azimuth = np.arange(0, 360, 2, dtype=np.int32)
for azi in tqdm.tqdm(azimuth):
cam_poses = torch.from_numpy(orbit_camera(elevation, azi, radius=opt.cam_radius, opengl=True)).unsqueeze(0).to(device)
cam_poses[:, :3, 1:3] *= -1 # invert up & forward direction
# cameras needed by gaussian rasterizer
cam_view = torch.inverse(cam_poses).transpose(1, 2) # [V, 4, 4]
cam_view_proj = cam_view @ proj_matrix # [V, 4, 4]
cam_pos = - cam_poses[:, :3, 3] # [V, 3]
image = model.gs.render(gaussians, cam_view.unsqueeze(0), cam_view_proj.unsqueeze(0), cam_pos.unsqueeze(0), scale_modifier=1)['image']
images.append((image.squeeze(1).permute(0,2,3,1).contiguous().float().cpu().numpy() * 255).astype(np.uint8))
images = np.concatenate(images, axis=0)
imageio.mimwrite(output_video_path, images, fps=30)
return mv_image_grid, output_video_path, output_ply_path
def convert(output_ply_path):
if not os.path.exists(output_ply_path):
gr.Warning("PLY file not found please upload or generate 3D model")
# load a saved ply and convert to mesh
opt.test_path = output_ply_path
opt.force_cuda_rast = True # container compatability
converter = Converter(opt).cuda()
converter.fit_nerf()
converter.fit_mesh()
converter.fit_mesh_uv()
converter.export_mesh(opt.test_path.replace('.ply', '.glb'))
return output_glb_path
# gradio UI
_TITLE = '''LGM: Large Multi-View Gaussian Model for High-Resolution 3D Content Creation'''
_DESCRIPTION = '''
<div>
<a style="display:inline-block" href="https://me.kiui.moe/lgm/"><img src='https://img.shields.io/badge/public_website-8A2BE2'></a>
<a style="display:inline-block; margin-left: .5em" href="https://github.com/3DTopia/LGM"><img src='https://img.shields.io/github/stars/3DTopia/LGM?style=social'/></a>
</div>
* Input can be only text, only image, or both image and text.
* Output is a `ply` file containing the 3D Gaussians, please check our [repo](https://github.com/3DTopia/LGM/blob/main/readme.md) for visualization and mesh conversion.
* If you find the output unsatisfying, try using different seeds!
'''
block = gr.Blocks(title=_TITLE).queue()
with block:
with gr.Row():
with gr.Column(scale=1):
gr.Markdown('# ' + _TITLE)
gr.Markdown(_DESCRIPTION)
with gr.Row(variant='panel'):
with gr.Column(scale=1):
# input image
input_image = gr.Image(label="image", type='pil')
# input prompt
input_text = gr.Textbox(label="prompt")
# negative prompt
input_neg_text = gr.Textbox(label="negative prompt", value='ugly, blurry, pixelated obscure, unnatural colors, poor lighting, dull, unclear, cropped, lowres, low quality, artifacts, duplicate')
# elevation
input_elevation = gr.Slider(label="elevation", minimum=-90, maximum=90, step=1, value=0)
# inference steps
input_num_steps = gr.Slider(label="inference steps", minimum=1, maximum=100, step=1, value=30)
# random seed
input_seed = gr.Slider(label="random seed", minimum=0, maximum=100000, step=1, value=0)
# gen button
button_gen = gr.Button("Generate")
with gr.Column(scale=1):
with gr.Tab("Video and 3D Model"):
# final video results
output_video = gr.Video(label="video")
# ply file
output_file = gr.Model3D(label="3D Gaussians (ply format)")
gr.Markdown("Note: Downloaded object will be flipped in case of .ply export. Export .glb instead or manually flip it before usage.")
with gr.Tab("Multi-view Image"):
# multi-view results
output_image = gr.Image(interactive=False, show_label=False)
with gr.Tab("Convert Format"):
# multi-view results
output_glb = gr.Model3D(label="3D Mesh (glb format)")
gr.Markdown("Note: The model shown here has a darker appearance. Download to get correct results.")
button_con = gr.Button("Convert to Mesh")
button_gen.click(generate, inputs=[input_image, input_text, input_neg_text, input_elevation, input_num_steps, input_seed], outputs=[output_image, output_video, output_file])
button_con.click(convert, inputs=output_file, outputs=output_glb)
gr.Examples(
examples=[
"data_test/frog_sweater.jpg",
"data_test/bird.jpg",
"data_test/boy.jpg",
"data_test/cat_statue.jpg",
"data_test/dragontoy.jpg",
"data_test/gso_rabbit.jpg",
],
inputs=[input_image],
outputs=[output_image, output_video, output_file],
fn=lambda x: generate(input_image=x, prompt=''),
cache_examples=True,
label='Image-to-3D Examples'
)
gr.Examples(
examples=[
"teddy bear",
"hamburger",
"oldman's head sculpture",
"headphone",
"motorbike",
"mech suit"
],
inputs=[input_text],
outputs=[output_image, output_video, output_file],
fn=lambda x: generate(input_image=None, prompt=x),
cache_examples=True,
label='Text-to-3D Examples'
)
block.launch()