GLM

Runtime error

App Files Files Community

GLM / app.py

jorgejungle

Update app.py

29a0098 verified 4 months ago

raw

history blame contribute delete

No virus

12.2 kB

	import os
	import shlex
	import subprocess
	import tyro
	import imageio
	import numpy as np
	import tqdm
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torchvision.transforms.functional as TF
	from safetensors.torch import load_file
	import rembg
	import gradio as gr

	# download checkpoints
	from huggingface_hub import hf_hub_download
	ckpt_path = hf_hub_download(repo_id="ashawkey/LGM", filename="model_fp16.safetensors")

	subprocess.run(shlex.split("pip install wheel/diff_gaussian_rasterization-0.0.0-cp310-cp310-linux_x86_64.whl"))

	import kiui
	from kiui.op import recenter
	from kiui.cam import orbit_camera

	from core.options import AllConfigs, Options, config_defaults
	from core.models import LGM
	from convert import Converter
	from mvdream.pipeline_mvdream import MVDreamPipeline

	# import spaces

	IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
	IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
	GRADIO_VIDEO_PATH = 'gradio_output.mp4'
	GRADIO_PLY_PATH = 'gradio_output.ply'
	GRADIO_GLB_PATH = 'gradio_output.glb'

	# opt = tyro.cli(AllConfigs)
	opt = Options(
	input_size=256,
	up_channels=(1024, 1024, 512, 256, 128), # one more decoder
	up_attention=(True, True, True, False, False),
	splat_size=128,
	output_size=512, # render & supervise Gaussians at a higher resolution.
	batch_size=8,
	num_views=8,
	gradient_accumulation_steps=1,
	mixed_precision='bf16',
	resume=ckpt_path,
	)

	# model
	model = LGM(opt)

	# resume pretrained checkpoint
	if opt.resume is not None:
	if opt.resume.endswith('safetensors'):
	ckpt = load_file(opt.resume, device='cpu')
	else:
	ckpt = torch.load(opt.resume, map_location='cpu')
	model.load_state_dict(ckpt, strict=False)
	print(f'[INFO] Loaded checkpoint from {opt.resume}')
	else:
	print(f'[WARN] model randomly initialized, are you sure?')

	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	model = model.half().to(device)
	model.eval()

	tan_half_fov = np.tan(0.5 * np.deg2rad(opt.fovy))
	proj_matrix = torch.zeros(4, 4, dtype=torch.float32).to(device)
	proj_matrix[0, 0] = 1 / tan_half_fov
	proj_matrix[1, 1] = 1 / tan_half_fov
	proj_matrix[2, 2] = (opt.zfar + opt.znear) / (opt.zfar - opt.znear)
	proj_matrix[3, 2] = - (opt.zfar * opt.znear) / (opt.zfar - opt.znear)
	proj_matrix[2, 3] = 1

	# load dreams
	pipe_text = MVDreamPipeline.from_pretrained(
	'ashawkey/mvdream-sd2.1-diffusers', # remote weights
	torch_dtype=torch.float16,
	trust_remote_code=True,
	# local_files_only=True,
	)
	pipe_text = pipe_text.to(device)

	pipe_image = MVDreamPipeline.from_pretrained(
	"ashawkey/imagedream-ipmv-diffusers", # remote weights
	torch_dtype=torch.float16,
	trust_remote_code=True,
	# local_files_only=True,
	)
	pipe_image = pipe_image.to(device)

	# load rembg
	bg_remover = rembg.new_session()

	# process function
	# @spaces.GPU
	def generate(input_image, prompt, prompt_neg='', input_elevation=0, input_num_steps=30, input_seed=42):

	# seed
	kiui.seed_everything(input_seed)

	os.makedirs(opt.workspace, exist_ok=True)
	output_video_path = os.path.join(opt.workspace, GRADIO_VIDEO_PATH)
	output_ply_path = os.path.join(opt.workspace, GRADIO_PLY_PATH)
	output_glb_path = os.path.join(opt.workspace, GRADIO_GLB_PATH)

	# text-conditioned
	if input_image is None:
	mv_image_uint8 = pipe_text(prompt, negative_prompt=prompt_neg, num_inference_steps=input_num_steps, guidance_scale=7.5, elevation=input_elevation)
	mv_image_uint8 = (mv_image_uint8 * 255).astype(np.uint8)
	# bg removal
	mv_image = []
	for i in range(4):
	image = rembg.remove(mv_image_uint8[i], session=bg_remover) # [H, W, 4]
	# to white bg
	image = image.astype(np.float32) / 255
	image = recenter(image, image[..., 0] > 0, border_ratio=0.2)
	image = image[..., :3] * image[..., -1:] + (1 - image[..., -1:])
	mv_image.append(image)
	# image-conditioned (may also input text, but no text usually works too)
	else:
	input_image = np.array(input_image) # uint8
	# bg removal
	carved_image = rembg.remove(input_image, session=bg_remover) # [H, W, 4]
	mask = carved_image[..., -1] > 0
	image = recenter(carved_image, mask, border_ratio=0.2)
	image = image.astype(np.float32) / 255.0
	image = image[..., :3] * image[..., 3:4] + (1 - image[..., 3:4])
	mv_image = pipe_image(prompt, image, negative_prompt=prompt_neg, num_inference_steps=input_num_steps, guidance_scale=5.0, elevation=input_elevation)

	mv_image_grid = np.concatenate([
	np.concatenate([mv_image[1], mv_image[2]], axis=1),
	np.concatenate([mv_image[3], mv_image[0]], axis=1),
	], axis=0)

	# generate gaussians
	input_image = np.stack([mv_image[1], mv_image[2], mv_image[3], mv_image[0]], axis=0) # [4, 256, 256, 3], float32
	input_image = torch.from_numpy(input_image).permute(0, 3, 1, 2).float().to(device) # [4, 3, 256, 256]
	input_image = F.interpolate(input_image, size=(opt.input_size, opt.input_size), mode='bilinear', align_corners=False)
	input_image = TF.normalize(input_image, IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD)

	rays_embeddings = model.prepare_default_rays(device, elevation=input_elevation)
	input_image = torch.cat([input_image, rays_embeddings], dim=1).unsqueeze(0) # [1, 4, 9, H, W]

	with torch.no_grad():
	with torch.autocast(device_type='cuda', dtype=torch.float16):
	# generate gaussians
	gaussians = model.forward_gaussians(input_image)

	# save gaussians
	model.gs.save_ply(gaussians, output_ply_path)

	# render 360 video
	images = []
	elevation = 0
	if opt.fancy_video:
	azimuth = np.arange(0, 720, 4, dtype=np.int32)
	for azi in tqdm.tqdm(azimuth):

	cam_poses = torch.from_numpy(orbit_camera(elevation, azi, radius=opt.cam_radius, opengl=True)).unsqueeze(0).to(device)

	cam_poses[:, :3, 1:3] *= -1 # invert up & forward direction

	# cameras needed by gaussian rasterizer
	cam_view = torch.inverse(cam_poses).transpose(1, 2) # [V, 4, 4]
	cam_view_proj = cam_view @ proj_matrix # [V, 4, 4]
	cam_pos = - cam_poses[:, :3, 3] # [V, 3]

	scale = min(azi / 360, 1)

	image = model.gs.render(gaussians, cam_view.unsqueeze(0), cam_view_proj.unsqueeze(0), cam_pos.unsqueeze(0), scale_modifier=scale)['image']
	images.append((image.squeeze(1).permute(0,2,3,1).contiguous().float().cpu().numpy() * 255).astype(np.uint8))
	else:
	azimuth = np.arange(0, 360, 2, dtype=np.int32)
	for azi in tqdm.tqdm(azimuth):

	cam_poses = torch.from_numpy(orbit_camera(elevation, azi, radius=opt.cam_radius, opengl=True)).unsqueeze(0).to(device)

	cam_poses[:, :3, 1:3] *= -1 # invert up & forward direction

	# cameras needed by gaussian rasterizer
	cam_view = torch.inverse(cam_poses).transpose(1, 2) # [V, 4, 4]
	cam_view_proj = cam_view @ proj_matrix # [V, 4, 4]
	cam_pos = - cam_poses[:, :3, 3] # [V, 3]

	image = model.gs.render(gaussians, cam_view.unsqueeze(0), cam_view_proj.unsqueeze(0), cam_pos.unsqueeze(0), scale_modifier=1)['image']
	images.append((image.squeeze(1).permute(0,2,3,1).contiguous().float().cpu().numpy() * 255).astype(np.uint8))

	images = np.concatenate(images, axis=0)
	imageio.mimwrite(output_video_path, images, fps=30)

	return mv_image_grid, output_video_path, output_ply_path

	def convert(output_ply_path):
	if not os.path.exists(output_ply_path):
	gr.Warning("PLY file not found please upload or generate 3D model")

	# load a saved ply and convert to mesh
	opt.test_path = output_ply_path
	opt.force_cuda_rast = True # container compatability

	converter = Converter(opt).cuda()
	converter.fit_nerf()
	converter.fit_mesh()
	converter.fit_mesh_uv()
	converter.export_mesh(opt.test_path.replace('.ply', '.glb'))
	return output_glb_path

	# gradio UI

	_TITLE = '''LGM: Large Multi-View Gaussian Model for High-Resolution 3D Content Creation'''

	_DESCRIPTION = '''
	<div>
	<a style="display:inline-block" href="https://me.kiui.moe/lgm/"><img src='https://img.shields.io/badge/public_website-8A2BE2'></a>
	<a style="display:inline-block; margin-left: .5em" href="https://github.com/3DTopia/LGM"><img src='https://img.shields.io/github/stars/3DTopia/LGM?style=social'/></a>
	</div>

	* Input can be only text, only image, or both image and text.
	* Output is a `ply` file containing the 3D Gaussians, please check our [repo](https://github.com/3DTopia/LGM/blob/main/readme.md) for visualization and mesh conversion.
	* If you find the output unsatisfying, try using different seeds!
	'''

	block = gr.Blocks(title=_TITLE).queue()
	with block:
	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown('# ' + _TITLE)
	gr.Markdown(_DESCRIPTION)

	with gr.Row(variant='panel'):
	with gr.Column(scale=1):
	# input image
	input_image = gr.Image(label="image", type='pil')
	# input prompt
	input_text = gr.Textbox(label="prompt")
	# negative prompt
	input_neg_text = gr.Textbox(label="negative prompt", value='ugly, blurry, pixelated obscure, unnatural colors, poor lighting, dull, unclear, cropped, lowres, low quality, artifacts, duplicate')
	# elevation
	input_elevation = gr.Slider(label="elevation", minimum=-90, maximum=90, step=1, value=0)
	# inference steps
	input_num_steps = gr.Slider(label="inference steps", minimum=1, maximum=100, step=1, value=30)
	# random seed
	input_seed = gr.Slider(label="random seed", minimum=0, maximum=100000, step=1, value=0)
	# gen button
	button_gen = gr.Button("Generate")


	with gr.Column(scale=1):
	with gr.Tab("Video and 3D Model"):
	# final video results
	output_video = gr.Video(label="video")
	# ply file
	output_file = gr.Model3D(label="3D Gaussians (ply format)")
	gr.Markdown("Note: Downloaded object will be flipped in case of .ply export. Export .glb instead or manually flip it before usage.")
	with gr.Tab("Multi-view Image"):
	# multi-view results
	output_image = gr.Image(interactive=False, show_label=False)
	with gr.Tab("Convert Format"):
	# multi-view results
	output_glb = gr.Model3D(label="3D Mesh (glb format)")
	gr.Markdown("Note: The model shown here has a darker appearance. Download to get correct results.")
	button_con = gr.Button("Convert to Mesh")

	button_gen.click(generate, inputs=[input_image, input_text, input_neg_text, input_elevation, input_num_steps, input_seed], outputs=[output_image, output_video, output_file])
	button_con.click(convert, inputs=output_file, outputs=output_glb)

	gr.Examples(
	examples=[
	"data_test/frog_sweater.jpg",
	"data_test/bird.jpg",
	"data_test/boy.jpg",
	"data_test/cat_statue.jpg",
	"data_test/dragontoy.jpg",
	"data_test/gso_rabbit.jpg",
	],
	inputs=[input_image],
	outputs=[output_image, output_video, output_file],
	fn=lambda x: generate(input_image=x, prompt=''),
	cache_examples=True,
	label='Image-to-3D Examples'
	)

	gr.Examples(
	examples=[
	"teddy bear",
	"hamburger",
	"oldman's head sculpture",
	"headphone",
	"motorbike",
	"mech suit"
	],
	inputs=[input_text],
	outputs=[output_image, output_video, output_file],
	fn=lambda x: generate(input_image=None, prompt=x),
	cache_examples=True,
	label='Text-to-3D Examples'
	)

	block.launch()