Spaces:

TencentARC
/

FreeSplatter

Running on Zero

App Files Files Community

FreeSplatter / freesplatter /webui /runner.py

bluestyle97

Upload 147 files

184193d verified 11 days ago

raw

history blame

23.5 kB

	import os
	import json
	import uuid
	import time
	import numpy as np
	import torch
	import fpsample
	import fast_simplification
	import matplotlib.pyplot as plt
	cmap = plt.get_cmap("hsv")
	from torchvision.transforms import v2
	from pytorch_lightning import seed_everything
	from PIL import Image
	from omegaconf import OmegaConf
	from einops import rearrange
	from scipy.spatial.transform import Rotation
	from safetensors import safe_open
	from huggingface_hub import hf_hub_download

	from transformers import AutoModelForImageSegmentation
	from diffusers import DiffusionPipeline, EulerAncestralDiscreteScheduler

	from freesplatter.hunyuan.hunyuan3d_mvd_std_pipeline import HunYuan3D_MVD_Std_Pipeline
	from freesplatter.utils.mesh import Mesh
	from freesplatter.utils.mesh_renderer import MeshRenderer
	from freesplatter.utils.camera_util import *
	from freesplatter.utils.recon_util import *
	from freesplatter.utils.infer_util import *
	from freesplatter.webui.camera_viewer.visualizer import CameraVisualizer


	def inv_sigmoid(x: torch.Tensor) -> torch.Tensor:
	return torch.log(x / (1.0 - x))


	def save_gaussian(latent, gs_vis_path, model, opacity_threshold=None, pad_2dgs_scale=True):
	if latent.ndim == 3:
	latent = latent[0]

	sh_dim = model.sh_dim
	scale_dim = 2 if model.use_2dgs else 3
	xyz, features, opacity, scaling, rotation = latent.split([3, sh_dim, 1, scale_dim, 4], dim=-1)
	features = features.reshape(features.shape[0], sh_dim//3, 3)

	if opacity_threshold is not None:
	index = torch.nonzero(opacity.sigmoid() > opacity_threshold)[:, 0]
	xyz = xyz[index]
	features = features[index]
	opacity = opacity[index]
	scaling = scaling[index]
	rotation = rotation[index]

	# transform gaussians from reference view to world view
	cam2world = create_camera_to_world(torch.tensor([0, -2, 0]), camera_system='opencv').to(latent)
	R, T = cam2world[:3, :3], cam2world[:3, 3].reshape(1, 3)
	xyz = xyz @ R.T + T
	rotation = rotation.detach().cpu().numpy()
	rotation = Rotation.from_quat(rotation[:, [1, 2, 3, 0]]).as_matrix()
	rotation = R.detach().cpu().numpy() @ rotation
	rotation = Rotation.from_matrix(rotation).as_quat()[:, [3, 0, 1, 2]]
	rotation = torch.from_numpy(rotation).to(latent)

	# pad 2DGS with an additional z-scale for visualization
	if scaling.shape[-1] == 2 and pad_2dgs_scale:
	z_scaling = inv_sigmoid(torch.ones_like(scaling[:, :1]) * 0.001)
	scaling = torch.cat([scaling, z_scaling], dim=-1)
	pc_vis = model.gs_renderer.gaussian_model.set_data(
	xyz.float(), features.float(), scaling.float(), rotation.float(), opacity.float())
	pc_vis.save_ply_vis(gs_vis_path)


	class FreeSplatterRunner:
	def __init__(self, device):
	self.device = device

	# background remover
	self.rembg = AutoModelForImageSegmentation.from_pretrained(
	"briaai/RMBG-2.0",
	trust_remote_code=True,
	cache_dir='ckpts/',
	).to(device)
	self.rembg.eval()

	# diffusion models
	pipeline = DiffusionPipeline.from_pretrained(
	"sudo-ai/zero123plus-v1.1",
	custom_pipeline="sudo-ai/zero123plus-pipeline",
	torch_dtype=torch.float16,
	cache_dir="ckpts/",
	)
	pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(
	pipeline.scheduler.config, timestep_spacing='trailing'
	)
	self.zero123plus_v11 = pipeline.to(device)

	pipeline = DiffusionPipeline.from_pretrained(
	"sudo-ai/zero123plus-v1.2",
	custom_pipeline="sudo-ai/zero123plus-pipeline",
	torch_dtype=torch.float16,
	cache_dir="ckpts/",
	)
	pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(
	pipeline.scheduler.config, timestep_spacing='trailing'
	)
	self.zero123plus_v12 = pipeline.to(device)

	pipeline = HunYuan3D_MVD_Std_Pipeline.from_pretrained(
	'./ckpts/Hunyuan3D-1/mvd_std',
	torch_dtype=torch.float16,
	use_safetensors=True,
	)
	self.hunyuan3d_mvd_std = pipeline.to(device)

	# freesplatter
	config_file = 'configs/freesplatter-object.yaml'
	ckpt_path = hf_hub_download('TencentARC/FreeSplatter', repo_type='model', filename='freesplatter-object.safetensors', local_dir='./ckpts/FreeSplatter')
	model = instantiate_from_config(OmegaConf.load(config_file).model)
	state_dict = {}
	with safe_open(ckpt_path, framework="pt", device="cpu") as f:
	for key in f.keys():
	state_dict[key] = f.get_tensor(key)
	model.load_state_dict(state_dict, strict=True)
	self.freesplatter = model.eval().to(device)

	config_file = 'configs/freesplatter-object-2dgs.yaml'
	ckpt_path = hf_hub_download('TencentARC/FreeSplatter', repo_type='model', filename='freesplatter-object-2dgs.safetensors', local_dir='./ckpts/FreeSplatter')
	model = instantiate_from_config(OmegaConf.load(config_file).model)
	state_dict = {}
	with safe_open(ckpt_path, framework="pt", device="cpu") as f:
	for key in f.keys():
	state_dict[key] = f.get_tensor(key)
	model.load_state_dict(state_dict, strict=True)
	self.freesplatter_2dgs = model.eval().to(device)

	config_file = 'configs/freesplatter-scene.yaml'
	ckpt_path = hf_hub_download('TencentARC/FreeSplatter', repo_type='model', filename='freesplatter-scene.safetensors', local_dir='./ckpts/FreeSplatter')
	model = instantiate_from_config(OmegaConf.load(config_file).model)
	state_dict = {}
	with safe_open(ckpt_path, framework="pt", device="cpu") as f:
	for key in f.keys():
	state_dict[key] = f.get_tensor(key)
	model.load_state_dict(state_dict, strict=True)
	self.freesplatter_scene = model.eval().to(device)

	# mesh optimizer
	self.mesh_renderer = MeshRenderer(
	near=0.01,
	far=100,
	ssaa=1,
	texture_filter='linear-mipmap-linear').to(device)

	@torch.inference_mode()
	def run_segmentation(
	self,
	image,
	do_rembg=True,
	):
	torch.cuda.empty_cache()

	if do_rembg:
	image = remove_background(image, self.rembg)

	return image

	def run_img_to_3d(
	self,
	image_rgba,
	model='Zero123++ v1.2',
	diffusion_steps=30,
	guidance_scale=4.0,
	seed=42,
	view_indices=[],
	gs_type='2DGS',
	mesh_reduction=0.5,
	cache_dir=None,
	):
	torch.cuda.empty_cache()

	self.output_dir = os.path.join(cache_dir, f'output_{uuid.uuid4()}')
	os.makedirs(self.output_dir, exist_ok=True)

	# image-to-multiview
	input_image = resize_foreground(image_rgba, 0.9)
	seed_everything(seed)
	if model == 'Zero123++ v1.1':
	output_image = self.zero123plus_v11(
	input_image,
	num_inference_steps=diffusion_steps,
	guidance_scale=guidance_scale,
	).images[0]
	elif model == 'Zero123++ v1.2':
	output_image = self.zero123plus_v12(
	input_image,
	num_inference_steps=diffusion_steps,
	guidance_scale=guidance_scale,
	).images[0]
	elif model == 'Hunyuan3D Std':
	output_image = self.hunyuan3d_mvd_std(
	input_image,
	num_inference_steps=diffusion_steps,
	guidance_scale=guidance_scale,
	guidance_curve=lambda t:2.0,
	).images[0]
	else:
	raise ValueError(f'Unknown model: {model}')

	# preprocess images
	image, alpha = rgba_to_white_background(input_image)
	image = v2.functional.resize(image, 512, interpolation=3, antialias=True).clamp(0, 1)
	alpha = v2.functional.resize(alpha, 512, interpolation=0, antialias=True).clamp(0, 1)

	output_image_rgba = remove_background(output_image, self.rembg)
	if 'Zero123++' in model:
	images, alphas = rgba_to_white_background(output_image_rgba)
	else:
	_, alphas = rgba_to_white_background(output_image_rgba)
	images = torch.from_numpy(np.asarray(output_image) / 255.0).float()
	images = rearrange(images, 'h w c -> c h w')

	images = rearrange(images, 'c (n h) (m w) -> (n m) c h w', n=3, m=2)
	alphas = rearrange(alphas, 'c (n h) (m w) -> (n m) c h w', n=3, m=2)
	if model == 'Hunyuan3D Std':
	images = images[[0, 2, 4, 5, 3, 1]]
	alphas = alphas[[0, 2, 4, 5, 3, 1]]
	images_vis = v2.functional.to_pil_image(rearrange(images, 'nm c h w -> c h (nm w)'))
	images = v2.functional.resize(images, 512, interpolation=3, antialias=True).clamp(0, 1)
	alphas = v2.functional.resize(alphas, 512, interpolation=0, antialias=True).clamp(0, 1)

	images = torch.cat([image.unsqueeze(0), images], dim=0) # 7 x 3 x 512 x 512
	alphas = torch.cat([alpha.unsqueeze(0), alphas], dim=0) # 7 x 1 x 512 x 512

	# run reconstruction
	view_indices = [1, 2, 3, 4, 5, 6] if len(view_indices) == 0 else view_indices
	images, alphas = images[view_indices], alphas[view_indices]
	legends = [f'V{i}' if i != 0 else 'Input' for i in view_indices]

	gs_vis_path, video_path, mesh_fine_path, fig = self.run_freesplatter_object(
	images, alphas, legends=legends, gs_type=gs_type, mesh_reduction=mesh_reduction)

	return images_vis, gs_vis_path, video_path, mesh_fine_path, fig

	def run_views_to_3d(
	self,
	image_files,
	do_rembg=False,
	gs_type='2DGS',
	mesh_reduction=0.5,
	cache_dir=None,
	):
	torch.cuda.empty_cache()

	self.output_dir = os.path.join(cache_dir, f'output_{uuid.uuid4()}')
	os.makedirs(self.output_dir, exist_ok=True)

	# preprocesss images
	images, alphas = [], []
	for image_file in image_files:
	if isinstance(image_file, tuple):
	image_file = image_file[0]
	image = Image.open(image_file)
	w, h = image.size

	image_rgba = self.run_segmentation(image)
	if image.mode == 'RGBA':
	image, alpha = rgba_to_white_background(image_rgba)
	image = v2.functional.center_crop(image, min(h, w))
	alpha = v2.functional.center_crop(alpha, min(h, w))
	else:
	image_rgba = resize_foreground(image_rgba, 0.9)
	image_rgba.save('test.png')
	image, alpha = rgba_to_white_background(image_rgba)

	image = v2.functional.resize(image, 512, interpolation=3, antialias=True).clamp(0, 1)
	alpha = v2.functional.resize(alpha, 512, interpolation=0, antialias=True).clamp(0, 1)

	images.append(image)
	alphas.append(alpha)

	images = torch.stack(images, dim=0)
	alphas = torch.stack(alphas, dim=0)
	images_vis = v2.functional.to_pil_image(rearrange(images, 'n c h w -> c h (n w)'))

	# run reconstruction
	legends = [f'V{i}' for i in range(1, 1+len(images))]

	gs_vis_path, video_path, mesh_fine_path, fig = self.run_freesplatter_object(
	images, alphas, legends=legends, gs_type=gs_type, mesh_reduction=mesh_reduction)

	return images_vis, gs_vis_path, video_path, mesh_fine_path, fig

	def run_freesplatter_object(
	self,
	images,
	alphas,
	legends=None,
	gs_type='2DGS',
	mesh_reduction=0.5,
	):
	torch.cuda.empty_cache()
	device = self.device

	freesplatter = self.freesplatter_2dgs if gs_type == '2DGS' else self.freesplatter

	images, alphas = images.to(device), alphas.to(device)

	t0 = time.time()
	with torch.inference_mode():
	gaussians = freesplatter.forward_gaussians(images.unsqueeze(0))
	t1 = time.time()

	# estimate camera parameters and visualize
	c2ws_pred, focals_pred = freesplatter.estimate_poses(images, gaussians, masks=alphas, use_first_focal=True, pnp_iter=10)
	fig = self.visualize_cameras_object(images, c2ws_pred, focals_pred, legends=legends)
	t2 = time.time()

	# save gaussians
	gs_vis_path = os.path.join(self.output_dir, 'gs_vis.ply')
	save_gaussian(gaussians, gs_vis_path, freesplatter, opacity_threshold=5e-3, pad_2dgs_scale=True)
	print(f'Save gaussian at {gs_vis_path}')

	# render video
	with torch.inference_mode():
	c2ws_video = get_circular_cameras(N=120, elevation=0, radius=2.0, normalize=True).to(device)
	fx = fy = focals_pred.mean() / 512.0
	cx = cy = torch.ones_like(fx) * 0.5
	fxfycxcy_video = torch.tensor([fx, fy, cx, cy]).unsqueeze(0).repeat(c2ws_video.shape[0], 1).to(device)

	video_frames = freesplatter.forward_renderer(
	gaussians,
	c2ws_video.unsqueeze(0),
	fxfycxcy_video.unsqueeze(0),
	)['image'][0].clamp(0, 1)

	video_path = os.path.join(self.output_dir, 'gs.mp4')
	save_video(video_frames, video_path, fps=30)
	print(f'Save video at {video_path}')
	t3 = time.time()

	# extract mesh
	with torch.inference_mode():
	c2ws_fusion = get_fibonacci_cameras(N=120, radius=2.0)
	c2ws_fusion, _ = normalize_cameras(c2ws_fusion, camera_position=torch.tensor([0., -2., 0.]), camera_system='opencv')
	c2ws_fusion = c2ws_fusion.to(device)
	c2ws_fusion_reference = torch.linalg.inv(c2ws_fusion[0:1]) @ c2ws_fusion
	fx = fy = focals_pred.mean() / 512.0
	cx = cy = torch.ones_like(fx) * 0.5
	fov = np.rad2deg(np.arctan(0.5 / fx.item())) * 2
	fxfycxcy_fusion = torch.tensor([fx, fy, cx, cy]).unsqueeze(0).repeat(c2ws_fusion.shape[0], 1).to(device)

	fusion_render_results = freesplatter.forward_renderer(
	gaussians,
	c2ws_fusion_reference.unsqueeze(0),
	fxfycxcy_fusion.unsqueeze(0),
	)
	images_fusion = fusion_render_results['image'][0].clamp(0, 1).permute(0, 2, 3, 1)
	alphas_fusion = fusion_render_results['alpha'][0].permute(0, 2, 3, 1)
	depths_fusion = fusion_render_results['depth'][0].permute(0, 2, 3, 1)

	fusion_images = (images_fusion.detach().cpu().numpy()*255).clip(0, 255).astype(np.uint8)
	fusion_depths = depths_fusion.detach().cpu().numpy()
	fusion_alphas = alphas_fusion.detach().cpu().numpy()
	fusion_masks = (fusion_alphas > 1e-2).astype(np.uint8)
	fusion_depths = fusion_depths * fusion_masks - np.ones_like(fusion_depths) * (1 - fusion_masks)

	fusion_c2ws = c2ws_fusion.detach().cpu().numpy()

	mesh_path = os.path.join(self.output_dir, 'mesh.obj')
	rgbd_to_mesh(
	fusion_images, fusion_depths, fusion_c2ws, fov, mesh_path, cam_elev_thr=-90) # use all angles for tsdf fusion
	print(f'Save mesh at {mesh_path}')
	t4 = time.time()

	# optimize texture
	cam_pos = c2ws_fusion[:, :3, 3].cpu().numpy()
	cam_inds = torch.from_numpy(fpsample.fps_sampling(cam_pos, 16).astype(int)).to(device=device)

	alphas_bake = alphas_fusion[cam_inds]
	images_bake = (images_fusion[cam_inds] - (1 - alphas_bake)) / alphas_bake.clamp(min=1e-6)

	out_mesh = Mesh.load(str(mesh_path), auto_uv=False, device='cpu')
	max_faces = 50000
	mesh_reduction = max(1 - max_faces / out_mesh.f.shape[0], mesh_reduction)
	mesh_verts_, mesh_faces_ = fast_simplification.simplify(
	out_mesh.v.numpy(), out_mesh.f.numpy(), target_reduction=mesh_reduction)
	mesh_verts = out_mesh.v.new_tensor(mesh_verts_, dtype=torch.float32).requires_grad_(False)
	mesh_faces = out_mesh.f.new_tensor(mesh_faces_).requires_grad_(False)
	out_mesh = Mesh(v=mesh_verts, f=mesh_faces)
	out_mesh.auto_normal()
	out_mesh.auto_uv()
	out_mesh = out_mesh.to(device)

	intrinsics = fxfycxcy_fusion[0:1].clone()
	intrinsics[..., [0, 2]] *= images_bake.shape[-2]
	intrinsics[..., [1, 3]] *= images_bake.shape[-3]

	out_mesh = self.mesh_renderer.bake_multiview(
	[out_mesh],
	images_bake.unsqueeze(0),
	alphas_bake.unsqueeze(0),
	c2ws_fusion[cam_inds].unsqueeze(0),
	intrinsics.unsqueeze(0),
	)[0]
	mesh_fine_path = os.path.join(self.output_dir, 'mesh.glb')
	# align mesh orientation
	out_mesh.v = out_mesh.v.clone()
	out_mesh.vn = out_mesh.vn.clone()
	out_mesh.v[..., 0] = -out_mesh.v[..., 0]
	out_mesh.vn[..., 0] = -out_mesh.vn[..., 0]
	out_mesh.v[..., [1, 2]] = out_mesh.v[..., [2, 1]]
	out_mesh.vn[..., [1, 2]] = out_mesh.vn[..., [2, 1]]

	out_mesh.write(mesh_fine_path, flip_yz=False)
	print(f"Save optimized mesh at {mesh_fine_path}")
	t5 = time.time()

	print(f'Generate Gaussians: {t1-t0:.2f} seconds.')
	print(f'Estimate poses: {t2-t1:.2f} seconds.')
	print(f'Generate video: {t3-t2:.2f} seconds.')
	print(f'Generate mesh: {t4-t3:.2f} seconds.')
	print(f'Optimize mesh: {t5-t4:.2f} seconds.')

	return gs_vis_path, video_path, mesh_fine_path, fig

	def visualize_cameras_object(
	self,
	images,
	c2ws,
	focal_length,
	legends=None,
	):
	images = (images.permute(0, 2, 3, 1).detach().cpu().numpy() * 255).astype(np.uint8)

	cam2world = create_camera_to_world(torch.tensor([0, -2, 0]), camera_system='opencv').to(c2ws)
	transform = cam2world @ torch.linalg.inv(c2ws[0:1])
	c2ws = transform @ c2ws
	c2ws = c2ws.detach().cpu().numpy()
	c2ws[:, :, 1:3] *= -1 # opencv to opengl

	focal_length = focal_length.mean().detach().cpu().numpy()
	fov = np.rad2deg(np.arctan(256.0 / focal_length)) * 2

	colors = [cmap(i / len(images))[:3] for i in range(len(images))]

	legends = [None] * len(images) if legends is None else legends

	viz = CameraVisualizer(c2ws, legends, colors, images=images)
	fig = viz.update_figure(
	3,
	height=320,
	line_width=5,
	base_radius=1,
	zoom_scale=1,
	fov_deg=fov,
	show_grid=True,
	show_ticklabels=True,
	show_background=True,
	y_up=False,
	)
	return fig

	# FreeSplatter-S
	def run_views_to_scene(
	self,
	image1,
	image2,
	cache_dir=None,
	):
	torch.cuda.empty_cache()

	self.output_dir = os.path.join(cache_dir, f'output_{uuid.uuid4()}')
	os.makedirs(self.output_dir, exist_ok=True)

	# preprocesss images
	images = []
	for image in [image1, image2]:
	w, h = image.size
	image = torch.from_numpy(np.asarray(image) / 255.0).float()
	image = rearrange(image, 'h w c -> c h w')
	image = v2.functional.center_crop(image, min(h, w))
	image = v2.functional.resize(image, 512, interpolation=3, antialias=True).clamp(0, 1)
	images.append(image)

	images = torch.stack(images, dim=0)
	images_vis = v2.functional.to_pil_image(rearrange(images, 'n c h w -> c h (n w)'))

	# run reconstruction
	legends = [f'V{i}' for i in range(1, 1+len(images))]

	gs_vis_path, video_path, fig = self.run_freesplatter_scene(images, legends=legends)

	return images_vis, gs_vis_path, video_path, fig

	def run_freesplatter_scene(
	self,
	images,
	legends=None,
	):
	torch.cuda.empty_cache()

	freesplatter = self.freesplatter_scene

	device = self.device
	images = images.to(device)

	t0 = time.time()
	with torch.inference_mode():
	gaussians = freesplatter.forward_gaussians(images.unsqueeze(0))
	t1 = time.time()

	# estimate camera parameters
	c2ws_pred, focals_pred = freesplatter.estimate_poses(images, gaussians, use_first_focal=True, pnp_iter=10)
	# rescale cameras to make the baseline equal to 1.0
	baseline_pred = (c2ws_pred[:, :3, 3] - c2ws_pred[:1, :3, 3]).norm() + 1e-2
	scale_factor = 1.0 / baseline_pred
	c2ws_pred = c2ws_pred.clone()
	c2ws_pred[:, :3, 3] *= scale_factor
	# visualize cameras
	fig = self.visualize_cameras_scene(images, c2ws_pred, focals_pred, legends=legends)
	t2 = time.time()

	# save gaussians
	gs_vis_path = os.path.join(self.output_dir, 'gs_vis.ply')
	save_gaussian(gaussians, gs_vis_path, freesplatter, opacity_threshold=5e-3)
	print(f'Save gaussian at {gs_vis_path}')

	# render video
	with torch.inference_mode():
	c2ws_video = generate_interpolated_path(c2ws_pred.detach().cpu().numpy()[:, :3, :], n_interp=120)
	c2ws_video = torch.cat([
	torch.from_numpy(c2ws_video),
	torch.tensor([0, 0, 0, 1]).reshape(1, 1, 4).repeat(c2ws_video.shape[0], 1, 1)
	], dim=1).to(gaussians)
	fx = fy = focals_pred.mean() / 512.0
	cx = cy = torch.ones_like(fx) * 0.5
	fxfycxcy_video = torch.tensor([fx, fy, cx, cy]).unsqueeze(0).repeat(c2ws_video.shape[0], 1).to(device)

	video_frames = freesplatter.forward_renderer(
	gaussians,
	c2ws_video.unsqueeze(0),
	fxfycxcy_video.unsqueeze(0),
	rescale=scale_factor.reshape(1).to(gaussians)
	)['image'][0].clamp(0, 1)

	video_path = os.path.join(self.output_dir, 'gs.mp4')
	save_video(video_frames, video_path, fps=30)
	print(f'Save video at {video_path}')
	t3 = time.time()

	print(f'Generate Gaussians: {t1-t0:.2f} seconds.')
	print(f'Estimate poses: {t2-t1:.2f} seconds.')
	print(f'Generate video: {t3-t2:.2f} seconds.')

	return gs_vis_path, video_path, fig

	def visualize_cameras_scene(
	self,
	images,
	c2ws,
	focal_length,
	legends=None,
	):
	images = (images.permute(0, 2, 3, 1).detach().cpu().numpy() * 255).astype(np.uint8)

	c2ws = c2ws.detach().cpu().numpy()
	c2ws[:, :, 1:3] *= -1

	focal_length = focal_length.mean().detach().cpu().numpy()
	fov = np.rad2deg(np.arctan(256.0 / focal_length)) * 2

	colors = [cmap(i / len(images))[:3] for i in range(len(images))]

	legends = [None] * len(images) if legends is None else legends

	viz = CameraVisualizer(c2ws, legends, colors, images=images)
	fig = viz.update_figure(
	2,
	height=320,
	line_width=5,
	base_radius=1,
	zoom_scale=1,
	fov_deg=fov,
	show_grid=True,
	show_ticklabels=True,
	show_background=True,
	y_up=False,
	)
	return fig