StableRecon / app.py
Stable-X's picture
Update app.py
549d99a verified
import os
import time
import torch
import numpy as np
import gradio as gr
import urllib.parse
import tempfile
import subprocess
from dust3r.losses import L21
from spann3r.model import Spann3R
from mast3r.model import AsymmetricMASt3R
from spann3r.datasets import Demo
from torch.utils.data import DataLoader
import cv2
import json
import glob
from dust3r.post_process import estimate_focal_knowing_depth
from mast3r.demo import get_reconstructed_scene
from scipy.spatial.transform import Rotation
from transformers import AutoModelForImageSegmentation
from torchvision import transforms
from PIL import Image
import open3d as o3d
from backend_utils import improved_multiway_registration, pts2normal, point2mesh, combine_and_clean_point_clouds
from gs_utils import point2gs
from pose_utils import solve_cemara
from gradio.helpers import Examples as GradioExamples
from gradio.utils import get_cache_folder
from pathlib import Path
import os
import shutil
import math
import zipfile
from pathlib import Path
# Default values
DEFAULT_CKPT_PATH = 'checkpoints/spann3r.pth'
DEFAULT_DUST3R_PATH = 'https://huggingface.co/camenduru/dust3r/resolve/main/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth'
DEFAULT_MAST3R_PATH = 'https://download.europe.naverlabs.com/ComputerVision/MASt3R/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth'
DEFAULT_DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
OPENGL = np.array([[1, 0, 0, 0],
[0, -1, 0, 0],
[0, 0, -1, 0],
[0, 0, 0, 1]])
class Examples(GradioExamples):
def __init__(self, *args, directory_name=None, **kwargs):
super().__init__(*args, **kwargs, _initiated_directly=False)
if directory_name is not None:
self.cached_folder = get_cache_folder() / directory_name
self.cached_file = Path(self.cached_folder) / "log.csv"
self.create()
def export_geometry(geometry, file_format='obj'):
"""
Export Open3D geometry (triangle mesh or point cloud) to a file.
Args:
geometry: Open3D geometry object (TriangleMesh or PointCloud)
file_format: str, output format ('obj', 'ply', 'pcd')
Returns:
str: Path to the exported file
Raises:
ValueError: If geometry type is not supported or file format is invalid
"""
# Validate geometry type
if not isinstance(geometry, (o3d.geometry.TriangleMesh, o3d.geometry.PointCloud)):
raise ValueError("Geometry must be either TriangleMesh or PointCloud")
# Validate and set file format
supported_formats = {
'obj': '.obj',
'ply': '.ply',
'pcd': '.pcd'
}
if file_format.lower() not in supported_formats:
raise ValueError(f"Unsupported file format. Supported formats: {list(supported_formats.keys())}")
# Create temporary file with appropriate extension
output_path = tempfile.mktemp(suffix=supported_formats[file_format.lower()])
# Create a copy of the geometry to avoid modifying the original
geometry_copy = geometry
# Apply rotation
rot = np.eye(4)
rot[:3, :3] = Rotation.from_euler('y', np.deg2rad(180)).as_matrix()
transform = np.linalg.inv(OPENGL @ rot)
# Transform geometry
geometry_copy.transform(transform)
# Export based on geometry type and format
try:
if isinstance(geometry_copy, o3d.geometry.TriangleMesh):
if file_format.lower() == 'obj':
o3d.io.write_triangle_mesh(output_path, geometry_copy,
write_ascii=False, compressed=True)
elif file_format.lower() == 'ply':
o3d.io.write_triangle_mesh(output_path, geometry_copy,
write_ascii=False, compressed=True)
elif isinstance(geometry_copy, o3d.geometry.PointCloud):
if file_format.lower() == 'pcd':
o3d.io.write_point_cloud(output_path, geometry_copy,
write_ascii=False, compressed=True)
elif file_format.lower() == 'ply':
o3d.io.write_point_cloud(output_path, geometry_copy,
write_ascii=False, compressed=True)
else:
raise ValueError(f"Format {file_format} not supported for point clouds. Use 'ply' or 'pcd'")
return output_path
except Exception as e:
# Clean up temporary file if export fails
if os.path.exists(output_path):
os.remove(output_path)
raise RuntimeError(f"Failed to export geometry: {str(e)}")
def extract_frames(video_path: str, duration: float = 20.0, fps: float = 3.0) -> str:
temp_dir = tempfile.mkdtemp()
output_path = os.path.join(temp_dir, "%03d.jpg")
filter_complex = f"select='if(lt(t,{duration}),1,0)',fps={fps}"
command = [
"ffmpeg",
"-i", video_path,
"-vf", filter_complex,
"-vsync", "0",
output_path
]
subprocess.run(command, check=True)
return temp_dir
def load_ckpt(model_path_or_url, verbose=True):
if verbose:
print('... loading model from', model_path_or_url)
is_url = urllib.parse.urlparse(model_path_or_url).scheme in ('http', 'https')
if is_url:
ckpt = torch.hub.load_state_dict_from_url(model_path_or_url, map_location='cpu', progress=verbose)
else:
ckpt = torch.load(model_path_or_url, map_location='cpu')
return ckpt
def load_model(ckpt_path, device):
model = Spann3R(dus3r_name=DEFAULT_DUST3R_PATH,
use_feat=False).to(device)
model.load_state_dict(load_ckpt(ckpt_path)['model'])
model.eval()
return model
model = load_model(DEFAULT_CKPT_PATH, DEFAULT_DEVICE)
mast3r_model = AsymmetricMASt3R.from_pretrained(DEFAULT_MAST3R_PATH).to(DEFAULT_DEVICE)
mast3r_model.eval()
birefnet = AutoModelForImageSegmentation.from_pretrained('zhengpeng7/BiRefNet', trust_remote_code=True)
birefnet.to(DEFAULT_DEVICE)
birefnet.eval()
def extract_object(birefnet, image):
# Data settings
image_size = (1024, 1024)
transform_image = transforms.Compose([
transforms.Resize(image_size),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
input_images = transform_image(image).unsqueeze(0).to(DEFAULT_DEVICE)
# Prediction
with torch.no_grad():
preds = birefnet(input_images)[-1].sigmoid().cpu()
pred = preds[0].squeeze()
pred_pil = transforms.ToPILImage()(pred)
mask = pred_pil.resize(image.size)
return mask
def generate_mask(image: np.ndarray):
# Convert numpy array to PIL Image
pil_image = Image.fromarray((image * 255).astype(np.uint8))
# Extract object and get mask
mask = extract_object(birefnet, pil_image)
# Convert mask to numpy array
mask_np = np.array(mask) / 255.0
return mask_np
def center_pcd(pcd: o3d.geometry.PointCloud, normalize=False) -> o3d.geometry.PointCloud:
# Convert to numpy array
points = np.asarray(pcd.points)
# Compute centroid
centroid = np.mean(points, axis=0)
# Center the point cloud
centered_points = points - centroid
if normalize:
# Compute the maximum distance from the center
max_distance = np.max(np.linalg.norm(centered_points, axis=1))
# Normalize the point cloud
normalized_points = centered_points / max_distance
# Create a new point cloud with the normalized points
normalized_pcd = o3d.geometry.PointCloud()
normalized_pcd.points = o3d.utility.Vector3dVector(normalized_points)
# If the original point cloud has colors, normalize them too
if pcd.has_colors():
normalized_pcd.colors = pcd.colors
# If the original point cloud has normals, copy them
if pcd.has_normals():
normalized_pcd.normals = pcd.normals
return normalized_pcd
else:
pcd.points = o3d.utility.Vector3dVector(centered_points)
return pcd
def center_mesh(mesh: o3d.geometry.TriangleMesh, normalize=False) -> o3d.geometry.TriangleMesh:
# Convert to numpy array
vertices = np.asarray(mesh.vertices)
# Compute centroid
centroid = np.mean(vertices, axis=0)
# Center the mesh
centered_vertices = vertices - centroid
if normalize:
# Compute the maximum distance from the center
max_distance = np.max(np.linalg.norm(centered_vertices, axis=1))
# Normalize the mesh
normalized_vertices = centered_vertices / max_distance
# Create a new mesh with the normalized vertices
normalized_mesh = o3d.geometry.TriangleMesh()
normalized_mesh.vertices = o3d.utility.Vector3dVector(normalized_vertices)
normalized_mesh.triangles = mesh.triangles
# If the original mesh has vertex colors, copy them
if mesh.has_vertex_colors():
normalized_mesh.vertex_colors = mesh.vertex_colors
# If the original mesh has vertex normals, normalize them
if mesh.has_vertex_normals():
vertex_normals = np.asarray(mesh.vertex_normals)
normalized_vertex_normals = vertex_normals / np.linalg.norm(vertex_normals, axis=1, keepdims=True)
normalized_mesh.vertex_normals = o3d.utility.Vector3dVector(normalized_vertex_normals)
return normalized_mesh
else:
# Update the mesh with the centered vertices
mesh.vertices = o3d.utility.Vector3dVector(centered_vertices)
return mesh
def get_transform_json(H, W, focal, poses_all):
transform_dict = {
'w': W,
'h': H,
'fl_x': focal.item(),
'fl_y': focal.item(),
'cx': W/2,
'cy': H/2,
}
frames = []
for i, pose in enumerate(poses_all):
# CV2 GL format
pose[:3, 1] *= -1
pose[:3, 2] *= -1
frame = {
'w': W,
'h': H,
'fl_x': focal.item(),
'fl_y': focal.item(),
'cx': W/2,
'cy': H/2,
'file_path': f"images/{i:04d}.jpg",
"mask_path": f"masks/{i:04d}.png",
'transform_matrix': pose.tolist()
}
frames.append(frame)
transform_dict['frames'] = frames
return transform_dict
def organize_and_zip_output(images_all, masks_all, transform_json_path, output_dir=None):
"""
Organizes reconstruction outputs into a specific directory structure and creates a zip file.
Args:
images_all: List of numpy arrays containing images
masks_all: List of numpy arrays containing masks
transform_json_path: Path to the transform.json file
output_dir: Optional custom output directory name
Returns:
str: Path to the created zip file
"""
try:
# Create temporary directory with timestamp
timestamp = time.strftime("%Y%m%d_%H%M%S")
base_dir = output_dir or f"reconstruction_{timestamp}"
os.makedirs(base_dir, exist_ok=True)
# Create subdirectories
images_dir = os.path.join(base_dir, "images")
masks_dir = os.path.join(base_dir, "masks")
os.makedirs(images_dir, exist_ok=True)
os.makedirs(masks_dir, exist_ok=True)
# Save images
for i, image in enumerate(images_all):
image_path = os.path.join(images_dir, f"{i:04d}.jpg")
cv2.imwrite(image_path, (image * 255).astype(np.uint8)[..., ::-1], [int(cv2.IMWRITE_JPEG_QUALITY), 90])
# Save masks
for i, mask in enumerate(masks_all):
mask_path = os.path.join(masks_dir, f"{i:04d}.png")
cv2.imwrite(mask_path, (mask * 255).astype(np.uint8))
# Copy transform.json
shutil.copy2(transform_json_path, os.path.join(base_dir, "transforms.json"))
# Create zip file
zip_path = f"{base_dir}.zip"
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
for root, _, files in os.walk(base_dir):
for file in files:
file_path = os.path.join(root, file)
arcname = os.path.relpath(file_path, base_dir)
zipf.write(file_path, arcname)
return zip_path
finally:
# Clean up temporary directories and files
if os.path.exists(base_dir):
shutil.rmtree(base_dir)
if os.path.exists(transform_json_path):
os.remove(transform_json_path)
def get_keyframes(temp_dir: str, kf_every: int = 10):
"""
Select keyframes from a directory of extracted frames at specified intervals
Args:
temp_dir: Directory containing extracted frames (named as 001.jpg, 002.jpg, etc.)
kf_every: Select every Nth frame as a keyframe
Returns:
List[str]: Sorted list of paths to selected keyframe images
"""
# Get all jpg files in the directory
frame_paths = glob.glob(os.path.join(temp_dir, "*.jpg"))
# Sort frames by number to ensure correct order
frame_paths.sort(key=lambda x: int(Path(x).stem))
# Select keyframes at specified interval
keyframe_paths = frame_paths[::kf_every]
# Ensure we have at least 2 frames for reconstruction
if len(keyframe_paths) < 2:
if len(frame_paths) >= 2:
# If we have at least 2 frames, use first and last
keyframe_paths = [frame_paths[0], frame_paths[-1]]
else:
raise ValueError(f"Not enough frames found in {temp_dir}. Need at least 2 frames for reconstruction.")
return keyframe_paths
from mast3r.cloud_opt.sparse_ga import sparse_global_alignment
from mast3r.cloud_opt.tsdf_optimizer import TSDFPostProcess
from dust3r.utils.image import load_images
from dust3r.image_pairs import make_pairs
from dust3r.utils.device import to_numpy
def invert_matrix(mat):
"""Invert a torch or numpy matrix."""
if isinstance(mat, torch.Tensor):
return torch.linalg.inv(mat)
if isinstance(mat, np.ndarray):
return np.linalg.inv(mat)
raise ValueError(f'Unsupported matrix type: {type(mat)}')
def refine(
video_path: str,
conf_thresh: float = 5.0,
kf_every: int = 30,
remove_background: bool = False,
enable_registration: bool = True,
output_3d_model: bool = True
) -> dict:
# Extract keyframes from video
temp_dir = extract_frames(video_path)
keyframe_paths = get_keyframes(temp_dir, kf_every*3)
image_size = 512
images = load_images(keyframe_paths, size=image_size)
# Create output directory
output_dir = tempfile.mkdtemp()
# Generate pairs and run inference
pairs = make_pairs(images, scene_graph='complete', prefilter=None, symmetrize=True)
cache_dir = os.path.join(output_dir, 'cache')
if os.path.exists(cache_dir):
os.system(f'rm -rf {cache_dir}')
scene = sparse_global_alignment(keyframe_paths, pairs, cache_dir,
mast3r_model, lr1=0.07, niter1=500, lr2=0.014,
niter2=200 if enable_registration else 0, device=DEFAULT_DEVICE,
opt_depth=True if enable_registration else False, shared_intrinsics=True,
matching_conf_thr=5.)
# Extract scene information
imgs = np.array(scene.imgs)
tsdf = TSDFPostProcess(scene, TSDF_thresh=0)
pts3d, _, confs = tsdf.get_dense_pts3d(clean_depth=True)
masks = np.array(to_numpy([c > 1.5 for c in confs]))
pcds = []
for pts, conf_mask, image in zip(pts3d, masks, imgs):
if remove_background:
mask = generate_mask(image)
else:
mask = np.ones_like(conf_mask)
combined_mask = conf_mask & (mask > 0.5)
pts = pts.reshape(combined_mask.shape[0], combined_mask.shape[1], 3)
pts_normal = pts2normal(pts).cpu().numpy()
pts = pts.cpu().numpy()
pcd = o3d.geometry.PointCloud()
pcd.points = o3d.utility.Vector3dVector(pts[combined_mask] / 5)
pcd.colors = o3d.utility.Vector3dVector(image[combined_mask])
pcd.normals = o3d.utility.Vector3dVector(pts_normal[combined_mask])
pcds.append(pcd)
pcd_combined = combine_and_clean_point_clouds(pcds, voxel_size=0.001)
o3d_geometry = point2mesh(pcd_combined, depth=9)
o3d_geometry_centered = center_mesh(o3d_geometry, normalize=True)
# Create coarse result
coarse_output_path = export_geometry(o3d_geometry_centered)
if output_3d_model:
gs_output_path = tempfile.mktemp(suffix='.ply')
point2gs(gs_output_path, pcd_combined)
return coarse_output_path, [gs_output_path]
else:
pcd_output_path = export_geometry(pcd_combined, file_format='ply')
return coarse_output_path, [pcd_output_path]
@torch.no_grad()
def reconstruct(video_path, conf_thresh, kf_every,
remove_background=False, enable_registration=True, output_3d_model=True):
# Extract frames from video
demo_path = extract_frames(video_path)
# Load dataset
dataset = Demo(ROOT=demo_path, resolution=224, full_video=True, kf_every=kf_every)
dataloader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0)
batch = next(iter(dataloader))
for view in batch:
view['img'] = view['img'].to(DEFAULT_DEVICE, non_blocking=True)
demo_name = os.path.basename(video_path)
print(f'Started reconstruction for {demo_name}')
start = time.time()
preds, preds_all = model.forward(batch)
end = time.time()
fps = len(batch) / (end - start)
print(f'Finished reconstruction for {demo_name}, FPS: {fps:.2f}')
# Process results
pcds = []
poses_all = []
cameras_all = []
images_all = []
masks_all = []
last_focal = None
##### estimate focal length
_, H, W, _ = preds[0]['pts3d'].shape
pp = torch.tensor((W/2, H/2))
focal = estimate_focal_knowing_depth(preds[0]['pts3d'].cpu(), pp, focal_mode='weiszfeld')
print(f'Estimated focal of first camera: {focal.item()} (224x224)')
intrinsic = np.eye(3)
intrinsic[0, 0] = focal
intrinsic[1, 1] = focal
intrinsic[:2, 2] = pp
for j, view in enumerate(batch):
image = view['img'].permute(0, 2, 3, 1).cpu().numpy()[0]
image = (image + 1) / 2
mask = view['valid_mask'].cpu().numpy()[0]
pts = preds[j]['pts3d' if j==0 else 'pts3d_in_other_view'].detach().cpu().numpy()[0]
pts_normal = pts2normal(preds[j]['pts3d' if j==0 else 'pts3d_in_other_view'][0]).cpu().numpy()
##### Solve PnP-RANSAC
u, v = np.meshgrid(np.arange(W), np.arange(H))
points_2d = np.stack((u, v), axis=-1)
dist_coeffs = np.zeros(4).astype(np.float32)
success, rotation_vector, translation_vector, inliers = cv2.solvePnPRansac(
pts.reshape(-1, 3).astype(np.float32),
points_2d.reshape(-1, 2).astype(np.float32),
intrinsic.astype(np.float32),
dist_coeffs)
rotation_matrix, _ = cv2.Rodrigues(rotation_vector)
# Extrinsic parameters (4x4 matrix)
extrinsic_matrix = np.hstack((rotation_matrix, translation_vector.reshape(-1, 1)))
extrinsic_matrix = np.vstack((extrinsic_matrix, [0, 0, 0, 1]))
poses_all.append(np.linalg.inv(extrinsic_matrix))
conf = preds[j]['conf'][0].cpu().data.numpy()
conf_sig = (conf - 1) / conf
if remove_background:
mask = generate_mask(image)
else:
mask = np.ones_like(conf)
combined_mask = (conf_sig > conf_thresh) & (mask > 0.5)
camera, last_focal = solve_cemara(torch.tensor(pts), torch.tensor(conf_sig) > 0.001,
"cuda", focal=last_focal)
pcd = o3d.geometry.PointCloud()
pcd.points = o3d.utility.Vector3dVector(pts[combined_mask])
pcd.colors = o3d.utility.Vector3dVector(image[combined_mask])
pcd.normals = o3d.utility.Vector3dVector(pts_normal[combined_mask])
pcds.append(pcd)
images_all.append(image)
masks_all.append(mask)
cameras_all.append(camera)
transform_dict = get_transform_json(H, W, focal, poses_all)
temp_json_file = tempfile.mktemp(suffix='.json')
with open(os.path.join(temp_json_file), 'w') as f:
json.dump(transform_dict, f, indent=4)
pcd_combined = combine_and_clean_point_clouds(pcds, voxel_size=0.001)
o3d_geometry = point2mesh(pcd_combined)
o3d_geometry_centered = center_mesh(o3d_geometry, normalize=True)
# Create coarse result
coarse_output_path = export_geometry(o3d_geometry_centered)
if enable_registration:
pcd_combined, _, _ = improved_multiway_registration(pcds, voxel_size=0.01)
pcd_combined = center_pcd(pcd_combined)
# zip_path = organize_and_zip_output(images_all, masks_all, temp_json_file)
if output_3d_model:
gs_output_path = tempfile.mktemp(suffix='.ply')
point2gs(gs_output_path, pcd_combined)
return coarse_output_path, [gs_output_path, temp_json_file]
else:
pcd_output_path = export_geometry(pcd_combined, file_format='ply')
return coarse_output_path, [pcd_output_path, temp_json_file]
example_videos = [os.path.join('./examples', f) for f in os.listdir('./examples') if f.endswith(('.mp4', '.webm'))]
# Update the Gradio interface with improved layout
with gr.Blocks(
title="StableRecon: 3D Reconstruction from Video",
css="""
#download {
height: 118px;
}
.slider .inner {
width: 5px;
background: #FFF;
}
.viewport {
aspect-ratio: 4/3;
}
.tabs button.selected {
font-size: 20px !important;
color: crimson !important;
}
h1 {
text-align: center;
display: block;
}
h2 {
text-align: center;
display: block;
}
h3 {
text-align: center;
display: block;
}
.md_feedback li {
margin-bottom: 0px !important;
}
""",
head="""
<script async src="https://www.googletagmanager.com/gtag/js?id=G-1FWSVCGZTG"></script>
<script>
window.dataLayer = window.dataLayer || [];
function gtag() {dataLayer.push(arguments);}
gtag('js', new Date());
gtag('config', 'G-1FWSVCGZTG');
</script>
""",
) as iface:
gr.Markdown(
"""
# StableRecon: Making Video to 3D easy
<p align="center">
<a title="Github" href="https://github.com/Stable-X/StableRecon" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
<img src="https://img.shields.io/github/stars/Stable-X/StableRecon?label=GitHub%20%E2%98%85&logo=github&color=C8C" alt="badge-github-stars">
</a>
<a title="Social" href="https://x.com/ychngji6" target="_blank" rel="noopener noreferrer" style="display: inline-block;">
<img src="https://www.obukhov.ai/img/badges/badge-social.svg" alt="social">
</a>
</p>
<div style="background-color: #f0f0f0; padding: 10px; border-radius: 5px; margin-bottom: 20px;">
<strong>📢 About StableRecon:</strong> This is an experimental open-source project building on <a href="https://github.com/naver/dust3r" target="_blank">dust3r</a> and <a href="https://github.com/HengyiWang/spann3r" target="_blank">spann3r</a>. We're exploring video-to-3D conversion, using spann3r for tracking and implementing our own backend and meshing. While it's a work in progress with plenty of room for improvement, we're excited to share it with the community. We welcome your feedback, especially on failure cases, as we continue to develop and refine this tool.
</div>
"""
)
with gr.Row():
with gr.Column(scale=1):
video_input = gr.Video(label="Input Video", sources=["upload"])
with gr.Row():
conf_thresh = gr.Slider(0, 1, value=1e-3, label="Confidence Threshold")
kf_every = gr.Slider(1, 30, step=1, value=1, label="Keyframe Interval")
with gr.Row():
remove_background = gr.Checkbox(label="Remove Background", value=False)
enable_registration = gr.Checkbox(
label="Enable Refinement",
value=False,
info="Improves alignment but takes longer"
)
output_3d_model = gr.Checkbox(
label="Output Splat",
value=True,
info="Generate Splat (PLY) instead of Point Cloud (PLY)"
)
reconstruct_btn = gr.Button("Start Reconstruction")
refine_btn = gr.Button("Start Refinement")
with gr.Column(scale=2):
with gr.Tab("3D Models"):
with gr.Group():
initial_model = gr.Model3D(
label="Reconstructed Mesh",
display_mode="solid",
clear_color=[0.0, 0.0, 0.0, 0.0]
)
with gr.Group():
output_model = gr.File(
label="Reconstructed Results",
)
Examples(
fn=reconstruct,
examples=sorted([
os.path.join("examples", name)
for name in os.listdir(os.path.join("examples")) if name.endswith('.webm')
]),
inputs=[video_input],
outputs=[initial_model, output_model],
directory_name="examples_video",
cache_examples=False,
)
reconstruct_btn.click(
fn=reconstruct,
inputs=[video_input, conf_thresh, kf_every, remove_background, enable_registration, output_3d_model],
outputs=[initial_model, output_model]
)
refine_btn.click(
fn=refine,
inputs=[video_input, conf_thresh, kf_every, remove_background, enable_registration, output_3d_model],
outputs=[initial_model, output_model]
)
if __name__ == "__main__":
iface.launch(server_name="0.0.0.0")