Spaces:

Stable-X
/

StableRecon

Sleeping

App Files Files Community

hugoycj commited on Nov 16, 2024

Commit

270a9a7

1 Parent(s): 2caa1bd

feat: Add mast3r for refinement

Browse files

Files changed (1) hide show

app.py +269 -63

app.py CHANGED Viewed

@@ -8,9 +8,15 @@ import tempfile
 import subprocess
 from dust3r.losses import L21
 from spann3r.model import Spann3R
 from spann3r.datasets import Demo
 from torch.utils.data import DataLoader
-import trimesh
 from scipy.spatial.transform import Rotation
 from transformers import AutoModelForImageSegmentation
 from torchvision import transforms
@@ -22,9 +28,16 @@ from pose_utils import solve_cemara
 from gradio.helpers import Examples as GradioExamples
 from gradio.utils import get_cache_folder
 from pathlib import Path
 # Default values
-DEFAULT_CKPT_PATH = './checkpoints/spann3r.pth'
 DEFAULT_DUST3R_PATH = 'https://huggingface.co/camenduru/dust3r/resolve/main/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth'
 DEFAULT_DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
 OPENGL = np.array([[1, 0, 0, 0],
@@ -128,17 +141,6 @@ def extract_frames(video_path: str, duration: float = 20.0, fps: float = 3.0) ->
     subprocess.run(command, check=True)
     return temp_dir
-def cat_meshes(meshes):
-    vertices, faces, colors = zip(*[(m['vertices'], m['faces'], m['face_colors']) for m in meshes])
-    n_vertices = np.cumsum([0]+[len(v) for v in vertices])
-    for i in range(len(faces)):
-        faces[i][:] += n_vertices[i]
-    vertices = np.concatenate(vertices)
-    colors = np.concatenate(colors)
-    faces = np.concatenate(faces)
-    return dict(vertices=vertices, face_colors=colors, faces=faces)
 def load_ckpt(model_path_or_url, verbose=True):
     if verbose:
         print('... loading model from', model_path_or_url)
@@ -158,46 +160,10 @@ def load_model(ckpt_path, device):
     model.eval()
     return model
-def pts3d_to_trimesh(img, pts3d, valid=None):
-    H, W, THREE = img.shape
-    assert THREE == 3
-    assert img.shape == pts3d.shape
-    vertices = pts3d.reshape(-1, 3)
-    # make squares: each pixel == 2 triangles
-    idx = np.arange(len(vertices)).reshape(H, W)
-    idx1 = idx[:-1, :-1].ravel()  # top-left corner
-    idx2 = idx[:-1, +1:].ravel()  # right-left corner
-    idx3 = idx[+1:, :-1].ravel()  # bottom-left corner
-    idx4 = idx[+1:, +1:].ravel()  # bottom-right corner
-    faces = np.concatenate((
-        np.c_[idx1, idx2, idx3],
-        np.c_[idx3, idx2, idx1],  # same triangle, but backward (cheap solution to cancel face culling)
-        np.c_[idx2, idx3, idx4],
-        np.c_[idx4, idx3, idx2],  # same triangle, but backward (cheap solution to cancel face culling)
-    ), axis=0)
-    # prepare triangle colors
-    face_colors = np.concatenate((
-        img[:-1, :-1].reshape(-1, 3),
-        img[:-1, :-1].reshape(-1, 3),
-        img[+1:, +1:].reshape(-1, 3),
-        img[+1:, +1:].reshape(-1, 3)
-    ), axis=0)
-    # remove invalid faces
-    if valid is not None:
-        assert valid.shape == (H, W)
-        valid_idxs = valid.ravel()
-        valid_faces = valid_idxs[faces].all(axis=-1)
-        faces = faces[valid_faces]
-        face_colors = face_colors[valid_faces]
-    assert len(faces) == len(face_colors)
-    return dict(vertices=vertices, face_colors=face_colors, faces=faces)
 model = load_model(DEFAULT_CKPT_PATH, DEFAULT_DEVICE)
 birefnet = AutoModelForImageSegmentation.from_pretrained('zhengpeng7/BiRefNet', trust_remote_code=True)
 birefnet.to(DEFAULT_DEVICE)
 birefnet.eval()
@@ -304,6 +270,204 @@ def center_mesh(mesh: o3d.geometry.TriangleMesh, normalize=False) -> o3d.geometr
         mesh.vertices = o3d.utility.Vector3dVector(centered_vertices)
         return mesh
 @torch.no_grad()
 def reconstruct(video_path, conf_thresh, kf_every,
                 remove_background=False, enable_registration=True, output_3d_model=True):
@@ -329,13 +493,46 @@ def reconstruct(video_path, conf_thresh, kf_every,
     # Process results
     pcds = []
     cameras_all = []
     last_focal = None
     for j, view in enumerate(batch):
         image = view['img'].permute(0, 2, 3, 1).cpu().numpy()[0]
         image = (image + 1) / 2
         pts = preds[j]['pts3d' if j==0 else 'pts3d_in_other_view'].detach().cpu().numpy()[0]
         pts_normal = pts2normal(preds[j]['pts3d' if j==0 else 'pts3d_in_other_view'][0]).cpu().numpy()
         conf = preds[j]['conf'][0].cpu().data.numpy()
         conf_sig = (conf - 1) / conf
         if remove_background:
@@ -353,9 +550,15 @@ def reconstruct(video_path, conf_thresh, kf_every,
         pcd.colors = o3d.utility.Vector3dVector(image[combined_mask])
         pcd.normals = o3d.utility.Vector3dVector(pts_normal[combined_mask])
         pcds.append(pcd)
         cameras_all.append(camera)
     pcd_combined = combine_and_clean_point_clouds(pcds, voxel_size=0.001)
     o3d_geometry = point2mesh(pcd_combined)
     o3d_geometry_centered = center_mesh(o3d_geometry, normalize=True)
@@ -367,17 +570,14 @@ def reconstruct(video_path, conf_thresh, kf_every,
         pcd_combined, _, _ = improved_multiway_registration(pcds, voxel_size=0.01)
         pcd_combined = center_pcd(pcd_combined)
     if output_3d_model:
         gs_output_path = tempfile.mktemp(suffix='.ply')
         point2gs(gs_output_path, pcd_combined)
-        # Create 3D model result using gaussian splatting
-        return coarse_output_path, gs_output_path
     else:
         pcd_output_path = export_geometry(pcd_combined, file_format='ply')
-        return coarse_output_path, pcd_output_path
-    # Clean up temporary directory
-    os.system(f"rm -rf {demo_path}")
 example_videos = [os.path.join('./examples', f) for f in os.listdir('./examples') if f.endswith(('.mp4', '.webm'))]
@@ -461,6 +661,7 @@ with gr.Blocks(
                     info="Generate Splat (PLY) instead of Point Cloud (PLY)"
                 )
             reconstruct_btn = gr.Button("Start Reconstruction")
         with gr.Column(scale=2):
             with gr.Tab("3D Models"):
@@ -472,10 +673,8 @@ with gr.Blocks(
                     )
                 with gr.Group():
-                    output_model = gr.Model3D(
-                        label="Reconstructed PointCloud or Splat",
-                        display_mode="solid",
-                        clear_color=[0.0, 0.0, 0.0, 0.0]
                     )
     Examples(
@@ -495,6 +694,13 @@ with gr.Blocks(
         inputs=[video_input, conf_thresh, kf_every, remove_background, enable_registration, output_3d_model],
         outputs=[initial_model, output_model]
     )
 if __name__ == "__main__":
     iface.launch(server_name="0.0.0.0")

 import subprocess
 from dust3r.losses import L21
 from spann3r.model import Spann3R
+from mast3r.model import AsymmetricMASt3R
 from spann3r.datasets import Demo
 from torch.utils.data import DataLoader
+import cv2
+import json
+import glob
+from dust3r.post_process import estimate_focal_knowing_depth
+from mast3r.demo import get_reconstructed_scene
 from scipy.spatial.transform import Rotation
 from transformers import AutoModelForImageSegmentation
 from torchvision import transforms
 from gradio.helpers import Examples as GradioExamples
 from gradio.utils import get_cache_folder
 from pathlib import Path
+import os
+import shutil
+import math
+import zipfile
+from pathlib import Path
 # Default values
+DEFAULT_CKPT_PATH = 'checkpoints/spann3r.pth'
 DEFAULT_DUST3R_PATH = 'https://huggingface.co/camenduru/dust3r/resolve/main/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth'
+DEFAULT_MAST3R_PATH = 'checkpoints/MASt3R_ViTLarge_BaseDecoder_512_catmlpdpt_metric.pth'
 DEFAULT_DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
 OPENGL = np.array([[1, 0, 0, 0],
     subprocess.run(command, check=True)
     return temp_dir
 def load_ckpt(model_path_or_url, verbose=True):
     if verbose:
         print('... loading model from', model_path_or_url)
     model.eval()
     return model
 model = load_model(DEFAULT_CKPT_PATH, DEFAULT_DEVICE)
+mast3r_model = AsymmetricMASt3R.from_pretrained(DEFAULT_MAST3R_PATH).to(DEFAULT_DEVICE)
+mast3r_model.eval()
 birefnet = AutoModelForImageSegmentation.from_pretrained('zhengpeng7/BiRefNet', trust_remote_code=True)
 birefnet.to(DEFAULT_DEVICE)
 birefnet.eval()
         mesh.vertices = o3d.utility.Vector3dVector(centered_vertices)
         return mesh
+def get_transform_json(H, W, focal, poses_all):
+    transform_dict = {
+        'w': W,
+        'h': H,
+        'fl_x': focal.item(),
+        'fl_y': focal.item(),
+        'cx': W/2,
+        'cy': H/2,
+    }
+    frames = []
+    for i, pose in enumerate(poses_all):
+        # CV2 GL format
+        pose[:3, 1] *= -1
+        pose[:3, 2] *= -1
+        frame = {
+            'w': W,
+            'h': H,
+            'fl_x': focal.item(),
+            'fl_y': focal.item(),
+            'cx': W/2,
+            'cy': H/2,
+            'file_path': f"images/{i:04d}.jpg",
+            "mask_path": f"masks/{i:04d}.png",
+            'transform_matrix': pose.tolist()
+        }
+        frames.append(frame)
+    transform_dict['frames'] = frames
+    return transform_dict
+def organize_and_zip_output(images_all, masks_all, transform_json_path, output_dir=None):
+    """
+    Organizes reconstruction outputs into a specific directory structure and creates a zip file.
+    Args:
+        images_all: List of numpy arrays containing images
+        masks_all: List of numpy arrays containing masks
+        transform_json_path: Path to the transform.json file
+        output_dir: Optional custom output directory name
+    Returns:
+        str: Path to the created zip file
+    """
+    try:
+        # Create temporary directory with timestamp
+        timestamp = time.strftime("%Y%m%d_%H%M%S")
+        base_dir = output_dir or f"reconstruction_{timestamp}"
+        os.makedirs(base_dir, exist_ok=True)
+        # Create subdirectories
+        images_dir = os.path.join(base_dir, "images")
+        masks_dir = os.path.join(base_dir, "masks")
+        os.makedirs(images_dir, exist_ok=True)
+        os.makedirs(masks_dir, exist_ok=True)
+        # Save images
+        for i, image in enumerate(images_all):
+            image_path = os.path.join(images_dir, f"{i:04d}.jpg")
+            cv2.imwrite(image_path, (image * 255).astype(np.uint8)[..., ::-1], [int(cv2.IMWRITE_JPEG_QUALITY), 90])
+        # Save masks
+        for i, mask in enumerate(masks_all):
+            mask_path = os.path.join(masks_dir, f"{i:04d}.png")
+            cv2.imwrite(mask_path, (mask * 255).astype(np.uint8))
+        # Copy transform.json
+        shutil.copy2(transform_json_path, os.path.join(base_dir, "transforms.json"))
+        # Create zip file
+        zip_path = f"{base_dir}.zip"
+        with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
+            for root, _, files in os.walk(base_dir):
+                for file in files:
+                    file_path = os.path.join(root, file)
+                    arcname = os.path.relpath(file_path, base_dir)
+                    zipf.write(file_path, arcname)
+        return zip_path
+    finally:
+        # Clean up temporary directories and files
+        if os.path.exists(base_dir):
+            shutil.rmtree(base_dir)
+        if os.path.exists(transform_json_path):
+            os.remove(transform_json_path)
+def get_keyframes(temp_dir: str, kf_every: int = 10):
+    """
+    Select keyframes from a directory of extracted frames at specified intervals
+    Args:
+        temp_dir: Directory containing extracted frames (named as 001.jpg, 002.jpg, etc.)
+        kf_every: Select every Nth frame as a keyframe
+    Returns:
+        List[str]: Sorted list of paths to selected keyframe images
+    """
+    # Get all jpg files in the directory
+    frame_paths = glob.glob(os.path.join(temp_dir, "*.jpg"))
+    # Sort frames by number to ensure correct order
+    frame_paths.sort(key=lambda x: int(Path(x).stem))
+    # Select keyframes at specified interval
+    keyframe_paths = frame_paths[::kf_every]
+    # Ensure we have at least 2 frames for reconstruction
+    if len(keyframe_paths) < 2:
+        if len(frame_paths) >= 2:
+            # If we have at least 2 frames, use first and last
+            keyframe_paths = [frame_paths[0], frame_paths[-1]]
+        else:
+            raise ValueError(f"Not enough frames found in {temp_dir}. Need at least 2 frames for reconstruction.")
+    return keyframe_paths
+from mast3r.cloud_opt.sparse_ga import sparse_global_alignment
+from mast3r.cloud_opt.tsdf_optimizer import TSDFPostProcess
+from dust3r.utils.image import load_images
+from dust3r.image_pairs import make_pairs
+from dust3r.utils.device import to_numpy
+def invert_matrix(mat):
+    """Invert a torch or numpy matrix."""
+    if isinstance(mat, torch.Tensor):
+        return torch.linalg.inv(mat)
+    if isinstance(mat, np.ndarray):
+        return np.linalg.inv(mat)
+    raise ValueError(f'Unsupported matrix type: {type(mat)}')
+def refine(
+    video_path: str,
+    conf_thresh: float = 5.0,
+    kf_every: int = 30,
+    remove_background: bool = False,
+    enable_registration: bool = True,
+    output_3d_model: bool = True
+) -> dict:
+    # Extract keyframes from video
+    temp_dir = extract_frames(video_path)
+    keyframe_paths = get_keyframes(temp_dir, kf_every*3)
+    image_size = 512
+    images = load_images(keyframe_paths, size=image_size)
+    # Create output directory
+    output_dir = tempfile.mkdtemp()
+    # Generate pairs and run inference
+    pairs = make_pairs(images, scene_graph='complete', prefilter=None, symmetrize=True)
+    cache_dir = os.path.join(output_dir, 'cache')
+    if os.path.exists(cache_dir):
+        os.system(f'rm -rf {cache_dir}')
+    scene = sparse_global_alignment(keyframe_paths, pairs, cache_dir,
+                                    mast3r_model, lr1=0.07, niter1=500, lr2=0.014,
+                                    niter2=200 if enable_registration else 0, device=DEFAULT_DEVICE,
+                                    opt_depth=True if enable_registration else False, shared_intrinsics=True,
+                                    matching_conf_thr=5.)
+    # Extract scene information
+    imgs = np.array(scene.imgs)
+    tsdf = TSDFPostProcess(scene, TSDF_thresh=0)
+    pts3d, _, confs = tsdf.get_dense_pts3d(clean_depth=True)
+    masks = np.array(to_numpy([c > 1.5 for c in confs]))
+    pcds = []
+    for pts, conf_mask, image in zip(pts3d, masks, imgs):
+        if remove_background:
+            mask = generate_mask(image)
+        else:
+            mask = np.ones_like(conf_mask)
+        combined_mask = conf_mask & (mask > 0.5)
+        pts = pts.reshape(combined_mask.shape[0], combined_mask.shape[1], 3)
+        pts_normal = pts2normal(pts).cpu().numpy()
+        pts = pts.cpu().numpy()
+        pcd = o3d.geometry.PointCloud()
+        pcd.points = o3d.utility.Vector3dVector(pts[combined_mask] / 5)
+        pcd.colors = o3d.utility.Vector3dVector(image[combined_mask])
+        pcd.normals = o3d.utility.Vector3dVector(pts_normal[combined_mask])
+        pcds.append(pcd)
+    pcd_combined = combine_and_clean_point_clouds(pcds, voxel_size=0.001)
+    o3d_geometry = point2mesh(pcd_combined, depth=9)
+    o3d_geometry_centered = center_mesh(o3d_geometry, normalize=True)
+    # Create coarse result
+    coarse_output_path = export_geometry(o3d_geometry_centered)
+    if output_3d_model:
+        gs_output_path = tempfile.mktemp(suffix='.ply')
+        point2gs(gs_output_path, pcd_combined)
+        return coarse_output_path, [gs_output_path]
+    else:
+        pcd_output_path = export_geometry(pcd_combined, file_format='ply')
+        return coarse_output_path, [pcd_output_path]
 @torch.no_grad()
 def reconstruct(video_path, conf_thresh, kf_every,
                 remove_background=False, enable_registration=True, output_3d_model=True):
     # Process results
     pcds = []
+    poses_all = []
     cameras_all = []
+    images_all = []
+    masks_all = []
     last_focal = None
+    ##### estimate focal length
+    _, H, W, _ = preds[0]['pts3d'].shape
+    pp = torch.tensor((W/2, H/2))
+    focal = estimate_focal_knowing_depth(preds[0]['pts3d'].cpu(), pp, focal_mode='weiszfeld')
+    print(f'Estimated focal of first camera: {focal.item()} (224x224)')
+    intrinsic = np.eye(3)
+    intrinsic[0, 0] = focal
+    intrinsic[1, 1] = focal
+    intrinsic[:2, 2] = pp
     for j, view in enumerate(batch):
         image = view['img'].permute(0, 2, 3, 1).cpu().numpy()[0]
         image = (image + 1) / 2
+        mask = view['valid_mask'].cpu().numpy()[0]
         pts = preds[j]['pts3d' if j==0 else 'pts3d_in_other_view'].detach().cpu().numpy()[0]
         pts_normal = pts2normal(preds[j]['pts3d' if j==0 else 'pts3d_in_other_view'][0]).cpu().numpy()
+        ##### Solve PnP-RANSAC
+        u, v = np.meshgrid(np.arange(W), np.arange(H))
+        points_2d = np.stack((u, v), axis=-1)
+        dist_coeffs = np.zeros(4).astype(np.float32)
+        success, rotation_vector, translation_vector, inliers = cv2.solvePnPRansac(
+            pts.reshape(-1, 3).astype(np.float32),
+            points_2d.reshape(-1, 2).astype(np.float32),
+            intrinsic.astype(np.float32),
+            dist_coeffs)
+        rotation_matrix, _ = cv2.Rodrigues(rotation_vector)
+        # Extrinsic parameters (4x4 matrix)
+        extrinsic_matrix = np.hstack((rotation_matrix, translation_vector.reshape(-1, 1)))
+        extrinsic_matrix = np.vstack((extrinsic_matrix, [0, 0, 0, 1]))
+        poses_all.append(np.linalg.inv(extrinsic_matrix))
         conf = preds[j]['conf'][0].cpu().data.numpy()
         conf_sig = (conf - 1) / conf
         if remove_background:
         pcd.colors = o3d.utility.Vector3dVector(image[combined_mask])
         pcd.normals = o3d.utility.Vector3dVector(pts_normal[combined_mask])
         pcds.append(pcd)
+        images_all.append(image)
+        masks_all.append(mask)
         cameras_all.append(camera)
+    transform_dict = get_transform_json(H, W, focal, poses_all)
+    temp_json_file = tempfile.mktemp(suffix='.json')
+    with open(os.path.join(temp_json_file), 'w') as f:
+        json.dump(transform_dict, f, indent=4)
     pcd_combined = combine_and_clean_point_clouds(pcds, voxel_size=0.001)
     o3d_geometry = point2mesh(pcd_combined)
     o3d_geometry_centered = center_mesh(o3d_geometry, normalize=True)
         pcd_combined, _, _ = improved_multiway_registration(pcds, voxel_size=0.01)
         pcd_combined = center_pcd(pcd_combined)
+    # zip_path = organize_and_zip_output(images_all, masks_all, temp_json_file)
     if output_3d_model:
         gs_output_path = tempfile.mktemp(suffix='.ply')
         point2gs(gs_output_path, pcd_combined)
+        return coarse_output_path, [gs_output_path]
     else:
         pcd_output_path = export_geometry(pcd_combined, file_format='ply')
+        return coarse_output_path, [pcd_output_path]
 example_videos = [os.path.join('./examples', f) for f in os.listdir('./examples') if f.endswith(('.mp4', '.webm'))]
                     info="Generate Splat (PLY) instead of Point Cloud (PLY)"
                 )
             reconstruct_btn = gr.Button("Start Reconstruction")
+            refine_btn = gr.Button("Start Refinement")
         with gr.Column(scale=2):
             with gr.Tab("3D Models"):
                     )
                 with gr.Group():
+                    output_model = gr.File(
+                        label="Reconstructed Results",
                     )
     Examples(
         inputs=[video_input, conf_thresh, kf_every, remove_background, enable_registration, output_3d_model],
         outputs=[initial_model, output_model]
     )
+    refine_btn.click(
+        fn=refine,
+        inputs=[video_input, conf_thresh, kf_every, remove_background, enable_registration, output_3d_model],
+        outputs=[initial_model, output_model]
+    )
 if __name__ == "__main__":
     iface.launch(server_name="0.0.0.0")