Spaces:

pablovela5620
/

MiniDPVO

Sleeping

@@ -16,6 +16,12 @@ from dataclasses import dataclass
 from timeit import default_timer as timer
 @dataclass
 class DPVOPrediction:
@@ -27,14 +33,20 @@ class DPVOPrediction:
 def log_trajectory(
     parent_log_path: Path,
-    poses: Float32[torch.Tensor, "buffer_size 7"],
-    points: Float32[torch.Tensor, "buffer_size*num_patches 3"],
-    colors: UInt8[torch.Tensor, "buffer_size num_patches 3"],
     intri_np: Float64[np.ndarray, "4"],
-    bgr_hw3: UInt8[np.ndarray, "h w 3"],
 ):
     cam_log_path = f"{parent_log_path}/camera"
-    rr.log(f"{cam_log_path}/pinhole/image", rr.Image(bgr_hw3[..., ::-1]))
     rr.log(
         f"{cam_log_path}/pinhole",
         rr.Pinhole(
@@ -54,18 +66,45 @@ def log_trajectory(
     last_index = nonzero_poses.shape[0] - 1
     # get last non-zero pose, and the index of the last non-zero pose
     quat_pose = nonzero_poses[last_index].numpy(force=True)
-    trans_quat = quat_pose[:3]
     rotation_quat = Rotation.from_quat(quat_pose[3:])
-    mat3x3 = rotation_quat.as_matrix()
     rr.log(
         f"{cam_log_path}",
-        rr.Transform3D(translation=trans_quat, mat3x3=mat3x3, from_parent=True),
     )
     # outlier removal
     trajectory_center = np.median(nonzero_poses[:, :3].numpy(force=True), axis=0)
-    radii = lambda a: np.linalg.norm(a - trajectory_center, axis=1)
     points_np = nonzero_points.view(-1, 3).numpy(force=True)
     colors_np = colors.view(-1, 3)[points_mask].numpy(force=True)
     inlier_mask = (
@@ -82,6 +121,7 @@ def log_trajectory(
             colors=colors_filtered,
         ),
     )
 def log_final(
@@ -102,7 +142,7 @@ def log_final(
 def create_reader(
-    imagedir: str, calib: str, stride: int, skip: int, queue: Queue
 ) -> Process:
     if os.path.isdir(imagedir):
         reader = Process(
@@ -116,56 +156,158 @@ def create_reader(
     return reader
 @torch.no_grad()
-def run(
     cfg: CfgNode,
     network_path: str,
     imagedir: str,
     calib: str,
     stride: int = 1,
     skip: int = 0,
-    vis_during: bool = True,
     timeit: bool = False,
 ) -> tuple[DPVOPrediction, float]:
     slam = None
     queue = Queue(maxsize=8)
     reader: Process = create_reader(imagedir, calib, stride, skip, queue)
     reader.start()
-    if vis_during:
-        parent_log_path = Path("world")
-        rr.log(f"{parent_log_path}", rr.ViewCoordinates.RDF, timeless=True)
     start = timer()
-    while True:
-        t: int
-        bgr_hw3: UInt8[np.ndarray, "h w 3"]
-        intri_np: Float64[np.ndarray, "4"]
-        (t, bgr_hw3, intri_np) = queue.get()
-        # queue will have a (-1, image, intrinsics) tuple when the reader is done
-        if t < 0:
-            break
-        if vis_during:
-            rr.set_time_sequence(timeline="timestep", sequence=t)
-        bgr_3hw: UInt8[torch.Tensor, "h w 3"] = (
-            torch.from_numpy(bgr_hw3).permute(2, 0, 1).cuda()
         )
-        intri_torch: Float64[torch.Tensor, "4"] = torch.from_numpy(intri_np).cuda()
-        if slam is None:
-            slam = DPVO(cfg, network_path, ht=bgr_3hw.shape[1], wd=bgr_3hw.shape[2])
-        with Timer("SLAM", enabled=timeit):
-            slam(t, bgr_3hw, intri_torch)
-        if slam.is_initialized and vis_during:
-            poses: Float32[torch.Tensor, "buffer_size 7"] = slam.poses_
-            points: Float32[torch.Tensor, "buffer_size*num_patches 3"] = slam.points_
-            colors: UInt8[torch.Tensor, "buffer_size num_patches 3"] = slam.colors_
-            log_trajectory(parent_log_path, poses, points, colors, intri_np, bgr_hw3)
     for _ in range(12):
         slam.update()

 from timeit import default_timer as timer
+import cv2
+import mmcv
+from tqdm import tqdm
+from mini_dust3r.api import OptimizedResult, inferece_dust3r
+from mini_dust3r.model import AsymmetricCroCo3DStereo
 @dataclass
 class DPVOPrediction:
 def log_trajectory(
     parent_log_path: Path,
+    poses: Float32[torch.Tensor, "buffer_size 7"],  # noqa: F722
+    points: Float32[torch.Tensor, "buffer_size*num_patches 3"],  # noqa: F722
+    colors: UInt8[torch.Tensor, "buffer_size num_patches 3"],  # noqa: F722
     intri_np: Float64[np.ndarray, "4"],
+    bgr_hw3: UInt8[np.ndarray, "h w 3"],  # noqa: F722
+    path_list: list,
+    jpg_quality: int = 90,
 ):
     cam_log_path = f"{parent_log_path}/camera"
+    rgb_hw3 = mmcv.bgr2rgb(bgr_hw3)
+    rr.log(
+        f"{cam_log_path}/pinhole/image",
+        rr.Image(rgb_hw3).compress(jpeg_quality=jpg_quality),
+    )
     rr.log(
         f"{cam_log_path}/pinhole",
         rr.Pinhole(
     last_index = nonzero_poses.shape[0] - 1
     # get last non-zero pose, and the index of the last non-zero pose
     quat_pose = nonzero_poses[last_index].numpy(force=True)
+    trans_quat: Float32[np.ndarray, "3"] = quat_pose[:3]
     rotation_quat = Rotation.from_quat(quat_pose[3:])
+    cam_R_world: Float64[np.ndarray, "3 3"] = rotation_quat.as_matrix()
+    cam_T_world = np.eye(4)
+    cam_T_world[:3, :3] = cam_R_world
+    cam_T_world[0:3, 3] = trans_quat
+    world_T_cam = np.linalg.inv(cam_T_world)
+    path_list.append(world_T_cam[:3, 3].copy().tolist())
     rr.log(
         f"{cam_log_path}",
+        rr.Transform3D(
+            translation=world_T_cam[:3, 3],
+            mat3x3=world_T_cam[:3, :3],
+            from_parent=False,
+        ),
+    )
+    # log path using linestrip
+    rr.log(
+        f"{parent_log_path}/path",
+        rr.LineStrips3D(
+            strips=[
+                path_list,
+            ],
+            colors=[255, 0, 0],
+        ),
     )
     # outlier removal
     trajectory_center = np.median(nonzero_poses[:, :3].numpy(force=True), axis=0)
+    def radii(a):
+        return np.linalg.norm(a - trajectory_center, axis=1)
     points_np = nonzero_points.view(-1, 3).numpy(force=True)
     colors_np = colors.view(-1, 3)[points_mask].numpy(force=True)
     inlier_mask = (
             colors=colors_filtered,
         ),
     )
+    return path_list
 def log_final(
 def create_reader(
+    imagedir: str, calib: str | None, stride: int, skip: int, queue: Queue
 ) -> Process:
     if os.path.isdir(imagedir):
         reader = Process(
     return reader
+def calculate_num_frames(video_or_image_dir: str, stride: int, skip: int) -> int:
+    # Determine the total number of frames
+    total_frames = 0
+    if os.path.isdir(video_or_image_dir):
+        total_frames = len(
+            [
+                name
+                for name in os.listdir(video_or_image_dir)
+                if os.path.isfile(os.path.join(video_or_image_dir, name))
+            ]
+        )
+    else:
+        cap = cv2.VideoCapture(video_or_image_dir)
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        cap.release()
+    total_frames = (total_frames - skip) // stride
+    return total_frames
+def calib_from_dust3r(
+    bgr_hw3: UInt8[np.ndarray, "height width 3"],
+    model: AsymmetricCroCo3DStereo,
+    device: str,
+) -> Float64[np.ndarray, "3 3"]:
+    """
+    Calculates the calibration matrix from mini-dust3r.
+    Args:
+        bgr_hw3: The input image in BGR format with shape (height, width, 3).
+        model: The Dust3D-R model used for inference.
+        device: The device to run the inference on.
+    Returns:
+        The calibration matrix with shape (3, 3).
+    Raises:
+        None.
+    """
+    tmp_path = Path("/tmp/dpvo/tmp.png")
+    # save image
+    mmcv.imwrite(bgr_hw3, str(tmp_path))
+    optimized_results: OptimizedResult = inferece_dust3r(
+        image_dir_or_list=tmp_path.parent,
+        model=model,
+        device=device,
+        batch_size=1,
+    )
+    # DELETE tmp file
+    tmp_path.unlink()
+    # get predicted intrinsics in original image size
+    downscaled_h, downscaled_w, _ = optimized_results.rgb_hw3_list[0].shape
+    orig_h, orig_w, _ = bgr_hw3.shape
+    # Scaling factors
+    scaling_factor_x = orig_w / downscaled_w
+    scaling_factor_y = orig_h / downscaled_h
+    # Scale the intrinsic matrix to the original image size
+    K_33_original = optimized_results.K_b33[0].copy()
+    K_33_original[0, 0] *= scaling_factor_x  # fx
+    K_33_original[1, 1] *= scaling_factor_y  # fy
+    K_33_original[0, 2] *= scaling_factor_x  # cx
+    K_33_original[1, 2] *= scaling_factor_y  # cy
+    return K_33_original
 @torch.no_grad()
+def inference_dpvo(
     cfg: CfgNode,
     network_path: str,
     imagedir: str,
     calib: str,
     stride: int = 1,
     skip: int = 0,
     timeit: bool = False,
 ) -> tuple[DPVOPrediction, float]:
     slam = None
     queue = Queue(maxsize=8)
     reader: Process = create_reader(imagedir, calib, stride, skip, queue)
     reader.start()
+    parent_log_path = Path("world")
+    rr.log(f"{parent_log_path}", rr.ViewCoordinates.RDF, timeless=True)
     start = timer()
+    total_frames = calculate_num_frames(imagedir, stride, skip)
+    # estimate camera intrinsics if not provided
+    if calib is None:
+        dust3r_device = (
+            "mps"
+            if torch.backends.mps.is_available()
+            else "cuda"
+            if torch.cuda.is_available()
+            else "cpu"
         )
+        dust3r_model = AsymmetricCroCo3DStereo.from_pretrained(
+            "naver/DUSt3R_ViTLarge_BaseDecoder_512_dpt"
+        ).to(dust3r_device)
+        _, bgr_hw3, _ = queue.get()
+        K_33_pred = calib_from_dust3r(bgr_hw3, dust3r_model, dust3r_device)
+        intri_np_dust3r = np.array(
+            [K_33_pred[0, 0], K_33_pred[1, 1], K_33_pred[0, 2], K_33_pred[1, 2]]
+        )
+    # path list for visualizing the trajectory
+    path_list = []
+    with tqdm(total=total_frames, desc="Processing Frames") as pbar:
+        while True:
+            t: int
+            bgr_hw3: UInt8[np.ndarray, "h w 3"]
+            intri_np: Float64[np.ndarray, "4"]
+            (t, bgr_hw3, intri_np_calib) = queue.get()
+            intri_np = intri_np_calib if calib is not None else intri_np_dust3r
+            # queue will have a (-1, image, intrinsics) tuple when the reader is done
+            if t < 0:
+                break
+            rr.set_time_sequence(timeline="timestep", sequence=t)
+            bgr_3hw: UInt8[torch.Tensor, "h w 3"] = (
+                torch.from_numpy(bgr_hw3).permute(2, 0, 1).cuda()
+            )
+            intri_torch: Float64[torch.Tensor, "4"] = torch.from_numpy(intri_np).cuda()
+            if slam is None:
+                slam = DPVO(cfg, network_path, ht=bgr_3hw.shape[1], wd=bgr_3hw.shape[2])
+            with Timer("SLAM", enabled=timeit):
+                slam(t, bgr_3hw, intri_torch)
+            if slam.is_initialized:
+                poses: Float32[torch.Tensor, "buffer_size 7"] = slam.poses_
+                points: Float32[torch.Tensor, "buffer_size*num_patches 3"] = (
+                    slam.points_
+                )
+                colors: UInt8[torch.Tensor, "buffer_size num_patches 3"] = slam.colors_
+                path_list = log_trajectory(
+                    parent_log_path=parent_log_path,
+                    poses=poses,
+                    points=points,
+                    colors=colors,
+                    intri_np=intri_np,
+                    bgr_hw3=bgr_hw3,
+                    path_list=path_list,
+                )
+            pbar.update(1)
     for _ in range(12):
         slam.update()

mini_dpvo/dpvo.py CHANGED Viewed

@@ -156,6 +156,7 @@ class DPVO:
         poses = lietorch.stack(poses, dim=0)
         poses = poses.inv().data.cpu().numpy()
         tstamps = np.array(self.tlist, dtype=np.float64)
         return poses, tstamps

         poses = lietorch.stack(poses, dim=0)
         poses = poses.inv().data.cpu().numpy()
         tstamps = np.array(self.tlist, dtype=np.float64)
+        print("Done!")
         return poses, tstamps

mini_dpvo/stream.py CHANGED Viewed

@@ -3,13 +3,10 @@ import numpy as np
 from pathlib import Path
 from itertools import chain
 from multiprocessing import Queue
-def image_stream(
-    queue: Queue, imagedir: str, calib: str, stride: int, skip: int = 0
-) -> None:
-    """image generator"""
     calib = np.loadtxt(calib, delimiter=" ")
     fx, fy, cx, cy = calib[:4]
@@ -18,6 +15,17 @@ def image_stream(
     K[0, 2] = cx
     K[1, 1] = fy
     K[1, 2] = cy
     img_exts = ["*.png", "*.jpeg", "*.jpg"]
     image_list = sorted(chain.from_iterable(Path(imagedir).glob(e) for e in img_exts))[
@@ -26,15 +34,11 @@ def image_stream(
     for t, imfile in enumerate(image_list):
         image = cv2.imread(str(imfile))
-        if len(calib) > 4:
-            image = cv2.undistort(image, K, calib[4:])
-        if 0:
-            image = cv2.resize(image, None, fx=0.5, fy=0.5)
-            intrinsics = np.array([fx / 2, fy / 2, cx / 2, cy / 2])
-        else:
             intrinsics = np.array([fx, fy, cx, cy])
         h, w, _ = image.shape
         image = image[: h - h % 16, : w - w % 16]
@@ -45,48 +49,43 @@ def image_stream(
 def video_stream(
-    queue: Queue, imagedir: str, calib: str, stride: int, skip: int = 0
 ) -> None:
     """video generator"""
-    calib = np.loadtxt(calib, delimiter=" ")
-    fx, fy, cx, cy = calib[:4]
-    K = np.eye(3)
-    K[0, 0] = fx
-    K[0, 2] = cx
-    K[1, 1] = fy
-    K[1, 2] = cy
-    cap = cv2.VideoCapture(imagedir)
     t = 0
     for _ in range(skip):
-        ret, image = cap.read()
     while True:
         # Capture frame-by-frame
         for _ in range(stride):
-            ret, image = cap.read()
-            # if frame is read correctly ret is True
-            if not ret:
                 break
-        if not ret:
             break
-        if len(calib) > 4:
-            image = cv2.undistort(image, K, calib[4:])
         image = cv2.resize(image, None, fx=0.5, fy=0.5, interpolation=cv2.INTER_AREA)
         h, w, _ = image.shape
         image = image[: h - h % 16, : w - w % 16]
-        intrinsics = np.array([fx * 0.5, fy * 0.5, cx * 0.5, cy * 0.5])
         queue.put((t, image, intrinsics))
         t += 1
     queue.put((-1, image, intrinsics))
-    cap.release()

 from pathlib import Path
 from itertools import chain
 from multiprocessing import Queue
+import mmcv
+def load_calib(calib: str) -> np.ndarray:
     calib = np.loadtxt(calib, delimiter=" ")
     fx, fy, cx, cy = calib[:4]
     K[0, 2] = cx
     K[1, 1] = fy
     K[1, 2] = cy
+    return K, calib
+def image_stream(
+    queue: Queue, imagedir: str, calib: str | None, stride: int, skip: int = 0
+) -> None:
+    """image generator"""
+    if calib is not None:
+        K, calib = load_calib(calib)
+        fx, fy, cx, cy = K[0, 0], K[1, 1], K[0, 2], K[1, 2]
     img_exts = ["*.png", "*.jpeg", "*.jpg"]
     image_list = sorted(chain.from_iterable(Path(imagedir).glob(e) for e in img_exts))[
     for t, imfile in enumerate(image_list):
         image = cv2.imread(str(imfile))
+        if calib is not None:
             intrinsics = np.array([fx, fy, cx, cy])
+        else:
+            intrinsics = None
         h, w, _ = image.shape
         image = image[: h - h % 16, : w - w % 16]
 def video_stream(
+    queue: Queue, imagedir: str, calib: str | None, stride: int, skip: int = 0
 ) -> None:
     """video generator"""
+    if calib is not None:
+        K, calib = load_calib(calib)
+        fx, fy, cx, cy = K[0, 0], K[1, 1], K[0, 2], K[1, 2]
+    video_reader = mmcv.VideoReader(imagedir)
     t = 0
     for _ in range(skip):
+        image = video_reader.read()
     while True:
         # Capture frame-by-frame
         for _ in range(stride):
+            image = video_reader.read()
+            if image is None:
                 break
+        if image is None:
             break
+        # if len(calib) > 4:
+        #     image = cv2.undistort(image, K, calib[4:])
         image = cv2.resize(image, None, fx=0.5, fy=0.5, interpolation=cv2.INTER_AREA)
         h, w, _ = image.shape
         image = image[: h - h % 16, : w - w % 16]
+        if calib is not None:
+            intrinsics = np.array([fx * 0.5, fy * 0.5, cx * 0.5, cy * 0.5])
+        else:
+            intrinsics = None
         queue.put((t, image, intrinsics))
         t += 1
     queue.put((-1, image, intrinsics))

packages.txt DELETED Viewed

	@@ -1 +0,0 @@
1	- lsof

pixi.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff

pixi.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "mini-dpvo"
-version = "0.1.1"
 description = "Add a short description here"
 authors = ["pablovela5620 <pablovela5620@gmail.com>"]
 channels = ["nvidia/label/cuda-11.8.0", "nvidia", "conda-forge", "pytorch", "pyg"]
@@ -15,16 +15,40 @@ download-model = """
     || (
         wget https://www.dropbox.com/s/nap0u8zslspdwm4/models.zip
         && unzip models.zip -d checkpoints
        )
 """
-post-install = {cmd="python -m pip install -e .", depends_on=["download-model"]}
-old-app = "python pixi_app.py"
-app = {cmd="python tools/app.py", depends_on=["post-install"], outputs=["mini_dpvo.egg-info/PKG-INFO"]}
 [dependencies]
 python = "3.11.*"
-pip = ">=23.3.2,<23.4"
 cuda = {version = "*", channel="nvidia/label/cuda-11.8.0"}
 pytorch-cuda = {version = "11.8.*", channel="pytorch"}
 pytorch = {version = ">=2.2.0,<2.3", channel="pytorch"}
@@ -35,14 +59,16 @@ matplotlib = ">=3.8.4,<3.9"
 yacs = ">=0.1.8,<0.2"
 jaxtyping = ">=0.2.28,<0.3"
 icecream = ">=2.1.3,<2.2"
-rerun-sdk = "0.15.*"
-gradio = "4.31.2.*"
 eigen = ">=3.4.0,<3.5"
 [pypi-dependencies]
-mini-dust3r = "*"
-spaces = "==0.28.3"
 opencv-python = ">=4.9.0.80"
 evo = ">=1.28.0"
-gradio-rerun = "*"
-mmcv = "*"

 [project]
 name = "mini-dpvo"
+version = "0.1.0"
 description = "Add a short description here"
 authors = ["pablovela5620 <pablovela5620@gmail.com>"]
 channels = ["nvidia/label/cuda-11.8.0", "nvidia", "conda-forge", "pytorch", "pyg"]
     || (
         wget https://www.dropbox.com/s/nap0u8zslspdwm4/models.zip
         && unzip models.zip -d checkpoints
+        && rm -r models.zip
        )
 """
+download-dpvo-data = """
+    test -e data/movies/IMG_0492.MOV
+    || (
+        wget https://www.dropbox.com/s/7030y0mdl6efteg/movies.zip -P data/
+        && unzip data/movies.zip -d data/
+        && rm -r data/movies.zip
+       )
+"""
+download-iphone-data = """
+    test -e data/iphone/pool.MOV
+    || (
+        huggingface-cli download pablovela5620/dpvo-example-data pool.MOV --repo-type dataset --local-dir data/iphone/
+    )
+"""
+post-install = {cmd="python -m pip install -e .", depends_on=["download-model", "download-dpvo-data", "download-iphone-data"], outputs=["cuda_ba.cpython-311-x86_64-linux-gnu.so"]}
+rr-viewer = "rerun --memory-limit 50% --drop-at-latency 500ms"
+demo = """
+python tools/demo.py --imagedir data/movies/IMG_0493.MOV --config config/fast.yaml
+"""
+app = {cmd="python tools/app.py", depends_on=["post-install"]}
+# Docker tasks
+docker-build = "docker build --no-cache -t mini-dpvo ."
+docker-run = {cmd="docker run --gpus all -it -p 7860:7860 mini-dpvo", depends_on=["docker-build"]}
 [dependencies]
 python = "3.11.*"
+pip = ">=24.0,<25"
 cuda = {version = "*", channel="nvidia/label/cuda-11.8.0"}
 pytorch-cuda = {version = "11.8.*", channel="pytorch"}
 pytorch = {version = ">=2.2.0,<2.3", channel="pytorch"}
 yacs = ">=0.1.8,<0.2"
 jaxtyping = ">=0.2.28,<0.3"
 icecream = ">=2.1.3,<2.2"
 eigen = ">=3.4.0,<3.5"
+rerun-sdk = ">=0.16.1"
+tyro = ">=0.8.4,<0.9"
+unzip = ">=6.0,<7"
 [pypi-dependencies]
 opencv-python = ">=4.9.0.80"
 evo = ">=1.28.0"
+mini-dust3r = "*"
+gradio-rerun = ">=0.0.3"
+mmcv = "*"
+yt-dlp = "*"
+gradio = ">=4.36.0"

pixi_app.py DELETED Viewed

@@ -1,14 +0,0 @@
-import gradio as gr
-import spaces
-import torch
-tensor = torch.zeros(3).cuda()
-print(tensor.device)
-@spaces.GPU
-def greet(name):
-        print(tensor.device)
-        return "Hello pixi pablo " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

tools/app.py CHANGED Viewed

@@ -1,89 +1,140 @@
 import gradio as gr
-# import spaces
 from gradio_rerun import Rerun
 import rerun as rr
 import rerun.blueprint as rrb
-from pathlib import Path
-import uuid
 import mmcv
-import spaces
-from mini_dpvo.api.inference import run
 from mini_dpvo.config import cfg as base_cfg
-base_cfg.merge_from_file("config/fast.yaml")
-base_cfg.BUFFER_SIZE = 2048
-def create_blueprint(image_name_list: list[str], log_path: Path) -> rrb.Blueprint:
-    # dont show 2d views if there are more than 4 images as to not clutter the view
-    if len(image_name_list) > 4:
-        blueprint = rrb.Blueprint(
-            rrb.Horizontal(
-                rrb.Spatial3DView(origin=f"{log_path}"),
-            ),
-            collapse_panels=True,
-        )
     else:
-        blueprint = rrb.Blueprint(
-            rrb.Horizontal(
-                contents=[
-                    rrb.Spatial3DView(origin=f"{log_path}"),
-                    rrb.Vertical(
-                        contents=[
-                            rrb.Spatial2DView(
-                                origin=f"{log_path}/camera_{i}/pinhole/",
-                                contents=[
-                                    "+ $origin/**",
-                                ],
-                            )
-                            for i in range(len(image_name_list))
-                        ]
-                    ),
-                ],
-                column_shares=[3, 1],
-            ),
-            collapse_panels=True,
-        )
-    return blueprint
-@spaces.GPU
-def predict(video_file_path: str, stride: int) -> tuple[str, str]:
-    # check if is list or string and if not raise error
-    if not isinstance(video_file_path, str):
-        raise gr.Error(
-            f"Something is wrong with your input video, got: {type(video_file_path)}"
-        )
-    uuid_str = str(uuid.uuid4())
-    filename = Path(f"/tmp/gradio/{uuid_str}.rrd")
-    if not filename.parent.exists():
-        filename.parent.mkdir(parents=True)
-    rr.init(f"{uuid_str}")
-    calib_path = "data/calib/iphone.txt"
-    if not Path(calib_path).exists():
-        gr.Error(f"Calibration file not found at {calib_path}")
-    dpvo_pred, time_taken = run(
-        cfg=base_cfg,
-        network_path="checkpoints/dpvo.pth",
-        imagedir=video_file_path,
-        calib="data/calib/iphone.txt",
-        stride=stride,
-        skip=0,
-        vis_during=True,
-    )
-    # blueprint: rrb.Blueprint = create_blueprint(image_name_list, log_path)
-    # rr.send_blueprint(blueprint)
-    rr.set_time_sequence("sequence", 0)
-    # log_optimized_result(optimized_results, log_path)
-    rr.save(filename.as_posix())
-    return filename.as_posix(), f"Total time: {time_taken:.2f}s"
 def on_file_upload(video_file_path: str) -> None:
@@ -96,26 +147,23 @@ def on_file_upload(video_file_path: str) -> None:
     return video_info
-with gr.Blocks(
-    css=""".gradio-container {margin: 0 !important; min-width: 100%};""",
-    title="Mini-DPVO Demo",
-) as demo:
-    # scene state is save so that you can change conf_thr, cam_size... without rerunning the inference
-    gr.HTML('<h2 style="text-align: center;">Mini-DPVO Demo</h2>')
-    gr.HTML(
-        '<p style="text-align: center;">Unofficial DPVO demo using the mini-dpvo pip package</p>'
-    )
-    gr.HTML(
-        '<p style="text-align: center;">Learn more about mini-dpvo here <a href="https://github.com/pablovela5620/mini-dpvo">here</a></p>'
-    )
-    with gr.Tab(label="Video Input"):
         with gr.Column():
             with gr.Row():
                 video_input = gr.File(
-                    height=300,
                     file_count="single",
-                    file_types=[".mp4", ".mov"],
-                    label="Video",
                 )
                 with gr.Column():
                     video_info = gr.Markdown(
@@ -123,26 +171,79 @@ with gr.Blocks(
                     **Video Info:**
                     """
                     )
-                    time_taken = gr.Textbox(label="Time Taken")
             with gr.Accordion(label="Advanced", open=False):
-                stride = gr.Slider(
-                    label="Stride",
-                    minimum=1,
-                    maximum=5,
-                    step=1,
-                    value=2,
-                )
-            run_btn_single = gr.Button("Run")
-            rerun_viewer_single = Rerun(height=900)
-            run_btn_single.click(
-                fn=predict,
-                inputs=[video_input, stride],
-                outputs=[rerun_viewer_single, time_taken],
             )
             video_input.upload(
                 fn=on_file_upload, inputs=[video_input], outputs=[video_info]
             )
-demo.launch(share=False)

 import gradio as gr
 from gradio_rerun import Rerun
 import rerun as rr
 import rerun.blueprint as rrb
 import mmcv
+from timeit import default_timer as timer
+from typing import Literal
 from mini_dpvo.config import cfg as base_cfg
+from mini_dpvo.api.inference import (
+    log_trajectory,
+    calib_from_dust3r,
+    create_reader,
+    calculate_num_frames,
+)
+import torch
+import numpy as np
+from pathlib import Path
+from multiprocessing import Process, Queue
+from mini_dpvo.dpvo import DPVO
+from jaxtyping import UInt8, Float64, Float32
+from mini_dust3r.model import AsymmetricCroCo3DStereo
+from tqdm import tqdm
+import tyro
+from dataclasses import dataclass
+if gr.NO_RELOAD:
+    NETWORK_PATH = "checkpoints/dpvo.pth"
+    DEVICE = (
+        "mps"
+        if torch.backends.mps.is_available()
+        else "cuda"
+        if torch.cuda.is_available()
+        else "cpu"
+    )
+    MODEL = AsymmetricCroCo3DStereo.from_pretrained(
+        "naver/DUSt3R_ViTLarge_BaseDecoder_512_dpt"
+    ).to(DEVICE)
+@dataclass
+class GradioDPVOConfig:
+    share: bool = False
+    port: int = 7860
+    server_name: str = "0.0.0.0"
+@rr.thread_local_stream("mini_dpvo")
+@torch.no_grad()
+def run_dpvo(
+    video_file_path: str,
+    jpg_quality: str,
+    stride: int = 1,
+    skip: int = 0,
+    config_type: Literal["accurate", "fast"] = "accurate",
+    progress=gr.Progress(),
+):
+    # create a stream to send data back to the rerun viewer
+    stream = rr.binary_stream()
+    parent_log_path = Path("world")
+    rr.log(f"{parent_log_path}", rr.ViewCoordinates.RDF, timeless=True)
+    blueprint = rrb.Blueprint(
+        collapse_panels=True,
+    )
+    rr.send_blueprint(blueprint)
+    if config_type == "accurate":
+        base_cfg.merge_from_file("config/default.yaml")
+    elif config_type == "fast":
+        base_cfg.merge_from_file("config/fast.yaml")
     else:
+        raise ValueError("Invalid config type")
+    base_cfg.BUFFER_SIZE = 2048
+    slam = None
+    start_time = timer()
+    queue = Queue(maxsize=8)
+    reader: Process = create_reader(video_file_path, None, stride, skip, queue)
+    reader.start()
+    # get the first frame
+    progress(progress=0.1, desc="Estimating Camera Intrinsics")
+    _, bgr_hw3, _ = queue.get()
+    K_33_pred = calib_from_dust3r(bgr_hw3, MODEL, DEVICE)
+    intri_np: Float64[np.ndarray, "4"] = np.array(
+        [K_33_pred[0, 0], K_33_pred[1, 1], K_33_pred[0, 2], K_33_pred[1, 2]]
+    )
+    num_frames = calculate_num_frames(video_file_path, stride, skip)
+    path_list = []
+    with tqdm(total=num_frames, desc="Processing Frames") as pbar:
+        while True:
+            timestep: int
+            bgr_hw3: UInt8[np.ndarray, "h w 3"]
+            intri_np: Float64[np.ndarray, "4"]
+            (timestep, bgr_hw3, _) = queue.get()
+            # queue will have a (-1, image, intrinsics) tuple when the reader is done
+            if timestep < 0:
+                break
+            rr.set_time_sequence(timeline="timestep", sequence=timestep)
+            bgr_3hw: UInt8[torch.Tensor, "h w 3"] = (
+                torch.from_numpy(bgr_hw3).permute(2, 0, 1).cuda()
+            )
+            intri_torch: Float64[torch.Tensor, "4"] = torch.from_numpy(intri_np).cuda()
+            if slam is None:
+                _, h, w = bgr_3hw.shape
+                slam = DPVO(base_cfg, NETWORK_PATH, ht=h, wd=w)
+            slam(timestep, bgr_3hw, intri_torch)
+            pbar.update(1)
+            if slam.is_initialized:
+                poses: Float32[torch.Tensor, "buffer_size 7"] = slam.poses_
+                points: Float32[torch.Tensor, "buffer_size*num_patches 3"] = (
+                    slam.points_
+                )
+                colors: UInt8[torch.Tensor, "buffer_size num_patches 3"] = slam.colors_
+                path_list = log_trajectory(
+                    parent_log_path,
+                    poses,
+                    points,
+                    colors,
+                    intri_np,
+                    bgr_hw3,
+                    path_list,
+                    jpg_quality,
+                )
+                yield stream.read(), timer() - start_time
 def on_file_upload(video_file_path: str) -> None:
     return video_info
+def main(gradio_config: GradioDPVOConfig):
+    with gr.Blocks(
+        css=""".gradio-container {margin: 0 !important; min-width: 100%};""",
+        title="Mini-DPVO Demo",
+    ) as demo:
+        # scene state is save so that you can change conf_thr, cam_size... without rerunning the inference
+        gr.HTML('<h2 style="text-align: center;">Mini-DPVO Demo</h2>')
+        gr.HTML(
+            '<p style="text-align: center;">Unofficial DPVO demo using the mini-dpvo. Learn more about mini-dpvo <a href="https://github.com/pablovela5620/mini-dpvo">here</a>.</p>'
+        )
         with gr.Column():
             with gr.Row():
                 video_input = gr.File(
+                    height=100,
                     file_count="single",
+                    file_types=[".mp4", ".mov", ".MOV", ".webm"],
+                    label="Video File",
                 )
                 with gr.Column():
                     video_info = gr.Markdown(
                     **Video Info:**
                     """
                     )
+                    time_taken = gr.Number(
+                        label="Time Taken (s)", precision=2, interactive=False
+                    )
             with gr.Accordion(label="Advanced", open=False):
+                with gr.Row():
+                    jpg_quality = gr.Radio(
+                        label="JPEG Quality %: Lower quality means faster streaming",
+                        choices=[10, 50, 90],
+                        value=90,
+                        type="value",
+                    )
+                    stride = gr.Slider(
+                        label="Stride: How many frames to sample between each prediction",
+                        minimum=1,
+                        maximum=5,
+                        step=1,
+                        value=5,
+                    )
+                    skip = gr.Number(
+                        label="Skip: How many frames to skip at the beginning",
+                        value=0,
+                        precision=0,
+                    )
+                    config_type = gr.Dropdown(
+                        label="Config Type: Choose between accurate and fast",
+                        value="fast",
+                        choices=["accurate", "fast"],
+                        max_choices=1,
+                    )
+            with gr.Row():
+                start_btn = gr.Button("Run")
+                stop_btn = gr.Button("Stop")
+            rr_viewer = Rerun(height=600, streaming=True)
+            # Example videos
+            base_example_params = [50, 4, 0, "fast"]
+            example_dpvo_dir = Path("data/movies")
+            example_iphone_dir = Path("data/iphone")
+            example_video_paths = sorted(example_iphone_dir.glob("*.MOV")) + sorted(
+                example_dpvo_dir.glob("*.MOV")
+            )
+            example_video_paths = [str(path) for path in example_video_paths]
+            gr.Examples(
+                examples=[[path, *base_example_params] for path in example_video_paths],
+                inputs=[video_input, jpg_quality, stride, skip, config_type],
+                outputs=[rr_viewer],
+                fn=run_dpvo,
+            )
+            click_event = start_btn.click(
+                fn=run_dpvo,
+                inputs=[video_input, jpg_quality, stride, skip, config_type],
+                outputs=[rr_viewer, time_taken],
+            )
+            stop_btn.click(
+                fn=None,
+                inputs=[],
+                outputs=[],
+                cancels=[click_event],
             )
             video_input.upload(
                 fn=on_file_upload, inputs=[video_input], outputs=[video_info]
             )
+    demo.launch(
+        share=gradio_config.share,
+        server_name=gradio_config.server_name,
+        server_port=gradio_config.port,
+    )
+if __name__ == "__main__":
+    main(tyro.cli(GradioDPVOConfig))

tools/demo.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from argparse import ArgumentParser
 import rerun as rr
-from mini_dpvo.api.inference import run
 from mini_dpvo.config import cfg as base_cfg
@@ -12,8 +12,7 @@ if __name__ == "__main__":
     parser.add_argument("--stride", type=int, default=2)
     parser.add_argument("--skip", type=int, default=0)
     parser.add_argument("--buffer", type=int, default=2048)
-    parser.add_argument("--config", default="config/default.yaml")
-    parser.add_argument("--vis-during", action="store_true")
     rr.script_add_args(parser)
     args = parser.parse_args()
     rr.script_setup(args, "mini_dpvo")
@@ -24,13 +23,12 @@ if __name__ == "__main__":
     print("Running with config...")
     print(base_cfg)
-    run(
-        base_cfg,
-        args.network_path,
-        args.imagedir,
-        args.calib,
-        args.stride,
-        args.skip,
-        vis_during=args.vis_during,
     )
     rr.script_teardown(args)

 from argparse import ArgumentParser
 import rerun as rr
+from mini_dpvo.api.inference import inference_dpvo
 from mini_dpvo.config import cfg as base_cfg
     parser.add_argument("--stride", type=int, default=2)
     parser.add_argument("--skip", type=int, default=0)
     parser.add_argument("--buffer", type=int, default=2048)
+    parser.add_argument("--config", default="config/fast.yaml")
     rr.script_add_args(parser)
     args = parser.parse_args()
     rr.script_setup(args, "mini_dpvo")
     print("Running with config...")
     print(base_cfg)
+    inference_dpvo(
+        cfg=base_cfg,
+        network_path=args.network_path,
+        imagedir=args.imagedir,
+        calib=args.calib,
+        stride=args.stride,
+        skip=args.skip,
     )
     rr.script_teardown(args)