import gradio as gr import numpy as np from torchvision import transforms import torch from helpers import * import sys import csv from monoscene.monoscene import MonoScene csv.field_size_limit(sys.maxsize) torch.set_grad_enabled(False) # pipeline = pipeline(model="anhquancao/monoscene_kitti") # model = AutoModel.from_pretrained( # "anhquancao/monoscene_kitti", trust_remote_code=True, revision='bf033f87c2a86b60903ab811b790a1532c1ae313' # )#.cuda() model = MonoScene.load_from_checkpoint( "monoscene_nyu.ckpt", dataset="NYU", feature=200, project_scale=1, full_scene_size=(60, 36, 60), ) def get_projections(img_W, img_H): scale_3ds = [1, 2] data = {} for scale_3d in scale_3ds: scene_size = (4.8, 4.8, 2.88) vox_origin = np.array([-1.54591799, 0.8907361 , -0.05 ]) voxel_size = 0.08 cam_k = np.array([[518.8579, 0, 320], [0, 518.8579, 240], [0, 0, 1]]) cam_pose = np.asarray([[ 9.6699458e-01, 4.2662762e-02, 2.5120059e-01, 0.0000000e+00], [-2.5147417e-01, 1.0867463e-03, 9.6786356e-01, 0.0000000e+00], [ 4.1018680e-02, -9.9908894e-01, 1.1779292e-02, 1.1794727e+00], [ 0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.0000000e+00]]) T_velo_2_cam = np.linalg.inv(cam_pose) # compute the 3D-2D mapping projected_pix, fov_mask, pix_z = vox2pix( T_velo_2_cam, cam_k, vox_origin, voxel_size * scale_3d, img_W, img_H, scene_size, ) data["projected_pix_{}".format(scale_3d)] = projected_pix data["pix_z_{}".format(scale_3d)] = pix_z data["fov_mask_{}".format(scale_3d)] = fov_mask return data def predict(img): img_W, img_H = 640, 480 img = np.array(img, dtype=np.float32, copy=False) / 255.0 normalize_rgb = transforms.Compose( [ transforms.ToTensor(), transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] ), ] ) img = normalize_rgb(img) batch = get_projections(img_W, img_H) batch["img"] = img for k in batch: batch[k] = batch[k].unsqueeze(0) # .cuda() pred = model(batch).squeeze() y_pred = torch.softmax(pred["ssc_logit"], dim=1).detach().cpu().numpy() cam_pose = np.asarray([[ 9.6699458e-01, 4.2662762e-02, 2.5120059e-01, 0.0000000e+00], [-2.5147417e-01, 1.0867463e-03, 9.6786356e-01, 0.0000000e+00], [ 4.1018680e-02, -9.9908894e-01, 1.1779292e-02, 1.1794727e+00], [ 0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.0000000e+00]]) vox_origin = np.array([-1.54591799, 0.8907361 , -0.05 ]) fig = draw(y_pred.squeeze(),cam_pose, vox_origin) return fig description = """ MonoScene Demo on SemanticKITTI Validation Set (Sequence 08), which uses the camera parameters of Sequence 08. Due to the CPU-only inference, it might take up to 20s to predict a scene. \n The output is downsampled by 2 for faster rendering. Darker colors represent the scenery outside the Field of View, i.e. not visible on the image.

""" title = "MonoScene: Monocular 3D Semantic Scene Completion" article = """ We also released a smaller MonoScene model (Half resolution - w/o 3D CRP) at: https://huggingface.co/spaces/CVPR/monoscene_lite

""" examples = [ 'images/08/3-1.jpg', 'images/08/001385.jpg', 'images/08/000295.jpg', 'images/08/002505.jpg', 'images/08/000085.jpg', 'images/08/000290.jpg', 'images/08/000465.jpg', 'images/08/000790.jpg', 'images/08/001005.jpg', 'images/08/001380.jpg', 'images/08/001530.jpg', 'images/08/002360.jpg', 'images/08/004059.jpg', 'images/08/003149.jpg', 'images/08/001446.jpg', 'images/08/000010.jpg', 'images/08/001122.jpg', 'images/08/003533.jpg', 'images/08/003365.jpg', 'images/08/002944.jpg', 'images/08/000822.jpg', 'images/08/000103.jpg', 'images/08/002716.jpg', 'images/08/000187.jpg', 'images/08/002128.jpg', 'images/08/000511.jpg', 'images/08/000618.jpg', 'images/08/002010.jpg', 'images/08/000234.jpg', 'images/08/001842.jpg', 'images/08/001687.jpg', 'images/08/003929.jpg', 'images/08/002272.jpg', ] demo = gr.Interface( predict, gr.Image(shape=(1220, 370)), gr.Plot(), article=article, title=title, enable_queue=True, cache_examples=False, live=False, examples=examples, description=description) demo.launch(enable_queue=True, debug=False)