3D-MOOD / app.py
RoyYang0714's picture
feat: Fix the description.
8f091f1
"""Gradio Demo for 3D-MOOD."""
import spaces
import gradio as gr
import gc
import os
import numpy as np
import torch
from PIL import Image
from vis4d.data.transforms.base import compose
from vis4d.data.transforms.normalize import NormalizeImages
from vis4d.data.transforms.resize import ResizeImages, ResizeIntrinsics
from vis4d.data.transforms.to_tensor import ToTensor
from vis4d.common.ckpt import load_model_checkpoint
from vis4d.op.fpp.fpn import FPN
from vis4d.vis.image.functional import imshow_bboxes3d
from opendet3d.data.transforms.pad import CenterPadImages, CenterPadIntrinsics
from opendet3d.data.transforms.resize import GenResizeParameters
from opendet3d.model.detect3d.grounding_dino_3d import GroundingDINO3D
from opendet3d.op.base.swin import SwinTransformer
from opendet3d.op.detect3d.grounding_dino_3d import (
GroundingDINO3DCoder,
GroundingDINO3DHead,
RoI2Det3D,
UniDepthHead,
)
from opendet3d.op.fpp.channel_mapper import ChannelMapper
def get_3d_mood_swin_base(
max_per_image: int = 100, score_thres: float = 0.1
) -> GroundingDINO3D:
"""Get the config of Swin-Base."""
basemodel = SwinTransformer(
convert_weights=True,
pretrain_img_size=384,
embed_dims=128,
depths=[2, 2, 18, 2],
num_heads=[4, 8, 16, 32],
window_size=12,
drop_path_rate=0.3,
out_indices=(0, 1, 2, 3),
)
neck = ChannelMapper(
in_channels=[256, 512, 1024],
out_channels=256,
num_outs=4,
kernel_size=1,
norm="GroupNorm",
num_groups=32,
activation=None,
bias=True,
)
depth_fpn = FPN(
in_channels_list=[128, 256, 512, 1024],
out_channels=256,
extra_blocks=None,
start_index=0,
)
depth_head = UniDepthHead(input_dims=[256, 256, 256, 256])
box_coder = GroundingDINO3DCoder()
bbox3d_head = GroundingDINO3DHead(box_coder=box_coder)
roi2det3d = RoI2Det3D(
nms=True,
class_agnostic_nms=True,
max_per_img=max_per_image,
score_threshold=score_thres,
)
return GroundingDINO3D(
basemodel=basemodel,
neck=neck,
bbox3d_head=bbox3d_head,
roi2det3d=roi2det3d,
fpn=depth_fpn,
depth_head=depth_head,
)
@spaces.GPU
def run_3d_mood(image, text_prompts, score_thres, fx, fy, cx, cy):
"""Run 3D-MOOD demo."""
gc.collect()
device = "cuda" if torch.cuda.is_available() else "cpu"
# Data
images = image.astype(np.float32)[None, ...]
intrinsics = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]]).astype(np.float32)
input_texts = text_prompts.split(".")
class_id_mapping = {i: txt for i, txt in enumerate(input_texts)}
data_dict = {
"images": images,
"original_images": images,
"input_hw": (images.shape[1], images.shape[2]),
"original_hw": (images.shape[1], images.shape[2]),
"intrinsics": intrinsics,
"original_intrinsics": intrinsics,
}
# Transform
preprocess_transforms = compose(
transforms=[
GenResizeParameters(shape=(800, 1333)),
ResizeImages(),
ResizeIntrinsics(),
NormalizeImages(),
CenterPadImages(stride=1, shape=(800, 1333), update_input_hw=True),
CenterPadIntrinsics(),
]
)
data = preprocess_transforms([data_dict])[0]
# Convert to Tensor
to_tensor = ToTensor()
data = to_tensor([data])[0]
# Model
model = get_3d_mood_swin_base(score_thres=score_thres).to(device)
load_model_checkpoint(
model,
weights="https://huggingface.co/RoyYang0714/3D-MOOD/resolve/main/gdino3d_swin-b_120e_omni3d_834c97.pt",
rev_keys=[(r"^model\.", ""), (r"^module\.", "")],
)
model.eval()
# Run predict
with torch.no_grad():
boxes, boxes3d, scores, class_ids, depth_maps, categories = model(
images=data["images"].to(device),
input_hw=[data["input_hw"]],
original_hw=[data["original_hw"]],
intrinsics=data["intrinsics"].to(device)[None],
padding=[data["padding"]],
input_texts=[input_texts],
)
# Save the prediction for visualization
imshow_bboxes3d(
image=data["original_images"].cpu(),
boxes3d=[b.cpu() for b in boxes3d],
intrinsics=data["original_intrinsics"].cpu().numpy(),
scores=[s.cpu() for s in scores],
class_ids=[c.cpu() for c in class_ids],
class_id_mapping=class_id_mapping,
file_path="./output.png",
n_colors=len(class_id_mapping),
)
output = Image.open("./output.png")
os.remove("./output.png")
return output
demo = gr.Blocks()
with demo:
gr.HTML(
"""
<h1>3D-MOOD: Lifting 2D to 3D for Monocular Open-Set Object Detection</h1>
<p><a href="https://github.com/cvg/3D-MOOD">๐ŸŒŸ GitHub Repository</a> | <a href="https://royyang0714.github.io/3D-MOOD">๐Ÿš€ Project Page</a></p>
<div style="font-size: 16px; line-height: 1.5;">
<p>Upload one image, camera parameters and language prompts to run the 3D object detection in the wild!</p>
<p><strong>PLEASE NOTE: </strong>We are using ZeroGPU thanks to HuggingFace community Grant. However, while running on HuggingFace Space, it will take extra time to load the model for each inference. For faster visualization, please consider using a local machine to run our demo from our GitHub repository.</p>
</div>
"""
)
with gr.Row():
with gr.Column(scale=1):
fx = gr.Number(label="fx")
fy = gr.Number(label="fy")
cx = gr.Number(label="cx")
cy = gr.Number(label="cy")
text_prompts = gr.Textbox(label="Language Prompt")
score_thres = gr.Number(label="Score Threshold")
submit_btn = gr.Button("Run 3D-MOOD", scale=1, variant="primary")
with gr.Column(scale=2):
input_image = gr.Image(label="Upload Image")
with gr.Column(scale=2):
detection_output = gr.Image(label="Detection Results.")
gr.Examples(
examples=[["rgb.png", "chair.table", 0.1, 518.8579, 519.4696, 325.58246, 253.73616]],
inputs=[
input_image,
text_prompts,
score_thres,
fx,
fy,
cx,
cy,
],
outputs=[detection_output],
fn=run_3d_mood,
cache_examples=False,
examples_per_page=50,
)
submit_btn.click(
fn=run_3d_mood,
inputs=[
input_image,
text_prompts,
score_thres,
fx,
fy,
cx,
cy,
],
outputs=[detection_output],
)
demo.launch()