Spaces:

mmlab-ntu
/

Segment-Any-RGBD

Runtime error

Segment-Any-RGBD / app.py

Jingkang Yang

update: app

6328d06 about 2 years ago

17.7 kB

	# Copyright (c) Facebook, Inc. and its affiliates.
	# Copyright (c) Meta Platforms, Inc. All Rights Reserved

	import os
	os.system('pip install torch==1.10.1+cu113 torchvision==0.11.2+cu113 torchaudio==0.10.1+cu113 -f https://download.pytorch.org/whl/torch_stable.html')

	try:
	import detectron2
	except:
	import os
	# os.system('cd /home/user/app/third_party/CLIP && pip install -Ue .')
	os.system('pip install git+https://github.com/Jun-CEN/CLIP.git')
	os.system('pip install git+https://github.com/facebookresearch/detectron2.git')
	os.system('pip install git+https://github.com/facebookresearch/pytorch3d.git')
	os.system('pip install git+https://github.com/facebookresearch/segment-anything.git')

	import argparse
	import glob
	import multiprocessing as mp
	import os
	import time
	import cv2
	import tqdm
	import numpy as np
	import gradio as gr
	from tools.util import *

	from detectron2.config import get_cfg

	from detectron2.projects.deeplab import add_deeplab_config
	from detectron2.data.detection_utils import read_image
	from detectron2.utils.logger import setup_logger
	from open_vocab_seg import add_ovseg_config

	from open_vocab_seg.utils import VisualizationDemo, VisualizationDemoIndoor

	# constants
	WINDOW_NAME = "Open vocabulary segmentation"


	def setup_cfg(args):
	# load config from file and command-line arguments
	cfg = get_cfg()
	# for poly lr schedule
	add_deeplab_config(cfg)
	add_ovseg_config(cfg)
	cfg.merge_from_file(args.config_file)
	cfg.merge_from_list(args.opts)
	cfg.freeze()
	return cfg


	def get_parser():
	parser = argparse.ArgumentParser(description="Detectron2 demo for open vocabulary segmentation")
	parser.add_argument(
	"--config-file",
	default="configs/ovseg_swinB_vitL_demo.yaml",
	metavar="FILE",
	help="path to config file",
	)
	parser.add_argument(
	"--input",
	default=["/mnt/lustre/jkyang/PSG4D/sailvos3d/downloads/sailvos3d/trevor_1_int/images/000160.bmp"],
	nargs="+",
	help="A list of space separated input images; "
	"or a single glob pattern such as 'directory/*.jpg'",
	)
	parser.add_argument(
	"--class-names",
	default=["person", "car", "motorcycle", "truck", "bird", "dog", "handbag", "suitcase", "bottle", "cup", "bowl", "chair", "potted plant", "bed", "dining table", "tv", "laptop", "cell phone", "bag", "bin", "box", "door", "road barrier", "stick", "lamp", "floor", "wall"],
	nargs="+",
	help="A list of user-defined class_names"
	)
	parser.add_argument(
	"--output",
	default = "./pred",
	help="A file or directory to save output visualizations. "
	"If not given, will show output in an OpenCV window.",
	)
	parser.add_argument(
	"--opts",
	help="Modify config options using the command-line 'KEY VALUE' pairs",
	default=["MODEL.WEIGHTS", "ovseg_swinbase_vitL14_ft_mpt.pth"],
	nargs=argparse.REMAINDER,
	)
	return parser

	args = get_parser().parse_args()

	def greet_sailvos3d(rgb_input, depth_map_input, rage_matrices_input, class_candidates):
	print(args.class_names)
	print(class_candidates[0], class_candidates[1], class_candidates[2], class_candidates[3],)
	print(class_candidates.split(', '))
	args.input = [rgb_input]
	args.class_names = class_candidates.split(', ')
	depth_map_path = depth_map_input.name
	rage_matrices_path = rage_matrices_input.name
	print(args.input, args.class_names, depth_map_path, rage_matrices_path)
	mp.set_start_method("spawn", force=True)
	setup_logger(name="fvcore")
	logger = setup_logger()
	logger.info("Arguments: " + str(args))

	cfg = setup_cfg(args)

	demo = VisualizationDemo(cfg)
	class_names = args.class_names
	print(args.input)
	if args.input:
	if len(args.input) == 1:
	args.input = glob.glob(os.path.expanduser(args.input[0]))
	assert args.input, "The input path(s) was not found"
	for path in tqdm.tqdm(args.input, disable=not args.output):
	# use PIL, to be consistent with evaluation
	start_time = time.time()
	predictions, visualized_output_rgb, visualized_output_depth, visualized_output_rgb_sam, visualized_output_depth_sam = demo.run_on_image_sam(path, class_names, depth_map_path, rage_matrices_path)
	logger.info(
	"{}: {} in {:.2f}s".format(
	path,
	"detected {} instances".format(len(predictions["instances"]))
	if "instances" in predictions
	else "finished",
	time.time() - start_time,
	)
	)

	if args.output:
	if os.path.isdir(args.output):
	assert os.path.isdir(args.output), args.output
	out_filename = os.path.join(args.output, os.path.basename(path))
	else:
	assert len(args.input) == 1, "Please specify a directory with args.output"
	out_filename = args.output
	visualized_output_rgb.save('outputs/RGB_Semantic_SAM.png')
	visualized_output_depth.save('outputs/Depth_Semantic_SAM.png')
	visualized_output_rgb_sam.save('outputs/RGB_Semantic_SAM_Mask.png')
	visualized_output_depth_sam.save('outputs/Depth_Semantic_SAM_Mask.png')
	rgb_3d_sam = demo.get_xyzrgb('outputs/RGB_Semantic_SAM.png', depth_map_path, rage_matrices_path)
	depth_3d_sam = demo.get_xyzrgb('outputs/Depth_Semantic_SAM.png', depth_map_path, rage_matrices_path)
	rgb_3d_sam_mask = demo.get_xyzrgb('outputs/RGB_Semantic_SAM_Mask.png', depth_map_path, rage_matrices_path)
	depth_3d_sam_mask = demo.get_xyzrgb('outputs/Depth_Semantic_SAM_Mask.png', depth_map_path, rage_matrices_path)
	np.savez('outputs/xyzrgb.npz', rgb_3d_sam = rgb_3d_sam, depth_3d_sam = depth_3d_sam, rgb_3d_sam_mask = rgb_3d_sam_mask, depth_3d_sam_mask = depth_3d_sam_mask)
	demo.render_3d_video('outputs/xyzrgb.npz', depth_map_path)
	else:
	cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
	cv2.imshow(WINDOW_NAME, visualized_output_rgb.get_image()[:, :, ::-1])
	if cv2.waitKey(0) == 27:
	break # esc to quit
	else:
	raise NotImplementedError

	Depth_Semantic_SAM_Mask = read_image('outputs/Depth_Semantic_SAM_Mask.png')
	RGB_Semantic_SAM_Mask = read_image('outputs/RGB_Semantic_SAM_Mask.png')
	Depth_Semantic_SAM = read_image('outputs/Depth_Semantic_SAM.png')
	RGB_Semantic_SAM = read_image('outputs/RGB_Semantic_SAM.png')
	two_image_to_gif(Depth_Semantic_SAM_Mask, Depth_Semantic_SAM, 'Depth_Semantic_SAM_2D')
	two_image_to_gif(RGB_Semantic_SAM_Mask, RGB_Semantic_SAM, 'RGB_Semantic_SAM_2D')
	Depth_Semantic_SAM_2D = 'outputs/Depth_Semantic_SAM_2D.mp4'
	RGB_Semantic_SAM_2D = 'outputs/RGB_Semantic_SAM_2D.mp4'
	Depth_map = read_image('outputs/Depth_rendered.png')
	Depth_Semantic_SAM_Mask_gif = 'outputs/Depth_3D_All.mp4'
	RGB_Semantic_SAM_Mask_gif = 'outputs/RGB_3D_All.mp4'
	return RGB_Semantic_SAM_2D, RGB_Semantic_SAM_Mask_gif, Depth_map, Depth_Semantic_SAM_2D, Depth_Semantic_SAM_Mask_gif

	def greet_scannet(rgb_input, depth_map_input, class_candidates):
	rgb_input = rgb_input
	depth_map_input = depth_map_input.name
	class_candidates = class_candidates.split(', ')
	print(rgb_input, depth_map_input, class_candidates)
	mp.set_start_method("spawn", force=True)
	args = get_parser().parse_args()
	setup_logger(name="fvcore")
	logger = setup_logger()
	logger.info("Arguments: " + str(args))

	cfg = setup_cfg(args)

	demo = VisualizationDemoIndoor(cfg)
	""" args.input = glob.glob(os.path.expanduser(args.input[0]))
	assert args.input, "The input path(s) was not found" """
	start_time = time.time()
	predictions, output2D, output3D = demo.run_on_pcd_ui(rgb_input, depth_map_input, class_candidates)

	output2D['sem_seg_on_rgb'].save('outputs/RGB_Semantic_SAM.png')
	output2D['sem_seg_on_depth'].save('outputs/Depth_Semantic_SAM.png')
	output2D['sam_seg_on_rgb'].save('outputs/RGB_Semantic_SAM_Mask.png')
	output2D['sam_seg_on_depth'].save('outputs/Depth_Semantic_SAM_Mask.png')
	""" rgb_3d_sam = demo.get_xyzrgb('outputs/RGB_Semantic_SAM.png', path)
	depth_3d_sam = demo.get_xyzrgb('outputs/Depth_Semantic_SAM.png', path)
	rgb_3d_sam_mask = demo.get_xyzrgb('outputs/RGB_Semantic_SAM_Mask.png', path)
	depth_3d_sam_mask = demo.get_xyzrgb(outputs/'Depth_Semantic_SAM_Mask.png', path) """
	rgb_3d_sem = output3D['rgb_3d_sem']
	depth_3d_sem = output3D['depth_3d_sem']
	rgb_3d_sam = output3D['rgb_3d_sam']
	depth_3d_sam = output3D['depth_3d_sam']

	np.savez('outputs/xyzrgb.npz', rgb_3d_sam = rgb_3d_sem, depth_3d_sam = depth_3d_sem, rgb_3d_sam_mask = rgb_3d_sam, depth_3d_sam_mask = depth_3d_sam)
	demo.render_3d_video('outputs/xyzrgb.npz')

	Depth_Semantic_SAM_Mask = read_image('outputs/Depth_Semantic_SAM_Mask.png')
	RGB_Semantic_SAM_Mask = read_image('outputs/RGB_Semantic_SAM_Mask.png')
	Depth_Semantic_SAM = read_image('outputs/Depth_Semantic_SAM.png')
	RGB_Semantic_SAM = read_image('outputs/RGB_Semantic_SAM.png')
	two_image_to_gif(Depth_Semantic_SAM_Mask, Depth_Semantic_SAM, 'Depth_Semantic_SAM_2D')
	two_image_to_gif(RGB_Semantic_SAM_Mask, RGB_Semantic_SAM, 'RGB_Semantic_SAM_2D')
	Depth_Semantic_SAM_2D = 'outputs/Depth_Semantic_SAM_2D.mp4'
	RGB_Semantic_SAM_2D = 'outputs/RGB_Semantic_SAM_2D.mp4'
	Depth_map = read_image('outputs/Depth_rendered.png')
	Depth_Semantic_SAM_Mask_gif = 'outputs/Depth_3D_All.mp4'
	RGB_Semantic_SAM_Mask_gif = 'outputs/RGB_3D_All.mp4'
	return RGB_Semantic_SAM_2D, RGB_Semantic_SAM_Mask_gif, Depth_map, Depth_Semantic_SAM_2D, Depth_Semantic_SAM_Mask_gif

	SHARED_UI_WARNING = f'''### [NOTE] It may be very slow in this shared UI.
	You can duplicate and use it with a paid private GPU.
	<a class="duplicate-button" style="display:inline-block" target="_blank" href="https://huggingface.co/spaces/mmlab-ntu/Segment-Any-RGBD?duplicate=true"><img style="margin-top:0;margin-bottom:0" src="https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-xl-dark.svg" alt="Duplicate Space"></a>
	Alternatively, you can also use the demo on your own computer.
	<a style="display:inline-block" href="https://github.com/Jun-CEN/SegmentAnyRGBD/"><img style="margin-top:0;margin-bottom:0" src="https://img.shields.io/badge/Project%20Page-online-brightgreen"></a>
	'''

	with gr.Blocks(analytics_enabled=False) as segrgbd_iface:
	with gr.Box():
	gr.Markdown(SHARED_UI_WARNING)
	#######t2v#######
	with gr.Tab(label="Dataset: Sailvos3D"):
	with gr.Column():
	with gr.Row():
	# with gr.Tab(label='input'):
	with gr.Column():
	with gr.Row():
	Input_RGB_Component = gr.Image(label = 'RGB_Input', type = 'filepath').style(width=320, height=200)
	Depth_Map_Output_Component = gr.Image(label = "Vis_Depth_Map").style(width=320, height=200)
	with gr.Row():
	Depth_Map_Input_Component = gr.File(label = 'input_Depth_map')
	Component_2D_to_3D_Projection_Parameters = gr.File(label = '2D_to_3D_Projection_Parameters')
	with gr.Row():
	Class_Candidates_Component = gr.Text(label = 'Class_Candidates')
	vc_end_btn = gr.Button("Send")
	with gr.Tab(label='Result'):
	with gr.Row():
	RGB_Semantic_SAM_Mask_Component = gr.Video(label = "RGB_Semantic_SAM_Mask").style(width=320, height=200)
	RGB_Semantic_SAM_Mask_3D_Component = gr.Video(label = "Video_3D_RGB_Semantic_SAM_Mask").style(width=320, height=200)
	with gr.Row():
	Depth_Semantic_SAM_Mask_Component = gr.Video(label = "Depth_Semantic_SAM_Mask").style(width=320, height=200)
	Depth_Semantic_SAM_Mask_3D_Component = gr.Video(label = "Video_3D_Depth_Semantic_SAM_Mask").style(width=320, height=200)
	with gr.Row():
	gr.Markdown("<b> It takes around 2 to 5 minutes to get the final results. The framework initialization, SAM segmentation, zero-shot semantic segmentation and 3D results rendering take long time.</b>")
	gr.Examples(examples=[
	[
	'UI/sailvos3d/ex1/inputs/rgb_000160.bmp',
	'UI/sailvos3d/ex1/inputs/depth_000160.npy',
	'UI/sailvos3d/ex1/inputs/rage_matrices_000160.npz',
	'person, car, motorcycle, truck, bird, dog, handbag, suitcase, bottle, cup, bowl, chair, potted plant, bed, dining table, tv, laptop, cell phone, bag, bin, box, door, road barrier, stick, lamp, floor, wall',
	],
	[
	'UI/sailvos3d/ex2/inputs/rgb_000540.bmp',
	'UI/sailvos3d/ex2/inputs/depth_000540.npy',
	'UI/sailvos3d/ex2/inputs/rage_matrices_000540.npz',
	'person, car, motorcycle, truck, bird, dog, handbag, suitcase, bottle, cup, bowl, chair, potted plant, bed, dining table, tv, laptop, cell phone, bag, bin, box, door, road barrier, stick, lamp, floor, wall',
	]],
	inputs=[Input_RGB_Component, Depth_Map_Input_Component, Component_2D_to_3D_Projection_Parameters, Class_Candidates_Component],
	outputs=[RGB_Semantic_SAM_Mask_Component, RGB_Semantic_SAM_Mask_3D_Component, Depth_Map_Output_Component, Depth_Semantic_SAM_Mask_Component, Depth_Semantic_SAM_Mask_3D_Component],
	fn=greet_sailvos3d)
	vc_end_btn.click(inputs=[Input_RGB_Component, Depth_Map_Input_Component, Component_2D_to_3D_Projection_Parameters, Class_Candidates_Component],
	outputs=[RGB_Semantic_SAM_Mask_Component, RGB_Semantic_SAM_Mask_3D_Component, Depth_Map_Output_Component, Depth_Semantic_SAM_Mask_Component, Depth_Semantic_SAM_Mask_3D_Component],
	fn=greet_sailvos3d)

	with gr.Tab(label="Dataset: Scannet"):
	with gr.Column():
	with gr.Row():
	# with gr.Tab(label='input'):
	with gr.Column():
	with gr.Row():
	Input_RGB_Component = gr.Image(label = 'RGB_Input', type = 'filepath').style(width=320, height=200)
	Depth_Map_Output_Component = gr.Image(label = "Vis_Depth_Map").style(width=320, height=200)
	with gr.Row():
	Depth_Map_Input_Component = gr.File(label = "Input_Depth_Map")
	Class_Candidates_Component = gr.Text(label = 'Class_Candidates')
	vc_end_btn = gr.Button("Send")
	with gr.Tab(label='Result'):
	with gr.Row():
	RGB_Semantic_SAM_Mask_Component = gr.Video(label = "RGB_Semantic_SAM_Mask").style(width=320, height=200)
	RGB_Semantic_SAM_Mask_3D_Component = gr.Video(label = "Video_3D_RGB_Semantic_SAM_Mask").style(width=320, height=200)
	with gr.Row():
	Depth_Semantic_SAM_Mask_Component = gr.Video(label = "Depth_Semantic_SAM_Mask").style(width=320, height=200)
	Depth_Semantic_SAM_Mask_3D_Component = gr.Video(label = "Video_3D_Depth_Semantic_SAM_Mask").style(width=320, height=200)
	with gr.Row():
	gr.Markdown("<b> It takes around 2 to 5 minutes to get the final results. The framework initialization, SAM segmentation, zero-shot semantic segmentation and 3D results rendering take long time.</b>")
	gr.Examples(examples=[
	[
	'UI/scannetv2/examples/scene0000_00/color/1660.jpg',
	'UI/scannetv2/examples/scene0000_00/depth/1660.png',
	'wall, floor, cabinet, bed, chair, sofa, table, door, window, bookshelf, picture, counter, desk, curtain, refrigerator, shower curtain, toilet, sink, bathtub, other furniture',
	],
	[
	'UI/scannetv2/examples/scene0000_00/color/5560.jpg',
	'UI/scannetv2/examples/scene0000_00/depth/5560.png',
	'wall, floor, cabinet, bed, chair, sofa, table, door, window, bookshelf, picture, counter, desk, curtain, refrigerator, shower curtain, toilet, sink, bathtub, other furniture',
	]],
	inputs=[Input_RGB_Component, Depth_Map_Input_Component, Class_Candidates_Component],
	outputs=[RGB_Semantic_SAM_Mask_Component, RGB_Semantic_SAM_Mask_3D_Component, Depth_Map_Output_Component, Depth_Semantic_SAM_Mask_Component, Depth_Semantic_SAM_Mask_3D_Component],
	fn=greet_scannet)
	vc_end_btn.click(inputs=[Input_RGB_Component, Depth_Map_Input_Component, Class_Candidates_Component],
	outputs=[RGB_Semantic_SAM_Mask_Component, RGB_Semantic_SAM_Mask_3D_Component, Depth_Map_Output_Component, Depth_Semantic_SAM_Mask_Component, Depth_Semantic_SAM_Mask_3D_Component],
	fn=greet_scannet)

	demo = segrgbd_iface
	demo.launch()