Spaces:

liangtm
/

referdino

Sleeping

App Files Files Community

referdino / app.py

liangtm

Update app.py

33d471c verified 4 months ago

raw

history blame contribute delete

10.1 kB

	import gradio as gr
	import os
	import warnings

	so_path = "models/GroundingDINO/ops/MultiScaleDeformableAttention.cpython-39-x86_64-linux-gnu.so"
	if not os.path.exists(so_path):
	os.system("python models/GroundingDINO/ops/setup.py build_ext develop --user")

	import torchvision.transforms as T
	from models import build_model
	import torch
	import misc as utils
	import numpy as np
	import torch.nn.functional as F
	from torchvision.io import read_video
	import torchvision.transforms.functional as Func
	from ruamel.yaml import YAML
	from easydict import EasyDict
	from misc import nested_tensor_from_videos_list
	from torch.cuda.amp import autocast
	from PIL import Image, ImageDraw
	import imageio.v3 as iio
	import cv2
	import tempfile
	import argparse
	import time
	from huggingface_hub import hf_hub_download

	os.environ["TOKENIZERS_PARALLELISM"] = "false"

	DURATION = 6
	CHECKPOINT = "ryt_mevis_swinb.pth"

	# Transform for video frames
	transform = T.Compose([
	T.Resize(360),
	T.ToTensor(),
	T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
	])

	# Colormap
	color_list = utils.colormap()
	color_list = color_list.astype('uint8').tolist()

	# Global model variable
	model = None


	def load_model_once(config_path, device='cpu'):
	"""Load model once at startup"""
	global model
	if model is None:
	# Create args object for model loading
	with open(config_path) as f:
	yaml = YAML(typ='safe', pure=True)
	config = yaml.load(f)
	config = {k: v['value'] for k, v in config.items()}

	args = EasyDict(config)
	args.device = device

	model = build_model(args)
	model.to(device)
	cache_file = hf_hub_download(repo_id="liangtm/referdino", filename=CHECKPOINT)
	# cache_file = 'ckpt/' + CHECKPOINT
	checkpoint = torch.load(cache_file, map_location='cpu')
	state_dict = checkpoint["model_state_dict"]
	model.load_state_dict(state_dict, strict=False)
	model.eval()
	print("Model loaded successfully!")
	return model


	def box_cxcywh_to_xyxy(x):
	x_c, y_c, w, h = x[:, 0], x[:, 1], x[:, 2], x[:, 3]
	b = np.stack([
	x_c - 0.5 * w,
	y_c - 0.5 * h,
	x_c + 0.5 * w,
	y_c + 0.5 * h
	], axis=1)
	return b


	def rescale_bboxes(out_bbox, size):
	img_w, img_h = size
	b = box_cxcywh_to_xyxy(out_bbox)
	b = b * np.array([img_w, img_h, img_w, img_h], dtype=np.float32)
	return b


	def vis_add_mask(img, mask, color, edge_width=3):
	origin_img = np.asarray(img.convert('RGB')).copy()
	color = np.array(color)

	mask = mask.reshape(mask.shape[0], mask.shape[1]).astype('uint8')
	mask = mask > 0.5

	# Increase the edge width using dilation
	kernel = np.ones((edge_width, edge_width), np.uint8)
	mask_dilated = cv2.dilate(mask.astype(np.uint8), kernel, iterations=1).astype(bool)
	edge_mask = mask_dilated & ~mask

	origin_img[mask] = origin_img[mask] * 0.5 + color * 0.5
	origin_img[edge_mask] = color
	origin_img = Image.fromarray(origin_img)
	return origin_img


	def run_video_inference(input_video, text_prompt, tracking_alpha=0.1, fps=15):
	"""Main inference function for Gradio"""
	global model
	model.tracking_alpha = tracking_alpha

	# Set default values for other parameters
	show_box = True
	mask_edge_width = 6

	if input_video is None:
	return None, "Please upload a video file."

	if not text_prompt or text_prompt.strip() == "":
	return None, "Please enter a text prompt."

	# Process text prompt
	exp = " ".join(text_prompt.lower().split())

	# Read video
	video_frames, _, info = read_video(input_video, end_pts=DURATION, pts_unit='sec') # (T, H, W, C)

	frame_step = max(round(info['video_fps'] / fps), 1)

	frames = []
	for i in range(0, len(video_frames), frame_step):
	source_frame = Func.to_pil_image(video_frames[i].permute(2, 0, 1))
	frames.append(source_frame)

	video_len = len(frames)
	if video_len == 0:
	return None, "No frames found in the video."

	frames_ids = [x for x in range(video_len)]
	imgs = []
	for t in frames_ids:
	img = frames[t]
	origin_w, origin_h = img.size
	imgs.append(transform(img))

	device = next(model.parameters()).device
	imgs = torch.stack(imgs, dim=0).to(device)
	samples = nested_tensor_from_videos_list(imgs[None], size_divisibility=16)
	img_h, img_w = imgs.shape[-2:]
	size = torch.as_tensor([int(img_h), int(img_w)]).to(device)
	target = {"size": size}

	start_infer = time.time()
	# Run inference
	with torch.no_grad():
	with autocast(True):
	outputs = model(samples, [exp], [target])
	end_infer = time.time()

	pred_logits = outputs["pred_logits"][0] # [t, q, k]
	pred_masks = outputs["pred_masks"][0] # [t, q, h, w]
	pred_boxes = outputs["pred_boxes"][0] # [t, q, 4]

	# Select the query index according to pred_logits
	pred_scores = pred_logits.sigmoid() # [t, q, k]
	pred_scores = pred_scores.mean(0) # [q, K]
	max_scores, _ = pred_scores.max(-1) # [q,]
	_, max_ind = max_scores.max(-1) # [1,]
	max_inds = max_ind.repeat(video_len)
	pred_masks = pred_masks[range(video_len), max_inds, ...] # [t, h, w]
	pred_masks = pred_masks.unsqueeze(0)
	pred_boxes = pred_boxes[range(video_len), max_inds].cpu().numpy() # [t, 4]

	# Unpad and resize
	pred_masks = pred_masks[:, :, :img_h, :img_w].cpu()
	pred_masks = F.interpolate(pred_masks, size=(origin_h, origin_w), mode='bilinear', align_corners=False)
	pred_masks = (pred_masks.sigmoid() > 0.5).squeeze(0).cpu().numpy()

	# Visualization
	color = np.array([220, 20, 60], dtype=np.uint8)

	start_save = time.time()
	save_imgs = []
	for t, img in enumerate(frames):
	# Draw mask
	img = vis_add_mask(img, pred_masks[t], color, mask_edge_width)

	draw = ImageDraw.Draw(img)
	draw_boxes = pred_boxes[t][None]
	draw_boxes = rescale_bboxes(draw_boxes, (origin_w, origin_h)).tolist()

	# Draw box if enabled
	if show_box:
	xmin, ymin, xmax, ymax = draw_boxes[0]
	draw.rectangle(((xmin, ymin), (xmax, ymax)), outline=tuple(color), width=5)

	save_imgs.append(np.asarray(img).copy())

	# Save result video
	with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as tmp_file:
	iio.imwrite(tmp_file.name, save_imgs, fps=fps)
	result_video_path = tmp_file.name

	end_save = time.time()

	status = (
	f"Inference Time: {(end_infer - start_infer):.1f}s\n"
	f"Saving Time: {(end_save - start_save):.1f}s"
	)
	return result_video_path, status


	def main():
	# Configuration
	config_path = "configs/ytvos_swinb.yaml" # Update this path
	device = "cuda" if torch.cuda.is_available() else "cpu"
	# device = "cpu"

	# Load model at startup
	print("Loading model...")
	load_model_once(config_path, device)
	print(f"Model loaded on device: {device}")

	# Create Gradio interface
	with gr.Blocks(
	title="ReferDINO",
	css="""
	#hero { text-align: center; }
	#hero h1, #hero h2, #hero h3, #hero p {
	text-align: center !important;
	margin: 0.25rem 0;
	}
	"""
	) as demo:
	gr.Markdown(
	"""
	<h1>Referring Video Object Segmentation with
	<a href="https://github.com/iSEE-Laboratory/ReferDINO">ReferDINO</a>
	</h1>
	<h3>Note that this demo runs on CPU, so the video will be trimmed to ≤6 seconds.</h3>
	""",
	elem_id="hero",
	)

	with gr.Row():
	with gr.Column(scale=1):

	# Input components
	input_video = gr.Video(
	label="📹 Upload Video",
	height=300
	)

	text_prompt = gr.Textbox(
	label="📝 Text Description",
	placeholder="Describe the object you want to segment (e.g., 'red car', 'person in blue shirt')",
	lines=2
	)

	run_button = gr.Button(
	"🚀 Run Inference",
	variant="primary",
	size="lg"
	)

	tracking_alpha = gr.Slider(
	label="Momentum",
	minimum=0.0,
	maximum=1.0,
	value=0.1,
	step=0.05,
	info="controls the memory updating (lower = longer memory)"
	)

	target_fps = gr.Slider(
	label="FPS",
	minimum=1,
	maximum=30,
	value=10,
	step=1,
	info="controls the FPS (lower = faster processing)"
	)

	with gr.Column(scale=1):
	output_video = gr.Video(
	label="🎯 Segmentation Result",
	height=400
	)

	status_text = gr.Textbox(
	label="📊 Status",
	lines=3,
	interactive=False
	)

	# Examples
	gr.Examples(
	examples=[
	["dogs.mp4", "the dog is drinking water", 0.1, 10],
	["dogs.mp4", "the dog is sleeping", 0.1, 10],
	],
	inputs=[input_video, text_prompt, tracking_alpha, target_fps],
	outputs=[output_video],
	fn=run_video_inference,
	cache_examples=False,
	label="📋 Try these examples:"
	)

	# Event handlers
	run_button.click(
	fn=run_video_inference,
	inputs=[input_video, text_prompt, tracking_alpha, target_fps],
	outputs=[output_video, status_text],
	show_progress=True
	)

	return demo


	if __name__ == "__main__":
	demo = main()
	demo.launch(
	show_api=False,
	show_error=True
	)