Spaces:

caltech-animal-tracking
/

Primate-Detection-GPU

Sleeping

App Files Files Community

Primate-Detection-GPU / dino_sam.py

annading

added video downsampling

dd74532 3 months ago

raw history blame contribute delete

No virus

9.65 kB

	import datetime
	import cv2
	import os
	import numpy as np
	import torch
	# import io
	# import cProfile
	import csv
	# import pstats
	import warnings
	from memory_profiler import profile
	# from pstats import SortKey
	from tqdm import tqdm
	from torchvision.ops import box_convert
	from typing import Tuple
	from GroundingDINO.groundingdino.util.inference import load_model, load_image, annotate, preprocess_caption
	from GroundingDINO.groundingdino.util.utils import get_phrases_from_posmap
	from segment_anything import sam_model_registry
	from segment_anything.utils.transforms import ResizeLongestSide
	from video_utils import mp4_to_png, frame_to_timestamp, vid_stitcher

	warnings.filterwarnings("ignore")

	def prepare_image(image, transform, device):
	image = transform.apply_image(image)
	image = torch.as_tensor(image, device=device.device)
	return image.permute(2, 0, 1).contiguous()

	# @profile
	def sam_dino_vid(
	vid_path: str,
	text_prompt: str,
	box_threshold: float = 0.35,
	text_threshold: float = 0.25,
	fps_processed: int = 1,
	scaling_factor: float = 1.0,
	video_options: list[str] = ["Bounding boxes"],
	config_path: str = "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py",
	weights_path: str = "weights/groundingdino_swint_ogc.pth",
	device: str = 'cuda',
	batch_size: int = 5
	) -> (str, str):
	""" Args:
	Returns:
	"""

	masks_needed = False
	boxes_needed = True
	# if masks are selected, load SAM model
	if "Bounding boxes" not in video_options:
	boxes_needed = False
	if "Masks" in video_options:
	masks_needed = True
	checkpoint = "weights/sam_vit_h_4b8939.pth"
	model_type = "vit_h"
	sam = sam_model_registry[model_type](checkpoint=checkpoint)
	sam.to(device=device)
	resize_transform = ResizeLongestSide(sam.image_encoder.img_size)

	# create new dirs and paths for results
	filename = os.path.splitext(os.path.basename(vid_path))[0]
	results_dir = "../processed/" + filename + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
	os.makedirs(results_dir, exist_ok=True)
	frames_dir = os.path.join(results_dir, "frames")
	os.makedirs(frames_dir, exist_ok=True)
	csv_path = os.path.join(results_dir, "detections.csv")

	# load the groundingDINO model
	gd_model = load_model(config_path, weights_path, device=device)

	# process video and create a directory of video frames
	fps = mp4_to_png(vid_path, frames_dir, scaling_factor)

	# get the frame paths for the images to process
	frame_filenames = os.listdir(frames_dir)

	frame_paths = [] # list of frame paths to process based on fps_processed
	other_paths = [] # list of every frame path in the dir
	for i, frame in enumerate(frame_filenames):
	if i % fps_processed == 0:
	frame_paths.append(os.path.join(frames_dir, frame))
	else:
	other_paths.append(os.path.join(frames_dir, frame))

	# TODO: rename vars to be more clear
	# run dino_predict_batch and sam_predict_batch in batches of frames
	# write the results to a csv
	with open(csv_path, 'w', newline='') as csvfile:
	writer = csv.writer(csvfile)
	writer.writerow(["Frame", "Timestamp (hh:mm:ss)", "Boxes (cxcywh)", "# Boxes"])
	# run groundingDINO in batches
	for i in tqdm(range(0, len(frame_paths), batch_size), desc="Running batches"):
	batch_paths = frame_paths[i:i+batch_size] # paths for this batch
	images_orig = [load_image(img)[0] for img in batch_paths]
	image_stack = torch.stack([load_image(img)[1] for img in batch_paths])
	boxes_i, logits_i, phrases_i = dino_predict_batch(
	model=gd_model,
	images=image_stack,
	caption=text_prompt,
	box_threshold=box_threshold,
	text_threshold=text_threshold
	)

	annotated_frame_paths = [os.path.join(frames_dir, os.path.basename(frame_path)) for frame_path in batch_paths]
	# convert images_orig to rgb from bgr
	images_orig_rgb = [cv2.cvtColor(image, cv2.COLOR_BGR2RGB) for image in images_orig]

	if masks_needed:
	# run SAM in batches on boxes from dino
	batched_input = []
	sam_boxes = []
	for image, box in zip(images_orig_rgb, boxes_i):
	height, width = image.shape[:2]
	# convert the boxes from groundingDINO format to SAM format
	box = box * torch.Tensor([width, height, width, height])
	box = box_convert(box, in_fmt="cxcywh", out_fmt="xyxy").cuda()
	sam_boxes.append(box)
	batched_input.append({
	"image": prepare_image(image, resize_transform, sam),
	"boxes": resize_transform.apply_boxes_torch(box, image.shape[:2]),
	"original_size": image.shape[:2]
	})
	batched_output = sam(batched_input, multimask_output=False)
	for i, prediction in enumerate(batched_output):
	# write to annotated_frames_dir for stitching
	mask = prediction["masks"].cpu().numpy()
	box = sam_boxes[i].cpu().numpy()
	annotated_frame = plot_sam(images_orig_rgb[i], mask, box, boxes_shown=boxes_needed)
	cv2.imwrite(annotated_frame_paths[i], annotated_frame)

	elif boxes_needed and not masks_needed:
	# get groundingDINO annotated frames
	for i, (image, box, logit, phrase) in enumerate(zip(images_orig, boxes_i, logits_i, phrases_i)):
	annotated_frame = annotate(image_source=image, boxes=box, logits=logit, phrases=phrase)
	cv2.imwrite(annotated_frame_paths[i], annotated_frame)

	# write results to csv
	# TODO: convert boxes to SAM format for clearer understanding
	frame_names = [os.path.basename(frame_path).split(".")[0] for frame_path in batch_paths]
	for i, frame in enumerate(frame_names):
	writer.writerow([frame, frame_to_timestamp(int(frame[-8:]), fps), boxes_i[i], len(boxes_i[i])])
	csvfile.close()

	# stitch the frames
	save_path = vid_stitcher(frames_dir, output_path=os.path.join(results_dir, "output.mp4"), fps=fps)
	print("Results saved to: " + save_path)
	return csv_path, save_path


	def dino_predict_batch(
	model,
	images: torch.Tensor,
	caption: str,
	box_threshold: float,
	text_threshold: float,
	device: str = "cuda"
	) -> Tuple[list[torch.Tensor], list[torch.Tensor], list[list[str]]]:
	'''
	return:
	bboxes_batch: list of tensors of shape (n, 4)
	predicts_batch: list of tensors of shape (n,)
	phrases_batch: list of list of strings of shape (n,)
	'''
	caption = preprocess_caption(caption=caption)
	model = model.to(device)
	image = images.to(device)
	with torch.no_grad():
	outputs = model(image, captions=[caption for _ in range(len(images))])
	prediction_logits = outputs["pred_logits"].cpu().sigmoid() # prediction_logits.shape = (num_batch, nq, 256)
	prediction_boxes = outputs["pred_boxes"].cpu() # prediction_boxes.shape = (num_batch, nq, 4)

	mask = prediction_logits.max(dim=2)[0] > box_threshold # mask: torch.Size([num_batch, 256])

	bboxes_batch = []
	predicts_batch = []
	phrases_batch = [] # list of lists
	tokenizer = model.tokenizer
	tokenized = tokenizer(caption)
	for i in range(prediction_logits.shape[0]):
	logits = prediction_logits[i][mask[i]] # logits.shape = (n, 256)
	phrases = [
	get_phrases_from_posmap(logit > text_threshold, tokenized, tokenizer).replace('.', '')
	for logit # logit is a tensor of shape (256,) torch.Size([256])
	in logits # torch.Size([7, 256])
	]
	boxes = prediction_boxes[i][mask[i]] # boxes.shape = (n, 4)
	phrases_batch.append(phrases)
	bboxes_batch.append(boxes)
	predicts_batch.append(logits.max(dim=1)[0])

	return bboxes_batch, predicts_batch, phrases_batch

	def plot_sam(
	image: np.ndarray,
	masks: list[np.ndarray],
	boxes: np.ndarray,
	boxes_shown: bool = True,
	masks_shown: bool = True,
	) -> np.ndarray:
	"""
	Plot image with masks and/or boxes.
	"""
	# Use cv2 to plot the boxes and masks if they exist
	if boxes_shown:
	for box in boxes:
	# red bbox
	cv2.rectangle(image, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), (0, 0, 255), 2)
	if masks_shown:
	# blue mask
	color = np.array([255, 144, 30])
	color = color.astype(np.uint8)
	for mask in masks:
	# turn the mask into a colored mask
	h, w = mask.shape[-2:]
	mask = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
	image = cv2.addWeighted(image, 1, mask, 0.5, 0)
	return image

	# if __name__ == '__main__':
	# def run_sam_dino_vid():
	# sam_dino_vid("baboon_15s.mp4", "baboon", box_threshold=0.3, text_threshold=0.3, fps_processed=30, video_options=['Bounding boxes', 'Masks'])
	# start_time = datetime.datetime.now()
	# stats = run_sam_dino_vid()
	# print("elapsed: " + str(datetime.datetime.now() - start_time))