Spaces:

hikerxu
/

Grounded-Segment-Anything

Paused

App Files Files Community

Grounded-Segment-Anything / grounded_sam_whisper_demo.py

hikerxu

Upload folder using huggingface_hub

483de47 verified about 2 months ago

raw history blame contribute delete

No virus

8.91 kB

	import argparse
	import os
	import copy

	import numpy as np
	import json
	import torch
	import torchvision
	from PIL import Image, ImageDraw, ImageFont

	# Grounding DINO
	import GroundingDINO.groundingdino.datasets.transforms as T
	from GroundingDINO.groundingdino.models import build_model
	from GroundingDINO.groundingdino.util import box_ops
	from GroundingDINO.groundingdino.util.slconfig import SLConfig
	from GroundingDINO.groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap

	# segment anything
	from segment_anything import build_sam, SamPredictor
	import cv2
	import numpy as np
	import matplotlib.pyplot as plt

	# whisper
	import whisper


	def load_image(image_path):
	# load image
	image_pil = Image.open(image_path).convert("RGB") # load image

	transform = T.Compose(
	[
	T.RandomResize([800], max_size=1333),
	T.ToTensor(),
	T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
	]
	)
	image, _ = transform(image_pil, None) # 3, h, w
	return image_pil, image


	def load_model(model_config_path, model_checkpoint_path, device):
	args = SLConfig.fromfile(model_config_path)
	args.device = device
	model = build_model(args)
	checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
	load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
	print(load_res)
	_ = model.eval()
	return model


	def get_grounding_output(model, image, caption, box_threshold, text_threshold,device="cpu"):
	caption = caption.lower()
	caption = caption.strip()
	if not caption.endswith("."):
	caption = caption + "."
	model = model.to(device)
	image = image.to(device)
	with torch.no_grad():
	outputs = model(image[None], captions=[caption])
	logits = outputs["pred_logits"].cpu().sigmoid()[0] # (nq, 256)
	boxes = outputs["pred_boxes"].cpu()[0] # (nq, 4)
	logits.shape[0]

	# filter output
	logits_filt = logits.clone()
	boxes_filt = boxes.clone()
	filt_mask = logits_filt.max(dim=1)[0] > box_threshold
	logits_filt = logits_filt[filt_mask] # num_filt, 256
	boxes_filt = boxes_filt[filt_mask] # num_filt, 4
	logits_filt.shape[0]

	# get phrase
	tokenlizer = model.tokenizer
	tokenized = tokenlizer(caption)
	# build pred
	pred_phrases = []
	scores = []
	for logit, box in zip(logits_filt, boxes_filt):
	pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
	pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
	scores.append(logit.max().item())

	return boxes_filt, torch.Tensor(scores), pred_phrases

	def show_mask(mask, ax, random_color=False):
	if random_color:
	color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
	else:
	color = np.array([30/255, 144/255, 255/255, 0.6])
	h, w = mask.shape[-2:]
	mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
	ax.imshow(mask_image)


	def show_box(box, ax, label):
	x0, y0 = box[0], box[1]
	w, h = box[2] - box[0], box[3] - box[1]
	ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2))
	ax.text(x0, y0, label)


	def save_mask_data(output_dir, mask_list, box_list, label_list):
	value = 0 # 0 for background

	mask_img = torch.zeros(mask_list.shape[-2:])
	for idx, mask in enumerate(mask_list):
	mask_img[mask.cpu().numpy()[0] == True] = value + idx + 1
	plt.figure(figsize=(10, 10))
	plt.imshow(mask_img.numpy())
	plt.axis('off')
	plt.savefig(os.path.join(output_dir, 'mask.jpg'), bbox_inches="tight", dpi=300, pad_inches=0.0)

	json_data = [{
	'value': value,
	'label': 'background'
	}]
	for label, box in zip(label_list, box_list):
	value += 1
	name, logit = label.split('(')
	logit = logit[:-1] # the last is ')'
	json_data.append({
	'value': value,
	'label': name,
	'logit': float(logit),
	'box': box.numpy().tolist(),
	})
	with open(os.path.join(output_dir, 'mask.json'), 'w') as f:
	json.dump(json_data, f)


	def speech_recognition(speech_file, model):
	# whisper
	# load audio and pad/trim it to fit 30 seconds
	audio = whisper.load_audio(speech_file)
	audio = whisper.pad_or_trim(audio)

	# make log-Mel spectrogram and move to the same device as the model
	mel = whisper.log_mel_spectrogram(audio).to(model.device)

	# detect the spoken language
	_, probs = model.detect_language(mel)
	speech_language = max(probs, key=probs.get)

	# decode the audio
	options = whisper.DecodingOptions()
	result = whisper.decode(model, mel, options)

	# print the recognized text
	speech_text = result.text
	return speech_text, speech_language

	if __name__ == "__main__":

	parser = argparse.ArgumentParser("Grounded-Segment-Anything Demo", add_help=True)
	parser.add_argument("--config", type=str, required=True, help="path to config file")
	parser.add_argument(
	"--grounded_checkpoint", type=str, required=True, help="path to checkpoint file"
	)
	parser.add_argument(
	"--sam_checkpoint", type=str, required=True, help="path to checkpoint file"
	)
	parser.add_argument("--input_image", type=str, required=True, help="path to image file")
	parser.add_argument("--speech_file", type=str, required=True, help="speech file")
	parser.add_argument(
	"--output_dir", "-o", type=str, default="outputs", required=True, help="output directory"
	)

	parser.add_argument("--box_threshold", type=float, default=0.3, help="box threshold")
	parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
	parser.add_argument("--iou_threshold", type=float, default=0.5, help="iou threshold")

	parser.add_argument("--device", type=str, default="cpu", help="running on cpu only!, default=False")
	args = parser.parse_args()

	# cfg
	config_file = args.config # change the path of the model config file
	grounded_checkpoint = args.grounded_checkpoint # change the path of the model
	sam_checkpoint = args.sam_checkpoint
	image_path = args.input_image
	output_dir = args.output_dir
	box_threshold = args.box_threshold
	text_threshold = args.text_threshold
	iou_threshold = args.iou_threshold
	device = args.device

	# load speech
	whisper_model = whisper.load_model("base")
	speech_text, speech_language = speech_recognition(args.speech_file, whisper_model)
	print(f"speech_text: {speech_text}")
	print(f"speech_language: {speech_language}")

	# make dir
	os.makedirs(output_dir, exist_ok=True)
	# load image
	image_pil, image = load_image(image_path)
	# load model
	model = load_model(config_file, grounded_checkpoint, device=device)

	# visualize raw image
	image_pil.save(os.path.join(output_dir, "raw_image.jpg"))

	# run grounding dino model
	text_prompt = speech_text
	boxes_filt, scores, pred_phrases = get_grounding_output(
	model, image, text_prompt, box_threshold, text_threshold, device=device
	)

	# initialize SAM
	sam = build_sam(checkpoint=sam_checkpoint)
	sam.to(device=device)
	predictor = SamPredictor(sam)
	image = cv2.imread(image_path)
	image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
	predictor.set_image(image)

	size = image_pil.size
	H, W = size[1], size[0]
	for i in range(boxes_filt.size(0)):
	boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
	boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
	boxes_filt[i][2:] += boxes_filt[i][:2]

	boxes_filt = boxes_filt.cpu()
	# use NMS to handle overlapped boxes
	print(f"Before NMS: {boxes_filt.shape[0]} boxes")
	nms_idx = torchvision.ops.nms(boxes_filt, scores, iou_threshold).numpy().tolist()
	boxes_filt = boxes_filt[nms_idx]
	pred_phrases = [pred_phrases[idx] for idx in nms_idx]
	print(f"After NMS: {boxes_filt.shape[0]} boxes")

	transformed_boxes = predictor.transform.apply_boxes_torch(boxes_filt, image.shape[:2]).to(device)

	masks, _, _ = predictor.predict_torch(
	point_coords = None,
	point_labels = None,
	boxes = transformed_boxes.to(args.device),
	multimask_output = False,
	)

	# draw output image
	plt.figure(figsize=(10, 10))
	plt.imshow(image)
	for mask in masks:
	show_mask(mask.cpu().numpy(), plt.gca(), random_color=True)
	for box, label in zip(boxes_filt, pred_phrases):
	show_box(box.numpy(), plt.gca(), label)

	plt.title(speech_text)
	plt.axis('off')
	plt.savefig(
	os.path.join(output_dir, "grounded_sam_whisper_output.jpg"),
	bbox_inches="tight", dpi=300, pad_inches=0.0
	)


	save_mask_data(output_dir, masks, boxes_filt, pred_phrases)