Spaces:

xhk
/

ASAM

Running

App Files Files Community

ASAM / app.py

xhk

Upload 138 files

2fe64fd verified 5 months ago

raw

history blame

19.1 kB

	import os, sys
	import random
	import warnings
	import copy
	os.system("python -m pip install -e asam")
	os.system("python -m pip install -e GroundingDINO")
	# os.system("python -m pip uninstall gradio")
	os.system("python -m pip install gradio==3.38.0")
	os.system("pip install opencv-python pycocotools matplotlib onnxruntime onnx ipykernel")
	sys.path.append(os.path.join(os.getcwd(), "GroundingDINO"))
	sys.path.append(os.path.join(os.getcwd(), "asam"))
	warnings.filterwarnings("ignore")

	import gradio as gr
	import argparse

	import numpy as np
	import torch
	import torchvision
	from PIL import Image, ImageDraw, ImageFont
	from scipy import ndimage

	# Grounding DINO
	import GroundingDINO.groundingdino.datasets.transforms as T
	from GroundingDINO.groundingdino.models import build_model
	from GroundingDINO.groundingdino.util.slconfig import SLConfig
	from GroundingDINO.groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap

	# segment anything
	from segment_anything import build_sam_vit_b, SamPredictor
	import numpy as np


	# BLIP
	from transformers import BlipProcessor, BlipForConditionalGeneration


	def generate_caption(processor, blip_model, raw_image):
	# unconditional image captioning
	inputs = processor(raw_image, return_tensors="pt").to(
	device) #fp 16
	out = blip_model.generate(**inputs)
	caption = processor.decode(out[0], skip_special_tokens=True)
	return caption


	def transform_image(image_pil):

	transform = T.Compose(
	[
	T.RandomResize([800], max_size=1333),
	T.ToTensor(),
	T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
	]
	)
	image, _ = transform(image_pil, None) # 3, h, w
	return image


	def load_model(model_config_path, model_checkpoint_path, device):
	args = SLConfig.fromfile(model_config_path)
	args.device = device
	model = build_model(args)
	checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
	load_res = model.load_state_dict(
	clean_state_dict(checkpoint["model"]), strict=False)
	print(load_res)
	_ = model.eval()
	return model


	def get_grounding_output(model, image, caption, box_threshold, text_threshold, with_logits=True):
	caption = caption.lower()
	caption = caption.strip()
	if not caption.endswith("."):
	caption = caption + "."

	with torch.no_grad():
	outputs = model(image[None], captions=[caption])
	logits = outputs["pred_logits"].cpu().sigmoid()[0] # (nq, 256)
	boxes = outputs["pred_boxes"].cpu()[0] # (nq, 4)
	logits.shape[0]

	# filter output
	logits_filt = logits.clone()
	boxes_filt = boxes.clone()
	filt_mask = logits_filt.max(dim=1)[0] > box_threshold
	logits_filt = logits_filt[filt_mask] # num_filt, 256
	boxes_filt = boxes_filt[filt_mask] # num_filt, 4
	logits_filt.shape[0]

	# get phrase
	tokenlizer = model.tokenizer
	tokenized = tokenlizer(caption)
	# build pred
	pred_phrases = []
	scores = []
	for logit, box in zip(logits_filt, boxes_filt):
	pred_phrase = get_phrases_from_posmap(
	logit > text_threshold, tokenized, tokenlizer)
	if with_logits:
	pred_phrases.append(
	pred_phrase + f"({str(logit.max().item())[:4]})")
	else:
	pred_phrases.append(pred_phrase)
	scores.append(logit.max().item())

	return boxes_filt, torch.Tensor(scores), pred_phrases


	def draw_mask(mask, draw, random_color=False):
	if random_color:
	color = (random.randint(0, 255), random.randint(
	0, 255), random.randint(0, 255), 153)
	else:
	color = (30, 144, 255, 153)

	nonzero_coords = np.transpose(np.nonzero(mask))

	for coord in nonzero_coords:
	draw.point(coord[::-1], fill=color)


	def draw_box(box, draw, label):
	# random color
	color = tuple(np.random.randint(0, 255, size=3).tolist())

	draw.rectangle(((box[0], box[1]), (box[2], box[3])),
	outline=color, width=2)

	if label:
	font = ImageFont.load_default()
	if hasattr(font, "getbbox"):
	bbox = draw.textbbox((box[0], box[1]), str(label), font)
	else:
	w, h = draw.textsize(str(label), font)
	bbox = (box[0], box[1], w + box[0], box[1] + h)
	draw.rectangle(bbox, fill=color)
	draw.text((box[0], box[1]), str(label), fill="white")

	draw.text((box[0], box[1]), label)

	def draw_point(point, draw, r=10):
	show_point = []
	for p in point:
	x,y = p
	draw.ellipse((x-r, y-r, x+r, y+r), fill='green')


	config_file = 'GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py'
	ckpt_filenmae = "groundingdino_swint_ogc.pth"
	sam_checkpoint = 'sam_vit_b_01ec64.pth'
	asam_checkpoint = 'asam_vit_b.pth'
	output_dir = "outputs"
	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	blip_processor = None
	blip_model = None
	groundingdino_model = None
	sam_predictor = None


	def run_grounded_sam(input_image, text_prompt, task_type, box_threshold, text_threshold, iou_threshold):
	print(text_prompt, type(text_prompt))
	global blip_processor, blip_model, groundingdino_model, sam_predictor

	# make dir
	os.makedirs(output_dir, exist_ok=True)
	# load image
	scribble = np.array(input_image["mask"])
	image_pil = input_image["image"].convert("RGB")
	transformed_image = transform_image(image_pil)
	print('img sum:' ,torch.sum(transformed_image).to(torch.int).item())

	if groundingdino_model is None:
	groundingdino_model = load_model(
	config_file, ckpt_filenmae, device=device)

	if task_type == 'automatic':
	# generate caption and tags
	# use Tag2Text can generate better captions
	# https://huggingface.co/spaces/xinyu1205/Tag2Text
	# but there are some bugs...
	blip_processor = blip_processor or BlipProcessor.from_pretrained(
	"Salesforce/blip-image-captioning-large")
	blip_model = blip_model or BlipForConditionalGeneration.from_pretrained(
	"Salesforce/blip-image-captioning-large").to(device) #torch_dtype=torch.float16
	text_prompt = generate_caption(blip_processor, blip_model, image_pil)
	print(f"Caption: {text_prompt}")

	# run grounding dino model
	boxes_filt, scores, pred_phrases = get_grounding_output(
	groundingdino_model, transformed_image, text_prompt, box_threshold, text_threshold
	)

	size = image_pil.size

	# process boxes
	H, W = size[1], size[0]
	for i in range(boxes_filt.size(0)):
	boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
	boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
	boxes_filt[i][2:] += boxes_filt[i][:2]

	boxes_filt = boxes_filt.cpu()

	# nms
	print(f"Before NMS: {boxes_filt.shape[0]} boxes")
	nms_idx = torchvision.ops.nms(
	boxes_filt, scores, iou_threshold).numpy().tolist()
	boxes_filt = boxes_filt[nms_idx]
	pred_phrases = [pred_phrases[idx] for idx in nms_idx]
	print(f"After NMS: {boxes_filt.shape[0]} boxes")

	if sam_predictor is None:
	# initialize SAM
	assert sam_checkpoint, 'sam_checkpoint is not found!'
	sam = build_sam_vit_b(checkpoint=sam_checkpoint)
	sam.to(device=device)
	sam_predictor = SamPredictor(sam)

	image = np.array(image_pil)
	sam_predictor.set_image(image)

	if task_type == 'automatic':
	# use NMS to handle overlapped boxes
	print(f"Revise caption with number: {text_prompt}")

	if task_type == 'default_box' or task_type == 'automatic' or task_type == 'scribble_box':
	if task_type == 'default_box':
	id = torch.sum(transformed_image).to(torch.int).item()
	if id == -1683627: #example 1 *
	x_min, y_min, x_max, y_max = 204, 213, 813, 1023
	elif id == 1137390: #example 2 *
	x_min, y_min, x_max, y_max = 125, 168, 842, 904
	elif id == 1145309: #example 3 *
	x_min, y_min, x_max, y_max = 0, 486, 992, 899
	elif id == 1091779: #example 4 *
	x_min, y_min, x_max, y_max = 2, 73, 981, 968
	elif id == -1335352: #example 5 *
	x_min, y_min, x_max, y_max = 201, 195, 811, 1023
	elif id == -1479645: #example 6
	x_min, y_min, x_max, y_max = 428, 0, 992, 799
	elif id == -544197: #example 7
	x_min, y_min, x_max, y_max = 106, 419, 312, 783
	elif id == -23873: #example 8
	x_min, y_min, x_max, y_max = 250, 25, 774, 803
	elif id == -1572157: #example 9 *
	x_min, y_min, x_max, y_max = 15, 88, 1006, 977
	else:
	print("not defined")
	raise NotImplementedError
	bbox = np.array([x_min, y_min, x_max, y_max])
	bbox = torch.tensor(bbox).unsqueeze(0)
	transformed_boxes = sam_predictor.transform.apply_boxes_torch(bbox, image.shape[:2]).to(device)
	elif task_type == 'scribble_box':
	scribble = scribble.transpose(2, 1, 0)[0]
	labeled_array, num_features = ndimage.label(scribble >= 255)
	centers = ndimage.center_of_mass(scribble, labeled_array, range(1, num_features+1))
	centers = np.array(centers)
	### (x1, y1, x2, y2)
	x_min = centers[:, 0].min()
	x_max = centers[:, 0].max()
	y_min = centers[:, 1].min()
	y_max = centers[:, 1].max()
	bbox = np.array([x_min, y_min, x_max, y_max])
	bbox = torch.tensor(bbox).unsqueeze(0)
	transformed_boxes = sam_predictor.transform.apply_boxes_torch(bbox, image.shape[:2]).to(device)
	else:
	transformed_boxes = sam_predictor.transform.apply_boxes_torch(
	boxes_filt, image.shape[:2]).to(device)


	a_image_pil = copy.deepcopy(image_pil)
	# sam`s output
	sam_predictor.model.load_state_dict(torch.load(sam_checkpoint,map_location='cpu'))
	masks, _, _ = sam_predictor.predict_torch(
	point_coords=None,
	point_labels=None,
	boxes=transformed_boxes,
	multimask_output=False,
	)
	print(torch.sum(masks), masks.device)
	# masks: [1, 1, 512, 512]
	mask_image = Image.new('RGBA', size, color=(0, 0, 0, 0))
	mask_draw = ImageDraw.Draw(mask_image)
	for mask in masks:
	draw_mask(mask[0].cpu().numpy(), mask_draw, random_color=True)
	image_draw = ImageDraw.Draw(image_pil)

	if task_type == 'scribble_box' or task_type == 'default_box':
	for box in bbox:
	draw_box(box, image_draw, None)
	else:
	for box, label in zip(boxes_filt, pred_phrases):
	draw_box(box, image_draw, label)

	if task_type == 'automatic':
	image_draw.text((10, 10), text_prompt, fill='black')

	image_pil = image_pil.convert('RGBA')
	image_pil.alpha_composite(mask_image)


	# asam`s output
	total_weights = 0
	for param in sam_predictor.model.parameters():
	total_weights += param.data.sum()

	print("Total sum of model weights:", total_weights.item())

	sam_predictor.model.load_state_dict(torch.load(asam_checkpoint,map_location='cpu'))

	total_weights = 0
	for param in sam_predictor.model.parameters():
	total_weights += param.data.sum()

	print("Total sum of model weights:", total_weights.item())

	a_masks, _, _ = sam_predictor.predict_torch(
	point_coords=None,
	point_labels=None,
	boxes=transformed_boxes,
	multimask_output=False,
	)
	print(torch.sum(a_masks))

	# masks: [1, 1, 512, 512]
	a_mask_image = Image.new('RGBA', size, color=(0, 0, 0, 0))
	a_mask_draw = ImageDraw.Draw(a_mask_image)
	for a_mask in a_masks:
	draw_mask(a_mask[0].cpu().numpy(), a_mask_draw, random_color=True)
	a_image_draw = ImageDraw.Draw(a_image_pil)

	if task_type == 'scribble_box' or task_type == 'default_box':
	for box in bbox:
	draw_box(box, a_image_draw, None)
	else:
	for box, label in zip(boxes_filt, pred_phrases):
	draw_box(box, a_image_draw, label)

	if task_type == 'automatic':
	a_image_draw.text((10, 10), text_prompt, fill='black')

	a_image_pil = a_image_pil.convert('RGBA')
	a_image_pil.alpha_composite(a_mask_image)

	return [[image_pil, mask_image],[a_image_pil, a_mask_image]]

	elif task_type == 'scribble_point':

	scribble = scribble.transpose(2, 1, 0)[0]
	labeled_array, num_features = ndimage.label(scribble >= 255)
	centers = ndimage.center_of_mass(scribble, labeled_array, range(1, num_features+1))
	centers = np.array(centers)
	point_coords = centers
	point_labels = np.ones(point_coords.shape[0])

	a_image_pil = copy.deepcopy(image_pil)

	# sam`s output
	sam_predictor.model.load_state_dict(torch.load(sam_checkpoint,map_location='cpu'))
	masks, _, _ = sam_predictor.predict(
	point_coords=point_coords,
	point_labels=point_labels,
	box=None,
	multimask_output=False,
	)

	mask_image = Image.new('RGBA', size, color=(0, 0, 0, 0))
	mask_draw = ImageDraw.Draw(mask_image)
	for mask in masks:
	draw_mask(mask, mask_draw, random_color=True)
	image_draw = ImageDraw.Draw(image_pil)

	draw_point(point_coords,image_draw)

	image_pil = image_pil.convert('RGBA')
	image_pil.alpha_composite(mask_image)


	# asam`s output
	sam_predictor.model.load_state_dict(torch.load(asam_checkpoint,map_location='cpu'))
	a_masks, _, _ = sam_predictor.predict(
	point_coords=point_coords,
	point_labels=point_labels,
	box=None,
	multimask_output=False,
	)

	a_mask_image = Image.new('RGBA', size, color=(0, 0, 0, 0))
	a_mask_draw = ImageDraw.Draw(a_mask_image)
	for a_mask in a_masks:
	draw_mask(a_mask, a_mask_draw, random_color=True)

	a_image_draw = ImageDraw.Draw(a_image_pil)
	draw_point(point_coords,a_image_draw)

	a_image_pil = a_image_pil.convert('RGBA')
	a_image_pil.alpha_composite(a_mask_image)

	return [[image_pil, mask_image],[a_image_pil, a_mask_image]]

	else:
	print("task_type:{} error!".format(task_type))


	if __name__ == "__main__":
	parser = argparse.ArgumentParser("Grounded SAM demo", add_help=True)
	parser.add_argument("--debug", action="store_true",
	help="using debug mode")
	parser.add_argument("--share", action="store_true", help="share the app")
	parser.add_argument('--no-gradio-queue', action="store_true",
	help='path to the SAM checkpoint')
	args = parser.parse_args()

	print(args)

	block = gr.Blocks()
	if not args.no_gradio_queue:
	block = block.queue()

	with block:
	gr.Markdown(
	"""
	# ASAM

	Welcome to the ASAM demo <br/>
	You may select different prompt types to get the output mask of target instance.

	## Usage
	You may check the instruction below, or check our github page about more details.

	## Mode
	You may select an example image or upload your image to start, we support 4 prompt types:

	default_box: According to the mask label, automaticly generate the default box prompt, only used for examples.

	automatic: Automaticly generate text prompt and the corresponding box input with BLIP and Grounding-DINO.

	scribble_point: Click an point on the target instance.

	scribble_box: Click on two points, the top-left point and the bottom-right point to represent a bounding box of the target instance.

	""")

	with gr.Row():
	with gr.Column():
	input_image = gr.Image(
	source='upload', type="pil", value="example9.jpg", tool="sketch",brush_radius=20)
	task_type = gr.Dropdown(
	["default_box","automatic", "scribble_point", "scribble_box"], value="default_box", label="task_type")
	text_prompt = gr.Textbox(label="Text Prompt", placeholder="bench .", visible=False)
	run_button = gr.Button(label="Run")
	with gr.Accordion("Advanced options", open=False):
	box_threshold = gr.Slider(
	label="Box Threshold", minimum=0.0, maximum=1.0, value=0.4, step=0.001
	)
	text_threshold = gr.Slider(
	label="Text Threshold", minimum=0.0, maximum=1.0, value=0.25, step=0.001
	)
	iou_threshold = gr.Slider(
	label="IOU Threshold", minimum=0.0, maximum=1.0, value=0.8, step=0.001
	)

	with gr.Column():
	gr.Markdown(
	"""
	# SAM`s output
	""")

	gallery1 = gr.Gallery(
	label="Generated images", show_label=False, elem_id="gallery"
	).style(preview=True, grid=2, object_fit="scale-down")

	gr.Markdown(
	"""
	# ASAM`s output
	""")
	gallery2 = gr.Gallery(
	label="Generated images", show_label=False, elem_id="gallery"
	).style(preview=True, grid=2, object_fit="scale-down")

	with gr.Row():
	with gr.Column():
	gr.Examples(["example1.jpg"], inputs=input_image)
	with gr.Column():
	gr.Examples(["example2.jpg"], inputs=input_image)
	with gr.Column():
	gr.Examples(["example3.jpg"], inputs=input_image)
	with gr.Column():
	gr.Examples(["example4.jpg"], inputs=input_image)
	with gr.Column():
	gr.Examples(["example5.jpg"], inputs=input_image)
	with gr.Column():
	gr.Examples(["example6.jpg"], inputs=input_image)
	with gr.Column():
	gr.Examples(["example7.jpg"], inputs=input_image)
	with gr.Column():
	gr.Examples(["example8.jpg"], inputs=input_image)
	with gr.Column():
	gr.Examples(["example9.jpg"], inputs=input_image)
	run_button.click(fn=run_grounded_sam, inputs=[
	input_image, text_prompt, task_type, box_threshold, text_threshold, iou_threshold], outputs=[gallery1,gallery2])

	block.launch(debug=args.debug, share=args.share, show_error=True)