GraspAnything / app.py
Plachta's picture
Update app.py
8e9f709 verified
import copy
import numpy as np
import torch
import sys
sys.path.append("./")
from models import sam_model_registry
from models.grasp_mods import modify_forward
from models.utils.transforms import ResizeLongestSide
from gradio_image_prompter import ImagePrompter
from structures.grasp_box import GraspCoder
img_resize = ResizeLongestSide(1024)
import cv2
import gradio as gr
from models.grasp_mods import add_inference_method
device = "cuda" if torch.cuda.is_available() else "cpu"
model_type = "vit_b"
mean = np.array([103.53, 116.28, 123.675])[:, np.newaxis, np.newaxis]
std = np.array([57.375, 57.12, 58.395])[:, np.newaxis, np.newaxis]
sam = sam_model_registry[model_type]()
sam.to(device=device)
sam.forward = modify_forward(sam)
sam.infer = add_inference_method(sam)
pretrained_model_path = "./epoch_39_step_415131.pth"
if pretrained_model_path != "":
sd = torch.load(pretrained_model_path, map_location='cpu')
# strip prefix "module." from keys
new_sd = {}
for k, v in sd.items():
if k.startswith("module."):
k = k[7:]
new_sd[k] = v
sam.load_state_dict(new_sd)
sam.eval()
def predict(input, topk):
np_image = input["image"]
points = input["points"]
orig_size = np_image.shape[:2]
# normalize image
np_image = np_image.transpose(2, 0, 1)
image = (np_image - mean) / std
image = torch.tensor(image).float().to(device)
image = image.unsqueeze(0)
t_image = img_resize.apply_image_torch(image)
t_orig_size = t_image.shape[-2:]
# pad to 1024x1024
pixel_mask = torch.ones(1, t_orig_size[0], t_orig_size[1], device=device)
t_image = torch.nn.functional.pad(t_image, (0, 1024 - t_image.shape[-1], 0, 1024 - t_image.shape[-2]))
pixel_mask = torch.nn.functional.pad(pixel_mask, (0, 1024 - t_orig_size[1], 0, 1024 - t_orig_size[0]))
# get box prompt
valid_boxes = []
for point in points:
x1, y1, type1, x2, y2, type2 = point
if type1 == 2 and type2 == 3:
valid_boxes.append([x1, y1, x2, y2])
if len(valid_boxes) == 0:
return np_image
t_boxes = np.array(valid_boxes)
t_boxes = img_resize.apply_boxes(t_boxes, orig_size)
box_torch = torch.as_tensor(t_boxes, dtype=torch.float, device=device)
batched_inputs = [{"image": t_image[0], "boxes": box_torch, "pixel_mask": pixel_mask}]
with torch.no_grad():
outputs = sam.infer(batched_inputs, multimask_output=False)
# visualize and post on tensorboard
# recover image
recovered_img = batched_inputs[0]['image'].cpu().numpy()
recovered_img = recovered_img * std + mean
recovered_img = recovered_img.transpose(1, 2, 0).astype(np.uint8).clip(0, 255)
for i in range(len(outputs.pred_masks)):
# get predicted mask
pred_mask = outputs.pred_masks[i].detach().sigmoid().cpu().numpy() > 0.5
pred_mask = pred_mask.transpose(1, 2, 0).repeat(3, axis=2)
# get predicted grasp
pred_logits = outputs.logits[i].detach().cpu().numpy()
top_ind = pred_logits[:, 0].argsort()[-topk:][::-1]
pred_grasp = outputs.pred_boxes[i].detach().cpu().numpy()[top_ind]
coded_grasp = GraspCoder(t_orig_size[0], t_orig_size[1], None, grasp_annos_reformat=pred_grasp)
_ = coded_grasp.decode()
decoded_grasp = copy.deepcopy(coded_grasp.grasp_annos)
# draw mask
mask_color = np.array([0, 255, 0])[None, None, :]
recovered_img[pred_mask] = recovered_img[pred_mask] * 0.5 + (pred_mask * mask_color)[pred_mask] * 0.5
# draw grasp
recovered_img = np.ascontiguousarray(recovered_img)
for grasp in decoded_grasp:
grasp = grasp.astype(int)
cv2.line(recovered_img, tuple(grasp[0:2]), tuple(grasp[2:4]), (255, 0, 0), 1)
cv2.line(recovered_img, tuple(grasp[4:6]), tuple(grasp[6:8]), (255, 0, 0), 1)
cv2.line(recovered_img, tuple(grasp[2:4]), tuple(grasp[4:6]), (0, 0, 255), 2)
cv2.line(recovered_img, tuple(grasp[6:8]), tuple(grasp[0:2]), (0, 0, 255), 2)
recovered_img = recovered_img[:t_orig_size[0], :t_orig_size[1]]
# resize to original size
recovered_img = cv2.resize(recovered_img, (orig_size[1], orig_size[0]))
return recovered_img
if __name__ == "__main__":
app = gr.Blocks(title="GraspAnything")
with app:
gr.Markdown("""
# GraspAnything <br>
Upload an image and draw a box around the object you want to grasp. Set top k to be the number of grasps you want to predict for each object.
""")
with gr.Column():
prompter = ImagePrompter(show_label=False)
top_k = gr.Slider(minimum=1, maximum=20, step=1, value=3, label="Top K Grasps")
with gr.Column():
image_output = gr.Image()
btn = gr.Button("Generate!")
btn.click(predict,
inputs=[prompter, top_k],
outputs=[image_output])
app.launch()