|
|
|
|
|
|
|
import os |
|
import json |
|
from typing import Any |
|
import numpy as np |
|
import random |
|
import torch |
|
import torchvision |
|
import torchvision.transforms as transforms |
|
from PIL import Image |
|
import cv2 |
|
import matplotlib.pyplot as plt |
|
from cog import BasePredictor, Input, Path, BaseModel |
|
|
|
from subprocess import call |
|
|
|
HOME = os.getcwd() |
|
os.chdir("GroundingDINO") |
|
call("pip install -q .", shell=True) |
|
os.chdir(HOME) |
|
os.chdir("segment_anything") |
|
call("pip install -q .", shell=True) |
|
os.chdir(HOME) |
|
|
|
|
|
import GroundingDINO.groundingdino.datasets.transforms as T |
|
from GroundingDINO.groundingdino.models import build_model |
|
from GroundingDINO.groundingdino.util.slconfig import SLConfig |
|
from GroundingDINO.groundingdino.util.utils import ( |
|
clean_state_dict, |
|
get_phrases_from_posmap, |
|
) |
|
|
|
|
|
from segment_anything import build_sam, build_sam_hq, SamPredictor |
|
|
|
from ram.models import ram |
|
|
|
|
|
class ModelOutput(BaseModel): |
|
tags: str |
|
rounding_box_img: Path |
|
masked_img: Path |
|
json_data: Any |
|
|
|
|
|
class Predictor(BasePredictor): |
|
def setup(self): |
|
"""Load the model into memory to make running multiple predictions efficient""" |
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
normalize = transforms.Normalize( |
|
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] |
|
) |
|
self.image_size = 384 |
|
self.transform = transforms.Compose( |
|
[ |
|
transforms.Resize((self.image_size, self.image_size)), |
|
transforms.ToTensor(), |
|
normalize, |
|
] |
|
) |
|
|
|
|
|
self.ram_model = ram( |
|
pretrained="pretrained/ram_swin_large_14m.pth", |
|
image_size=self.image_size, |
|
vit="swin_l", |
|
) |
|
self.ram_model.eval() |
|
self.ram_model = self.ram_model.to(self.device) |
|
|
|
self.model = load_model( |
|
"GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py", |
|
"pretrained/groundingdino_swint_ogc.pth", |
|
device=self.device, |
|
) |
|
|
|
self.sam = SamPredictor( |
|
build_sam(checkpoint="pretrained/sam_vit_h_4b8939.pth").to(self.device) |
|
) |
|
self.sam_hq = SamPredictor( |
|
build_sam_hq(checkpoint="pretrained/sam_hq_vit_h.pth").to(self.device) |
|
) |
|
|
|
def predict( |
|
self, |
|
input_image: Path = Input(description="Input image"), |
|
use_sam_hq: bool = Input( |
|
description="Use sam_hq instead of SAM for prediction", default=False |
|
), |
|
) -> ModelOutput: |
|
"""Run a single prediction on the model""" |
|
|
|
|
|
box_threshold = 0.25 |
|
text_threshold = 0.2 |
|
iou_threshold = 0.5 |
|
|
|
image_pil, image = load_image(str(input_image)) |
|
|
|
raw_image = image_pil.resize((self.image_size, self.image_size)) |
|
raw_image = self.transform(raw_image).unsqueeze(0).to(self.device) |
|
|
|
with torch.no_grad(): |
|
tags, tags_chinese = self.ram_model.generate_tag(raw_image) |
|
|
|
tags = tags[0].replace(" |", ",") |
|
|
|
|
|
boxes_filt, scores, pred_phrases = get_grounding_output( |
|
self.model, image, tags, box_threshold, text_threshold, device=self.device |
|
) |
|
|
|
predictor = self.sam_hq if use_sam_hq else self.sam |
|
|
|
image = cv2.imread(str(input_image)) |
|
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) |
|
predictor.set_image(image) |
|
|
|
size = image_pil.size |
|
H, W = size[1], size[0] |
|
for i in range(boxes_filt.size(0)): |
|
boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H]) |
|
boxes_filt[i][:2] -= boxes_filt[i][2:] / 2 |
|
boxes_filt[i][2:] += boxes_filt[i][:2] |
|
|
|
boxes_filt = boxes_filt.cpu() |
|
|
|
print(f"Before NMS: {boxes_filt.shape[0]} boxes") |
|
nms_idx = ( |
|
torchvision.ops.nms(boxes_filt, scores, iou_threshold).numpy().tolist() |
|
) |
|
boxes_filt = boxes_filt[nms_idx] |
|
pred_phrases = [pred_phrases[idx] for idx in nms_idx] |
|
print(f"After NMS: {boxes_filt.shape[0]} boxes") |
|
|
|
transformed_boxes = predictor.transform.apply_boxes_torch( |
|
boxes_filt, image.shape[:2] |
|
).to(self.device) |
|
|
|
masks, _, _ = predictor.predict_torch( |
|
point_coords=None, |
|
point_labels=None, |
|
boxes=transformed_boxes.to(self.device), |
|
multimask_output=False, |
|
) |
|
|
|
|
|
plt.figure(figsize=(10, 10)) |
|
for mask in masks: |
|
show_mask(mask.cpu().numpy(), plt.gca(), random_color=True) |
|
for box, label in zip(boxes_filt, pred_phrases): |
|
show_box(box.numpy(), plt.gca(), label) |
|
|
|
rounding_box_path = "/tmp/automatic_label_output.png" |
|
plt.axis("off") |
|
plt.savefig( |
|
Path(rounding_box_path), bbox_inches="tight", dpi=300, pad_inches=0.0 |
|
) |
|
plt.close() |
|
|
|
|
|
value = 0 |
|
mask_img = torch.zeros(masks.shape[-2:]) |
|
for idx, mask in enumerate(masks): |
|
mask_img[mask.cpu().numpy()[0] == True] = value + idx + 1 |
|
plt.figure(figsize=(10, 10)) |
|
plt.imshow(mask_img.numpy()) |
|
plt.axis("off") |
|
masks_path = "/tmp/mask.png" |
|
plt.savefig(masks_path, bbox_inches="tight", dpi=300, pad_inches=0.0) |
|
plt.close() |
|
|
|
json_data = { |
|
"tags": tags, |
|
"mask": [{"value": value, "label": "background"}], |
|
} |
|
for label, box in zip(pred_phrases, boxes_filt): |
|
value += 1 |
|
name, logit = label.split("(") |
|
logit = logit[:-1] |
|
json_data["mask"].append( |
|
{ |
|
"value": value, |
|
"label": name, |
|
"logit": float(logit), |
|
"box": box.numpy().tolist(), |
|
} |
|
) |
|
|
|
json_path = "/tmp/label.json" |
|
with open(json_path, "w") as f: |
|
json.dump(json_data, f) |
|
|
|
return ModelOutput( |
|
tags=tags, |
|
masked_img=Path(masks_path), |
|
rounding_box_img=Path(rounding_box_path), |
|
json_data=Path(json_path), |
|
) |
|
|
|
|
|
def get_grounding_output( |
|
model, image, caption, box_threshold, text_threshold, device="cpu" |
|
): |
|
caption = caption.lower() |
|
caption = caption.strip() |
|
if not caption.endswith("."): |
|
caption = caption + "." |
|
model = model.to(device) |
|
image = image.to(device) |
|
with torch.no_grad(): |
|
outputs = model(image[None], captions=[caption]) |
|
logits = outputs["pred_logits"].cpu().sigmoid()[0] |
|
boxes = outputs["pred_boxes"].cpu()[0] |
|
logits.shape[0] |
|
|
|
|
|
logits_filt = logits.clone() |
|
boxes_filt = boxes.clone() |
|
filt_mask = logits_filt.max(dim=1)[0] > box_threshold |
|
logits_filt = logits_filt[filt_mask] |
|
boxes_filt = boxes_filt[filt_mask] |
|
logits_filt.shape[0] |
|
|
|
|
|
tokenlizer = model.tokenizer |
|
tokenized = tokenlizer(caption) |
|
|
|
pred_phrases = [] |
|
scores = [] |
|
for logit, box in zip(logits_filt, boxes_filt): |
|
pred_phrase = get_phrases_from_posmap( |
|
logit > text_threshold, tokenized, tokenlizer |
|
) |
|
pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})") |
|
scores.append(logit.max().item()) |
|
|
|
return boxes_filt, torch.Tensor(scores), pred_phrases |
|
|
|
|
|
def load_image(image_path): |
|
|
|
image_pil = Image.open(image_path).convert("RGB") |
|
|
|
transform = T.Compose( |
|
[ |
|
T.RandomResize([800], max_size=1333), |
|
T.ToTensor(), |
|
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), |
|
] |
|
) |
|
image, _ = transform(image_pil, None) |
|
return image_pil, image |
|
|
|
|
|
def load_model(model_config_path, model_checkpoint_path, device): |
|
args = SLConfig.fromfile(model_config_path) |
|
args.device = device |
|
model = build_model(args) |
|
checkpoint = torch.load(model_checkpoint_path, map_location="cpu") |
|
load_res = model.load_state_dict( |
|
clean_state_dict(checkpoint["model"]), strict=False |
|
) |
|
print(load_res) |
|
_ = model.eval() |
|
return model |
|
|
|
|
|
def show_mask(mask, ax, random_color=False): |
|
if random_color: |
|
color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0) |
|
else: |
|
color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6]) |
|
h, w = mask.shape[-2:] |
|
mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1) |
|
ax.imshow(mask_image) |
|
|
|
|
|
def show_box(box, ax, label): |
|
x0, y0 = box[0], box[1] |
|
w, h = box[2] - box[0], box[3] - box[1] |
|
ax.add_patch( |
|
plt.Rectangle((x0, y0), w, h, edgecolor="green", facecolor=(0, 0, 0, 0), lw=1.5) |
|
) |
|
ax.text(x0, y0, label) |
|
|