Spaces:
Sleeping
Sleeping
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. | |
import cv2 | |
import torch | |
import numpy as np | |
from torchvision import transforms as T | |
from maskrcnn_benchmark.modeling.detector import build_detection_model | |
from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer | |
from maskrcnn_benchmark.structures.image_list import to_image_list | |
from maskrcnn_benchmark.structures.boxlist_ops import boxlist_iou | |
from maskrcnn_benchmark.structures.bounding_box import BoxList | |
from maskrcnn_benchmark.modeling.roi_heads.mask_head.inference import Masker | |
from maskrcnn_benchmark import layers as L | |
from maskrcnn_benchmark.utils import cv2_util | |
import timeit | |
class COCODemo(object): | |
# COCO categories for pretty print | |
CATEGORIES = [ | |
"__background", | |
"person", | |
"bicycle", | |
"car", | |
"motorcycle", | |
"airplane", | |
"bus", | |
"train", | |
"truck", | |
"boat", | |
"traffic light", | |
"fire hydrant", | |
"stop sign", | |
"parking meter", | |
"bench", | |
"bird", | |
"cat", | |
"dog", | |
"horse", | |
"sheep", | |
"cow", | |
"elephant", | |
"bear", | |
"zebra", | |
"giraffe", | |
"backpack", | |
"umbrella", | |
"handbag", | |
"tie", | |
"suitcase", | |
"frisbee", | |
"skis", | |
"snowboard", | |
"sports ball", | |
"kite", | |
"baseball bat", | |
"baseball glove", | |
"skateboard", | |
"surfboard", | |
"tennis racket", | |
"bottle", | |
"wine glass", | |
"cup", | |
"fork", | |
"knife", | |
"spoon", | |
"bowl", | |
"banana", | |
"apple", | |
"sandwich", | |
"orange", | |
"broccoli", | |
"carrot", | |
"hot dog", | |
"pizza", | |
"donut", | |
"cake", | |
"chair", | |
"couch", | |
"potted plant", | |
"bed", | |
"dining table", | |
"toilet", | |
"tv", | |
"laptop", | |
"mouse", | |
"remote", | |
"keyboard", | |
"cell phone", | |
"microwave", | |
"oven", | |
"toaster", | |
"sink", | |
"refrigerator", | |
"book", | |
"clock", | |
"vase", | |
"scissors", | |
"teddy bear", | |
"hair drier", | |
"toothbrush", | |
] | |
def __init__( | |
self, | |
cfg, | |
confidence_threshold=0.7, | |
show_mask_heatmaps=False, | |
masks_per_dim=2, | |
min_image_size=None, | |
exclude_region=None, | |
): | |
self.cfg = cfg.clone() | |
self.model = build_detection_model(cfg) | |
self.model.eval() | |
self.device = torch.device(cfg.MODEL.DEVICE) | |
self.model.to(self.device) | |
self.min_image_size = min_image_size | |
save_dir = cfg.OUTPUT_DIR | |
checkpointer = DetectronCheckpointer(cfg, self.model, save_dir=save_dir) | |
_ = checkpointer.load(cfg.MODEL.WEIGHT) | |
self.transforms = self.build_transform() | |
mask_threshold = -1 if show_mask_heatmaps else 0.5 | |
self.masker = Masker(threshold=mask_threshold, padding=1) | |
# used to make colors for each class | |
self.palette = torch.tensor([2**25 - 1, 2**15 - 1, 2**21 - 1]) | |
self.cpu_device = torch.device("cpu") | |
self.confidence_threshold = confidence_threshold | |
self.show_mask_heatmaps = show_mask_heatmaps | |
self.masks_per_dim = masks_per_dim | |
self.exclude_region = exclude_region | |
def build_transform(self): | |
""" | |
Creates a basic transformation that was used to train the models | |
""" | |
cfg = self.cfg | |
# we are loading images with OpenCV, so we don't need to convert them | |
# to BGR, they are already! So all we need to do is to normalize | |
# by 255 if we want to convert to BGR255 format, or flip the channels | |
# if we want it to be in RGB in [0-1] range. | |
if cfg.INPUT.TO_BGR255: | |
to_bgr_transform = T.Lambda(lambda x: x * 255) | |
else: | |
to_bgr_transform = T.Lambda(lambda x: x[[2, 1, 0]]) | |
normalize_transform = T.Normalize(mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD) | |
transform = T.Compose( | |
[ | |
T.ToPILImage(), | |
T.Resize(self.min_image_size) if self.min_image_size is not None else lambda x: x, | |
T.ToTensor(), | |
to_bgr_transform, | |
normalize_transform, | |
] | |
) | |
return transform | |
def inference(self, image, debug=False): | |
""" | |
Arguments: | |
image (np.ndarray): an image as returned by OpenCV | |
Returns: | |
prediction (BoxList): the detected objects. Additional information | |
of the detection properties can be found in the fields of | |
the BoxList via `prediction.fields()` | |
""" | |
predictions, debug_info = self.compute_prediction(image) | |
top_predictions = self.select_top_predictions(predictions) | |
if debug: | |
return top_predictions, debug_info | |
else: | |
return top_predictions | |
def run_on_opencv_image(self, image): | |
""" | |
Arguments: | |
image (np.ndarray): an image as returned by OpenCV | |
Returns: | |
prediction (BoxList): the detected objects. Additional information | |
of the detection properties can be found in the fields of | |
the BoxList via `prediction.fields()` | |
""" | |
predictions, debug_info = self.compute_prediction(image) | |
top_predictions = self.select_top_predictions(predictions) | |
result = image.copy() | |
if self.show_mask_heatmaps: | |
return self.create_mask_montage(result, top_predictions) | |
result = self.overlay_boxes(result, top_predictions) | |
if self.cfg.MODEL.MASK_ON: | |
result = self.overlay_mask(result, top_predictions) | |
if self.cfg.MODEL.KEYPOINT_ON: | |
result = self.overlay_keypoints(result, top_predictions) | |
result = self.overlay_class_names(result, top_predictions) | |
return result, debug_info, top_predictions | |
def compute_prediction(self, original_image): | |
""" | |
Arguments: | |
original_image (np.ndarray): an image as returned by OpenCV | |
Returns: | |
prediction (BoxList): the detected objects. Additional information | |
of the detection properties can be found in the fields of | |
the BoxList via `prediction.fields()` | |
""" | |
# apply pre-processing to image | |
# if self.exclude_region: | |
# for region in self.exclude_region: | |
# original_image[region[1]:region[3], region[0]:region[2], :] = 255 | |
image = self.transforms(original_image) | |
# convert to an ImageList, padded so that it is divisible by | |
# cfg.DATALOADER.SIZE_DIVISIBILITY | |
image_list = to_image_list(image, self.cfg.DATALOADER.SIZE_DIVISIBILITY) | |
image_list = image_list.to(self.device) | |
tic = timeit.time.perf_counter() | |
# compute predictions | |
with torch.no_grad(): | |
predictions, debug_info = self.model(image_list) | |
predictions = [o.to(self.cpu_device) for o in predictions] | |
debug_info["total_time"] = timeit.time.perf_counter() - tic | |
# always single image is passed at a time | |
prediction = predictions[0] | |
# reshape prediction (a BoxList) into the original image size | |
height, width = original_image.shape[:-1] | |
prediction = prediction.resize((width, height)) | |
if prediction.has_field("mask"): | |
# if we have masks, paste the masks in the right position | |
# in the image, as defined by the bounding boxes | |
masks = prediction.get_field("mask") | |
# always single image is passed at a time | |
masks = self.masker([masks], [prediction])[0] | |
prediction.add_field("mask", masks) | |
return prediction, debug_info | |
def select_top_predictions(self, predictions): | |
""" | |
Select only predictions which have a `score` > self.confidence_threshold, | |
and returns the predictions in descending order of score | |
Arguments: | |
predictions (BoxList): the result of the computation by the model. | |
It should contain the field `scores`. | |
Returns: | |
prediction (BoxList): the detected objects. Additional information | |
of the detection properties can be found in the fields of | |
the BoxList via `prediction.fields()` | |
""" | |
scores = predictions.get_field("scores") | |
labels = predictions.get_field("labels").tolist() | |
thresh = scores.clone() | |
for i, lb in enumerate(labels): | |
if isinstance(self.confidence_threshold, float): | |
thresh[i] = self.confidence_threshold | |
elif len(self.confidence_threshold) == 1: | |
thresh[i] = self.confidence_threshold[0] | |
else: | |
thresh[i] = self.confidence_threshold[lb - 1] | |
keep = torch.nonzero(scores > thresh).squeeze(1) | |
predictions = predictions[keep] | |
if self.exclude_region: | |
exlude = BoxList(self.exclude_region, predictions.size) | |
iou = boxlist_iou(exlude, predictions) | |
keep = torch.nonzero(torch.sum(iou > 0.5, dim=0) == 0).squeeze(1) | |
if len(keep) > 0: | |
predictions = predictions[keep] | |
scores = predictions.get_field("scores") | |
_, idx = scores.sort(0, descending=True) | |
return predictions[idx] | |
def compute_colors_for_labels(self, labels): | |
""" | |
Simple function that adds fixed colors depending on the class | |
""" | |
colors = (30 * (labels[:, None] - 1) + 1) * self.palette | |
colors = (colors % 255).numpy().astype("uint8") | |
return colors | |
def overlay_boxes(self, image, predictions): | |
""" | |
Adds the predicted boxes on top of the image | |
Arguments: | |
image (np.ndarray): an image as returned by OpenCV | |
predictions (BoxList): the result of the computation by the model. | |
It should contain the field `labels`. | |
""" | |
labels = predictions.get_field("labels") | |
boxes = predictions.bbox | |
colors = self.compute_colors_for_labels(labels).tolist() | |
for box, color in zip(boxes, colors): | |
box = box.to(torch.int64) | |
top_left, bottom_right = box[:2].tolist(), box[2:].tolist() | |
image = cv2.rectangle(image, tuple(top_left), tuple(bottom_right), tuple(color), 2) | |
return image | |
def overlay_scores(self, image, predictions): | |
""" | |
Adds the predicted boxes on top of the image | |
Arguments: | |
image (np.ndarray): an image as returned by OpenCV | |
predictions (BoxList): the result of the computation by the model. | |
It should contain the field `labels`. | |
""" | |
scores = predictions.get_field("scores") | |
boxes = predictions.bbox | |
for box, score in zip(boxes, scores): | |
box = box.to(torch.int64) | |
image = cv2.putText( | |
image, | |
"%.3f" % score, | |
(box[0], (box[1] + box[3]) / 2), | |
cv2.FONT_HERSHEY_SIMPLEX, | |
0.5, | |
(255, 255, 255), | |
1, | |
) | |
return image | |
def overlay_cboxes(self, image, predictions): | |
""" | |
Adds the predicted boxes on top of the image | |
Arguments: | |
image (np.ndarray): an image as returned by OpenCV | |
predictions (BoxList): the result of the computation by the model. | |
It should contain the field `labels`. | |
""" | |
scores = predictions.get_field("scores") | |
boxes = predictions.bbox | |
for box, score in zip(boxes, scores): | |
box = box.to(torch.int64) | |
top_left, bottom_right = box[:2].tolist(), box[2:].tolist() | |
image = cv2.rectangle(image, tuple(top_left), tuple(bottom_right), (255, 0, 0), 2) | |
image = cv2.putText( | |
image, "%.3f" % score, (box[0], (box[1] + box[3]) / 2), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1 | |
) | |
return image | |
def overlay_centers(self, image, predictions): | |
""" | |
Adds the predicted boxes on top of the image | |
Arguments: | |
image (np.ndarray): an image as returned by OpenCV | |
predictions (BoxList): the result of the computation by the model. | |
It should contain the field `labels`. | |
""" | |
centers = predictions.get_field("centers") | |
for cord in centers: | |
cord = cord.to(torch.int64) | |
image = cv2.circle(image, (cord[0].item(), cord[1].item()), 2, (255, 0, 0), 20) | |
return image | |
def overlay_count(self, image, predictions): | |
""" | |
Adds the predicted boxes on top of the image | |
Arguments: | |
image (np.ndarray): an image as returned by OpenCV | |
predictions (BoxList): the result of the computation by the model. | |
It should contain the field `labels`. | |
""" | |
if isinstance(predictions, int): | |
count = predictions | |
else: | |
count = len(predictions) | |
image = cv2.putText(image, "Count: %d" % count, (0, 100), cv2.FONT_HERSHEY_SIMPLEX, 3, (255, 0, 0), 3) | |
return image | |
def overlay_mask(self, image, predictions): | |
""" | |
Adds the instances contours for each predicted object. | |
Each label has a different color. | |
Arguments: | |
image (np.ndarray): an image as returned by OpenCV | |
predictions (BoxList): the result of the computation by the model. | |
It should contain the field `mask` and `labels`. | |
""" | |
masks = predictions.get_field("mask").numpy() | |
labels = predictions.get_field("labels") | |
colors = self.compute_colors_for_labels(labels).tolist() | |
for mask, color in zip(masks, colors): | |
thresh = mask[0, :, :, None].astype(np.uint8) | |
contours, hierarchy = cv2_util.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) | |
image = cv2.drawContours(image, contours, -1, color, 3) | |
composite = image | |
return composite | |
def overlay_keypoints(self, image, predictions): | |
keypoints = predictions.get_field("keypoints") | |
kps = keypoints.keypoints | |
scores = keypoints.get_field("logits") | |
kps = torch.cat((kps[:, :, 0:2], scores[:, :, None]), dim=2).numpy() | |
for region in kps: | |
image = vis_keypoints( | |
image, region.transpose((1, 0)), names=keypoints.NAMES, connections=keypoints.CONNECTIONS | |
) | |
return image | |
def create_mask_montage(self, image, predictions): | |
""" | |
Create a montage showing the probability heatmaps for each one one of the | |
detected objects | |
Arguments: | |
image (np.ndarray): an image as returned by OpenCV | |
predictions (BoxList): the result of the computation by the model. | |
It should contain the field `mask`. | |
""" | |
masks = predictions.get_field("mask") | |
masks_per_dim = self.masks_per_dim | |
masks = L.interpolate(masks.float(), scale_factor=1 / masks_per_dim).byte() | |
height, width = masks.shape[-2:] | |
max_masks = masks_per_dim**2 | |
masks = masks[:max_masks] | |
# handle case where we have less detections than max_masks | |
if len(masks) < max_masks: | |
masks_padded = torch.zeros(max_masks, 1, height, width, dtype=torch.uint8) | |
masks_padded[: len(masks)] = masks | |
masks = masks_padded | |
masks = masks.reshape(masks_per_dim, masks_per_dim, height, width) | |
result = torch.zeros((masks_per_dim * height, masks_per_dim * width), dtype=torch.uint8) | |
for y in range(masks_per_dim): | |
start_y = y * height | |
end_y = (y + 1) * height | |
for x in range(masks_per_dim): | |
start_x = x * width | |
end_x = (x + 1) * width | |
result[start_y:end_y, start_x:end_x] = masks[y, x] | |
return cv2.applyColorMap(result.numpy(), cv2.COLORMAP_JET) | |
def overlay_class_names(self, image, predictions, names=None): | |
""" | |
Adds detected class names and scores in the positions defined by the | |
top-left corner of the predicted bounding box | |
Arguments: | |
image (np.ndarray): an image as returned by OpenCV | |
predictions (BoxList): the result of the computation by the model. | |
It should contain the field `scores` and `labels`. | |
""" | |
scores = predictions.get_field("scores").tolist() | |
labels = predictions.get_field("labels").tolist() | |
if names: | |
labels = [names[i - 1] for i in labels] | |
else: | |
labels = [self.CATEGORIES[i] for i in labels] | |
boxes = predictions.bbox | |
template = "{}: {:.2f}" | |
for box, score, label in zip(boxes, scores, labels): | |
x, y = box[:2] | |
s = template.format(label, score) | |
cv2.putText(image, s, (x, y), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) | |
return image | |
def vis_keypoints(img, kps, kp_thresh=0, alpha=0.7, names=None, connections=None): | |
"""Visualizes keypoints (adapted from vis_one_image). | |
kps has shape (4, #keypoints) where 4 rows are (x, y, logit, prob). | |
""" | |
dataset_keypoints = names | |
kp_lines = connections | |
# simple rainbow color map implementation | |
blue_red_ratio = 0.8 | |
gx = lambda x: (6 - 2 * blue_red_ratio) * x + blue_red_ratio | |
colors = [ | |
[ | |
256 * max(0, (3 - abs(gx(i) - 4) - abs(gx(i) - 5)) / 2), | |
256 * max(0, (3 - abs(gx(i) - 2) - abs(gx(i) - 4)) / 2), | |
256 * max(0, (3 - abs(gx(i) - 1) - abs(gx(i) - 2)) / 2), | |
] | |
for i in np.linspace(0, 1, len(kp_lines) + 2) | |
] | |
# Perform the drawing on a copy of the image, to allow for blending. | |
kp_mask = np.copy(img) | |
# Draw mid shoulder / mid hip first for better visualization. | |
mid_shoulder = ( | |
kps[:2, dataset_keypoints.index("right_shoulder")] + kps[:2, dataset_keypoints.index("left_shoulder")] | |
) / 2.0 | |
sc_mid_shoulder = np.minimum( | |
kps[2, dataset_keypoints.index("right_shoulder")], kps[2, dataset_keypoints.index("left_shoulder")] | |
) | |
nose_idx = dataset_keypoints.index("nose") | |
if sc_mid_shoulder > kp_thresh and kps[2, nose_idx] > kp_thresh: | |
cv2.line( | |
kp_mask, | |
tuple(mid_shoulder), | |
tuple(kps[:2, nose_idx]), | |
color=colors[len(kp_lines)], | |
thickness=2, | |
lineType=cv2.LINE_AA, | |
) | |
if "right_hip" in names and "left_hip" in names: | |
mid_hip = (kps[:2, dataset_keypoints.index("right_hip")] + kps[:2, dataset_keypoints.index("left_hip")]) / 2.0 | |
sc_mid_hip = np.minimum( | |
kps[2, dataset_keypoints.index("right_hip")], kps[2, dataset_keypoints.index("left_hip")] | |
) | |
if sc_mid_shoulder > kp_thresh and sc_mid_hip > kp_thresh: | |
cv2.line( | |
kp_mask, | |
tuple(mid_shoulder), | |
tuple(mid_hip), | |
color=colors[len(kp_lines) + 1], | |
thickness=2, | |
lineType=cv2.LINE_AA, | |
) | |
# Draw the keypoints. | |
for l in range(len(kp_lines)): | |
i1 = kp_lines[l][0] | |
i2 = kp_lines[l][1] | |
p1 = kps[0, i1], kps[1, i1] | |
p2 = kps[0, i2], kps[1, i2] | |
if kps[2, i1] > kp_thresh and kps[2, i2] > kp_thresh: | |
cv2.line(kp_mask, p1, p2, color=colors[l], thickness=2, lineType=cv2.LINE_AA) | |
if kps[2, i1] > kp_thresh: | |
cv2.circle(kp_mask, p1, radius=3, color=colors[l], thickness=-1, lineType=cv2.LINE_AA) | |
if kps[2, i2] > kp_thresh: | |
cv2.circle(kp_mask, p2, radius=3, color=colors[l], thickness=-1, lineType=cv2.LINE_AA) | |
# Blend the keypoints. | |
return cv2.addWeighted(img, 1.0 - alpha, kp_mask, alpha, 0) | |