Spaces:

Fucius
/

OMG

Running on Zero

OMG / inference /models /yolact /yolact_instance_segmentation.py

Upload 422 files

df6c67d verified 3 months ago

13.4 kB

	from time import perf_counter
	from typing import Any, List, Tuple

	import cv2
	import numpy as np

	from inference.core.entities.responses.inference import (
	InferenceResponseImage,
	InstanceSegmentationInferenceResponse,
	InstanceSegmentationPrediction,
	)
	from inference.core.models.roboflow import OnnxRoboflowInferenceModel
	from inference.core.models.types import PreprocessReturnMetadata
	from inference.core.nms import w_np_non_max_suppression
	from inference.core.utils.postprocess import (
	crop_mask,
	masks2poly,
	post_process_bboxes,
	post_process_polygons,
	)


	class YOLACT(OnnxRoboflowInferenceModel):
	"""Roboflow ONNX Object detection model (Implements an object detection specific infer method)"""

	task_type = "instance-segmentation"

	@property
	def weights_file(self) -> str:
	"""Gets the weights file.

	Returns:
	str: Path to the weights file.
	"""
	return "weights.onnx"

	def infer(
	self,
	image: Any,
	class_agnostic_nms: bool = False,
	confidence: float = 0.5,
	iou_threshold: float = 0.5,
	max_candidates: int = 3000,
	max_detections: int = 300,
	return_image_dims: bool = False,
	**kwargs,
	) -> List[List[dict]]:
	"""
	Performs instance segmentation inference on a given image, post-processes the results,
	and returns the segmented instances as dictionaries containing their properties.

	Args:
	image (Any): The image or list of images to segment. Can be in various formats (e.g., raw array, PIL image).
	class_agnostic_nms (bool, optional): Whether to perform class-agnostic non-max suppression. Defaults to False.
	confidence (float, optional): Confidence threshold for filtering weak detections. Defaults to 0.5.
	iou_threshold (float, optional): Intersection-over-union threshold for non-max suppression. Defaults to 0.5.
	max_candidates (int, optional): Maximum number of candidate detections to consider. Defaults to 3000.
	max_detections (int, optional): Maximum number of detections to return after non-max suppression. Defaults to 300.
	return_image_dims (bool, optional): Whether to return the dimensions of the input image(s). Defaults to False.
	**kwargs: Additional keyword arguments.

	Returns:
	List[List[dict]]: Each list contains dictionaries of segmented instances for a given image. Each dictionary contains:
	- x, y: Center coordinates of the instance.
	- width, height: Width and height of the bounding box around the instance.
	- class: Name of the detected class.
	- confidence: Confidence score of the detection.
	- points: List of points describing the segmented mask's boundary.
	- class_id: ID corresponding to the detected class.
	If `return_image_dims` is True, the function returns a tuple where the first element is the list of detections and the
	second element is the list of image dimensions.

	Notes:
	- The function supports processing multiple images in a batch.
	- If an input list of images is provided, the function returns a list of lists,
	where each inner list corresponds to the detections for a specific image.
	- The function internally uses an ONNX model for inference.
	"""
	return super().infer(
	image,
	class_agnostic_nms=class_agnostic_nms,
	confidence=confidence,
	iou_threshold=iou_threshold,
	max_candidates=max_candidates,
	max_detections=max_detections,
	return_image_dims=return_image_dims,
	**kwargs,
	)

	def preprocess(
	self, image: Any, **kwargs
	) -> Tuple[np.ndarray, PreprocessReturnMetadata]:
	if isinstance(image, list):
	imgs_with_dims = [self.preproc_image(i) for i in image]
	imgs, img_dims = zip(*imgs_with_dims)
	img_in = np.concatenate(imgs, axis=0)
	unwrap = False
	else:
	img_in, img_dims = self.preproc_image(image)
	img_dims = [img_dims]
	unwrap = True

	# IN BGR order (for some reason)
	mean = (103.94, 116.78, 123.68)
	std = (57.38, 57.12, 58.40)

	img_in = img_in.astype(np.float32)

	# Our channels are RGB, so apply mean and std accordingly
	img_in[:, 0, :, :] = (img_in[:, 0, :, :] - mean[2]) / std[2]
	img_in[:, 1, :, :] = (img_in[:, 1, :, :] - mean[1]) / std[1]
	img_in[:, 2, :, :] = (img_in[:, 2, :, :] - mean[0]) / std[0]

	return img_in, PreprocessReturnMetadata(
	{
	"img_dims": img_dims,
	"im_shape": img_in.shape,
	}
	)

	def predict(
	self, img_in: np.ndarray, **kwargs
	) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
	return self.onnx_session.run(None, {self.input_name: img_in})

	def postprocess(
	self,
	predictions: Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray],
	preprocess_return_metadata: PreprocessReturnMetadata,
	**kwargs,
	) -> List[InstanceSegmentationInferenceResponse]:
	loc_data = np.float32(predictions[0])
	conf_data = np.float32(predictions[1])
	mask_data = np.float32(predictions[2])
	prior_data = np.float32(predictions[3])
	proto_data = np.float32(predictions[4])

	batch_size = loc_data.shape[0]
	num_priors = prior_data.shape[0]

	boxes = np.zeros((batch_size, num_priors, 4))
	for batch_idx in range(batch_size):
	boxes[batch_idx, :, :] = self.decode_predicted_bboxes(
	loc_data[batch_idx], prior_data
	)

	conf_preds = np.reshape(
	conf_data, (batch_size, num_priors, self.num_classes + 1)
	)
	class_confs = conf_preds[:, :, 1:] # remove background class
	box_confs = np.expand_dims(
	np.max(class_confs, axis=2), 2
	) # get max conf for each box

	predictions = np.concatenate((boxes, box_confs, class_confs, mask_data), axis=2)

	img_in_shape = preprocess_return_metadata["im_shape"]
	predictions[:, :, 0] *= img_in_shape[2]
	predictions[:, :, 1] *= img_in_shape[3]
	predictions[:, :, 2] *= img_in_shape[2]
	predictions[:, :, 3] *= img_in_shape[3]
	predictions = w_np_non_max_suppression(
	predictions,
	conf_thresh=kwargs["confidence"],
	iou_thresh=kwargs["iou_threshold"],
	class_agnostic=kwargs["class_agnostic_nms"],
	max_detections=kwargs["max_detections"],
	max_candidate_detections=kwargs["max_candidates"],
	num_masks=32,
	box_format="xyxy",
	)
	predictions = np.array(predictions)
	batch_preds = []
	if predictions.shape != (1, 0):
	for batch_idx, img_dim in enumerate(preprocess_return_metadata["img_dims"]):
	boxes = predictions[batch_idx, :, :4]
	scores = predictions[batch_idx, :, 4]
	classes = predictions[batch_idx, :, 6]
	masks = predictions[batch_idx, :, 7:]
	proto = proto_data[batch_idx]
	decoded_masks = self.decode_masks(boxes, masks, proto, img_in_shape[2:])
	polys = masks2poly(decoded_masks)
	infer_shape = (self.img_size_w, self.img_size_h)
	boxes = post_process_bboxes(
	[boxes], infer_shape, [img_dim], self.preproc, self.resize_method
	)[0]
	polys = post_process_polygons(
	img_in_shape[2:],
	polys,
	img_dim,
	self.preproc,
	resize_method=self.resize_method,
	)
	preds = []
	for box, poly, score, cls in zip(boxes, polys, scores, classes):
	confidence = float(score)
	class_name = self.class_names[int(cls)]
	points = [{"x": round(x, 1), "y": round(y, 1)} for (x, y) in poly]
	pred = {
	"x": round((box[2] + box[0]) / 2, 1),
	"y": round((box[3] + box[1]) / 2, 1),
	"width": int(box[2] - box[0]),
	"height": int(box[3] - box[1]),
	"class": class_name,
	"confidence": round(confidence, 3),
	"points": points,
	"class_id": int(cls),
	}
	preds.append(pred)
	batch_preds.append(preds)
	else:
	batch_preds.append([])
	img_dims = preprocess_return_metadata["img_dims"]
	responses = self.make_response(batch_preds, img_dims, **kwargs)
	if kwargs["return_image_dims"]:
	return responses, preprocess_return_metadata["img_dims"]
	else:
	return responses

	def make_response(
	self,
	predictions: List[List[dict]],
	img_dims: List[Tuple[int, int]],
	class_filter: List[str] = None,
	**kwargs,
	) -> List[InstanceSegmentationInferenceResponse]:
	"""
	Constructs a list of InstanceSegmentationInferenceResponse objects based on the provided predictions
	and image dimensions, optionally filtering by class name.

	Args:
	predictions (List[List[dict]]): A list containing batch predictions, where each inner list contains
	dictionaries of segmented instances for a given image.
	img_dims (List[Tuple[int, int]]): List of tuples specifying the dimensions of each image in the format
	(height, width).
	class_filter (List[str], optional): A list of class names to filter the predictions by. If not provided,
	all predictions are included.

	Returns:
	List[InstanceSegmentationInferenceResponse]: A list of response objects, each containing the filtered
	predictions and corresponding image dimensions for a given image.

	Examples:
	>>> predictions = [[{"class_name": "cat", ...}, {"class_name": "dog", ...}], ...]
	>>> img_dims = [(300, 400), ...]
	>>> responses = make_response(predictions, img_dims, class_filter=["cat"])
	>>> len(responses[0].predictions) # Only predictions with "cat" class are included
	1
	"""
	responses = [
	InstanceSegmentationInferenceResponse(
	predictions=[
	InstanceSegmentationPrediction(**p)
	for p in batch_pred
	if not class_filter or p["class_name"] in class_filter
	],
	image=InferenceResponseImage(
	width=img_dims[i][1], height=img_dims[i][0]
	),
	)
	for i, batch_pred in enumerate(predictions)
	]
	return responses

	def decode_masks(self, boxes, masks, proto, img_dim):
	"""Decodes the masks from the given parameters.

	Args:
	boxes (np.array): Bounding boxes.
	masks (np.array): Masks.
	proto (np.array): Proto data.
	img_dim (tuple): Image dimensions.

	Returns:
	np.array: Decoded masks.
	"""
	ret_mask = np.matmul(proto, np.transpose(masks))
	ret_mask = 1 / (1 + np.exp(-ret_mask))
	w, h, _ = ret_mask.shape
	gain = min(h / img_dim[0], w / img_dim[1]) # gain = old / new
	pad = (w - img_dim[1] * gain) / 2, (h - img_dim[0] * gain) / 2 # wh padding
	top, left = int(pad[1]), int(pad[0]) # y, x
	bottom, right = int(h - pad[1]), int(w - pad[0])
	ret_mask = np.transpose(ret_mask, (2, 0, 1))
	ret_mask = ret_mask[:, top:bottom, left:right]
	if len(ret_mask.shape) == 2:
	ret_mask = np.expand_dims(ret_mask, axis=0)
	ret_mask = ret_mask.transpose((1, 2, 0))
	ret_mask = cv2.resize(ret_mask, img_dim, interpolation=cv2.INTER_LINEAR)
	if len(ret_mask.shape) == 2:
	ret_mask = np.expand_dims(ret_mask, axis=2)
	ret_mask = ret_mask.transpose((2, 0, 1))
	ret_mask = crop_mask(ret_mask, boxes) # CHW
	ret_mask[ret_mask < 0.5] = 0

	return ret_mask

	def decode_predicted_bboxes(self, loc, priors):
	"""Decode predicted bounding box coordinates using the scheme employed by Yolov2.

	Args:
	loc (np.array): The predicted bounding boxes of size [num_priors, 4].
	priors (np.array): The prior box coordinates with size [num_priors, 4].

	Returns:
	np.array: A tensor of decoded relative coordinates in point form with size [num_priors, 4].
	"""

	variances = [0.1, 0.2]

	boxes = np.concatenate(
	[
	priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
	priors[:, 2:] * np.exp(loc[:, 2:] * variances[1]),
	],
	1,
	)
	boxes[:, :2] -= boxes[:, 2:] / 2
	boxes[:, 2:] += boxes[:, :2]

	return boxes