|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Preprocess images and bounding boxes for detection. |
|
|
|
We perform two sets of operations in preprocessing stage: |
|
(a) operations that are applied to both training and testing data, |
|
(b) operations that are applied only to training data for the purpose of |
|
data augmentation. |
|
|
|
A preprocessing function receives a set of inputs, |
|
e.g. an image and bounding boxes, |
|
performs an operation on them, and returns them. |
|
Some examples are: randomly cropping the image, randomly mirroring the image, |
|
randomly changing the brightness, contrast, hue and |
|
randomly jittering the bounding boxes. |
|
|
|
The preprocess function receives a tensor_dict which is a dictionary that maps |
|
different field names to their tensors. For example, |
|
tensor_dict[fields.InputDataFields.image] holds the image tensor. |
|
The image is a rank 4 tensor: [1, height, width, channels] with |
|
dtype=tf.float32. The groundtruth_boxes is a rank 2 tensor: [N, 4] where |
|
in each row there is a box with [ymin xmin ymax xmax]. |
|
Boxes are in normalized coordinates meaning |
|
their coordinate values range in [0, 1] |
|
|
|
To preprocess multiple images with the same operations in cases where |
|
nondeterministic operations are used, a preprocessor_cache.PreprocessorCache |
|
object can be passed into the preprocess function or individual operations. |
|
All nondeterministic operations except random_jitter_boxes support caching. |
|
E.g. |
|
Let tensor_dict{1,2,3,4,5} be copies of the same inputs. |
|
Let preprocess_options contain nondeterministic operation(s) excluding |
|
random_jitter_boxes. |
|
|
|
cache1 = preprocessor_cache.PreprocessorCache() |
|
cache2 = preprocessor_cache.PreprocessorCache() |
|
a = preprocess(tensor_dict1, preprocess_options, preprocess_vars_cache=cache1) |
|
b = preprocess(tensor_dict2, preprocess_options, preprocess_vars_cache=cache1) |
|
c = preprocess(tensor_dict3, preprocess_options, preprocess_vars_cache=cache2) |
|
d = preprocess(tensor_dict4, preprocess_options, preprocess_vars_cache=cache2) |
|
e = preprocess(tensor_dict5, preprocess_options) |
|
|
|
Then correspondings tensors of object pairs (a,b) and (c,d) |
|
are guaranteed to be equal element-wise, but the equality of any other object |
|
pair cannot be determined. |
|
|
|
Important Note: In tensor_dict, images is a rank 4 tensor, but preprocessing |
|
functions receive a rank 3 tensor for processing the image. Thus, inside the |
|
preprocess function we squeeze the image to become a rank 3 tensor and then |
|
we pass it to the functions. At the end of the preprocess we expand the image |
|
back to rank 4. |
|
""" |
|
|
|
import functools |
|
import inspect |
|
import sys |
|
import tensorflow as tf |
|
|
|
from tensorflow.python.ops import control_flow_ops |
|
|
|
from object_detection.core import box_list |
|
from object_detection.core import box_list_ops |
|
from object_detection.core import keypoint_ops |
|
from object_detection.core import preprocessor_cache |
|
from object_detection.core import standard_fields as fields |
|
from object_detection.utils import shape_utils |
|
|
|
|
|
def _apply_with_random_selector(x, |
|
func, |
|
num_cases, |
|
preprocess_vars_cache=None, |
|
key=''): |
|
"""Computes func(x, sel), with sel sampled from [0...num_cases-1]. |
|
|
|
If both preprocess_vars_cache AND key are the same between two calls, sel will |
|
be the same value in both calls. |
|
|
|
Args: |
|
x: input Tensor. |
|
func: Python function to apply. |
|
num_cases: Python int32, number of cases to sample sel from. |
|
preprocess_vars_cache: PreprocessorCache object that records previously |
|
performed augmentations. Updated in-place. If this |
|
function is called multiple times with the same |
|
non-null cache, it will perform deterministically. |
|
key: variable identifier for preprocess_vars_cache. |
|
|
|
Returns: |
|
The result of func(x, sel), where func receives the value of the |
|
selector as a python integer, but sel is sampled dynamically. |
|
""" |
|
generator_func = functools.partial( |
|
tf.random_uniform, [], maxval=num_cases, dtype=tf.int32) |
|
rand_sel = _get_or_create_preprocess_rand_vars( |
|
generator_func, preprocessor_cache.PreprocessorCache.SELECTOR, |
|
preprocess_vars_cache, key) |
|
|
|
|
|
return control_flow_ops.merge([func( |
|
control_flow_ops.switch(x, tf.equal(rand_sel, case))[1], case) |
|
for case in range(num_cases)])[0] |
|
|
|
|
|
def _apply_with_random_selector_tuples(x, |
|
func, |
|
num_cases, |
|
preprocess_vars_cache=None, |
|
key=''): |
|
"""Computes func(x, sel), with sel sampled from [0...num_cases-1]. |
|
|
|
If both preprocess_vars_cache AND key are the same between two calls, sel will |
|
be the same value in both calls. |
|
|
|
Args: |
|
x: A tuple of input tensors. |
|
func: Python function to apply. |
|
num_cases: Python int32, number of cases to sample sel from. |
|
preprocess_vars_cache: PreprocessorCache object that records previously |
|
performed augmentations. Updated in-place. If this |
|
function is called multiple times with the same |
|
non-null cache, it will perform deterministically. |
|
key: variable identifier for preprocess_vars_cache. |
|
|
|
Returns: |
|
The result of func(x, sel), where func receives the value of the |
|
selector as a python integer, but sel is sampled dynamically. |
|
""" |
|
num_inputs = len(x) |
|
generator_func = functools.partial( |
|
tf.random_uniform, [], maxval=num_cases, dtype=tf.int32) |
|
rand_sel = _get_or_create_preprocess_rand_vars( |
|
generator_func, preprocessor_cache.PreprocessorCache.SELECTOR_TUPLES, |
|
preprocess_vars_cache, key) |
|
|
|
|
|
tuples = [list() for t in x] |
|
for case in range(num_cases): |
|
new_x = [control_flow_ops.switch(t, tf.equal(rand_sel, case))[1] for t in x] |
|
output = func(tuple(new_x), case) |
|
for j in range(num_inputs): |
|
tuples[j].append(output[j]) |
|
|
|
for i in range(num_inputs): |
|
tuples[i] = control_flow_ops.merge(tuples[i])[0] |
|
return tuple(tuples) |
|
|
|
|
|
def _get_or_create_preprocess_rand_vars(generator_func, |
|
function_id, |
|
preprocess_vars_cache, |
|
key=''): |
|
"""Returns a tensor stored in preprocess_vars_cache or using generator_func. |
|
|
|
If the tensor was previously generated and appears in the PreprocessorCache, |
|
the previously generated tensor will be returned. Otherwise, a new tensor |
|
is generated using generator_func and stored in the cache. |
|
|
|
Args: |
|
generator_func: A 0-argument function that generates a tensor. |
|
function_id: identifier for the preprocessing function used. |
|
preprocess_vars_cache: PreprocessorCache object that records previously |
|
performed augmentations. Updated in-place. If this |
|
function is called multiple times with the same |
|
non-null cache, it will perform deterministically. |
|
key: identifier for the variable stored. |
|
Returns: |
|
The generated tensor. |
|
""" |
|
if preprocess_vars_cache is not None: |
|
var = preprocess_vars_cache.get(function_id, key) |
|
if var is None: |
|
var = generator_func() |
|
preprocess_vars_cache.update(function_id, key, var) |
|
else: |
|
var = generator_func() |
|
return var |
|
|
|
|
|
def _random_integer(minval, maxval, seed): |
|
"""Returns a random 0-D tensor between minval and maxval. |
|
|
|
Args: |
|
minval: minimum value of the random tensor. |
|
maxval: maximum value of the random tensor. |
|
seed: random seed. |
|
|
|
Returns: |
|
A random 0-D tensor between minval and maxval. |
|
""" |
|
return tf.random_uniform( |
|
[], minval=minval, maxval=maxval, dtype=tf.int32, seed=seed) |
|
|
|
|
|
|
|
|
|
|
|
def _rgb_to_grayscale(images, name=None): |
|
"""Converts one or more images from RGB to Grayscale. |
|
|
|
Outputs a tensor of the same `DType` and rank as `images`. The size of the |
|
last dimension of the output is 1, containing the Grayscale value of the |
|
pixels. |
|
|
|
Args: |
|
images: The RGB tensor to convert. Last dimension must have size 3 and |
|
should contain RGB values. |
|
name: A name for the operation (optional). |
|
|
|
Returns: |
|
The converted grayscale image(s). |
|
""" |
|
with tf.name_scope(name, 'rgb_to_grayscale', [images]) as name: |
|
images = tf.convert_to_tensor(images, name='images') |
|
|
|
orig_dtype = images.dtype |
|
flt_image = tf.image.convert_image_dtype(images, tf.float32) |
|
|
|
|
|
|
|
rgb_weights = [0.2989, 0.5870, 0.1140] |
|
rank_1 = tf.expand_dims(tf.rank(images) - 1, 0) |
|
gray_float = tf.reduce_sum( |
|
flt_image * rgb_weights, rank_1, keep_dims=True) |
|
gray_float.set_shape(images.get_shape()[:-1].concatenate([1])) |
|
return tf.image.convert_image_dtype(gray_float, orig_dtype, name=name) |
|
|
|
|
|
def normalize_image(image, original_minval, original_maxval, target_minval, |
|
target_maxval): |
|
"""Normalizes pixel values in the image. |
|
|
|
Moves the pixel values from the current [original_minval, original_maxval] |
|
range to a the [target_minval, target_maxval] range. |
|
|
|
Args: |
|
image: rank 3 float32 tensor containing 1 |
|
image -> [height, width, channels]. |
|
original_minval: current image minimum value. |
|
original_maxval: current image maximum value. |
|
target_minval: target image minimum value. |
|
target_maxval: target image maximum value. |
|
|
|
Returns: |
|
image: image which is the same shape as input image. |
|
""" |
|
with tf.name_scope('NormalizeImage', values=[image]): |
|
original_minval = float(original_minval) |
|
original_maxval = float(original_maxval) |
|
target_minval = float(target_minval) |
|
target_maxval = float(target_maxval) |
|
image = tf.to_float(image) |
|
image = tf.subtract(image, original_minval) |
|
image = tf.multiply(image, (target_maxval - target_minval) / |
|
(original_maxval - original_minval)) |
|
image = tf.add(image, target_minval) |
|
return image |
|
|
|
|
|
def retain_boxes_above_threshold(boxes, |
|
labels, |
|
label_weights, |
|
label_confidences=None, |
|
multiclass_scores=None, |
|
masks=None, |
|
keypoints=None, |
|
threshold=0.0): |
|
"""Retains boxes whose label weight is above a given threshold. |
|
|
|
If the label weight for a box is missing (represented by NaN), the box is |
|
retained. The boxes that don't pass the threshold will not appear in the |
|
returned tensor. |
|
|
|
Args: |
|
boxes: float32 tensor of shape [num_instance, 4] representing boxes |
|
location in normalized coordinates. |
|
labels: rank 1 int32 tensor of shape [num_instance] containing the object |
|
classes. |
|
label_weights: float32 tensor of shape [num_instance] representing the |
|
weight for each box. |
|
label_confidences: float32 tensor of shape [num_instance] representing the |
|
confidence for each box. |
|
multiclass_scores: (optional) float32 tensor of shape |
|
[num_instances, num_classes] representing the score for each box for each |
|
class. |
|
masks: (optional) rank 3 float32 tensor with shape |
|
[num_instances, height, width] containing instance masks. The masks are of |
|
the same height, width as the input `image`. |
|
keypoints: (optional) rank 3 float32 tensor with shape |
|
[num_instances, num_keypoints, 2]. The keypoints are in y-x normalized |
|
coordinates. |
|
threshold: scalar python float. |
|
|
|
Returns: |
|
retained_boxes: [num_retained_instance, 4] |
|
retianed_labels: [num_retained_instance] |
|
retained_label_weights: [num_retained_instance] |
|
|
|
If multiclass_scores, masks, or keypoints are not None, the function also |
|
returns: |
|
|
|
retained_multiclass_scores: [num_retained_instance, num_classes] |
|
retained_masks: [num_retained_instance, height, width] |
|
retained_keypoints: [num_retained_instance, num_keypoints, 2] |
|
""" |
|
with tf.name_scope('RetainBoxesAboveThreshold', |
|
values=[boxes, labels, label_weights]): |
|
indices = tf.where( |
|
tf.logical_or(label_weights > threshold, tf.is_nan(label_weights))) |
|
indices = tf.squeeze(indices, axis=1) |
|
retained_boxes = tf.gather(boxes, indices) |
|
retained_labels = tf.gather(labels, indices) |
|
retained_label_weights = tf.gather(label_weights, indices) |
|
result = [retained_boxes, retained_labels, retained_label_weights] |
|
|
|
if label_confidences is not None: |
|
retained_label_confidences = tf.gather(label_confidences, indices) |
|
result.append(retained_label_confidences) |
|
|
|
if multiclass_scores is not None: |
|
retained_multiclass_scores = tf.gather(multiclass_scores, indices) |
|
result.append(retained_multiclass_scores) |
|
|
|
if masks is not None: |
|
retained_masks = tf.gather(masks, indices) |
|
result.append(retained_masks) |
|
|
|
if keypoints is not None: |
|
retained_keypoints = tf.gather(keypoints, indices) |
|
result.append(retained_keypoints) |
|
|
|
return result |
|
|
|
|
|
def _flip_boxes_left_right(boxes): |
|
"""Left-right flip the boxes. |
|
|
|
Args: |
|
boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. |
|
Boxes are in normalized form meaning their coordinates vary |
|
between [0, 1]. |
|
Each row is in the form of [ymin, xmin, ymax, xmax]. |
|
|
|
Returns: |
|
Flipped boxes. |
|
""" |
|
ymin, xmin, ymax, xmax = tf.split(value=boxes, num_or_size_splits=4, axis=1) |
|
flipped_xmin = tf.subtract(1.0, xmax) |
|
flipped_xmax = tf.subtract(1.0, xmin) |
|
flipped_boxes = tf.concat([ymin, flipped_xmin, ymax, flipped_xmax], 1) |
|
return flipped_boxes |
|
|
|
|
|
def _flip_boxes_up_down(boxes): |
|
"""Up-down flip the boxes. |
|
|
|
Args: |
|
boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. |
|
Boxes are in normalized form meaning their coordinates vary |
|
between [0, 1]. |
|
Each row is in the form of [ymin, xmin, ymax, xmax]. |
|
|
|
Returns: |
|
Flipped boxes. |
|
""" |
|
ymin, xmin, ymax, xmax = tf.split(value=boxes, num_or_size_splits=4, axis=1) |
|
flipped_ymin = tf.subtract(1.0, ymax) |
|
flipped_ymax = tf.subtract(1.0, ymin) |
|
flipped_boxes = tf.concat([flipped_ymin, xmin, flipped_ymax, xmax], 1) |
|
return flipped_boxes |
|
|
|
|
|
def _rot90_boxes(boxes): |
|
"""Rotate boxes counter-clockwise by 90 degrees. |
|
|
|
Args: |
|
boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. |
|
Boxes are in normalized form meaning their coordinates vary |
|
between [0, 1]. |
|
Each row is in the form of [ymin, xmin, ymax, xmax]. |
|
|
|
Returns: |
|
Rotated boxes. |
|
""" |
|
ymin, xmin, ymax, xmax = tf.split(value=boxes, num_or_size_splits=4, axis=1) |
|
rotated_ymin = tf.subtract(1.0, xmax) |
|
rotated_ymax = tf.subtract(1.0, xmin) |
|
rotated_xmin = ymin |
|
rotated_xmax = ymax |
|
rotated_boxes = tf.concat( |
|
[rotated_ymin, rotated_xmin, rotated_ymax, rotated_xmax], 1) |
|
return rotated_boxes |
|
|
|
|
|
def _flip_masks_left_right(masks): |
|
"""Left-right flip masks. |
|
|
|
Args: |
|
masks: rank 3 float32 tensor with shape |
|
[num_instances, height, width] representing instance masks. |
|
|
|
Returns: |
|
flipped masks: rank 3 float32 tensor with shape |
|
[num_instances, height, width] representing instance masks. |
|
""" |
|
return masks[:, :, ::-1] |
|
|
|
|
|
def _flip_masks_up_down(masks): |
|
"""Up-down flip masks. |
|
|
|
Args: |
|
masks: rank 3 float32 tensor with shape |
|
[num_instances, height, width] representing instance masks. |
|
|
|
Returns: |
|
flipped masks: rank 3 float32 tensor with shape |
|
[num_instances, height, width] representing instance masks. |
|
""" |
|
return masks[:, ::-1, :] |
|
|
|
|
|
def _rot90_masks(masks): |
|
"""Rotate masks counter-clockwise by 90 degrees. |
|
|
|
Args: |
|
masks: rank 3 float32 tensor with shape |
|
[num_instances, height, width] representing instance masks. |
|
|
|
Returns: |
|
rotated masks: rank 3 float32 tensor with shape |
|
[num_instances, height, width] representing instance masks. |
|
""" |
|
masks = tf.transpose(masks, [0, 2, 1]) |
|
return masks[:, ::-1, :] |
|
|
|
|
|
def random_horizontal_flip(image, |
|
boxes=None, |
|
masks=None, |
|
keypoints=None, |
|
keypoint_flip_permutation=None, |
|
seed=None, |
|
preprocess_vars_cache=None): |
|
"""Randomly flips the image and detections horizontally. |
|
|
|
The probability of flipping the image is 50%. |
|
|
|
Args: |
|
image: rank 3 float32 tensor with shape [height, width, channels]. |
|
boxes: (optional) rank 2 float32 tensor with shape [N, 4] |
|
containing the bounding boxes. |
|
Boxes are in normalized form meaning their coordinates vary |
|
between [0, 1]. |
|
Each row is in the form of [ymin, xmin, ymax, xmax]. |
|
masks: (optional) rank 3 float32 tensor with shape |
|
[num_instances, height, width] containing instance masks. The masks |
|
are of the same height, width as the input `image`. |
|
keypoints: (optional) rank 3 float32 tensor with shape |
|
[num_instances, num_keypoints, 2]. The keypoints are in y-x |
|
normalized coordinates. |
|
keypoint_flip_permutation: rank 1 int32 tensor containing the keypoint flip |
|
permutation. |
|
seed: random seed |
|
preprocess_vars_cache: PreprocessorCache object that records previously |
|
performed augmentations. Updated in-place. If this |
|
function is called multiple times with the same |
|
non-null cache, it will perform deterministically. |
|
|
|
Returns: |
|
image: image which is the same shape as input image. |
|
|
|
If boxes, masks, keypoints, and keypoint_flip_permutation are not None, |
|
the function also returns the following tensors. |
|
|
|
boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. |
|
Boxes are in normalized form meaning their coordinates vary |
|
between [0, 1]. |
|
masks: rank 3 float32 tensor with shape [num_instances, height, width] |
|
containing instance masks. |
|
keypoints: rank 3 float32 tensor with shape |
|
[num_instances, num_keypoints, 2] |
|
|
|
Raises: |
|
ValueError: if keypoints are provided but keypoint_flip_permutation is not. |
|
""" |
|
|
|
def _flip_image(image): |
|
|
|
image_flipped = tf.image.flip_left_right(image) |
|
return image_flipped |
|
|
|
if keypoints is not None and keypoint_flip_permutation is None: |
|
raise ValueError( |
|
'keypoints are provided but keypoints_flip_permutation is not provided') |
|
|
|
with tf.name_scope('RandomHorizontalFlip', values=[image, boxes]): |
|
result = [] |
|
|
|
generator_func = functools.partial(tf.random_uniform, [], seed=seed) |
|
do_a_flip_random = _get_or_create_preprocess_rand_vars( |
|
generator_func, |
|
preprocessor_cache.PreprocessorCache.HORIZONTAL_FLIP, |
|
preprocess_vars_cache) |
|
do_a_flip_random = tf.greater(do_a_flip_random, 0.5) |
|
|
|
|
|
image = tf.cond(do_a_flip_random, lambda: _flip_image(image), lambda: image) |
|
result.append(image) |
|
|
|
|
|
if boxes is not None: |
|
boxes = tf.cond(do_a_flip_random, lambda: _flip_boxes_left_right(boxes), |
|
lambda: boxes) |
|
result.append(boxes) |
|
|
|
|
|
if masks is not None: |
|
masks = tf.cond(do_a_flip_random, lambda: _flip_masks_left_right(masks), |
|
lambda: masks) |
|
result.append(masks) |
|
|
|
|
|
if keypoints is not None and keypoint_flip_permutation is not None: |
|
permutation = keypoint_flip_permutation |
|
keypoints = tf.cond( |
|
do_a_flip_random, |
|
lambda: keypoint_ops.flip_horizontal(keypoints, 0.5, permutation), |
|
lambda: keypoints) |
|
result.append(keypoints) |
|
|
|
return tuple(result) |
|
|
|
|
|
def random_vertical_flip(image, |
|
boxes=None, |
|
masks=None, |
|
keypoints=None, |
|
keypoint_flip_permutation=None, |
|
seed=None, |
|
preprocess_vars_cache=None): |
|
"""Randomly flips the image and detections vertically. |
|
|
|
The probability of flipping the image is 50%. |
|
|
|
Args: |
|
image: rank 3 float32 tensor with shape [height, width, channels]. |
|
boxes: (optional) rank 2 float32 tensor with shape [N, 4] |
|
containing the bounding boxes. |
|
Boxes are in normalized form meaning their coordinates vary |
|
between [0, 1]. |
|
Each row is in the form of [ymin, xmin, ymax, xmax]. |
|
masks: (optional) rank 3 float32 tensor with shape |
|
[num_instances, height, width] containing instance masks. The masks |
|
are of the same height, width as the input `image`. |
|
keypoints: (optional) rank 3 float32 tensor with shape |
|
[num_instances, num_keypoints, 2]. The keypoints are in y-x |
|
normalized coordinates. |
|
keypoint_flip_permutation: rank 1 int32 tensor containing the keypoint flip |
|
permutation. |
|
seed: random seed |
|
preprocess_vars_cache: PreprocessorCache object that records previously |
|
performed augmentations. Updated in-place. If this |
|
function is called multiple times with the same |
|
non-null cache, it will perform deterministically. |
|
|
|
Returns: |
|
image: image which is the same shape as input image. |
|
|
|
If boxes, masks, keypoints, and keypoint_flip_permutation are not None, |
|
the function also returns the following tensors. |
|
|
|
boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. |
|
Boxes are in normalized form meaning their coordinates vary |
|
between [0, 1]. |
|
masks: rank 3 float32 tensor with shape [num_instances, height, width] |
|
containing instance masks. |
|
keypoints: rank 3 float32 tensor with shape |
|
[num_instances, num_keypoints, 2] |
|
|
|
Raises: |
|
ValueError: if keypoints are provided but keypoint_flip_permutation is not. |
|
""" |
|
|
|
def _flip_image(image): |
|
|
|
image_flipped = tf.image.flip_up_down(image) |
|
return image_flipped |
|
|
|
if keypoints is not None and keypoint_flip_permutation is None: |
|
raise ValueError( |
|
'keypoints are provided but keypoints_flip_permutation is not provided') |
|
|
|
with tf.name_scope('RandomVerticalFlip', values=[image, boxes]): |
|
result = [] |
|
|
|
generator_func = functools.partial(tf.random_uniform, [], seed=seed) |
|
do_a_flip_random = _get_or_create_preprocess_rand_vars( |
|
generator_func, preprocessor_cache.PreprocessorCache.VERTICAL_FLIP, |
|
preprocess_vars_cache) |
|
do_a_flip_random = tf.greater(do_a_flip_random, 0.5) |
|
|
|
|
|
image = tf.cond(do_a_flip_random, lambda: _flip_image(image), lambda: image) |
|
result.append(image) |
|
|
|
|
|
if boxes is not None: |
|
boxes = tf.cond(do_a_flip_random, lambda: _flip_boxes_up_down(boxes), |
|
lambda: boxes) |
|
result.append(boxes) |
|
|
|
|
|
if masks is not None: |
|
masks = tf.cond(do_a_flip_random, lambda: _flip_masks_up_down(masks), |
|
lambda: masks) |
|
result.append(masks) |
|
|
|
|
|
if keypoints is not None and keypoint_flip_permutation is not None: |
|
permutation = keypoint_flip_permutation |
|
keypoints = tf.cond( |
|
do_a_flip_random, |
|
lambda: keypoint_ops.flip_vertical(keypoints, 0.5, permutation), |
|
lambda: keypoints) |
|
result.append(keypoints) |
|
|
|
return tuple(result) |
|
|
|
|
|
def random_rotation90(image, |
|
boxes=None, |
|
masks=None, |
|
keypoints=None, |
|
seed=None, |
|
preprocess_vars_cache=None): |
|
"""Randomly rotates the image and detections 90 degrees counter-clockwise. |
|
|
|
The probability of rotating the image is 50%. This can be combined with |
|
random_horizontal_flip and random_vertical_flip to produce an output with a |
|
uniform distribution of the eight possible 90 degree rotation / reflection |
|
combinations. |
|
|
|
Args: |
|
image: rank 3 float32 tensor with shape [height, width, channels]. |
|
boxes: (optional) rank 2 float32 tensor with shape [N, 4] |
|
containing the bounding boxes. |
|
Boxes are in normalized form meaning their coordinates vary |
|
between [0, 1]. |
|
Each row is in the form of [ymin, xmin, ymax, xmax]. |
|
masks: (optional) rank 3 float32 tensor with shape |
|
[num_instances, height, width] containing instance masks. The masks |
|
are of the same height, width as the input `image`. |
|
keypoints: (optional) rank 3 float32 tensor with shape |
|
[num_instances, num_keypoints, 2]. The keypoints are in y-x |
|
normalized coordinates. |
|
seed: random seed |
|
preprocess_vars_cache: PreprocessorCache object that records previously |
|
performed augmentations. Updated in-place. If this |
|
function is called multiple times with the same |
|
non-null cache, it will perform deterministically. |
|
|
|
Returns: |
|
image: image which is the same shape as input image. |
|
|
|
If boxes, masks, and keypoints, are not None, |
|
the function also returns the following tensors. |
|
|
|
boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. |
|
Boxes are in normalized form meaning their coordinates vary |
|
between [0, 1]. |
|
masks: rank 3 float32 tensor with shape [num_instances, height, width] |
|
containing instance masks. |
|
keypoints: rank 3 float32 tensor with shape |
|
[num_instances, num_keypoints, 2] |
|
""" |
|
|
|
def _rot90_image(image): |
|
|
|
image_rotated = tf.image.rot90(image) |
|
return image_rotated |
|
|
|
with tf.name_scope('RandomRotation90', values=[image, boxes]): |
|
result = [] |
|
|
|
|
|
generator_func = functools.partial(tf.random_uniform, [], seed=seed) |
|
do_a_rot90_random = _get_or_create_preprocess_rand_vars( |
|
generator_func, preprocessor_cache.PreprocessorCache.ROTATION90, |
|
preprocess_vars_cache) |
|
do_a_rot90_random = tf.greater(do_a_rot90_random, 0.5) |
|
|
|
|
|
image = tf.cond(do_a_rot90_random, lambda: _rot90_image(image), |
|
lambda: image) |
|
result.append(image) |
|
|
|
|
|
if boxes is not None: |
|
boxes = tf.cond(do_a_rot90_random, lambda: _rot90_boxes(boxes), |
|
lambda: boxes) |
|
result.append(boxes) |
|
|
|
|
|
if masks is not None: |
|
masks = tf.cond(do_a_rot90_random, lambda: _rot90_masks(masks), |
|
lambda: masks) |
|
result.append(masks) |
|
|
|
|
|
if keypoints is not None: |
|
keypoints = tf.cond( |
|
do_a_rot90_random, |
|
lambda: keypoint_ops.rot90(keypoints), |
|
lambda: keypoints) |
|
result.append(keypoints) |
|
|
|
return tuple(result) |
|
|
|
|
|
def random_pixel_value_scale(image, |
|
minval=0.9, |
|
maxval=1.1, |
|
seed=None, |
|
preprocess_vars_cache=None): |
|
"""Scales each value in the pixels of the image. |
|
|
|
This function scales each pixel independent of the other ones. |
|
For each value in image tensor, draws a random number between |
|
minval and maxval and multiples the values with them. |
|
|
|
Args: |
|
image: rank 3 float32 tensor contains 1 image -> [height, width, channels] |
|
with pixel values varying between [0, 255]. |
|
minval: lower ratio of scaling pixel values. |
|
maxval: upper ratio of scaling pixel values. |
|
seed: random seed. |
|
preprocess_vars_cache: PreprocessorCache object that records previously |
|
performed augmentations. Updated in-place. If this |
|
function is called multiple times with the same |
|
non-null cache, it will perform deterministically. |
|
|
|
Returns: |
|
image: image which is the same shape as input image. |
|
""" |
|
with tf.name_scope('RandomPixelValueScale', values=[image]): |
|
generator_func = functools.partial( |
|
tf.random_uniform, tf.shape(image), |
|
minval=minval, maxval=maxval, |
|
dtype=tf.float32, seed=seed) |
|
color_coef = _get_or_create_preprocess_rand_vars( |
|
generator_func, |
|
preprocessor_cache.PreprocessorCache.PIXEL_VALUE_SCALE, |
|
preprocess_vars_cache) |
|
|
|
image = tf.multiply(image, color_coef) |
|
image = tf.clip_by_value(image, 0.0, 255.0) |
|
|
|
return image |
|
|
|
|
|
def random_image_scale(image, |
|
masks=None, |
|
min_scale_ratio=0.5, |
|
max_scale_ratio=2.0, |
|
seed=None, |
|
preprocess_vars_cache=None): |
|
"""Scales the image size. |
|
|
|
Args: |
|
image: rank 3 float32 tensor contains 1 image -> [height, width, channels]. |
|
masks: (optional) rank 3 float32 tensor containing masks with |
|
size [height, width, num_masks]. The value is set to None if there are no |
|
masks. |
|
min_scale_ratio: minimum scaling ratio. |
|
max_scale_ratio: maximum scaling ratio. |
|
seed: random seed. |
|
preprocess_vars_cache: PreprocessorCache object that records previously |
|
performed augmentations. Updated in-place. If this |
|
function is called multiple times with the same |
|
non-null cache, it will perform deterministically. |
|
|
|
Returns: |
|
image: image which is the same rank as input image. |
|
masks: If masks is not none, resized masks which are the same rank as input |
|
masks will be returned. |
|
""" |
|
with tf.name_scope('RandomImageScale', values=[image]): |
|
result = [] |
|
image_shape = tf.shape(image) |
|
image_height = image_shape[0] |
|
image_width = image_shape[1] |
|
generator_func = functools.partial( |
|
tf.random_uniform, [], |
|
minval=min_scale_ratio, maxval=max_scale_ratio, |
|
dtype=tf.float32, seed=seed) |
|
size_coef = _get_or_create_preprocess_rand_vars( |
|
generator_func, preprocessor_cache.PreprocessorCache.IMAGE_SCALE, |
|
preprocess_vars_cache) |
|
|
|
image_newysize = tf.to_int32( |
|
tf.multiply(tf.to_float(image_height), size_coef)) |
|
image_newxsize = tf.to_int32( |
|
tf.multiply(tf.to_float(image_width), size_coef)) |
|
image = tf.image.resize_images( |
|
image, [image_newysize, image_newxsize], align_corners=True) |
|
result.append(image) |
|
if masks is not None: |
|
masks = tf.image.resize_images( |
|
masks, [image_newysize, image_newxsize], |
|
method=tf.image.ResizeMethod.NEAREST_NEIGHBOR, |
|
align_corners=True) |
|
result.append(masks) |
|
return tuple(result) |
|
|
|
|
|
def _augment_only_rgb_channels(image, augment_function): |
|
"""Augments only the RGB slice of an image with additional channels.""" |
|
rgb_slice = image[:, :, :3] |
|
augmented_rgb_slice = augment_function(rgb_slice) |
|
image = tf.concat([augmented_rgb_slice, image[:, :, 3:]], -1) |
|
return image |
|
|
|
|
|
def random_rgb_to_gray(image, |
|
probability=0.1, |
|
seed=None, |
|
preprocess_vars_cache=None): |
|
"""Changes the image from RGB to Grayscale with the given probability. |
|
|
|
Args: |
|
image: rank 3 float32 tensor contains 1 image -> [height, width, channels] |
|
with pixel values varying between [0, 255]. |
|
probability: the probability of returning a grayscale image. |
|
The probability should be a number between [0, 1]. |
|
seed: random seed. |
|
preprocess_vars_cache: PreprocessorCache object that records previously |
|
performed augmentations. Updated in-place. If this |
|
function is called multiple times with the same |
|
non-null cache, it will perform deterministically. |
|
|
|
Returns: |
|
image: image which is the same shape as input image. |
|
""" |
|
def _image_to_gray(image): |
|
image_gray1 = _rgb_to_grayscale(image) |
|
image_gray3 = tf.image.grayscale_to_rgb(image_gray1) |
|
return image_gray3 |
|
|
|
with tf.name_scope('RandomRGBtoGray', values=[image]): |
|
|
|
generator_func = functools.partial(tf.random_uniform, [], seed=seed) |
|
do_gray_random = _get_or_create_preprocess_rand_vars( |
|
generator_func, preprocessor_cache.PreprocessorCache.RGB_TO_GRAY, |
|
preprocess_vars_cache) |
|
|
|
image = tf.cond( |
|
tf.greater(do_gray_random, probability), lambda: image, |
|
lambda: _augment_only_rgb_channels(image, _image_to_gray)) |
|
|
|
return image |
|
|
|
|
|
def random_adjust_brightness(image, |
|
max_delta=0.2, |
|
seed=None, |
|
preprocess_vars_cache=None): |
|
"""Randomly adjusts brightness. |
|
|
|
Makes sure the output image is still between 0 and 255. |
|
|
|
Args: |
|
image: rank 3 float32 tensor contains 1 image -> [height, width, channels] |
|
with pixel values varying between [0, 255]. |
|
max_delta: how much to change the brightness. A value between [0, 1). |
|
seed: random seed. |
|
preprocess_vars_cache: PreprocessorCache object that records previously |
|
performed augmentations. Updated in-place. If this |
|
function is called multiple times with the same |
|
non-null cache, it will perform deterministically. |
|
|
|
Returns: |
|
image: image which is the same shape as input image. |
|
boxes: boxes which is the same shape as input boxes. |
|
""" |
|
with tf.name_scope('RandomAdjustBrightness', values=[image]): |
|
generator_func = functools.partial(tf.random_uniform, [], |
|
-max_delta, max_delta, seed=seed) |
|
delta = _get_or_create_preprocess_rand_vars( |
|
generator_func, |
|
preprocessor_cache.PreprocessorCache.ADJUST_BRIGHTNESS, |
|
preprocess_vars_cache) |
|
|
|
def _adjust_brightness(image): |
|
image = tf.image.adjust_brightness(image / 255, delta) * 255 |
|
image = tf.clip_by_value(image, clip_value_min=0.0, clip_value_max=255.0) |
|
return image |
|
|
|
image = _augment_only_rgb_channels(image, _adjust_brightness) |
|
return image |
|
|
|
|
|
def random_adjust_contrast(image, |
|
min_delta=0.8, |
|
max_delta=1.25, |
|
seed=None, |
|
preprocess_vars_cache=None): |
|
"""Randomly adjusts contrast. |
|
|
|
Makes sure the output image is still between 0 and 255. |
|
|
|
Args: |
|
image: rank 3 float32 tensor contains 1 image -> [height, width, channels] |
|
with pixel values varying between [0, 255]. |
|
min_delta: see max_delta. |
|
max_delta: how much to change the contrast. Contrast will change with a |
|
value between min_delta and max_delta. This value will be |
|
multiplied to the current contrast of the image. |
|
seed: random seed. |
|
preprocess_vars_cache: PreprocessorCache object that records previously |
|
performed augmentations. Updated in-place. If this |
|
function is called multiple times with the same |
|
non-null cache, it will perform deterministically. |
|
|
|
Returns: |
|
image: image which is the same shape as input image. |
|
""" |
|
with tf.name_scope('RandomAdjustContrast', values=[image]): |
|
generator_func = functools.partial(tf.random_uniform, [], |
|
min_delta, max_delta, seed=seed) |
|
contrast_factor = _get_or_create_preprocess_rand_vars( |
|
generator_func, |
|
preprocessor_cache.PreprocessorCache.ADJUST_CONTRAST, |
|
preprocess_vars_cache) |
|
|
|
def _adjust_contrast(image): |
|
image = tf.image.adjust_contrast(image / 255, contrast_factor) * 255 |
|
image = tf.clip_by_value(image, clip_value_min=0.0, clip_value_max=255.0) |
|
return image |
|
image = _augment_only_rgb_channels(image, _adjust_contrast) |
|
return image |
|
|
|
|
|
def random_adjust_hue(image, |
|
max_delta=0.02, |
|
seed=None, |
|
preprocess_vars_cache=None): |
|
"""Randomly adjusts hue. |
|
|
|
Makes sure the output image is still between 0 and 255. |
|
|
|
Args: |
|
image: rank 3 float32 tensor contains 1 image -> [height, width, channels] |
|
with pixel values varying between [0, 255]. |
|
max_delta: change hue randomly with a value between 0 and max_delta. |
|
seed: random seed. |
|
preprocess_vars_cache: PreprocessorCache object that records previously |
|
performed augmentations. Updated in-place. If this |
|
function is called multiple times with the same |
|
non-null cache, it will perform deterministically. |
|
|
|
Returns: |
|
image: image which is the same shape as input image. |
|
""" |
|
with tf.name_scope('RandomAdjustHue', values=[image]): |
|
generator_func = functools.partial(tf.random_uniform, [], |
|
-max_delta, max_delta, seed=seed) |
|
delta = _get_or_create_preprocess_rand_vars( |
|
generator_func, preprocessor_cache.PreprocessorCache.ADJUST_HUE, |
|
preprocess_vars_cache) |
|
def _adjust_hue(image): |
|
image = tf.image.adjust_hue(image / 255, delta) * 255 |
|
image = tf.clip_by_value(image, clip_value_min=0.0, clip_value_max=255.0) |
|
return image |
|
image = _augment_only_rgb_channels(image, _adjust_hue) |
|
return image |
|
|
|
|
|
def random_adjust_saturation(image, |
|
min_delta=0.8, |
|
max_delta=1.25, |
|
seed=None, |
|
preprocess_vars_cache=None): |
|
"""Randomly adjusts saturation. |
|
|
|
Makes sure the output image is still between 0 and 255. |
|
|
|
Args: |
|
image: rank 3 float32 tensor contains 1 image -> [height, width, channels] |
|
with pixel values varying between [0, 255]. |
|
min_delta: see max_delta. |
|
max_delta: how much to change the saturation. Saturation will change with a |
|
value between min_delta and max_delta. This value will be |
|
multiplied to the current saturation of the image. |
|
seed: random seed. |
|
preprocess_vars_cache: PreprocessorCache object that records previously |
|
performed augmentations. Updated in-place. If this |
|
function is called multiple times with the same |
|
non-null cache, it will perform deterministically. |
|
|
|
Returns: |
|
image: image which is the same shape as input image. |
|
""" |
|
with tf.name_scope('RandomAdjustSaturation', values=[image]): |
|
generator_func = functools.partial(tf.random_uniform, [], |
|
min_delta, max_delta, seed=seed) |
|
saturation_factor = _get_or_create_preprocess_rand_vars( |
|
generator_func, |
|
preprocessor_cache.PreprocessorCache.ADJUST_SATURATION, |
|
preprocess_vars_cache) |
|
def _adjust_saturation(image): |
|
image = tf.image.adjust_saturation(image / 255, saturation_factor) * 255 |
|
image = tf.clip_by_value(image, clip_value_min=0.0, clip_value_max=255.0) |
|
return image |
|
image = _augment_only_rgb_channels(image, _adjust_saturation) |
|
return image |
|
|
|
|
|
def random_distort_color(image, color_ordering=0, preprocess_vars_cache=None): |
|
"""Randomly distorts color. |
|
|
|
Randomly distorts color using a combination of brightness, hue, contrast and |
|
saturation changes. Makes sure the output image is still between 0 and 255. |
|
|
|
Args: |
|
image: rank 3 float32 tensor contains 1 image -> [height, width, channels] |
|
with pixel values varying between [0, 255]. |
|
color_ordering: Python int, a type of distortion (valid values: 0, 1). |
|
preprocess_vars_cache: PreprocessorCache object that records previously |
|
performed augmentations. Updated in-place. If this |
|
function is called multiple times with the same |
|
non-null cache, it will perform deterministically. |
|
|
|
Returns: |
|
image: image which is the same shape as input image. |
|
|
|
Raises: |
|
ValueError: if color_ordering is not in {0, 1}. |
|
""" |
|
with tf.name_scope('RandomDistortColor', values=[image]): |
|
if color_ordering == 0: |
|
image = random_adjust_brightness( |
|
image, max_delta=32. / 255., |
|
preprocess_vars_cache=preprocess_vars_cache) |
|
image = random_adjust_saturation( |
|
image, min_delta=0.5, max_delta=1.5, |
|
preprocess_vars_cache=preprocess_vars_cache) |
|
image = random_adjust_hue( |
|
image, max_delta=0.2, |
|
preprocess_vars_cache=preprocess_vars_cache) |
|
image = random_adjust_contrast( |
|
image, min_delta=0.5, max_delta=1.5, |
|
preprocess_vars_cache=preprocess_vars_cache) |
|
|
|
elif color_ordering == 1: |
|
image = random_adjust_brightness( |
|
image, max_delta=32. / 255., |
|
preprocess_vars_cache=preprocess_vars_cache) |
|
image = random_adjust_contrast( |
|
image, min_delta=0.5, max_delta=1.5, |
|
preprocess_vars_cache=preprocess_vars_cache) |
|
image = random_adjust_saturation( |
|
image, min_delta=0.5, max_delta=1.5, |
|
preprocess_vars_cache=preprocess_vars_cache) |
|
image = random_adjust_hue( |
|
image, max_delta=0.2, |
|
preprocess_vars_cache=preprocess_vars_cache) |
|
else: |
|
raise ValueError('color_ordering must be in {0, 1}') |
|
return image |
|
|
|
|
|
def random_jitter_boxes(boxes, ratio=0.05, seed=None): |
|
"""Randomly jitter boxes in image. |
|
|
|
Args: |
|
boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. |
|
Boxes are in normalized form meaning their coordinates vary |
|
between [0, 1]. |
|
Each row is in the form of [ymin, xmin, ymax, xmax]. |
|
ratio: The ratio of the box width and height that the corners can jitter. |
|
For example if the width is 100 pixels and ratio is 0.05, |
|
the corners can jitter up to 5 pixels in the x direction. |
|
seed: random seed. |
|
|
|
Returns: |
|
boxes: boxes which is the same shape as input boxes. |
|
""" |
|
def random_jitter_box(box, ratio, seed): |
|
"""Randomly jitter box. |
|
|
|
Args: |
|
box: bounding box [1, 1, 4]. |
|
ratio: max ratio between jittered box and original box, |
|
a number between [0, 0.5]. |
|
seed: random seed. |
|
|
|
Returns: |
|
jittered_box: jittered box. |
|
""" |
|
rand_numbers = tf.random_uniform( |
|
[1, 1, 4], minval=-ratio, maxval=ratio, dtype=tf.float32, seed=seed) |
|
box_width = tf.subtract(box[0, 0, 3], box[0, 0, 1]) |
|
box_height = tf.subtract(box[0, 0, 2], box[0, 0, 0]) |
|
hw_coefs = tf.stack([box_height, box_width, box_height, box_width]) |
|
hw_rand_coefs = tf.multiply(hw_coefs, rand_numbers) |
|
jittered_box = tf.add(box, hw_rand_coefs) |
|
jittered_box = tf.clip_by_value(jittered_box, 0.0, 1.0) |
|
return jittered_box |
|
|
|
with tf.name_scope('RandomJitterBoxes', values=[boxes]): |
|
|
|
boxes_shape = tf.shape(boxes) |
|
boxes = tf.expand_dims(boxes, 1) |
|
boxes = tf.expand_dims(boxes, 2) |
|
|
|
distorted_boxes = tf.map_fn( |
|
lambda x: random_jitter_box(x, ratio, seed), boxes, dtype=tf.float32) |
|
|
|
distorted_boxes = tf.reshape(distorted_boxes, boxes_shape) |
|
|
|
return distorted_boxes |
|
|
|
|
|
def _strict_random_crop_image(image, |
|
boxes, |
|
labels, |
|
label_weights, |
|
label_confidences=None, |
|
multiclass_scores=None, |
|
masks=None, |
|
keypoints=None, |
|
min_object_covered=1.0, |
|
aspect_ratio_range=(0.75, 1.33), |
|
area_range=(0.1, 1.0), |
|
overlap_thresh=0.3, |
|
clip_boxes=True, |
|
preprocess_vars_cache=None): |
|
"""Performs random crop. |
|
|
|
Note: Keypoint coordinates that are outside the crop will be set to NaN, which |
|
is consistent with the original keypoint encoding for non-existing keypoints. |
|
This function always crops the image and is supposed to be used by |
|
`random_crop_image` function which sometimes returns the image unchanged. |
|
|
|
Args: |
|
image: rank 3 float32 tensor containing 1 image -> [height, width, channels] |
|
with pixel values varying between [0, 1]. |
|
boxes: rank 2 float32 tensor containing the bounding boxes with shape |
|
[num_instances, 4]. |
|
Boxes are in normalized form meaning their coordinates vary |
|
between [0, 1]. |
|
Each row is in the form of [ymin, xmin, ymax, xmax]. |
|
labels: rank 1 int32 tensor containing the object classes. |
|
label_weights: float32 tensor of shape [num_instances] representing the |
|
weight for each box. |
|
label_confidences: (optional) float32 tensor of shape [num_instances] |
|
representing the confidence for each box. |
|
multiclass_scores: (optional) float32 tensor of shape |
|
[num_instances, num_classes] representing the score for each box for each |
|
class. |
|
masks: (optional) rank 3 float32 tensor with shape |
|
[num_instances, height, width] containing instance masks. The masks |
|
are of the same height, width as the input `image`. |
|
keypoints: (optional) rank 3 float32 tensor with shape |
|
[num_instances, num_keypoints, 2]. The keypoints are in y-x |
|
normalized coordinates. |
|
min_object_covered: the cropped image must cover at least this fraction of |
|
at least one of the input bounding boxes. |
|
aspect_ratio_range: allowed range for aspect ratio of cropped image. |
|
area_range: allowed range for area ratio between cropped image and the |
|
original image. |
|
overlap_thresh: minimum overlap thresh with new cropped |
|
image to keep the box. |
|
clip_boxes: whether to clip the boxes to the cropped image. |
|
preprocess_vars_cache: PreprocessorCache object that records previously |
|
performed augmentations. Updated in-place. If this |
|
function is called multiple times with the same |
|
non-null cache, it will perform deterministically. |
|
|
|
Returns: |
|
image: image which is the same rank as input image. |
|
boxes: boxes which is the same rank as input boxes. |
|
Boxes are in normalized form. |
|
labels: new labels. |
|
|
|
If label_weights, multiclass_scores, masks, or keypoints is not None, the |
|
function also returns: |
|
label_weights: rank 1 float32 tensor with shape [num_instances]. |
|
multiclass_scores: rank 2 float32 tensor with shape |
|
[num_instances, num_classes] |
|
masks: rank 3 float32 tensor with shape [num_instances, height, width] |
|
containing instance masks. |
|
keypoints: rank 3 float32 tensor with shape |
|
[num_instances, num_keypoints, 2] |
|
""" |
|
with tf.name_scope('RandomCropImage', values=[image, boxes]): |
|
image_shape = tf.shape(image) |
|
|
|
|
|
boxes_expanded = tf.expand_dims( |
|
tf.clip_by_value( |
|
boxes, clip_value_min=0.0, clip_value_max=1.0), 1) |
|
|
|
generator_func = functools.partial( |
|
tf.image.sample_distorted_bounding_box, |
|
image_shape, |
|
bounding_boxes=boxes_expanded, |
|
min_object_covered=min_object_covered, |
|
aspect_ratio_range=aspect_ratio_range, |
|
area_range=area_range, |
|
max_attempts=100, |
|
use_image_if_no_bounding_boxes=True) |
|
|
|
|
|
|
|
sample_distorted_bounding_box = _get_or_create_preprocess_rand_vars( |
|
generator_func, |
|
preprocessor_cache.PreprocessorCache.STRICT_CROP_IMAGE, |
|
preprocess_vars_cache, key=min_object_covered) |
|
|
|
im_box_begin, im_box_size, im_box = sample_distorted_bounding_box |
|
|
|
new_image = tf.slice(image, im_box_begin, im_box_size) |
|
new_image.set_shape([None, None, image.get_shape()[2]]) |
|
|
|
|
|
im_box_rank2 = tf.squeeze(im_box, squeeze_dims=[0]) |
|
|
|
im_box_rank1 = tf.squeeze(im_box) |
|
|
|
boxlist = box_list.BoxList(boxes) |
|
boxlist.add_field('labels', labels) |
|
|
|
if label_weights is not None: |
|
boxlist.add_field('label_weights', label_weights) |
|
|
|
if label_confidences is not None: |
|
boxlist.add_field('label_confidences', label_confidences) |
|
|
|
if multiclass_scores is not None: |
|
boxlist.add_field('multiclass_scores', multiclass_scores) |
|
|
|
im_boxlist = box_list.BoxList(im_box_rank2) |
|
|
|
|
|
boxlist, inside_window_ids = box_list_ops.prune_completely_outside_window( |
|
boxlist, im_box_rank1) |
|
|
|
|
|
overlapping_boxlist, keep_ids = box_list_ops.prune_non_overlapping_boxes( |
|
boxlist, im_boxlist, overlap_thresh) |
|
|
|
|
|
new_labels = overlapping_boxlist.get_field('labels') |
|
new_boxlist = box_list_ops.change_coordinate_frame(overlapping_boxlist, |
|
im_box_rank1) |
|
new_boxes = new_boxlist.get() |
|
if clip_boxes: |
|
new_boxes = tf.clip_by_value( |
|
new_boxes, clip_value_min=0.0, clip_value_max=1.0) |
|
|
|
result = [new_image, new_boxes, new_labels] |
|
|
|
if label_weights is not None: |
|
new_label_weights = overlapping_boxlist.get_field('label_weights') |
|
result.append(new_label_weights) |
|
|
|
if label_confidences is not None: |
|
new_label_confidences = overlapping_boxlist.get_field('label_confidences') |
|
result.append(new_label_confidences) |
|
|
|
if multiclass_scores is not None: |
|
new_multiclass_scores = overlapping_boxlist.get_field('multiclass_scores') |
|
result.append(new_multiclass_scores) |
|
|
|
if masks is not None: |
|
masks_of_boxes_inside_window = tf.gather(masks, inside_window_ids) |
|
masks_of_boxes_completely_inside_window = tf.gather( |
|
masks_of_boxes_inside_window, keep_ids) |
|
masks_box_begin = [0, im_box_begin[0], im_box_begin[1]] |
|
masks_box_size = [-1, im_box_size[0], im_box_size[1]] |
|
new_masks = tf.slice( |
|
masks_of_boxes_completely_inside_window, |
|
masks_box_begin, masks_box_size) |
|
result.append(new_masks) |
|
|
|
if keypoints is not None: |
|
keypoints_of_boxes_inside_window = tf.gather(keypoints, inside_window_ids) |
|
keypoints_of_boxes_completely_inside_window = tf.gather( |
|
keypoints_of_boxes_inside_window, keep_ids) |
|
new_keypoints = keypoint_ops.change_coordinate_frame( |
|
keypoints_of_boxes_completely_inside_window, im_box_rank1) |
|
if clip_boxes: |
|
new_keypoints = keypoint_ops.prune_outside_window(new_keypoints, |
|
[0.0, 0.0, 1.0, 1.0]) |
|
result.append(new_keypoints) |
|
|
|
return tuple(result) |
|
|
|
|
|
def random_crop_image(image, |
|
boxes, |
|
labels, |
|
label_weights, |
|
label_confidences=None, |
|
multiclass_scores=None, |
|
masks=None, |
|
keypoints=None, |
|
min_object_covered=1.0, |
|
aspect_ratio_range=(0.75, 1.33), |
|
area_range=(0.1, 1.0), |
|
overlap_thresh=0.3, |
|
clip_boxes=True, |
|
random_coef=0.0, |
|
seed=None, |
|
preprocess_vars_cache=None): |
|
"""Randomly crops the image. |
|
|
|
Given the input image and its bounding boxes, this op randomly |
|
crops a subimage. Given a user-provided set of input constraints, |
|
the crop window is resampled until it satisfies these constraints. |
|
If within 100 trials it is unable to find a valid crop, the original |
|
image is returned. See the Args section for a description of the input |
|
constraints. Both input boxes and returned Boxes are in normalized |
|
form (e.g., lie in the unit square [0, 1]). |
|
This function will return the original image with probability random_coef. |
|
|
|
Note: Keypoint coordinates that are outside the crop will be set to NaN, which |
|
is consistent with the original keypoint encoding for non-existing keypoints. |
|
|
|
Args: |
|
image: rank 3 float32 tensor contains 1 image -> [height, width, channels] |
|
with pixel values varying between [0, 1]. |
|
boxes: rank 2 float32 tensor containing the bounding boxes with shape |
|
[num_instances, 4]. |
|
Boxes are in normalized form meaning their coordinates vary |
|
between [0, 1]. |
|
Each row is in the form of [ymin, xmin, ymax, xmax]. |
|
labels: rank 1 int32 tensor containing the object classes. |
|
label_weights: float32 tensor of shape [num_instances] representing the |
|
weight for each box. |
|
label_confidences: (optional) float32 tensor of shape [num_instances]. |
|
representing the confidence for each box. |
|
multiclass_scores: (optional) float32 tensor of shape |
|
[num_instances, num_classes] representing the score for each box for each |
|
class. |
|
masks: (optional) rank 3 float32 tensor with shape |
|
[num_instances, height, width] containing instance masks. The masks |
|
are of the same height, width as the input `image`. |
|
keypoints: (optional) rank 3 float32 tensor with shape |
|
[num_instances, num_keypoints, 2]. The keypoints are in y-x |
|
normalized coordinates. |
|
min_object_covered: the cropped image must cover at least this fraction of |
|
at least one of the input bounding boxes. |
|
aspect_ratio_range: allowed range for aspect ratio of cropped image. |
|
area_range: allowed range for area ratio between cropped image and the |
|
original image. |
|
overlap_thresh: minimum overlap thresh with new cropped |
|
image to keep the box. |
|
clip_boxes: whether to clip the boxes to the cropped image. |
|
random_coef: a random coefficient that defines the chance of getting the |
|
original image. If random_coef is 0, we will always get the |
|
cropped image, and if it is 1.0, we will always get the |
|
original image. |
|
seed: random seed. |
|
preprocess_vars_cache: PreprocessorCache object that records previously |
|
performed augmentations. Updated in-place. If this |
|
function is called multiple times with the same |
|
non-null cache, it will perform deterministically. |
|
|
|
Returns: |
|
image: Image shape will be [new_height, new_width, channels]. |
|
boxes: boxes which is the same rank as input boxes. Boxes are in normalized |
|
form. |
|
labels: new labels. |
|
|
|
If label_weights, multiclass_scores, masks, or keypoints is not None, the |
|
function also returns: |
|
label_weights: rank 1 float32 tensor with shape [num_instances]. |
|
multiclass_scores: rank 2 float32 tensor with shape |
|
[num_instances, num_classes] |
|
masks: rank 3 float32 tensor with shape [num_instances, height, width] |
|
containing instance masks. |
|
keypoints: rank 3 float32 tensor with shape |
|
[num_instances, num_keypoints, 2] |
|
""" |
|
|
|
def strict_random_crop_image_fn(): |
|
return _strict_random_crop_image( |
|
image, |
|
boxes, |
|
labels, |
|
label_weights, |
|
label_confidences=label_confidences, |
|
multiclass_scores=multiclass_scores, |
|
masks=masks, |
|
keypoints=keypoints, |
|
min_object_covered=min_object_covered, |
|
aspect_ratio_range=aspect_ratio_range, |
|
area_range=area_range, |
|
overlap_thresh=overlap_thresh, |
|
clip_boxes=clip_boxes, |
|
preprocess_vars_cache=preprocess_vars_cache) |
|
|
|
|
|
if random_coef < sys.float_info.min: |
|
result = strict_random_crop_image_fn() |
|
else: |
|
generator_func = functools.partial(tf.random_uniform, [], seed=seed) |
|
do_a_crop_random = _get_or_create_preprocess_rand_vars( |
|
generator_func, preprocessor_cache.PreprocessorCache.CROP_IMAGE, |
|
preprocess_vars_cache) |
|
do_a_crop_random = tf.greater(do_a_crop_random, random_coef) |
|
|
|
outputs = [image, boxes, labels] |
|
|
|
if label_weights is not None: |
|
outputs.append(label_weights) |
|
if label_confidences is not None: |
|
outputs.append(label_confidences) |
|
if multiclass_scores is not None: |
|
outputs.append(multiclass_scores) |
|
if masks is not None: |
|
outputs.append(masks) |
|
if keypoints is not None: |
|
outputs.append(keypoints) |
|
|
|
result = tf.cond(do_a_crop_random, strict_random_crop_image_fn, |
|
lambda: tuple(outputs)) |
|
return result |
|
|
|
|
|
def random_pad_image(image, |
|
boxes, |
|
keypoints=None, |
|
min_image_size=None, |
|
max_image_size=None, |
|
pad_color=None, |
|
seed=None, |
|
preprocess_vars_cache=None): |
|
"""Randomly pads the image. |
|
|
|
This function randomly pads the image with zeros. The final size of the |
|
padded image will be between min_image_size and max_image_size. |
|
if min_image_size is smaller than the input image size, min_image_size will |
|
be set to the input image size. The same for max_image_size. The input image |
|
will be located at a uniformly random location inside the padded image. |
|
The relative location of the boxes to the original image will remain the same. |
|
|
|
Args: |
|
image: rank 3 float32 tensor containing 1 image -> [height, width, channels] |
|
with pixel values varying between [0, 1]. |
|
boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. |
|
Boxes are in normalized form meaning their coordinates vary |
|
between [0, 1]. |
|
Each row is in the form of [ymin, xmin, ymax, xmax]. |
|
keypoints: (optional) rank 3 float32 tensor with shape |
|
[N, num_keypoints, 2]. The keypoints are in y-x normalized |
|
coordinates. |
|
min_image_size: a tensor of size [min_height, min_width], type tf.int32. |
|
If passed as None, will be set to image size |
|
[height, width]. |
|
max_image_size: a tensor of size [max_height, max_width], type tf.int32. |
|
If passed as None, will be set to twice the |
|
image [height * 2, width * 2]. |
|
pad_color: padding color. A rank 1 tensor of [channels] with dtype= |
|
tf.float32. if set as None, it will be set to average color of |
|
the input image. |
|
seed: random seed. |
|
preprocess_vars_cache: PreprocessorCache object that records previously |
|
performed augmentations. Updated in-place. If this |
|
function is called multiple times with the same |
|
non-null cache, it will perform deterministically. |
|
|
|
Returns: |
|
image: Image shape will be [new_height, new_width, channels]. |
|
boxes: boxes which is the same rank as input boxes. Boxes are in normalized |
|
form. |
|
|
|
if keypoints is not None, the function also returns: |
|
keypoints: rank 3 float32 tensor with shape [N, num_keypoints, 2] |
|
""" |
|
if pad_color is None: |
|
pad_color = tf.reduce_mean(image, axis=[0, 1]) |
|
|
|
image_shape = tf.shape(image) |
|
image_height = image_shape[0] |
|
image_width = image_shape[1] |
|
|
|
if max_image_size is None: |
|
max_image_size = tf.stack([image_height * 2, image_width * 2]) |
|
max_image_size = tf.maximum(max_image_size, |
|
tf.stack([image_height, image_width])) |
|
|
|
if min_image_size is None: |
|
min_image_size = tf.stack([image_height, image_width]) |
|
min_image_size = tf.maximum(min_image_size, |
|
tf.stack([image_height, image_width])) |
|
|
|
target_height = tf.cond( |
|
max_image_size[0] > min_image_size[0], |
|
lambda: _random_integer(min_image_size[0], max_image_size[0], seed), |
|
lambda: max_image_size[0]) |
|
|
|
target_width = tf.cond( |
|
max_image_size[1] > min_image_size[1], |
|
lambda: _random_integer(min_image_size[1], max_image_size[1], seed), |
|
lambda: max_image_size[1]) |
|
|
|
offset_height = tf.cond( |
|
target_height > image_height, |
|
lambda: _random_integer(0, target_height - image_height, seed), |
|
lambda: tf.constant(0, dtype=tf.int32)) |
|
|
|
offset_width = tf.cond( |
|
target_width > image_width, |
|
lambda: _random_integer(0, target_width - image_width, seed), |
|
lambda: tf.constant(0, dtype=tf.int32)) |
|
|
|
gen_func = lambda: (target_height, target_width, offset_height, offset_width) |
|
params = _get_or_create_preprocess_rand_vars( |
|
gen_func, preprocessor_cache.PreprocessorCache.PAD_IMAGE, |
|
preprocess_vars_cache) |
|
target_height, target_width, offset_height, offset_width = params |
|
|
|
new_image = tf.image.pad_to_bounding_box( |
|
image, |
|
offset_height=offset_height, |
|
offset_width=offset_width, |
|
target_height=target_height, |
|
target_width=target_width) |
|
|
|
|
|
image_ones = tf.ones_like(image) |
|
image_ones_padded = tf.image.pad_to_bounding_box( |
|
image_ones, |
|
offset_height=offset_height, |
|
offset_width=offset_width, |
|
target_height=target_height, |
|
target_width=target_width) |
|
image_color_padded = (1.0 - image_ones_padded) * pad_color |
|
new_image += image_color_padded |
|
|
|
|
|
new_window = tf.to_float( |
|
tf.stack([ |
|
-offset_height, -offset_width, target_height - offset_height, |
|
target_width - offset_width |
|
])) |
|
new_window /= tf.to_float( |
|
tf.stack([image_height, image_width, image_height, image_width])) |
|
boxlist = box_list.BoxList(boxes) |
|
new_boxlist = box_list_ops.change_coordinate_frame(boxlist, new_window) |
|
new_boxes = new_boxlist.get() |
|
|
|
result = [new_image, new_boxes] |
|
|
|
if keypoints is not None: |
|
new_keypoints = keypoint_ops.change_coordinate_frame(keypoints, new_window) |
|
result.append(new_keypoints) |
|
|
|
return tuple(result) |
|
|
|
|
|
def random_absolute_pad_image(image, |
|
boxes, |
|
max_height_padding, |
|
max_width_padding, |
|
pad_color=None, |
|
seed=None, |
|
preprocess_vars_cache=None): |
|
"""Randomly pads the image by small absolute amounts. |
|
|
|
As random_pad_image above, but the padding is of size [0, max_height_padding] |
|
or [0, max_width_padding] instead of padding to a fixed size of |
|
max_height_padding for all images. |
|
|
|
Args: |
|
image: rank 3 float32 tensor containing 1 image -> [height, width, channels] |
|
with pixel values varying between [0, 1]. |
|
boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. |
|
Boxes are in normalized form meaning their coordinates vary |
|
between [0, 1]. |
|
Each row is in the form of [ymin, xmin, ymax, xmax]. |
|
max_height_padding: a scalar tf.int32 tensor denoting the maximum amount of |
|
height padding. The padding will be chosen uniformly at |
|
random from [0, max_height_padding). |
|
max_width_padding: a scalar tf.int32 tensor denoting the maximum amount of |
|
width padding. The padding will be chosen uniformly at |
|
random from [0, max_width_padding). |
|
pad_color: padding color. A rank 1 tensor of [3] with dtype=tf.float32. |
|
if set as None, it will be set to average color of the input |
|
image. |
|
seed: random seed. |
|
preprocess_vars_cache: PreprocessorCache object that records previously |
|
performed augmentations. Updated in-place. If this |
|
function is called multiple times with the same |
|
non-null cache, it will perform deterministically. |
|
|
|
Returns: |
|
image: Image shape will be [new_height, new_width, channels]. |
|
boxes: boxes which is the same rank as input boxes. Boxes are in normalized |
|
form. |
|
""" |
|
min_image_size = tf.shape(image)[:2] |
|
max_image_size = min_image_size + tf.to_int32( |
|
[max_height_padding, max_width_padding]) |
|
return random_pad_image(image, boxes, min_image_size=min_image_size, |
|
max_image_size=max_image_size, pad_color=pad_color, |
|
seed=seed, |
|
preprocess_vars_cache=preprocess_vars_cache) |
|
|
|
|
|
def random_crop_pad_image(image, |
|
boxes, |
|
labels, |
|
label_weights, |
|
label_confidences=None, |
|
multiclass_scores=None, |
|
min_object_covered=1.0, |
|
aspect_ratio_range=(0.75, 1.33), |
|
area_range=(0.1, 1.0), |
|
overlap_thresh=0.3, |
|
clip_boxes=True, |
|
random_coef=0.0, |
|
min_padded_size_ratio=(1.0, 1.0), |
|
max_padded_size_ratio=(2.0, 2.0), |
|
pad_color=None, |
|
seed=None, |
|
preprocess_vars_cache=None): |
|
"""Randomly crops and pads the image. |
|
|
|
Given an input image and its bounding boxes, this op first randomly crops |
|
the image and then randomly pads the image with background values. Parameters |
|
min_padded_size_ratio and max_padded_size_ratio, determine the range of the |
|
final output image size. Specifically, the final image size will have a size |
|
in the range of min_padded_size_ratio * tf.shape(image) and |
|
max_padded_size_ratio * tf.shape(image). Note that these ratios are with |
|
respect to the size of the original image, so we can't capture the same |
|
effect easily by independently applying RandomCropImage |
|
followed by RandomPadImage. |
|
|
|
Args: |
|
image: rank 3 float32 tensor containing 1 image -> [height, width, channels] |
|
with pixel values varying between [0, 1]. |
|
boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. |
|
Boxes are in normalized form meaning their coordinates vary |
|
between [0, 1]. |
|
Each row is in the form of [ymin, xmin, ymax, xmax]. |
|
labels: rank 1 int32 tensor containing the object classes. |
|
label_weights: rank 1 float32 containing the label weights. |
|
label_confidences: rank 1 float32 containing the label confidences. |
|
multiclass_scores: (optional) float32 tensor of shape |
|
[num_instances, num_classes] representing the score for each box for each |
|
class. |
|
min_object_covered: the cropped image must cover at least this fraction of |
|
at least one of the input bounding boxes. |
|
aspect_ratio_range: allowed range for aspect ratio of cropped image. |
|
area_range: allowed range for area ratio between cropped image and the |
|
original image. |
|
overlap_thresh: minimum overlap thresh with new cropped |
|
image to keep the box. |
|
clip_boxes: whether to clip the boxes to the cropped image. |
|
random_coef: a random coefficient that defines the chance of getting the |
|
original image. If random_coef is 0, we will always get the |
|
cropped image, and if it is 1.0, we will always get the |
|
original image. |
|
min_padded_size_ratio: min ratio of padded image height and width to the |
|
input image's height and width. |
|
max_padded_size_ratio: max ratio of padded image height and width to the |
|
input image's height and width. |
|
pad_color: padding color. A rank 1 tensor of [3] with dtype=tf.float32. |
|
if set as None, it will be set to average color of the randomly |
|
cropped image. |
|
seed: random seed. |
|
preprocess_vars_cache: PreprocessorCache object that records previously |
|
performed augmentations. Updated in-place. If this |
|
function is called multiple times with the same |
|
non-null cache, it will perform deterministically. |
|
|
|
Returns: |
|
padded_image: padded image. |
|
padded_boxes: boxes which is the same rank as input boxes. Boxes are in |
|
normalized form. |
|
cropped_labels: cropped labels. |
|
if label_weights is not None also returns: |
|
cropped_label_weights: cropped label weights. |
|
if multiclass_scores is not None also returns: |
|
cropped_multiclass_scores: cropped_multiclass_scores. |
|
|
|
""" |
|
image_size = tf.shape(image) |
|
image_height = image_size[0] |
|
image_width = image_size[1] |
|
result = random_crop_image( |
|
image=image, |
|
boxes=boxes, |
|
labels=labels, |
|
label_weights=label_weights, |
|
label_confidences=label_confidences, |
|
multiclass_scores=multiclass_scores, |
|
min_object_covered=min_object_covered, |
|
aspect_ratio_range=aspect_ratio_range, |
|
area_range=area_range, |
|
overlap_thresh=overlap_thresh, |
|
clip_boxes=clip_boxes, |
|
random_coef=random_coef, |
|
seed=seed, |
|
preprocess_vars_cache=preprocess_vars_cache) |
|
|
|
cropped_image, cropped_boxes, cropped_labels = result[:3] |
|
|
|
min_image_size = tf.to_int32( |
|
tf.to_float(tf.stack([image_height, image_width])) * |
|
min_padded_size_ratio) |
|
max_image_size = tf.to_int32( |
|
tf.to_float(tf.stack([image_height, image_width])) * |
|
max_padded_size_ratio) |
|
|
|
padded_image, padded_boxes = random_pad_image( |
|
cropped_image, |
|
cropped_boxes, |
|
min_image_size=min_image_size, |
|
max_image_size=max_image_size, |
|
pad_color=pad_color, |
|
seed=seed, |
|
preprocess_vars_cache=preprocess_vars_cache) |
|
|
|
cropped_padded_output = (padded_image, padded_boxes, cropped_labels) |
|
|
|
index = 3 |
|
if label_weights is not None: |
|
cropped_label_weights = result[index] |
|
cropped_padded_output += (cropped_label_weights,) |
|
index += 1 |
|
|
|
if label_confidences is not None: |
|
cropped_label_confidences = result[index] |
|
cropped_padded_output += (cropped_label_confidences,) |
|
index += 1 |
|
|
|
if multiclass_scores is not None: |
|
cropped_multiclass_scores = result[index] |
|
cropped_padded_output += (cropped_multiclass_scores,) |
|
|
|
return cropped_padded_output |
|
|
|
|
|
def random_crop_to_aspect_ratio(image, |
|
boxes, |
|
labels, |
|
label_weights, |
|
label_confidences=None, |
|
multiclass_scores=None, |
|
masks=None, |
|
keypoints=None, |
|
aspect_ratio=1.0, |
|
overlap_thresh=0.3, |
|
clip_boxes=True, |
|
seed=None, |
|
preprocess_vars_cache=None): |
|
"""Randomly crops an image to the specified aspect ratio. |
|
|
|
Randomly crops the a portion of the image such that the crop is of the |
|
specified aspect ratio, and the crop is as large as possible. If the specified |
|
aspect ratio is larger than the aspect ratio of the image, this op will |
|
randomly remove rows from the top and bottom of the image. If the specified |
|
aspect ratio is less than the aspect ratio of the image, this op will randomly |
|
remove cols from the left and right of the image. If the specified aspect |
|
ratio is the same as the aspect ratio of the image, this op will return the |
|
image. |
|
|
|
Args: |
|
image: rank 3 float32 tensor contains 1 image -> [height, width, channels] |
|
with pixel values varying between [0, 1]. |
|
boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. |
|
Boxes are in normalized form meaning their coordinates vary |
|
between [0, 1]. |
|
Each row is in the form of [ymin, xmin, ymax, xmax]. |
|
labels: rank 1 int32 tensor containing the object classes. |
|
label_weights: float32 tensor of shape [num_instances] representing the |
|
weight for each box. |
|
label_confidences: (optional) float32 tensor of shape [num_instances] |
|
representing the confidence for each box. |
|
multiclass_scores: (optional) float32 tensor of shape |
|
[num_instances, num_classes] representing the score for each box for each |
|
class. |
|
masks: (optional) rank 3 float32 tensor with shape |
|
[num_instances, height, width] containing instance masks. The masks |
|
are of the same height, width as the input `image`. |
|
keypoints: (optional) rank 3 float32 tensor with shape |
|
[num_instances, num_keypoints, 2]. The keypoints are in y-x |
|
normalized coordinates. |
|
aspect_ratio: the aspect ratio of cropped image. |
|
overlap_thresh: minimum overlap thresh with new cropped |
|
image to keep the box. |
|
clip_boxes: whether to clip the boxes to the cropped image. |
|
seed: random seed. |
|
preprocess_vars_cache: PreprocessorCache object that records previously |
|
performed augmentations. Updated in-place. If this |
|
function is called multiple times with the same |
|
non-null cache, it will perform deterministically. |
|
|
|
Returns: |
|
image: image which is the same rank as input image. |
|
boxes: boxes which is the same rank as input boxes. |
|
Boxes are in normalized form. |
|
labels: new labels. |
|
|
|
If label_weights, masks, keypoints, or multiclass_scores is not None, the |
|
function also returns: |
|
label_weights: rank 1 float32 tensor with shape [num_instances]. |
|
masks: rank 3 float32 tensor with shape [num_instances, height, width] |
|
containing instance masks. |
|
keypoints: rank 3 float32 tensor with shape |
|
[num_instances, num_keypoints, 2] |
|
multiclass_scores: rank 2 float32 tensor with shape |
|
[num_instances, num_classes] |
|
|
|
Raises: |
|
ValueError: If image is not a 3D tensor. |
|
""" |
|
if len(image.get_shape()) != 3: |
|
raise ValueError('Image should be 3D tensor') |
|
|
|
with tf.name_scope('RandomCropToAspectRatio', values=[image]): |
|
image_shape = tf.shape(image) |
|
orig_height = image_shape[0] |
|
orig_width = image_shape[1] |
|
orig_aspect_ratio = tf.to_float(orig_width) / tf.to_float(orig_height) |
|
new_aspect_ratio = tf.constant(aspect_ratio, dtype=tf.float32) |
|
def target_height_fn(): |
|
return tf.to_int32(tf.round(tf.to_float(orig_width) / new_aspect_ratio)) |
|
|
|
target_height = tf.cond(orig_aspect_ratio >= new_aspect_ratio, |
|
lambda: orig_height, target_height_fn) |
|
|
|
def target_width_fn(): |
|
return tf.to_int32(tf.round(tf.to_float(orig_height) * new_aspect_ratio)) |
|
|
|
target_width = tf.cond(orig_aspect_ratio <= new_aspect_ratio, |
|
lambda: orig_width, target_width_fn) |
|
|
|
|
|
|
|
|
|
offset_height = _random_integer(0, orig_height - target_height + 1, seed) |
|
offset_width = _random_integer(0, orig_width - target_width + 1, seed) |
|
|
|
generator_func = lambda: (offset_height, offset_width) |
|
offset_height, offset_width = _get_or_create_preprocess_rand_vars( |
|
generator_func, |
|
preprocessor_cache.PreprocessorCache.CROP_TO_ASPECT_RATIO, |
|
preprocess_vars_cache) |
|
|
|
new_image = tf.image.crop_to_bounding_box( |
|
image, offset_height, offset_width, target_height, target_width) |
|
|
|
im_box = tf.stack([ |
|
tf.to_float(offset_height) / tf.to_float(orig_height), |
|
tf.to_float(offset_width) / tf.to_float(orig_width), |
|
tf.to_float(offset_height + target_height) / tf.to_float(orig_height), |
|
tf.to_float(offset_width + target_width) / tf.to_float(orig_width) |
|
]) |
|
|
|
boxlist = box_list.BoxList(boxes) |
|
boxlist.add_field('labels', labels) |
|
|
|
boxlist.add_field('label_weights', label_weights) |
|
|
|
if label_confidences is not None: |
|
boxlist.add_field('label_confidences', label_confidences) |
|
|
|
if multiclass_scores is not None: |
|
boxlist.add_field('multiclass_scores', multiclass_scores) |
|
|
|
im_boxlist = box_list.BoxList(tf.expand_dims(im_box, 0)) |
|
|
|
|
|
overlapping_boxlist, keep_ids = box_list_ops.prune_non_overlapping_boxes( |
|
boxlist, im_boxlist, overlap_thresh) |
|
|
|
|
|
new_labels = overlapping_boxlist.get_field('labels') |
|
new_boxlist = box_list_ops.change_coordinate_frame(overlapping_boxlist, |
|
im_box) |
|
if clip_boxes: |
|
new_boxlist = box_list_ops.clip_to_window( |
|
new_boxlist, tf.constant([0.0, 0.0, 1.0, 1.0], tf.float32)) |
|
new_boxes = new_boxlist.get() |
|
|
|
result = [new_image, new_boxes, new_labels] |
|
|
|
new_label_weights = overlapping_boxlist.get_field('label_weights') |
|
result.append(new_label_weights) |
|
|
|
if label_confidences is not None: |
|
new_label_confidences = ( |
|
overlapping_boxlist.get_field('label_confidences')) |
|
result.append(new_label_confidences) |
|
|
|
if multiclass_scores is not None: |
|
new_multiclass_scores = overlapping_boxlist.get_field('multiclass_scores') |
|
result.append(new_multiclass_scores) |
|
|
|
if masks is not None: |
|
masks_inside_window = tf.gather(masks, keep_ids) |
|
masks_box_begin = tf.stack([0, offset_height, offset_width]) |
|
masks_box_size = tf.stack([-1, target_height, target_width]) |
|
new_masks = tf.slice(masks_inside_window, masks_box_begin, masks_box_size) |
|
result.append(new_masks) |
|
|
|
if keypoints is not None: |
|
keypoints_inside_window = tf.gather(keypoints, keep_ids) |
|
new_keypoints = keypoint_ops.change_coordinate_frame( |
|
keypoints_inside_window, im_box) |
|
if clip_boxes: |
|
new_keypoints = keypoint_ops.prune_outside_window(new_keypoints, |
|
[0.0, 0.0, 1.0, 1.0]) |
|
result.append(new_keypoints) |
|
|
|
return tuple(result) |
|
|
|
|
|
def random_pad_to_aspect_ratio(image, |
|
boxes, |
|
masks=None, |
|
keypoints=None, |
|
aspect_ratio=1.0, |
|
min_padded_size_ratio=(1.0, 1.0), |
|
max_padded_size_ratio=(2.0, 2.0), |
|
seed=None, |
|
preprocess_vars_cache=None): |
|
"""Randomly zero pads an image to the specified aspect ratio. |
|
|
|
Pads the image so that the resulting image will have the specified aspect |
|
ratio without scaling less than the min_padded_size_ratio or more than the |
|
max_padded_size_ratio. If the min_padded_size_ratio or max_padded_size_ratio |
|
is lower than what is possible to maintain the aspect ratio, then this method |
|
will use the least padding to achieve the specified aspect ratio. |
|
|
|
Args: |
|
image: rank 3 float32 tensor contains 1 image -> [height, width, channels] |
|
with pixel values varying between [0, 1]. |
|
boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. |
|
Boxes are in normalized form meaning their coordinates vary |
|
between [0, 1]. |
|
Each row is in the form of [ymin, xmin, ymax, xmax]. |
|
masks: (optional) rank 3 float32 tensor with shape |
|
[num_instances, height, width] containing instance masks. The masks |
|
are of the same height, width as the input `image`. |
|
keypoints: (optional) rank 3 float32 tensor with shape |
|
[num_instances, num_keypoints, 2]. The keypoints are in y-x |
|
normalized coordinates. |
|
aspect_ratio: aspect ratio of the final image. |
|
min_padded_size_ratio: min ratio of padded image height and width to the |
|
input image's height and width. |
|
max_padded_size_ratio: max ratio of padded image height and width to the |
|
input image's height and width. |
|
seed: random seed. |
|
preprocess_vars_cache: PreprocessorCache object that records previously |
|
performed augmentations. Updated in-place. If this |
|
function is called multiple times with the same |
|
non-null cache, it will perform deterministically. |
|
|
|
Returns: |
|
image: image which is the same rank as input image. |
|
boxes: boxes which is the same rank as input boxes. |
|
Boxes are in normalized form. |
|
labels: new labels. |
|
|
|
If masks, or keypoints is not None, the function also returns: |
|
masks: rank 3 float32 tensor with shape [num_instances, height, width] |
|
containing instance masks. |
|
keypoints: rank 3 float32 tensor with shape |
|
[num_instances, num_keypoints, 2] |
|
|
|
Raises: |
|
ValueError: If image is not a 3D tensor. |
|
""" |
|
if len(image.get_shape()) != 3: |
|
raise ValueError('Image should be 3D tensor') |
|
|
|
with tf.name_scope('RandomPadToAspectRatio', values=[image]): |
|
image_shape = tf.shape(image) |
|
image_height = tf.to_float(image_shape[0]) |
|
image_width = tf.to_float(image_shape[1]) |
|
image_aspect_ratio = image_width / image_height |
|
new_aspect_ratio = tf.constant(aspect_ratio, dtype=tf.float32) |
|
target_height = tf.cond( |
|
image_aspect_ratio <= new_aspect_ratio, |
|
lambda: image_height, |
|
lambda: image_width / new_aspect_ratio) |
|
target_width = tf.cond( |
|
image_aspect_ratio >= new_aspect_ratio, |
|
lambda: image_width, |
|
lambda: image_height * new_aspect_ratio) |
|
|
|
min_height = tf.maximum( |
|
min_padded_size_ratio[0] * image_height, target_height) |
|
min_width = tf.maximum( |
|
min_padded_size_ratio[1] * image_width, target_width) |
|
max_height = tf.maximum( |
|
max_padded_size_ratio[0] * image_height, target_height) |
|
max_width = tf.maximum( |
|
max_padded_size_ratio[1] * image_width, target_width) |
|
|
|
max_scale = tf.minimum(max_height / target_height, max_width / target_width) |
|
min_scale = tf.minimum( |
|
max_scale, |
|
tf.maximum(min_height / target_height, min_width / target_width)) |
|
|
|
generator_func = functools.partial(tf.random_uniform, [], |
|
min_scale, max_scale, seed=seed) |
|
scale = _get_or_create_preprocess_rand_vars( |
|
generator_func, |
|
preprocessor_cache.PreprocessorCache.PAD_TO_ASPECT_RATIO, |
|
preprocess_vars_cache) |
|
|
|
target_height = tf.round(scale * target_height) |
|
target_width = tf.round(scale * target_width) |
|
|
|
new_image = tf.image.pad_to_bounding_box( |
|
image, 0, 0, tf.to_int32(target_height), tf.to_int32(target_width)) |
|
|
|
im_box = tf.stack([ |
|
0.0, |
|
0.0, |
|
target_height / image_height, |
|
target_width / image_width |
|
]) |
|
boxlist = box_list.BoxList(boxes) |
|
new_boxlist = box_list_ops.change_coordinate_frame(boxlist, im_box) |
|
new_boxes = new_boxlist.get() |
|
|
|
result = [new_image, new_boxes] |
|
|
|
if masks is not None: |
|
new_masks = tf.expand_dims(masks, -1) |
|
new_masks = tf.image.pad_to_bounding_box(new_masks, 0, 0, |
|
tf.to_int32(target_height), |
|
tf.to_int32(target_width)) |
|
new_masks = tf.squeeze(new_masks, [-1]) |
|
result.append(new_masks) |
|
|
|
if keypoints is not None: |
|
new_keypoints = keypoint_ops.change_coordinate_frame(keypoints, im_box) |
|
result.append(new_keypoints) |
|
|
|
return tuple(result) |
|
|
|
|
|
def random_black_patches(image, |
|
max_black_patches=10, |
|
probability=0.5, |
|
size_to_image_ratio=0.1, |
|
random_seed=None, |
|
preprocess_vars_cache=None): |
|
"""Randomly adds some black patches to the image. |
|
|
|
This op adds up to max_black_patches square black patches of a fixed size |
|
to the image where size is specified via the size_to_image_ratio parameter. |
|
|
|
Args: |
|
image: rank 3 float32 tensor containing 1 image -> [height, width, channels] |
|
with pixel values varying between [0, 1]. |
|
max_black_patches: number of times that the function tries to add a |
|
black box to the image. |
|
probability: at each try, what is the chance of adding a box. |
|
size_to_image_ratio: Determines the ratio of the size of the black patches |
|
to the size of the image. |
|
box_size = size_to_image_ratio * |
|
min(image_width, image_height) |
|
random_seed: random seed. |
|
preprocess_vars_cache: PreprocessorCache object that records previously |
|
performed augmentations. Updated in-place. If this |
|
function is called multiple times with the same |
|
non-null cache, it will perform deterministically. |
|
|
|
Returns: |
|
image |
|
""" |
|
def add_black_patch_to_image(image, idx): |
|
"""Function for adding one patch to the image. |
|
|
|
Args: |
|
image: image |
|
idx: counter for number of patches that could have been added |
|
|
|
Returns: |
|
image with a randomly added black box |
|
""" |
|
image_shape = tf.shape(image) |
|
image_height = image_shape[0] |
|
image_width = image_shape[1] |
|
box_size = tf.to_int32( |
|
tf.multiply( |
|
tf.minimum(tf.to_float(image_height), tf.to_float(image_width)), |
|
size_to_image_ratio)) |
|
|
|
generator_func = functools.partial(tf.random_uniform, [], minval=0.0, |
|
maxval=(1.0 - size_to_image_ratio), |
|
seed=random_seed) |
|
normalized_y_min = _get_or_create_preprocess_rand_vars( |
|
generator_func, |
|
preprocessor_cache.PreprocessorCache.ADD_BLACK_PATCH, |
|
preprocess_vars_cache, key=str(idx) + 'y') |
|
normalized_x_min = _get_or_create_preprocess_rand_vars( |
|
generator_func, |
|
preprocessor_cache.PreprocessorCache.ADD_BLACK_PATCH, |
|
preprocess_vars_cache, key=str(idx) + 'x') |
|
|
|
y_min = tf.to_int32(normalized_y_min * tf.to_float(image_height)) |
|
x_min = tf.to_int32(normalized_x_min * tf.to_float(image_width)) |
|
black_box = tf.ones([box_size, box_size, 3], dtype=tf.float32) |
|
mask = 1.0 - tf.image.pad_to_bounding_box(black_box, y_min, x_min, |
|
image_height, image_width) |
|
image = tf.multiply(image, mask) |
|
return image |
|
|
|
with tf.name_scope('RandomBlackPatchInImage', values=[image]): |
|
for idx in range(max_black_patches): |
|
generator_func = functools.partial(tf.random_uniform, [], |
|
minval=0.0, maxval=1.0, |
|
dtype=tf.float32, seed=random_seed) |
|
random_prob = _get_or_create_preprocess_rand_vars( |
|
generator_func, |
|
preprocessor_cache.PreprocessorCache.BLACK_PATCHES, |
|
preprocess_vars_cache, key=idx) |
|
image = tf.cond( |
|
tf.greater(random_prob, probability), lambda: image, |
|
functools.partial(add_black_patch_to_image, image=image, idx=idx)) |
|
return image |
|
|
|
|
|
def image_to_float(image): |
|
"""Used in Faster R-CNN. Casts image pixel values to float. |
|
|
|
Args: |
|
image: input image which might be in tf.uint8 or sth else format |
|
|
|
Returns: |
|
image: image in tf.float32 format. |
|
""" |
|
with tf.name_scope('ImageToFloat', values=[image]): |
|
image = tf.to_float(image) |
|
return image |
|
|
|
|
|
def random_resize_method(image, target_size, preprocess_vars_cache=None): |
|
"""Uses a random resize method to resize the image to target size. |
|
|
|
Args: |
|
image: a rank 3 tensor. |
|
target_size: a list of [target_height, target_width] |
|
preprocess_vars_cache: PreprocessorCache object that records previously |
|
performed augmentations. Updated in-place. If this |
|
function is called multiple times with the same |
|
non-null cache, it will perform deterministically. |
|
|
|
Returns: |
|
resized image. |
|
""" |
|
|
|
resized_image = _apply_with_random_selector( |
|
image, |
|
lambda x, method: tf.image.resize_images(x, target_size, method), |
|
num_cases=4, |
|
preprocess_vars_cache=preprocess_vars_cache, |
|
key=preprocessor_cache.PreprocessorCache.RESIZE_METHOD) |
|
|
|
return resized_image |
|
|
|
|
|
def resize_to_range(image, |
|
masks=None, |
|
min_dimension=None, |
|
max_dimension=None, |
|
method=tf.image.ResizeMethod.BILINEAR, |
|
align_corners=False, |
|
pad_to_max_dimension=False, |
|
per_channel_pad_value=(0, 0, 0)): |
|
"""Resizes an image so its dimensions are within the provided value. |
|
|
|
The output size can be described by two cases: |
|
1. If the image can be rescaled so its minimum dimension is equal to the |
|
provided value without the other dimension exceeding max_dimension, |
|
then do so. |
|
2. Otherwise, resize so the largest dimension is equal to max_dimension. |
|
|
|
Args: |
|
image: A 3D tensor of shape [height, width, channels] |
|
masks: (optional) rank 3 float32 tensor with shape |
|
[num_instances, height, width] containing instance masks. |
|
min_dimension: (optional) (scalar) desired size of the smaller image |
|
dimension. |
|
max_dimension: (optional) (scalar) maximum allowed size |
|
of the larger image dimension. |
|
method: (optional) interpolation method used in resizing. Defaults to |
|
BILINEAR. |
|
align_corners: bool. If true, exactly align all 4 corners of the input |
|
and output. Defaults to False. |
|
pad_to_max_dimension: Whether to resize the image and pad it with zeros |
|
so the resulting image is of the spatial size |
|
[max_dimension, max_dimension]. If masks are included they are padded |
|
similarly. |
|
per_channel_pad_value: A tuple of per-channel scalar value to use for |
|
padding. By default pads zeros. |
|
|
|
Returns: |
|
Note that the position of the resized_image_shape changes based on whether |
|
masks are present. |
|
resized_image: A 3D tensor of shape [new_height, new_width, channels], |
|
where the image has been resized (with bilinear interpolation) so that |
|
min(new_height, new_width) == min_dimension or |
|
max(new_height, new_width) == max_dimension. |
|
resized_masks: If masks is not None, also outputs masks. A 3D tensor of |
|
shape [num_instances, new_height, new_width]. |
|
resized_image_shape: A 1D tensor of shape [3] containing shape of the |
|
resized image. |
|
|
|
Raises: |
|
ValueError: if the image is not a 3D tensor. |
|
""" |
|
if len(image.get_shape()) != 3: |
|
raise ValueError('Image should be 3D tensor') |
|
|
|
def _resize_landscape_image(image): |
|
|
|
return tf.image.resize_images( |
|
image, tf.stack([min_dimension, max_dimension]), method=method, |
|
align_corners=align_corners, preserve_aspect_ratio=True) |
|
|
|
def _resize_portrait_image(image): |
|
|
|
return tf.image.resize_images( |
|
image, tf.stack([max_dimension, min_dimension]), method=method, |
|
align_corners=align_corners, preserve_aspect_ratio=True) |
|
|
|
with tf.name_scope('ResizeToRange', values=[image, min_dimension]): |
|
if image.get_shape().is_fully_defined(): |
|
if image.get_shape()[0] < image.get_shape()[1]: |
|
new_image = _resize_landscape_image(image) |
|
else: |
|
new_image = _resize_portrait_image(image) |
|
new_size = tf.constant(new_image.get_shape().as_list()) |
|
else: |
|
new_image = tf.cond( |
|
tf.less(tf.shape(image)[0], tf.shape(image)[1]), |
|
lambda: _resize_landscape_image(image), |
|
lambda: _resize_portrait_image(image)) |
|
new_size = tf.shape(new_image) |
|
|
|
if pad_to_max_dimension: |
|
channels = tf.unstack(new_image, axis=2) |
|
if len(channels) != len(per_channel_pad_value): |
|
raise ValueError('Number of channels must be equal to the length of ' |
|
'per-channel pad value.') |
|
new_image = tf.stack( |
|
[ |
|
tf.pad( |
|
channels[i], [[0, max_dimension - new_size[0]], |
|
[0, max_dimension - new_size[1]]], |
|
constant_values=per_channel_pad_value[i]) |
|
for i in range(len(channels)) |
|
], |
|
axis=2) |
|
new_image.set_shape([max_dimension, max_dimension, 3]) |
|
|
|
result = [new_image] |
|
if masks is not None: |
|
new_masks = tf.expand_dims(masks, 3) |
|
new_masks = tf.image.resize_images( |
|
new_masks, |
|
new_size[:-1], |
|
method=tf.image.ResizeMethod.NEAREST_NEIGHBOR, |
|
align_corners=align_corners) |
|
if pad_to_max_dimension: |
|
new_masks = tf.image.pad_to_bounding_box( |
|
new_masks, 0, 0, max_dimension, max_dimension) |
|
new_masks = tf.squeeze(new_masks, 3) |
|
result.append(new_masks) |
|
|
|
result.append(new_size) |
|
return result |
|
|
|
|
|
|
|
def resize_to_min_dimension(image, masks=None, min_dimension=600): |
|
"""Resizes image and masks given the min size maintaining the aspect ratio. |
|
|
|
If one of the image dimensions is smaller that min_dimension, it will scale |
|
the image such that its smallest dimension is equal to min_dimension. |
|
Otherwise, will keep the image size as is. |
|
|
|
Args: |
|
image: a tensor of size [height, width, channels]. |
|
masks: (optional) a tensors of size [num_instances, height, width]. |
|
min_dimension: minimum image dimension. |
|
|
|
Returns: |
|
Note that the position of the resized_image_shape changes based on whether |
|
masks are present. |
|
resized_image: A tensor of size [new_height, new_width, channels]. |
|
resized_masks: If masks is not None, also outputs masks. A 3D tensor of |
|
shape [num_instances, new_height, new_width] |
|
resized_image_shape: A 1D tensor of shape [3] containing the shape of the |
|
resized image. |
|
|
|
Raises: |
|
ValueError: if the image is not a 3D tensor. |
|
""" |
|
if len(image.get_shape()) != 3: |
|
raise ValueError('Image should be 3D tensor') |
|
|
|
with tf.name_scope('ResizeGivenMinDimension', values=[image, min_dimension]): |
|
image_height = tf.shape(image)[0] |
|
image_width = tf.shape(image)[1] |
|
num_channels = tf.shape(image)[2] |
|
min_image_dimension = tf.minimum(image_height, image_width) |
|
min_target_dimension = tf.maximum(min_image_dimension, min_dimension) |
|
target_ratio = tf.to_float(min_target_dimension) / tf.to_float( |
|
min_image_dimension) |
|
target_height = tf.to_int32(tf.to_float(image_height) * target_ratio) |
|
target_width = tf.to_int32(tf.to_float(image_width) * target_ratio) |
|
image = tf.image.resize_bilinear( |
|
tf.expand_dims(image, axis=0), |
|
size=[target_height, target_width], |
|
align_corners=True) |
|
result = [tf.squeeze(image, axis=0)] |
|
|
|
if masks is not None: |
|
masks = tf.image.resize_nearest_neighbor( |
|
tf.expand_dims(masks, axis=3), |
|
size=[target_height, target_width], |
|
align_corners=True) |
|
result.append(tf.squeeze(masks, axis=3)) |
|
|
|
result.append(tf.stack([target_height, target_width, num_channels])) |
|
return result |
|
|
|
|
|
def scale_boxes_to_pixel_coordinates(image, boxes, keypoints=None): |
|
"""Scales boxes from normalized to pixel coordinates. |
|
|
|
Args: |
|
image: A 3D float32 tensor of shape [height, width, channels]. |
|
boxes: A 2D float32 tensor of shape [num_boxes, 4] containing the bounding |
|
boxes in normalized coordinates. Each row is of the form |
|
[ymin, xmin, ymax, xmax]. |
|
keypoints: (optional) rank 3 float32 tensor with shape |
|
[num_instances, num_keypoints, 2]. The keypoints are in y-x normalized |
|
coordinates. |
|
|
|
Returns: |
|
image: unchanged input image. |
|
scaled_boxes: a 2D float32 tensor of shape [num_boxes, 4] containing the |
|
bounding boxes in pixel coordinates. |
|
scaled_keypoints: a 3D float32 tensor with shape |
|
[num_instances, num_keypoints, 2] containing the keypoints in pixel |
|
coordinates. |
|
""" |
|
boxlist = box_list.BoxList(boxes) |
|
image_height = tf.shape(image)[0] |
|
image_width = tf.shape(image)[1] |
|
scaled_boxes = box_list_ops.scale(boxlist, image_height, image_width).get() |
|
result = [image, scaled_boxes] |
|
if keypoints is not None: |
|
scaled_keypoints = keypoint_ops.scale(keypoints, image_height, image_width) |
|
result.append(scaled_keypoints) |
|
return tuple(result) |
|
|
|
|
|
|
|
|
|
|
|
def resize_image(image, |
|
masks=None, |
|
new_height=600, |
|
new_width=1024, |
|
method=tf.image.ResizeMethod.BILINEAR, |
|
align_corners=False): |
|
"""Resizes images to the given height and width. |
|
|
|
Args: |
|
image: A 3D tensor of shape [height, width, channels] |
|
masks: (optional) rank 3 float32 tensor with shape |
|
[num_instances, height, width] containing instance masks. |
|
new_height: (optional) (scalar) desired height of the image. |
|
new_width: (optional) (scalar) desired width of the image. |
|
method: (optional) interpolation method used in resizing. Defaults to |
|
BILINEAR. |
|
align_corners: bool. If true, exactly align all 4 corners of the input |
|
and output. Defaults to False. |
|
|
|
Returns: |
|
Note that the position of the resized_image_shape changes based on whether |
|
masks are present. |
|
resized_image: A tensor of size [new_height, new_width, channels]. |
|
resized_masks: If masks is not None, also outputs masks. A 3D tensor of |
|
shape [num_instances, new_height, new_width] |
|
resized_image_shape: A 1D tensor of shape [3] containing the shape of the |
|
resized image. |
|
""" |
|
with tf.name_scope( |
|
'ResizeImage', |
|
values=[image, new_height, new_width, method, align_corners]): |
|
new_image = tf.image.resize_images( |
|
image, tf.stack([new_height, new_width]), |
|
method=method, |
|
align_corners=align_corners) |
|
image_shape = shape_utils.combined_static_and_dynamic_shape(image) |
|
result = [new_image] |
|
if masks is not None: |
|
num_instances = tf.shape(masks)[0] |
|
new_size = tf.stack([new_height, new_width]) |
|
def resize_masks_branch(): |
|
new_masks = tf.expand_dims(masks, 3) |
|
new_masks = tf.image.resize_nearest_neighbor( |
|
new_masks, new_size, align_corners=align_corners) |
|
new_masks = tf.squeeze(new_masks, axis=3) |
|
return new_masks |
|
|
|
def reshape_masks_branch(): |
|
|
|
|
|
|
|
|
|
new_masks = tf.reshape(masks, [-1, new_size[0], new_size[1]]) |
|
return new_masks |
|
|
|
masks = tf.cond(num_instances > 0, resize_masks_branch, |
|
reshape_masks_branch) |
|
result.append(masks) |
|
|
|
result.append(tf.stack([new_height, new_width, image_shape[2]])) |
|
return result |
|
|
|
|
|
def subtract_channel_mean(image, means=None): |
|
"""Normalizes an image by subtracting a mean from each channel. |
|
|
|
Args: |
|
image: A 3D tensor of shape [height, width, channels] |
|
means: float list containing a mean for each channel |
|
Returns: |
|
normalized_images: a tensor of shape [height, width, channels] |
|
Raises: |
|
ValueError: if images is not a 4D tensor or if the number of means is not |
|
equal to the number of channels. |
|
""" |
|
with tf.name_scope('SubtractChannelMean', values=[image, means]): |
|
if len(image.get_shape()) != 3: |
|
raise ValueError('Input must be of size [height, width, channels]') |
|
if len(means) != image.get_shape()[-1]: |
|
raise ValueError('len(means) must match the number of channels') |
|
return image - [[means]] |
|
|
|
|
|
def one_hot_encoding(labels, num_classes=None): |
|
"""One-hot encodes the multiclass labels. |
|
|
|
Example usage: |
|
labels = tf.constant([1, 4], dtype=tf.int32) |
|
one_hot = OneHotEncoding(labels, num_classes=5) |
|
one_hot.eval() # evaluates to [0, 1, 0, 0, 1] |
|
|
|
Args: |
|
labels: A tensor of shape [None] corresponding to the labels. |
|
num_classes: Number of classes in the dataset. |
|
Returns: |
|
onehot_labels: a tensor of shape [num_classes] corresponding to the one hot |
|
encoding of the labels. |
|
Raises: |
|
ValueError: if num_classes is not specified. |
|
""" |
|
with tf.name_scope('OneHotEncoding', values=[labels]): |
|
if num_classes is None: |
|
raise ValueError('num_classes must be specified') |
|
|
|
labels = tf.one_hot(labels, num_classes, 1, 0) |
|
return tf.reduce_max(labels, 0) |
|
|
|
|
|
def rgb_to_gray(image): |
|
"""Converts a 3 channel RGB image to a 1 channel grayscale image. |
|
|
|
Args: |
|
image: Rank 3 float32 tensor containing 1 image -> [height, width, 3] |
|
with pixel values varying between [0, 1]. |
|
|
|
Returns: |
|
image: A single channel grayscale image -> [image, height, 1]. |
|
""" |
|
return _rgb_to_grayscale(image) |
|
|
|
|
|
def random_self_concat_image( |
|
image, boxes, labels, label_weights, label_confidences=None, |
|
multiclass_scores=None, concat_vertical_probability=0.1, |
|
concat_horizontal_probability=0.1, seed=None, |
|
preprocess_vars_cache=None): |
|
"""Randomly concatenates the image with itself. |
|
|
|
This function randomly concatenates the image with itself; the random |
|
variables for vertical and horizontal concatenation are independent. |
|
Afterwards, we adjust the old bounding boxes, and add new bounding boxes |
|
for the new objects. |
|
|
|
Args: |
|
image: rank 3 float32 tensor containing 1 image -> [height, width, channels] |
|
with pixel values varying between [0, 1]. |
|
boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. |
|
Boxes are in normalized form meaning their coordinates vary |
|
between [0, 1]. |
|
Each row is in the form of [ymin, xmin, ymax, xmax]. |
|
labels: rank 1 int32 tensor containing the object classes. |
|
label_weights: rank 1 float32 containing the label weights. |
|
label_confidences: (optional) rank 1 float32 containing the label |
|
confidences. |
|
multiclass_scores: (optional) float32 tensor of shape |
|
[num_instances, num_classes] representing the score for |
|
each box for each class. |
|
concat_vertical_probability: (optional) a tf.float32 scalar denoting the |
|
probability of a vertical concatenation. |
|
concat_horizontal_probability: (optional) a tf.float32 scalar denoting the |
|
probability of a horizontal concatenation. |
|
seed: random seed. |
|
preprocess_vars_cache: PreprocessorCache object that records previously |
|
performed augmentations. Updated in-place. If this |
|
function is called multiple times with the same |
|
non-null cache, it will perform deterministically. |
|
|
|
Returns: |
|
image: Image shape will be [new_height, new_width, channels]. |
|
boxes: boxes which is the same rank as input boxes. Boxes are in normalized |
|
form. |
|
if label_confidences is not None also returns: |
|
maybe_concat_label_confidences: cropped label weights. |
|
if multiclass_scores is not None also returns: |
|
maybe_concat_multiclass_scores: cropped_multiclass_scores. |
|
""" |
|
|
|
concat_vertical = (tf.random_uniform([], seed=seed) < |
|
concat_vertical_probability) |
|
|
|
|
|
concat_horizontal = (tf.random_uniform([], seed=seed + 1 if seed else None) |
|
< concat_horizontal_probability) |
|
|
|
gen_func = lambda: (concat_vertical, concat_horizontal) |
|
params = _get_or_create_preprocess_rand_vars( |
|
gen_func, preprocessor_cache.PreprocessorCache.SELF_CONCAT_IMAGE, |
|
preprocess_vars_cache) |
|
concat_vertical, concat_horizontal = params |
|
|
|
def _concat_image(image, boxes, labels, label_weights, axis): |
|
"""Concats the image to itself on `axis`.""" |
|
output_images = tf.concat([image, image], axis=axis) |
|
|
|
if axis == 0: |
|
|
|
old_scaling = tf.to_float([0.5, 1.0, 0.5, 1.0]) |
|
new_translation = tf.to_float([0.5, 0.0, 0.5, 0.0]) |
|
elif axis == 1: |
|
old_scaling = tf.to_float([1.0, 0.5, 1.0, 0.5]) |
|
new_translation = tf.to_float([0.0, 0.5, 0.0, 0.5]) |
|
|
|
old_boxes = old_scaling * boxes |
|
new_boxes = old_boxes + new_translation |
|
all_boxes = tf.concat([old_boxes, new_boxes], axis=0) |
|
|
|
return [output_images, all_boxes, tf.tile(labels, [2]), tf.tile( |
|
label_weights, [2])] |
|
|
|
image, boxes, labels, label_weights = tf.cond( |
|
concat_vertical, |
|
lambda: _concat_image(image, boxes, labels, label_weights, axis=0), |
|
lambda: [image, boxes, labels, label_weights], |
|
strict=True) |
|
|
|
outputs = tf.cond( |
|
concat_horizontal, |
|
lambda: _concat_image(image, boxes, labels, label_weights, axis=1), |
|
lambda: [image, boxes, labels, label_weights], |
|
strict=True) |
|
|
|
if label_confidences is not None: |
|
label_confidences = tf.cond(concat_vertical, |
|
lambda: tf.tile(label_confidences, [2]), |
|
lambda: label_confidences) |
|
outputs.append(tf.cond(concat_horizontal, |
|
lambda: tf.tile(label_confidences, [2]), |
|
lambda: label_confidences)) |
|
|
|
if multiclass_scores is not None: |
|
multiclass_scores = tf.cond(concat_vertical, |
|
lambda: tf.tile(multiclass_scores, [2, 1]), |
|
lambda: multiclass_scores) |
|
outputs.append(tf.cond(concat_horizontal, |
|
lambda: tf.tile(multiclass_scores, [2, 1]), |
|
lambda: multiclass_scores)) |
|
|
|
return outputs |
|
|
|
|
|
def ssd_random_crop(image, |
|
boxes, |
|
labels, |
|
label_weights, |
|
label_confidences=None, |
|
multiclass_scores=None, |
|
masks=None, |
|
keypoints=None, |
|
min_object_covered=(0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0), |
|
aspect_ratio_range=((0.5, 2.0),) * 7, |
|
area_range=((0.1, 1.0),) * 7, |
|
overlap_thresh=(0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0), |
|
clip_boxes=(True,) * 7, |
|
random_coef=(0.15,) * 7, |
|
seed=None, |
|
preprocess_vars_cache=None): |
|
"""Random crop preprocessing with default parameters as in SSD paper. |
|
|
|
Liu et al., SSD: Single shot multibox detector. |
|
For further information on random crop preprocessing refer to RandomCrop |
|
function above. |
|
|
|
Args: |
|
image: rank 3 float32 tensor contains 1 image -> [height, width, channels] |
|
with pixel values varying between [0, 1]. |
|
boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. |
|
Boxes are in normalized form meaning their coordinates vary |
|
between [0, 1]. |
|
Each row is in the form of [ymin, xmin, ymax, xmax]. |
|
labels: rank 1 int32 tensor containing the object classes. |
|
label_weights: rank 1 float32 tensor containing the weights. |
|
label_confidences: rank 1 float32 tensor containing the confidences. |
|
multiclass_scores: (optional) float32 tensor of shape |
|
[num_instances, num_classes] representing the score for each box for each |
|
class. |
|
masks: (optional) rank 3 float32 tensor with shape |
|
[num_instances, height, width] containing instance masks. The masks |
|
are of the same height, width as the input `image`. |
|
keypoints: (optional) rank 3 float32 tensor with shape |
|
[num_instances, num_keypoints, 2]. The keypoints are in y-x |
|
normalized coordinates. |
|
min_object_covered: the cropped image must cover at least this fraction of |
|
at least one of the input bounding boxes. |
|
aspect_ratio_range: allowed range for aspect ratio of cropped image. |
|
area_range: allowed range for area ratio between cropped image and the |
|
original image. |
|
overlap_thresh: minimum overlap thresh with new cropped |
|
image to keep the box. |
|
clip_boxes: whether to clip the boxes to the cropped image. |
|
random_coef: a random coefficient that defines the chance of getting the |
|
original image. If random_coef is 0, we will always get the |
|
cropped image, and if it is 1.0, we will always get the |
|
original image. |
|
seed: random seed. |
|
preprocess_vars_cache: PreprocessorCache object that records previously |
|
performed augmentations. Updated in-place. If this |
|
function is called multiple times with the same |
|
non-null cache, it will perform deterministically. |
|
|
|
Returns: |
|
image: image which is the same rank as input image. |
|
boxes: boxes which is the same rank as input boxes. |
|
Boxes are in normalized form. |
|
labels: new labels. |
|
|
|
If label_weights, multiclass_scores, masks, or keypoints is not None, the |
|
function also returns: |
|
label_weights: rank 1 float32 tensor with shape [num_instances]. |
|
multiclass_scores: rank 2 float32 tensor with shape |
|
[num_instances, num_classes] |
|
masks: rank 3 float32 tensor with shape [num_instances, height, width] |
|
containing instance masks. |
|
keypoints: rank 3 float32 tensor with shape |
|
[num_instances, num_keypoints, 2] |
|
""" |
|
|
|
def random_crop_selector(selected_result, index): |
|
"""Applies random_crop_image to selected result. |
|
|
|
Args: |
|
selected_result: A tuple containing image, boxes, labels, keypoints (if |
|
not None), and masks (if not None). |
|
index: The index that was randomly selected. |
|
|
|
Returns: A tuple containing image, boxes, labels, keypoints (if not None), |
|
and masks (if not None). |
|
""" |
|
|
|
i = 3 |
|
image, boxes, labels = selected_result[:i] |
|
selected_label_weights = None |
|
selected_label_confidences = None |
|
selected_multiclass_scores = None |
|
selected_masks = None |
|
selected_keypoints = None |
|
if label_weights is not None: |
|
selected_label_weights = selected_result[i] |
|
i += 1 |
|
if label_confidences is not None: |
|
selected_label_confidences = selected_result[i] |
|
i += 1 |
|
if multiclass_scores is not None: |
|
selected_multiclass_scores = selected_result[i] |
|
i += 1 |
|
if masks is not None: |
|
selected_masks = selected_result[i] |
|
i += 1 |
|
if keypoints is not None: |
|
selected_keypoints = selected_result[i] |
|
|
|
return random_crop_image( |
|
image=image, |
|
boxes=boxes, |
|
labels=labels, |
|
label_weights=selected_label_weights, |
|
label_confidences=selected_label_confidences, |
|
multiclass_scores=selected_multiclass_scores, |
|
masks=selected_masks, |
|
keypoints=selected_keypoints, |
|
min_object_covered=min_object_covered[index], |
|
aspect_ratio_range=aspect_ratio_range[index], |
|
area_range=area_range[index], |
|
overlap_thresh=overlap_thresh[index], |
|
clip_boxes=clip_boxes[index], |
|
random_coef=random_coef[index], |
|
seed=seed, |
|
preprocess_vars_cache=preprocess_vars_cache) |
|
|
|
result = _apply_with_random_selector_tuples( |
|
tuple( |
|
t for t in (image, boxes, labels, label_weights, label_confidences, |
|
multiclass_scores, masks, keypoints) if t is not None), |
|
random_crop_selector, |
|
num_cases=len(min_object_covered), |
|
preprocess_vars_cache=preprocess_vars_cache, |
|
key=preprocessor_cache.PreprocessorCache.SSD_CROP_SELECTOR_ID) |
|
return result |
|
|
|
|
|
def ssd_random_crop_pad(image, |
|
boxes, |
|
labels, |
|
label_weights, |
|
label_confidences=None, |
|
multiclass_scores=None, |
|
min_object_covered=(0.1, 0.3, 0.5, 0.7, 0.9, 1.0), |
|
aspect_ratio_range=((0.5, 2.0),) * 6, |
|
area_range=((0.1, 1.0),) * 6, |
|
overlap_thresh=(0.1, 0.3, 0.5, 0.7, 0.9, 1.0), |
|
clip_boxes=(True,) * 6, |
|
random_coef=(0.15,) * 6, |
|
min_padded_size_ratio=((1.0, 1.0),) * 6, |
|
max_padded_size_ratio=((2.0, 2.0),) * 6, |
|
pad_color=(None,) * 6, |
|
seed=None, |
|
preprocess_vars_cache=None): |
|
"""Random crop preprocessing with default parameters as in SSD paper. |
|
|
|
Liu et al., SSD: Single shot multibox detector. |
|
For further information on random crop preprocessing refer to RandomCrop |
|
function above. |
|
|
|
Args: |
|
image: rank 3 float32 tensor containing 1 image -> [height, width, channels] |
|
with pixel values varying between [0, 1]. |
|
boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. |
|
Boxes are in normalized form meaning their coordinates vary |
|
between [0, 1]. |
|
Each row is in the form of [ymin, xmin, ymax, xmax]. |
|
labels: rank 1 int32 tensor containing the object classes. |
|
label_weights: float32 tensor of shape [num_instances] representing the |
|
weight for each box. |
|
label_confidences: float32 tensor of shape [num_instances] representing the |
|
confidences for each box. |
|
multiclass_scores: (optional) float32 tensor of shape |
|
[num_instances, num_classes] representing the score for each box for each |
|
class. |
|
min_object_covered: the cropped image must cover at least this fraction of |
|
at least one of the input bounding boxes. |
|
aspect_ratio_range: allowed range for aspect ratio of cropped image. |
|
area_range: allowed range for area ratio between cropped image and the |
|
original image. |
|
overlap_thresh: minimum overlap thresh with new cropped |
|
image to keep the box. |
|
clip_boxes: whether to clip the boxes to the cropped image. |
|
random_coef: a random coefficient that defines the chance of getting the |
|
original image. If random_coef is 0, we will always get the |
|
cropped image, and if it is 1.0, we will always get the |
|
original image. |
|
min_padded_size_ratio: min ratio of padded image height and width to the |
|
input image's height and width. |
|
max_padded_size_ratio: max ratio of padded image height and width to the |
|
input image's height and width. |
|
pad_color: padding color. A rank 1 tensor of [3] with dtype=tf.float32. |
|
if set as None, it will be set to average color of the randomly |
|
cropped image. |
|
seed: random seed. |
|
preprocess_vars_cache: PreprocessorCache object that records previously |
|
performed augmentations. Updated in-place. If this |
|
function is called multiple times with the same |
|
non-null cache, it will perform deterministically. |
|
|
|
Returns: |
|
image: Image shape will be [new_height, new_width, channels]. |
|
boxes: boxes which is the same rank as input boxes. Boxes are in normalized |
|
form. |
|
new_labels: new labels. |
|
new_label_weights: new label weights. |
|
""" |
|
|
|
def random_crop_pad_selector(image_boxes_labels, index): |
|
"""Random crop preprocessing helper.""" |
|
i = 3 |
|
image, boxes, labels = image_boxes_labels[:i] |
|
selected_label_weights = None |
|
selected_label_confidences = None |
|
selected_multiclass_scores = None |
|
if label_weights is not None: |
|
selected_label_weights = image_boxes_labels[i] |
|
i += 1 |
|
if label_confidences is not None: |
|
selected_label_confidences = image_boxes_labels[i] |
|
i += 1 |
|
if multiclass_scores is not None: |
|
selected_multiclass_scores = image_boxes_labels[i] |
|
|
|
return random_crop_pad_image( |
|
image, |
|
boxes, |
|
labels, |
|
label_weights=selected_label_weights, |
|
label_confidences=selected_label_confidences, |
|
multiclass_scores=selected_multiclass_scores, |
|
min_object_covered=min_object_covered[index], |
|
aspect_ratio_range=aspect_ratio_range[index], |
|
area_range=area_range[index], |
|
overlap_thresh=overlap_thresh[index], |
|
clip_boxes=clip_boxes[index], |
|
random_coef=random_coef[index], |
|
min_padded_size_ratio=min_padded_size_ratio[index], |
|
max_padded_size_ratio=max_padded_size_ratio[index], |
|
pad_color=pad_color[index], |
|
seed=seed, |
|
preprocess_vars_cache=preprocess_vars_cache) |
|
|
|
return _apply_with_random_selector_tuples( |
|
tuple(t for t in (image, boxes, labels, label_weights, label_confidences, |
|
multiclass_scores) if t is not None), |
|
random_crop_pad_selector, |
|
num_cases=len(min_object_covered), |
|
preprocess_vars_cache=preprocess_vars_cache, |
|
key=preprocessor_cache.PreprocessorCache.SSD_CROP_PAD_SELECTOR_ID) |
|
|
|
|
|
def ssd_random_crop_fixed_aspect_ratio( |
|
image, |
|
boxes, |
|
labels, |
|
label_weights, |
|
label_confidences=None, |
|
multiclass_scores=None, |
|
masks=None, |
|
keypoints=None, |
|
min_object_covered=(0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0), |
|
aspect_ratio=1.0, |
|
area_range=((0.1, 1.0),) * 7, |
|
overlap_thresh=(0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0), |
|
clip_boxes=(True,) * 7, |
|
random_coef=(0.15,) * 7, |
|
seed=None, |
|
preprocess_vars_cache=None): |
|
"""Random crop preprocessing with default parameters as in SSD paper. |
|
|
|
Liu et al., SSD: Single shot multibox detector. |
|
For further information on random crop preprocessing refer to RandomCrop |
|
function above. |
|
|
|
The only difference is that the aspect ratio of the crops are fixed. |
|
|
|
Args: |
|
image: rank 3 float32 tensor contains 1 image -> [height, width, channels] |
|
with pixel values varying between [0, 1]. |
|
boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. |
|
Boxes are in normalized form meaning their coordinates vary |
|
between [0, 1]. |
|
Each row is in the form of [ymin, xmin, ymax, xmax]. |
|
labels: rank 1 int32 tensor containing the object classes. |
|
label_weights: float32 tensor of shape [num_instances] representing the |
|
weight for each box. |
|
label_confidences: (optional) float32 tensor of shape [num_instances] |
|
representing the confidences for each box. |
|
multiclass_scores: (optional) float32 tensor of shape |
|
[num_instances, num_classes] representing the score for each box for each |
|
class. |
|
masks: (optional) rank 3 float32 tensor with shape |
|
[num_instances, height, width] containing instance masks. The masks |
|
are of the same height, width as the input `image`. |
|
keypoints: (optional) rank 3 float32 tensor with shape |
|
[num_instances, num_keypoints, 2]. The keypoints are in y-x |
|
normalized coordinates. |
|
min_object_covered: the cropped image must cover at least this fraction of |
|
at least one of the input bounding boxes. |
|
aspect_ratio: aspect ratio of the cropped image. |
|
area_range: allowed range for area ratio between cropped image and the |
|
original image. |
|
overlap_thresh: minimum overlap thresh with new cropped |
|
image to keep the box. |
|
clip_boxes: whether to clip the boxes to the cropped image. |
|
random_coef: a random coefficient that defines the chance of getting the |
|
original image. If random_coef is 0, we will always get the |
|
cropped image, and if it is 1.0, we will always get the |
|
original image. |
|
seed: random seed. |
|
preprocess_vars_cache: PreprocessorCache object that records previously |
|
performed augmentations. Updated in-place. If this |
|
function is called multiple times with the same |
|
non-null cache, it will perform deterministically. |
|
|
|
Returns: |
|
image: image which is the same rank as input image. |
|
boxes: boxes which is the same rank as input boxes. |
|
Boxes are in normalized form. |
|
labels: new labels. |
|
|
|
If multiclass_scores, masks, or keypoints is not None, the function also |
|
returns: |
|
|
|
multiclass_scores: rank 2 float32 tensor with shape |
|
[num_instances, num_classes] |
|
masks: rank 3 float32 tensor with shape [num_instances, height, width] |
|
containing instance masks. |
|
keypoints: rank 3 float32 tensor with shape |
|
[num_instances, num_keypoints, 2] |
|
""" |
|
aspect_ratio_range = ((aspect_ratio, aspect_ratio),) * len(area_range) |
|
|
|
crop_result = ssd_random_crop( |
|
image, |
|
boxes, |
|
labels, |
|
label_weights=label_weights, |
|
label_confidences=label_confidences, |
|
multiclass_scores=multiclass_scores, |
|
masks=masks, |
|
keypoints=keypoints, |
|
min_object_covered=min_object_covered, |
|
aspect_ratio_range=aspect_ratio_range, |
|
area_range=area_range, |
|
overlap_thresh=overlap_thresh, |
|
clip_boxes=clip_boxes, |
|
random_coef=random_coef, |
|
seed=seed, |
|
preprocess_vars_cache=preprocess_vars_cache) |
|
i = 3 |
|
new_image, new_boxes, new_labels = crop_result[:i] |
|
new_label_weights = None |
|
new_label_confidences = None |
|
new_multiclass_scores = None |
|
new_masks = None |
|
new_keypoints = None |
|
if label_weights is not None: |
|
new_label_weights = crop_result[i] |
|
i += 1 |
|
if label_confidences is not None: |
|
new_label_confidences = crop_result[i] |
|
i += 1 |
|
if multiclass_scores is not None: |
|
new_multiclass_scores = crop_result[i] |
|
i += 1 |
|
if masks is not None: |
|
new_masks = crop_result[i] |
|
i += 1 |
|
if keypoints is not None: |
|
new_keypoints = crop_result[i] |
|
|
|
result = random_crop_to_aspect_ratio( |
|
new_image, |
|
new_boxes, |
|
new_labels, |
|
label_weights=new_label_weights, |
|
label_confidences=new_label_confidences, |
|
multiclass_scores=new_multiclass_scores, |
|
masks=new_masks, |
|
keypoints=new_keypoints, |
|
aspect_ratio=aspect_ratio, |
|
clip_boxes=clip_boxes, |
|
seed=seed, |
|
preprocess_vars_cache=preprocess_vars_cache) |
|
|
|
return result |
|
|
|
|
|
def ssd_random_crop_pad_fixed_aspect_ratio( |
|
image, |
|
boxes, |
|
labels, |
|
label_weights, |
|
label_confidences=None, |
|
multiclass_scores=None, |
|
masks=None, |
|
keypoints=None, |
|
min_object_covered=(0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0), |
|
aspect_ratio=1.0, |
|
aspect_ratio_range=((0.5, 2.0),) * 7, |
|
area_range=((0.1, 1.0),) * 7, |
|
overlap_thresh=(0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0), |
|
clip_boxes=(True,) * 7, |
|
random_coef=(0.15,) * 7, |
|
min_padded_size_ratio=(1.0, 1.0), |
|
max_padded_size_ratio=(2.0, 2.0), |
|
seed=None, |
|
preprocess_vars_cache=None): |
|
"""Random crop and pad preprocessing with default parameters as in SSD paper. |
|
|
|
Liu et al., SSD: Single shot multibox detector. |
|
For further information on random crop preprocessing refer to RandomCrop |
|
function above. |
|
|
|
The only difference is that after the initial crop, images are zero-padded |
|
to a fixed aspect ratio instead of being resized to that aspect ratio. |
|
|
|
Args: |
|
image: rank 3 float32 tensor contains 1 image -> [height, width, channels] |
|
with pixel values varying between [0, 1]. |
|
boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4]. |
|
Boxes are in normalized form meaning their coordinates vary |
|
between [0, 1]. |
|
Each row is in the form of [ymin, xmin, ymax, xmax]. |
|
labels: rank 1 int32 tensor containing the object classes. |
|
label_weights: float32 tensor of shape [num_instances] representing the |
|
weight for each box. |
|
label_confidences: (optional) float32 tensor of shape [num_instances] |
|
representing the confidence for each box. |
|
multiclass_scores: (optional) float32 tensor of shape |
|
[num_instances, num_classes] representing the score for each box for each |
|
class. |
|
masks: (optional) rank 3 float32 tensor with shape |
|
[num_instances, height, width] containing instance masks. The masks |
|
are of the same height, width as the input `image`. |
|
keypoints: (optional) rank 3 float32 tensor with shape |
|
[num_instances, num_keypoints, 2]. The keypoints are in y-x |
|
normalized coordinates. |
|
min_object_covered: the cropped image must cover at least this fraction of |
|
at least one of the input bounding boxes. |
|
aspect_ratio: the final aspect ratio to pad to. |
|
aspect_ratio_range: allowed range for aspect ratio of cropped image. |
|
area_range: allowed range for area ratio between cropped image and the |
|
original image. |
|
overlap_thresh: minimum overlap thresh with new cropped |
|
image to keep the box. |
|
clip_boxes: whether to clip the boxes to the cropped image. |
|
random_coef: a random coefficient that defines the chance of getting the |
|
original image. If random_coef is 0, we will always get the |
|
cropped image, and if it is 1.0, we will always get the |
|
original image. |
|
min_padded_size_ratio: min ratio of padded image height and width to the |
|
input image's height and width. |
|
max_padded_size_ratio: max ratio of padded image height and width to the |
|
input image's height and width. |
|
seed: random seed. |
|
preprocess_vars_cache: PreprocessorCache object that records previously |
|
performed augmentations. Updated in-place. If this |
|
function is called multiple times with the same |
|
non-null cache, it will perform deterministically. |
|
|
|
Returns: |
|
image: image which is the same rank as input image. |
|
boxes: boxes which is the same rank as input boxes. |
|
Boxes are in normalized form. |
|
labels: new labels. |
|
|
|
If multiclass_scores, masks, or keypoints is not None, the function also |
|
returns: |
|
|
|
multiclass_scores: rank 2 with shape [num_instances, num_classes] |
|
masks: rank 3 float32 tensor with shape [num_instances, height, width] |
|
containing instance masks. |
|
keypoints: rank 3 float32 tensor with shape |
|
[num_instances, num_keypoints, 2] |
|
""" |
|
crop_result = ssd_random_crop( |
|
image, |
|
boxes, |
|
labels, |
|
label_weights=label_weights, |
|
label_confidences=label_confidences, |
|
multiclass_scores=multiclass_scores, |
|
masks=masks, |
|
keypoints=keypoints, |
|
min_object_covered=min_object_covered, |
|
aspect_ratio_range=aspect_ratio_range, |
|
area_range=area_range, |
|
overlap_thresh=overlap_thresh, |
|
clip_boxes=clip_boxes, |
|
random_coef=random_coef, |
|
seed=seed, |
|
preprocess_vars_cache=preprocess_vars_cache) |
|
i = 3 |
|
new_image, new_boxes, new_labels = crop_result[:i] |
|
new_label_weights = None |
|
new_label_confidences = None |
|
new_multiclass_scores = None |
|
new_masks = None |
|
new_keypoints = None |
|
if label_weights is not None: |
|
new_label_weights = crop_result[i] |
|
i += 1 |
|
if label_confidences is not None: |
|
new_label_confidences = crop_result[i] |
|
i += 1 |
|
if multiclass_scores is not None: |
|
new_multiclass_scores = crop_result[i] |
|
i += 1 |
|
if masks is not None: |
|
new_masks = crop_result[i] |
|
i += 1 |
|
if keypoints is not None: |
|
new_keypoints = crop_result[i] |
|
|
|
result = random_pad_to_aspect_ratio( |
|
new_image, |
|
new_boxes, |
|
masks=new_masks, |
|
keypoints=new_keypoints, |
|
aspect_ratio=aspect_ratio, |
|
min_padded_size_ratio=min_padded_size_ratio, |
|
max_padded_size_ratio=max_padded_size_ratio, |
|
seed=seed, |
|
preprocess_vars_cache=preprocess_vars_cache) |
|
|
|
result = list(result) |
|
i = 3 |
|
result.insert(2, new_labels) |
|
if new_label_weights is not None: |
|
result.insert(i, new_label_weights) |
|
i += 1 |
|
if new_label_confidences is not None: |
|
result.insert(i, new_label_confidences) |
|
i += 1 |
|
if multiclass_scores is not None: |
|
result.insert(i, new_multiclass_scores) |
|
result = tuple(result) |
|
|
|
return result |
|
|
|
|
|
def convert_class_logits_to_softmax(multiclass_scores, temperature=1.0): |
|
"""Converts multiclass logits to softmax scores after applying temperature. |
|
|
|
Args: |
|
multiclass_scores: float32 tensor of shape |
|
[num_instances, num_classes] representing the score for each box for each |
|
class. |
|
temperature: Scale factor to use prior to applying softmax. Larger |
|
temperatures give more uniform distruibutions after softmax. |
|
|
|
Returns: |
|
multiclass_scores: float32 tensor of shape |
|
[num_instances, num_classes] with scaling and softmax applied. |
|
""" |
|
|
|
|
|
multiclass_scores_scaled = tf.divide( |
|
multiclass_scores, temperature, name='scale_logits') |
|
multiclass_scores = tf.nn.softmax(multiclass_scores_scaled, name='softmax') |
|
|
|
return multiclass_scores |
|
|
|
|
|
def get_default_func_arg_map(include_label_weights=True, |
|
include_label_confidences=False, |
|
include_multiclass_scores=False, |
|
include_instance_masks=False, |
|
include_keypoints=False): |
|
"""Returns the default mapping from a preprocessor function to its args. |
|
|
|
Args: |
|
include_label_weights: If True, preprocessing functions will modify the |
|
label weights, too. |
|
include_label_confidences: If True, preprocessing functions will modify the |
|
label confidences, too. |
|
include_multiclass_scores: If True, preprocessing functions will modify the |
|
multiclass scores, too. |
|
include_instance_masks: If True, preprocessing functions will modify the |
|
instance masks, too. |
|
include_keypoints: If True, preprocessing functions will modify the |
|
keypoints, too. |
|
|
|
Returns: |
|
A map from preprocessing functions to the arguments they receive. |
|
""" |
|
groundtruth_label_weights = None |
|
if include_label_weights: |
|
groundtruth_label_weights = ( |
|
fields.InputDataFields.groundtruth_weights) |
|
|
|
groundtruth_label_confidences = None |
|
if include_label_confidences: |
|
groundtruth_label_confidences = ( |
|
fields.InputDataFields.groundtruth_confidences) |
|
|
|
multiclass_scores = None |
|
if include_multiclass_scores: |
|
multiclass_scores = (fields.InputDataFields.multiclass_scores) |
|
|
|
groundtruth_instance_masks = None |
|
if include_instance_masks: |
|
groundtruth_instance_masks = ( |
|
fields.InputDataFields.groundtruth_instance_masks) |
|
|
|
groundtruth_keypoints = None |
|
if include_keypoints: |
|
groundtruth_keypoints = fields.InputDataFields.groundtruth_keypoints |
|
|
|
prep_func_arg_map = { |
|
normalize_image: (fields.InputDataFields.image,), |
|
random_horizontal_flip: ( |
|
fields.InputDataFields.image, |
|
fields.InputDataFields.groundtruth_boxes, |
|
groundtruth_instance_masks, |
|
groundtruth_keypoints, |
|
), |
|
random_vertical_flip: ( |
|
fields.InputDataFields.image, |
|
fields.InputDataFields.groundtruth_boxes, |
|
groundtruth_instance_masks, |
|
groundtruth_keypoints, |
|
), |
|
random_rotation90: ( |
|
fields.InputDataFields.image, |
|
fields.InputDataFields.groundtruth_boxes, |
|
groundtruth_instance_masks, |
|
groundtruth_keypoints, |
|
), |
|
random_pixel_value_scale: (fields.InputDataFields.image,), |
|
random_image_scale: ( |
|
fields.InputDataFields.image, |
|
groundtruth_instance_masks, |
|
), |
|
random_rgb_to_gray: (fields.InputDataFields.image,), |
|
random_adjust_brightness: (fields.InputDataFields.image,), |
|
random_adjust_contrast: (fields.InputDataFields.image,), |
|
random_adjust_hue: (fields.InputDataFields.image,), |
|
random_adjust_saturation: (fields.InputDataFields.image,), |
|
random_distort_color: (fields.InputDataFields.image,), |
|
random_jitter_boxes: (fields.InputDataFields.groundtruth_boxes,), |
|
random_crop_image: (fields.InputDataFields.image, |
|
fields.InputDataFields.groundtruth_boxes, |
|
fields.InputDataFields.groundtruth_classes, |
|
groundtruth_label_weights, |
|
groundtruth_label_confidences, |
|
multiclass_scores, |
|
groundtruth_instance_masks, |
|
groundtruth_keypoints), |
|
random_pad_image: (fields.InputDataFields.image, |
|
fields.InputDataFields.groundtruth_boxes, |
|
groundtruth_keypoints), |
|
random_absolute_pad_image: (fields.InputDataFields.image, |
|
fields.InputDataFields.groundtruth_boxes), |
|
random_crop_pad_image: (fields.InputDataFields.image, |
|
fields.InputDataFields.groundtruth_boxes, |
|
fields.InputDataFields.groundtruth_classes, |
|
groundtruth_label_weights, |
|
groundtruth_label_confidences, |
|
multiclass_scores), |
|
random_crop_to_aspect_ratio: ( |
|
fields.InputDataFields.image, |
|
fields.InputDataFields.groundtruth_boxes, |
|
fields.InputDataFields.groundtruth_classes, |
|
groundtruth_label_weights, |
|
groundtruth_label_confidences, |
|
multiclass_scores, |
|
groundtruth_instance_masks, |
|
groundtruth_keypoints, |
|
), |
|
random_pad_to_aspect_ratio: ( |
|
fields.InputDataFields.image, |
|
fields.InputDataFields.groundtruth_boxes, |
|
groundtruth_instance_masks, |
|
groundtruth_keypoints, |
|
), |
|
random_black_patches: (fields.InputDataFields.image,), |
|
retain_boxes_above_threshold: ( |
|
fields.InputDataFields.groundtruth_boxes, |
|
fields.InputDataFields.groundtruth_classes, |
|
groundtruth_label_weights, |
|
groundtruth_label_confidences, |
|
multiclass_scores, |
|
groundtruth_instance_masks, |
|
groundtruth_keypoints, |
|
), |
|
image_to_float: (fields.InputDataFields.image,), |
|
random_resize_method: (fields.InputDataFields.image,), |
|
resize_to_range: ( |
|
fields.InputDataFields.image, |
|
groundtruth_instance_masks, |
|
), |
|
resize_to_min_dimension: ( |
|
fields.InputDataFields.image, |
|
groundtruth_instance_masks, |
|
), |
|
scale_boxes_to_pixel_coordinates: ( |
|
fields.InputDataFields.image, |
|
fields.InputDataFields.groundtruth_boxes, |
|
groundtruth_keypoints, |
|
), |
|
resize_image: ( |
|
fields.InputDataFields.image, |
|
groundtruth_instance_masks, |
|
), |
|
subtract_channel_mean: (fields.InputDataFields.image,), |
|
one_hot_encoding: (fields.InputDataFields.groundtruth_image_classes,), |
|
rgb_to_gray: (fields.InputDataFields.image,), |
|
random_self_concat_image: (fields.InputDataFields.image, |
|
fields.InputDataFields.groundtruth_boxes, |
|
fields.InputDataFields.groundtruth_classes, |
|
groundtruth_label_weights, |
|
groundtruth_label_confidences, |
|
multiclass_scores), |
|
ssd_random_crop: (fields.InputDataFields.image, |
|
fields.InputDataFields.groundtruth_boxes, |
|
fields.InputDataFields.groundtruth_classes, |
|
groundtruth_label_weights, |
|
groundtruth_label_confidences, |
|
multiclass_scores, |
|
groundtruth_instance_masks, |
|
groundtruth_keypoints), |
|
ssd_random_crop_pad: (fields.InputDataFields.image, |
|
fields.InputDataFields.groundtruth_boxes, |
|
fields.InputDataFields.groundtruth_classes, |
|
groundtruth_label_weights, |
|
groundtruth_label_confidences, |
|
multiclass_scores), |
|
ssd_random_crop_fixed_aspect_ratio: ( |
|
fields.InputDataFields.image, |
|
fields.InputDataFields.groundtruth_boxes, |
|
fields.InputDataFields.groundtruth_classes, |
|
groundtruth_label_weights, |
|
groundtruth_label_confidences, |
|
multiclass_scores, |
|
groundtruth_instance_masks, |
|
groundtruth_keypoints), |
|
ssd_random_crop_pad_fixed_aspect_ratio: ( |
|
fields.InputDataFields.image, |
|
fields.InputDataFields.groundtruth_boxes, |
|
fields.InputDataFields.groundtruth_classes, |
|
groundtruth_label_weights, |
|
groundtruth_label_confidences, |
|
multiclass_scores, |
|
groundtruth_instance_masks, |
|
groundtruth_keypoints, |
|
), |
|
convert_class_logits_to_softmax: (multiclass_scores,), |
|
} |
|
|
|
return prep_func_arg_map |
|
|
|
|
|
def preprocess(tensor_dict, |
|
preprocess_options, |
|
func_arg_map=None, |
|
preprocess_vars_cache=None): |
|
"""Preprocess images and bounding boxes. |
|
|
|
Various types of preprocessing (to be implemented) based on the |
|
preprocess_options dictionary e.g. "crop image" (affects image and possibly |
|
boxes), "white balance image" (affects only image), etc. If self._options |
|
is None, no preprocessing is done. |
|
|
|
Args: |
|
tensor_dict: dictionary that contains images, boxes, and can contain other |
|
things as well. |
|
images-> rank 4 float32 tensor contains |
|
1 image -> [1, height, width, 3]. |
|
with pixel values varying between [0, 1] |
|
boxes-> rank 2 float32 tensor containing |
|
the bounding boxes -> [N, 4]. |
|
Boxes are in normalized form meaning |
|
their coordinates vary between [0, 1]. |
|
Each row is in the form |
|
of [ymin, xmin, ymax, xmax]. |
|
preprocess_options: It is a list of tuples, where each tuple contains a |
|
function and a dictionary that contains arguments and |
|
their values. |
|
func_arg_map: mapping from preprocessing functions to arguments that they |
|
expect to receive and return. |
|
preprocess_vars_cache: PreprocessorCache object that records previously |
|
performed augmentations. Updated in-place. If this |
|
function is called multiple times with the same |
|
non-null cache, it will perform deterministically. |
|
|
|
Returns: |
|
tensor_dict: which contains the preprocessed images, bounding boxes, etc. |
|
|
|
Raises: |
|
ValueError: (a) If the functions passed to Preprocess |
|
are not in func_arg_map. |
|
(b) If the arguments that a function needs |
|
do not exist in tensor_dict. |
|
(c) If image in tensor_dict is not rank 4 |
|
""" |
|
if func_arg_map is None: |
|
func_arg_map = get_default_func_arg_map() |
|
|
|
|
|
|
|
if fields.InputDataFields.image in tensor_dict: |
|
images = tensor_dict[fields.InputDataFields.image] |
|
if len(images.get_shape()) != 4: |
|
raise ValueError('images in tensor_dict should be rank 4') |
|
image = tf.squeeze(images, axis=0) |
|
tensor_dict[fields.InputDataFields.image] = image |
|
|
|
|
|
for option in preprocess_options: |
|
func, params = option |
|
if func not in func_arg_map: |
|
raise ValueError('The function %s does not exist in func_arg_map' % |
|
(func.__name__)) |
|
arg_names = func_arg_map[func] |
|
for a in arg_names: |
|
if a is not None and a not in tensor_dict: |
|
raise ValueError('The function %s requires argument %s' % |
|
(func.__name__, a)) |
|
|
|
def get_arg(key): |
|
return tensor_dict[key] if key is not None else None |
|
|
|
args = [get_arg(a) for a in arg_names] |
|
if (preprocess_vars_cache is not None and |
|
'preprocess_vars_cache' in inspect.getargspec(func).args): |
|
params['preprocess_vars_cache'] = preprocess_vars_cache |
|
|
|
results = func(*args, **params) |
|
if not isinstance(results, (list, tuple)): |
|
results = (results,) |
|
|
|
arg_names = [arg_name for arg_name in arg_names if arg_name is not None] |
|
for res, arg_name in zip(results, arg_names): |
|
tensor_dict[arg_name] = res |
|
|
|
|
|
|
|
if fields.InputDataFields.image in tensor_dict: |
|
image = tensor_dict[fields.InputDataFields.image] |
|
images = tf.expand_dims(image, 0) |
|
tensor_dict[fields.InputDataFields.image] = images |
|
|
|
return tensor_dict |
|
|