|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Gym environment for the ActiveVision Dataset. |
|
|
|
The dataset is captured with a robot moving around and taking picture in |
|
multiple directions. The actions are moving in four directions, and rotate |
|
clockwise or counter clockwise. The observations are the output of vision |
|
pipelines such as object detectors. The goal is to find objects of interest |
|
in each environment. For more details, refer: |
|
http://cs.unc.edu/~ammirato/active_vision_dataset_website/. |
|
""" |
|
import tensorflow as tf |
|
import collections |
|
import copy |
|
import json |
|
import os |
|
from StringIO import StringIO |
|
import time |
|
import gym |
|
from gym.envs.registration import register |
|
import gym.spaces |
|
import networkx as nx |
|
import numpy as np |
|
import scipy.io as sio |
|
from absl import logging |
|
import gin |
|
import cv2 |
|
import label_map_util |
|
import visualization_utils as vis_util |
|
from envs import task_env |
|
|
|
|
|
register( |
|
id='active-vision-env-v0', |
|
entry_point= |
|
'cognitive_planning.envs.active_vision_dataset_env:ActiveVisionDatasetEnv', |
|
) |
|
|
|
_MAX_DEPTH_VALUE = 12102 |
|
|
|
SUPPORTED_ACTIONS = [ |
|
'right', 'rotate_cw', 'rotate_ccw', 'forward', 'left', 'backward', 'stop' |
|
] |
|
SUPPORTED_MODALITIES = [ |
|
task_env.ModalityTypes.SEMANTIC_SEGMENTATION, |
|
task_env.ModalityTypes.DEPTH, |
|
task_env.ModalityTypes.OBJECT_DETECTION, |
|
task_env.ModalityTypes.IMAGE, |
|
task_env.ModalityTypes.GOAL, |
|
task_env.ModalityTypes.PREV_ACTION, |
|
task_env.ModalityTypes.DISTANCE, |
|
] |
|
|
|
|
|
_Graph = collections.namedtuple('_Graph', [ |
|
'graph', 'id_to_index', 'index_to_id', 'target_indexes', 'distance_to_goal' |
|
]) |
|
|
|
|
|
def _init_category_index(label_map_path): |
|
"""Creates category index from class indexes to name of the classes. |
|
|
|
Args: |
|
label_map_path: path to the mapping. |
|
Returns: |
|
A map for mapping int keys to string categories. |
|
""" |
|
|
|
label_map = label_map_util.load_labelmap(label_map_path) |
|
num_classes = np.max(x.id for x in label_map.item) |
|
categories = label_map_util.convert_label_map_to_categories( |
|
label_map, max_num_classes=num_classes, use_display_name=True) |
|
category_index = label_map_util.create_category_index(categories) |
|
return category_index |
|
|
|
|
|
def _draw_detections(image_np, detections, category_index): |
|
"""Draws detections on to the image. |
|
|
|
Args: |
|
image_np: Image in the form of uint8 numpy array. |
|
detections: a dictionary that contains the detection outputs. |
|
category_index: contains the mapping between indexes and the category names. |
|
|
|
Returns: |
|
Does not return anything but draws the boxes on the |
|
""" |
|
vis_util.visualize_boxes_and_labels_on_image_array( |
|
image_np, |
|
detections['detection_boxes'], |
|
detections['detection_classes'], |
|
detections['detection_scores'], |
|
category_index, |
|
use_normalized_coordinates=True, |
|
max_boxes_to_draw=1000, |
|
min_score_thresh=.0, |
|
agnostic_mode=False) |
|
|
|
|
|
def generate_detection_image(detections, |
|
image_size, |
|
category_map, |
|
num_classes, |
|
is_binary=True): |
|
"""Generates one_hot vector of the image using the detection boxes. |
|
|
|
Args: |
|
detections: 2D object detections from the image. It's a dictionary that |
|
contains detection_boxes, detection_classes, and detection_scores with |
|
dimensions of nx4, nx1, nx1 where n is the number of detections. |
|
image_size: The resolution of the output image. |
|
category_map: dictionary that maps label names to index. |
|
num_classes: Number of classes. |
|
is_binary: If true, it sets the corresponding channels to 0 and 1. |
|
Otherwise, sets the score in the corresponding channel. |
|
Returns: |
|
Returns image_size x image_size x num_classes image for the detection boxes. |
|
""" |
|
res = np.zeros((image_size, image_size, num_classes), dtype=np.float32) |
|
boxes = detections['detection_boxes'] |
|
labels = detections['detection_classes'] |
|
scores = detections['detection_scores'] |
|
for box, label, score in zip(boxes, labels, scores): |
|
transformed_boxes = [int(round(t)) for t in box * image_size] |
|
y1, x1, y2, x2 = transformed_boxes |
|
|
|
|
|
|
|
if (y2 - y1) * (x2 - x1) == 0: |
|
continue |
|
assert category_map[label] < num_classes, 'label = {}'.format(label) |
|
value = score |
|
if is_binary: |
|
value = 1 |
|
res[y1:y2, x1:x2, category_map[label]] = value |
|
return res |
|
|
|
|
|
def _get_detection_path(root, detection_folder_name, world): |
|
return os.path.join(root, 'Meta', detection_folder_name, world + '.npy') |
|
|
|
|
|
def _get_image_folder(root, world): |
|
return os.path.join(root, world, 'jpg_rgb') |
|
|
|
|
|
def _get_json_path(root, world): |
|
return os.path.join(root, world, 'annotations.json') |
|
|
|
|
|
def _get_image_path(root, world, image_id): |
|
return os.path.join(_get_image_folder(root, world), image_id + '.jpg') |
|
|
|
|
|
def _get_image_list(path, worlds): |
|
"""Builds a dictionary for all the worlds. |
|
|
|
Args: |
|
path: the path to the dataset on cns. |
|
worlds: list of the worlds. |
|
|
|
Returns: |
|
dictionary where the key is the world names and the values |
|
are the image_ids of that world. |
|
""" |
|
world_id_dict = {} |
|
for loc in worlds: |
|
files = [t[:-4] for t in tf.gfile.ListDir(_get_image_folder(path, loc))] |
|
world_id_dict[loc] = files |
|
return world_id_dict |
|
|
|
|
|
def read_all_poses(dataset_root, world): |
|
"""Reads all the poses for each world. |
|
|
|
Args: |
|
dataset_root: the path to the root of the dataset. |
|
world: string, name of the world. |
|
|
|
Returns: |
|
Dictionary of poses for all the images in each world. The key is the image |
|
id of each view and the values are tuple of (x, z, R, scale). Where x and z |
|
are the first and third coordinate of translation. R is the 3x3 rotation |
|
matrix and scale is a float scalar that indicates the scale that needs to |
|
be multipled to x and z in order to get the real world coordinates. |
|
|
|
Raises: |
|
ValueError: if the number of images do not match the number of poses read. |
|
""" |
|
path = os.path.join(dataset_root, world, 'image_structs.mat') |
|
with tf.gfile.Open(path) as f: |
|
data = sio.loadmat(f) |
|
xyz = data['image_structs']['world_pos'] |
|
image_names = data['image_structs']['image_name'][0] |
|
rot = data['image_structs']['R'][0] |
|
scale = data['scale'][0][0] |
|
n = xyz.shape[1] |
|
x = [xyz[0][i][0][0] for i in range(n)] |
|
z = [xyz[0][i][2][0] for i in range(n)] |
|
names = [name[0][:-4] for name in image_names] |
|
if len(names) != len(x): |
|
raise ValueError('number of image names are not equal to the number of ' |
|
'poses {} != {}'.format(len(names), len(x))) |
|
output = {} |
|
for i in range(n): |
|
if rot[i].shape[0] != 0: |
|
assert rot[i].shape[0] == 3 |
|
assert rot[i].shape[1] == 3 |
|
output[names[i]] = (x[i], z[i], rot[i], scale) |
|
else: |
|
output[names[i]] = (x[i], z[i], None, scale) |
|
|
|
return output |
|
|
|
|
|
def read_cached_data(should_load_images, dataset_root, segmentation_file_name, |
|
targets_file_name, output_size): |
|
"""Reads all the necessary cached data. |
|
|
|
Args: |
|
should_load_images: whether to load the images or not. |
|
dataset_root: path to the root of the dataset. |
|
segmentation_file_name: The name of the file that contains semantic |
|
segmentation annotations. |
|
targets_file_name: The name of the file the contains targets annotated for |
|
each world. |
|
output_size: Size of the output images. This is used for pre-processing the |
|
loaded images. |
|
Returns: |
|
Dictionary of all the cached data. |
|
""" |
|
|
|
load_start = time.time() |
|
result_data = {} |
|
|
|
annotated_target_path = os.path.join(dataset_root, 'Meta', |
|
targets_file_name + '.npy') |
|
|
|
logging.info('loading targets: %s', annotated_target_path) |
|
with tf.gfile.Open(annotated_target_path) as f: |
|
result_data['targets'] = np.load(f).item() |
|
|
|
depth_image_path = os.path.join(dataset_root, 'Meta/depth_imgs.npy') |
|
logging.info('loading depth: %s', depth_image_path) |
|
with tf.gfile.Open(depth_image_path) as f: |
|
depth_data = np.load(f).item() |
|
|
|
logging.info('processing depth') |
|
for home_id in depth_data: |
|
images = depth_data[home_id] |
|
for image_id in images: |
|
depth = images[image_id] |
|
depth = cv2.resize( |
|
depth / _MAX_DEPTH_VALUE, (output_size, output_size), |
|
interpolation=cv2.INTER_NEAREST) |
|
depth_mask = (depth > 0).astype(np.float32) |
|
depth = np.dstack((depth, depth_mask)) |
|
images[image_id] = depth |
|
result_data[task_env.ModalityTypes.DEPTH] = depth_data |
|
|
|
sseg_path = os.path.join(dataset_root, 'Meta', |
|
segmentation_file_name + '.npy') |
|
logging.info('loading sseg: %s', sseg_path) |
|
with tf.gfile.Open(sseg_path) as f: |
|
sseg_data = np.load(f).item() |
|
|
|
logging.info('processing sseg') |
|
for home_id in sseg_data: |
|
images = sseg_data[home_id] |
|
for image_id in images: |
|
sseg = images[image_id] |
|
sseg = cv2.resize( |
|
sseg, (output_size, output_size), interpolation=cv2.INTER_NEAREST) |
|
images[image_id] = np.expand_dims(sseg, axis=-1).astype(np.float32) |
|
result_data[task_env.ModalityTypes.SEMANTIC_SEGMENTATION] = sseg_data |
|
|
|
if should_load_images: |
|
image_path = os.path.join(dataset_root, 'Meta/imgs.npy') |
|
logging.info('loading imgs: %s', image_path) |
|
with tf.gfile.Open(image_path) as f: |
|
image_data = np.load(f).item() |
|
|
|
result_data[task_env.ModalityTypes.IMAGE] = image_data |
|
|
|
with tf.gfile.Open(os.path.join(dataset_root, 'Meta/world_id_dict.npy')) as f: |
|
result_data['world_id_dict'] = np.load(f).item() |
|
|
|
logging.info('logging done in %f seconds', time.time() - load_start) |
|
return result_data |
|
|
|
|
|
@gin.configurable |
|
def get_spec_dtype_map(): |
|
return {gym.spaces.Box: np.float32} |
|
|
|
|
|
@gin.configurable |
|
class ActiveVisionDatasetEnv(task_env.TaskEnv): |
|
"""Simulates the environment from ActiveVisionDataset.""" |
|
cached_data = None |
|
|
|
def __init__( |
|
self, |
|
episode_length, |
|
modality_types, |
|
confidence_threshold, |
|
output_size, |
|
worlds, |
|
targets, |
|
compute_distance, |
|
should_draw_detections, |
|
dataset_root, |
|
labelmap_path, |
|
reward_collision, |
|
reward_goal_range, |
|
num_detection_classes, |
|
segmentation_file_name, |
|
detection_folder_name, |
|
actions, |
|
targets_file_name, |
|
eval_init_points_file_name=None, |
|
shaped_reward=False, |
|
): |
|
"""Instantiates the environment for ActiveVision Dataset. |
|
|
|
Args: |
|
episode_length: the length of each episode. |
|
modality_types: a list of the strings where each entry indicates the name |
|
of the modalities to be loaded. Valid entries are "sseg", "det", |
|
"depth", "image", "distance", and "prev_action". "distance" should be |
|
used for computing metrics in tf agents. |
|
confidence_threshold: Consider detections more than confidence_threshold |
|
for potential targets. |
|
output_size: Resolution of the output image. |
|
worlds: List of the name of the worlds. |
|
targets: List of the target names. Each entry is a string label of the |
|
target category (e.g. 'fridge', 'microwave', so on). |
|
compute_distance: If True, outputs the distance of the view to the goal. |
|
should_draw_detections (bool): If True, the image returned for the |
|
observation will contains the bounding boxes. |
|
dataset_root: the path to the root folder of the dataset. |
|
labelmap_path: path to the dictionary that converts label strings to |
|
indexes. |
|
reward_collision: the reward the agents get after hitting an obstacle. |
|
It should be a non-positive number. |
|
reward_goal_range: the number of steps from goal, such that the agent is |
|
considered to have reached the goal. If the agent's distance is less |
|
than the specified goal range, the episode is also finishes by setting |
|
done = True. |
|
num_detection_classes: number of classes that detector outputs. |
|
segmentation_file_name: the name of the file that contains the semantic |
|
information. The file should be in the dataset_root/Meta/ folder. |
|
detection_folder_name: Name of the folder that contains the detections |
|
for each world. The folder should be under dataset_root/Meta/ folder. |
|
actions: The list of the action names. Valid entries are listed in |
|
SUPPORTED_ACTIONS. |
|
targets_file_name: the name of the file that contains the annotated |
|
targets. The file should be in the dataset_root/Meta/Folder |
|
eval_init_points_file_name: The name of the file that contains the initial |
|
points for evaluating the performance of the agent. If set to None, |
|
episodes start at random locations. Should be only set for evaluation. |
|
shaped_reward: Whether to add delta goal distance to the reward each step. |
|
|
|
Raises: |
|
ValueError: If one of the targets are not available in the annotated |
|
targets or the modality names are not from the domain specified above. |
|
ValueError: If one of the actions is not in SUPPORTED_ACTIONS. |
|
ValueError: If the reward_collision is a positive number. |
|
ValueError: If there is no action other than stop provided. |
|
""" |
|
if reward_collision > 0: |
|
raise ValueError('"reward" for collision should be non positive') |
|
|
|
if reward_goal_range < 0: |
|
logging.warning('environment does not terminate the episode if the agent ' |
|
'is too close to the environment') |
|
|
|
if not modality_types: |
|
raise ValueError('modality names can not be empty') |
|
|
|
for name in modality_types: |
|
if name not in SUPPORTED_MODALITIES: |
|
raise ValueError('invalid modality type: {}'.format(name)) |
|
|
|
actions_other_than_stop_found = False |
|
for a in actions: |
|
if a != 'stop': |
|
actions_other_than_stop_found = True |
|
if a not in SUPPORTED_ACTIONS: |
|
raise ValueError('invalid action %s', a) |
|
|
|
if not actions_other_than_stop_found: |
|
raise ValueError('environment needs to have actions other than stop.') |
|
|
|
super(ActiveVisionDatasetEnv, self).__init__() |
|
|
|
self._episode_length = episode_length |
|
self._modality_types = set(modality_types) |
|
self._confidence_threshold = confidence_threshold |
|
self._output_size = output_size |
|
self._dataset_root = dataset_root |
|
self._worlds = worlds |
|
self._targets = targets |
|
self._all_graph = {} |
|
for world in self._worlds: |
|
with tf.gfile.Open(_get_json_path(self._dataset_root, world), 'r') as f: |
|
file_content = f.read() |
|
file_content = file_content.replace('.jpg', '') |
|
io = StringIO(file_content) |
|
self._all_graph[world] = json.load(io) |
|
|
|
self._cur_world = '' |
|
self._cur_image_id = '' |
|
self._cur_graph = None |
|
self._steps_taken = 0 |
|
self._last_action_success = True |
|
self._category_index = _init_category_index(labelmap_path) |
|
self._category_map = dict( |
|
[(c, i) for i, c in enumerate(self._category_index)]) |
|
self._detection_cache = {} |
|
if not ActiveVisionDatasetEnv.cached_data: |
|
ActiveVisionDatasetEnv.cached_data = read_cached_data( |
|
True, self._dataset_root, segmentation_file_name, targets_file_name, |
|
self._output_size) |
|
cached_data = ActiveVisionDatasetEnv.cached_data |
|
|
|
self._world_id_dict = cached_data['world_id_dict'] |
|
self._depth_images = cached_data[task_env.ModalityTypes.DEPTH] |
|
self._semantic_segmentations = cached_data[ |
|
task_env.ModalityTypes.SEMANTIC_SEGMENTATION] |
|
self._annotated_targets = cached_data['targets'] |
|
self._cached_imgs = cached_data[task_env.ModalityTypes.IMAGE] |
|
self._graph_cache = {} |
|
self._compute_distance = compute_distance |
|
self._should_draw_detections = should_draw_detections |
|
self._reward_collision = reward_collision |
|
self._reward_goal_range = reward_goal_range |
|
self._num_detection_classes = num_detection_classes |
|
self._actions = actions |
|
self._detection_folder_name = detection_folder_name |
|
self._shaped_reward = shaped_reward |
|
|
|
self._eval_init_points = None |
|
if eval_init_points_file_name is not None: |
|
self._eval_init_index = 0 |
|
init_points_path = os.path.join(self._dataset_root, 'Meta', |
|
eval_init_points_file_name + '.npy') |
|
with tf.gfile.Open(init_points_path) as points_file: |
|
data = np.load(points_file).item() |
|
self._eval_init_points = [] |
|
for world in self._worlds: |
|
for goal in self._targets: |
|
if world in self._annotated_targets[goal]: |
|
for image_id in data[world]: |
|
self._eval_init_points.append((world, image_id[0], goal)) |
|
logging.info('loaded %d eval init points', len(self._eval_init_points)) |
|
|
|
self.action_space = gym.spaces.Discrete(len(self._actions)) |
|
|
|
obs_shapes = {} |
|
if task_env.ModalityTypes.SEMANTIC_SEGMENTATION in self._modality_types: |
|
obs_shapes[task_env.ModalityTypes.SEMANTIC_SEGMENTATION] = gym.spaces.Box( |
|
low=0, high=255, shape=(self._output_size, self._output_size, 1)) |
|
if task_env.ModalityTypes.OBJECT_DETECTION in self._modality_types: |
|
obs_shapes[task_env.ModalityTypes.OBJECT_DETECTION] = gym.spaces.Box( |
|
low=0, |
|
high=255, |
|
shape=(self._output_size, self._output_size, |
|
self._num_detection_classes)) |
|
if task_env.ModalityTypes.DEPTH in self._modality_types: |
|
obs_shapes[task_env.ModalityTypes.DEPTH] = gym.spaces.Box( |
|
low=0, |
|
high=_MAX_DEPTH_VALUE, |
|
shape=(self._output_size, self._output_size, 2)) |
|
if task_env.ModalityTypes.IMAGE in self._modality_types: |
|
obs_shapes[task_env.ModalityTypes.IMAGE] = gym.spaces.Box( |
|
low=0, high=255, shape=(self._output_size, self._output_size, 3)) |
|
if task_env.ModalityTypes.GOAL in self._modality_types: |
|
obs_shapes[task_env.ModalityTypes.GOAL] = gym.spaces.Box( |
|
low=0, high=1., shape=(len(self._targets),)) |
|
if task_env.ModalityTypes.PREV_ACTION in self._modality_types: |
|
obs_shapes[task_env.ModalityTypes.PREV_ACTION] = gym.spaces.Box( |
|
low=0, high=1., shape=(len(self._actions) + 1,)) |
|
if task_env.ModalityTypes.DISTANCE in self._modality_types: |
|
obs_shapes[task_env.ModalityTypes.DISTANCE] = gym.spaces.Box( |
|
low=0, high=255, shape=(1,)) |
|
self.observation_space = gym.spaces.Dict(obs_shapes) |
|
|
|
self._prev_action = np.zeros((len(self._actions) + 1), dtype=np.float32) |
|
|
|
|
|
all_poses = {} |
|
for world in self._worlds: |
|
all_poses[world] = read_all_poses(self._dataset_root, world) |
|
self._cached_poses = all_poses |
|
self._vertex_to_pose = {} |
|
self._pose_to_vertex = {} |
|
|
|
@property |
|
def actions(self): |
|
"""Returns list of actions for the env.""" |
|
return self._actions |
|
|
|
def _next_image(self, image_id, action): |
|
"""Given the action, returns the name of the image that agent ends up in. |
|
|
|
Args: |
|
image_id: The image id of the current view. |
|
action: valid actions are ['right', 'rotate_cw', 'rotate_ccw', |
|
'forward', 'left']. Each rotation is 30 degrees. |
|
|
|
Returns: |
|
The image name for the next location of the agent. If the action results |
|
in collision or it is not possible for the agent to execute that action, |
|
returns empty string. |
|
""" |
|
assert action in self._actions, 'invalid action : {}'.format(action) |
|
assert self._cur_world in self._all_graph, 'invalid world {}'.format( |
|
self._cur_world) |
|
assert image_id in self._all_graph[ |
|
self._cur_world], 'image_id {} is not in {}'.format( |
|
image_id, self._cur_world) |
|
return self._all_graph[self._cur_world][image_id][action] |
|
|
|
def _largest_detection_for_image(self, image_id, detections_dict): |
|
"""Assigns area of the largest box for the view with given image id. |
|
|
|
Args: |
|
image_id: Image id of the view. |
|
detections_dict: Detections for the view. |
|
""" |
|
for cls, box, score in zip(detections_dict['detection_classes'], |
|
detections_dict['detection_boxes'], |
|
detections_dict['detection_scores']): |
|
if cls not in self._targets: |
|
continue |
|
if score < self._confidence_threshold: |
|
continue |
|
ymin, xmin, ymax, xmax = box |
|
area = (ymax - ymin) * (xmax - xmin) |
|
if abs(area) < 1e-5: |
|
continue |
|
if image_id not in self._detection_area: |
|
self._detection_area[image_id] = area |
|
else: |
|
self._detection_area[image_id] = max(self._detection_area[image_id], |
|
area) |
|
|
|
def _compute_goal_indexes(self): |
|
"""Computes the goal indexes for the environment. |
|
|
|
Returns: |
|
The indexes of the goals that are closest to target categories. A vertex |
|
is goal vertice if the desired objects are detected in the image and the |
|
target categories are not seen by moving forward from that vertice. |
|
""" |
|
for image_id in self._world_id_dict[self._cur_world]: |
|
detections_dict = self._detection_table[image_id] |
|
self._largest_detection_for_image(image_id, detections_dict) |
|
goal_indexes = [] |
|
for image_id in self._world_id_dict[self._cur_world]: |
|
if image_id not in self._detection_area: |
|
continue |
|
|
|
if self._detection_area[image_id] < 0.01: |
|
continue |
|
ok = True |
|
next_image_id = self._next_image(image_id, 'forward') |
|
if next_image_id: |
|
if next_image_id in self._detection_area: |
|
ok = False |
|
if ok: |
|
goal_indexes.append(self._cur_graph.id_to_index[image_id]) |
|
return goal_indexes |
|
|
|
def to_image_id(self, vid): |
|
"""Converts vertex id to the image id. |
|
|
|
Args: |
|
vid: vertex id of the view. |
|
Returns: |
|
image id of the input vertex id. |
|
""" |
|
return self._cur_graph.index_to_id[vid] |
|
|
|
def to_vertex(self, image_id): |
|
return self._cur_graph.id_to_index[image_id] |
|
|
|
def observation(self, view_pose): |
|
"""Returns the observation at the given the vertex. |
|
|
|
Args: |
|
view_pose: pose of the view of interest. |
|
|
|
Returns: |
|
Observation at the given view point. |
|
|
|
Raises: |
|
ValueError: if the given view pose is not similar to any of the poses in |
|
the current world. |
|
""" |
|
vertex = self.pose_to_vertex(view_pose) |
|
if vertex is None: |
|
raise ValueError('The given found is not close enough to any of the poses' |
|
' in the environment.') |
|
image_id = self._cur_graph.index_to_id[vertex] |
|
output = collections.OrderedDict() |
|
|
|
if task_env.ModalityTypes.SEMANTIC_SEGMENTATION in self._modality_types: |
|
output[task_env.ModalityTypes. |
|
SEMANTIC_SEGMENTATION] = self._semantic_segmentations[ |
|
self._cur_world][image_id] |
|
|
|
detection = None |
|
need_det = ( |
|
task_env.ModalityTypes.OBJECT_DETECTION in self._modality_types or |
|
(task_env.ModalityTypes.IMAGE in self._modality_types and |
|
self._should_draw_detections)) |
|
if need_det: |
|
detection = self._detection_table[image_id] |
|
detection_image = generate_detection_image( |
|
detection, |
|
self._output_size, |
|
self._category_map, |
|
num_classes=self._num_detection_classes) |
|
|
|
if task_env.ModalityTypes.OBJECT_DETECTION in self._modality_types: |
|
output[task_env.ModalityTypes.OBJECT_DETECTION] = detection_image |
|
|
|
if task_env.ModalityTypes.DEPTH in self._modality_types: |
|
output[task_env.ModalityTypes.DEPTH] = self._depth_images[ |
|
self._cur_world][image_id] |
|
|
|
if task_env.ModalityTypes.IMAGE in self._modality_types: |
|
output_img = self._cached_imgs[self._cur_world][image_id] |
|
if self._should_draw_detections: |
|
output_img = output_img.copy() |
|
_draw_detections(output_img, detection, self._category_index) |
|
output[task_env.ModalityTypes.IMAGE] = output_img |
|
|
|
if task_env.ModalityTypes.GOAL in self._modality_types: |
|
goal = np.zeros((len(self._targets),), dtype=np.float32) |
|
goal[self._targets.index(self._cur_goal)] = 1. |
|
output[task_env.ModalityTypes.GOAL] = goal |
|
|
|
if task_env.ModalityTypes.PREV_ACTION in self._modality_types: |
|
output[task_env.ModalityTypes.PREV_ACTION] = self._prev_action |
|
|
|
if task_env.ModalityTypes.DISTANCE in self._modality_types: |
|
output[task_env.ModalityTypes.DISTANCE] = np.asarray( |
|
[self.gt_value(self._cur_goal, vertex)], dtype=np.float32) |
|
|
|
return output |
|
|
|
def _step_no_reward(self, action): |
|
"""Performs a step in the environment with given action. |
|
|
|
Args: |
|
action: Action that is used to step in the environment. Action can be |
|
string or integer. If the type is integer then it uses the ith element |
|
from self._actions list. Otherwise, uses the string value as the action. |
|
|
|
Returns: |
|
observation, done, info |
|
observation: dictonary that contains all the observations specified in |
|
modality_types. |
|
observation[task_env.ModalityTypes.OBJECT_DETECTION]: contains the |
|
detection of the current view. |
|
observation[task_env.ModalityTypes.IMAGE]: contains the |
|
image of the current view. Note that if using the images for training, |
|
should_load_images should be set to false. |
|
observation[task_env.ModalityTypes.SEMANTIC_SEGMENTATION]: contains the |
|
semantic segmentation of the current view. |
|
observation[task_env.ModalityTypes.DEPTH]: If selected, returns the |
|
depth map for the current view. |
|
observation[task_env.ModalityTypes.PREV_ACTION]: If selected, returns |
|
a numpy of (action_size + 1,). The first action_size elements indicate |
|
the action and the last element indicates whether the previous action |
|
was successful or not. |
|
done: True after episode_length steps have been taken, False otherwise. |
|
info: Empty dictionary. |
|
|
|
Raises: |
|
ValueError: for invalid actions. |
|
""" |
|
|
|
if not isinstance(action, str): |
|
if not self.action_space.contains(action): |
|
raise ValueError('Not a valid actions: %d', action) |
|
|
|
action = self._actions[action] |
|
|
|
if action not in self._actions: |
|
raise ValueError('Not a valid action: %s', action) |
|
|
|
action_index = self._actions.index(action) |
|
|
|
if action == 'stop': |
|
next_image_id = self._cur_image_id |
|
done = True |
|
success = True |
|
else: |
|
next_image_id = self._next_image(self._cur_image_id, action) |
|
self._steps_taken += 1 |
|
done = False |
|
success = True |
|
if not next_image_id: |
|
success = False |
|
else: |
|
self._cur_image_id = next_image_id |
|
|
|
if self._steps_taken >= self._episode_length: |
|
done = True |
|
|
|
cur_vertex = self._cur_graph.id_to_index[self._cur_image_id] |
|
observation = self.observation(self.vertex_to_pose(cur_vertex)) |
|
|
|
|
|
|
|
self._prev_action = np.zeros((len(self._actions) + 1,), dtype=np.float32) |
|
self._prev_action[action_index] = 1. |
|
self._prev_action[-1] = float(success) |
|
|
|
distance_to_goal = self.gt_value(self._cur_goal, cur_vertex) |
|
if success: |
|
if distance_to_goal <= self._reward_goal_range: |
|
done = True |
|
|
|
return observation, done, {'success': success} |
|
|
|
@property |
|
def graph(self): |
|
return self._cur_graph.graph |
|
|
|
def state(self): |
|
return self.vertex_to_pose(self.to_vertex(self._cur_image_id)) |
|
|
|
def gt_value(self, goal, v): |
|
"""Computes the distance to the goal from vertex v. |
|
|
|
Args: |
|
goal: name of the goal. |
|
v: vertex id. |
|
|
|
Returns: |
|
Minimmum number of steps to the given goal. |
|
""" |
|
assert goal in self._cur_graph.distance_to_goal, 'goal: {}'.format(goal) |
|
assert v in self._cur_graph.distance_to_goal[goal] |
|
res = self._cur_graph.distance_to_goal[goal][v] |
|
return res |
|
|
|
def _update_graph(self): |
|
"""Creates the graph for each environment and updates the _cur_graph.""" |
|
if self._cur_world not in self._graph_cache: |
|
graph = nx.DiGraph() |
|
id_to_index = {} |
|
index_to_id = {} |
|
image_list = self._world_id_dict[self._cur_world] |
|
for i, image_id in enumerate(image_list): |
|
id_to_index[image_id] = i |
|
index_to_id[i] = image_id |
|
graph.add_node(i) |
|
|
|
for image_id in image_list: |
|
for action in self._actions: |
|
if action == 'stop': |
|
continue |
|
next_image = self._all_graph[self._cur_world][image_id][action] |
|
if next_image: |
|
graph.add_edge( |
|
id_to_index[image_id], id_to_index[next_image], action=action) |
|
target_indexes = {} |
|
number_of_nodes_without_targets = graph.number_of_nodes() |
|
distance_to_goal = {} |
|
for goal in self._targets: |
|
if self._cur_world not in self._annotated_targets[goal]: |
|
continue |
|
goal_indexes = [ |
|
id_to_index[i] |
|
for i in self._annotated_targets[goal][self._cur_world] |
|
if i |
|
] |
|
super_source_index = graph.number_of_nodes() |
|
target_indexes[goal] = super_source_index |
|
graph.add_node(super_source_index) |
|
index_to_id[super_source_index] = goal |
|
id_to_index[goal] = super_source_index |
|
for v in goal_indexes: |
|
graph.add_edge(v, super_source_index, action='stop') |
|
graph.add_edge(super_source_index, v, action='stop') |
|
distance_to_goal[goal] = {} |
|
for v in range(number_of_nodes_without_targets): |
|
distance_to_goal[goal][v] = len( |
|
nx.shortest_path(graph, v, super_source_index)) - 2 |
|
|
|
self._graph_cache[self._cur_world] = _Graph( |
|
graph, id_to_index, index_to_id, target_indexes, distance_to_goal) |
|
self._cur_graph = self._graph_cache[self._cur_world] |
|
|
|
def reset_for_eval(self, new_world, new_goal, new_image_id): |
|
"""Resets to the given goal and image_id.""" |
|
return self._reset_env(new_world=new_world, new_goal=new_goal, new_image_id=new_image_id) |
|
|
|
def get_init_config(self, path): |
|
"""Exposes the initial state of the agent for the given path. |
|
|
|
Args: |
|
path: sequences of the vertexes that the agent moves. |
|
|
|
Returns: |
|
image_id of the first view, world, and the goal. |
|
""" |
|
return self._cur_graph.index_to_id[path[0]], self._cur_world, self._cur_goal |
|
|
|
def _reset_env( |
|
self, |
|
new_world=None, |
|
new_goal=None, |
|
new_image_id=None, |
|
): |
|
"""Resets the agent in a random world and random id. |
|
|
|
Args: |
|
new_world: If not None, sets the new world to new_world. |
|
new_goal: If not None, sets the new goal to new_goal. |
|
new_image_id: If not None, sets the first image id to new_image_id. |
|
|
|
Returns: |
|
observation: dictionary of the observations. Content of the observation |
|
is similar to that of the step function. |
|
Raises: |
|
ValueError: if it can't find a world and annotated goal. |
|
""" |
|
self._steps_taken = 0 |
|
|
|
self._prev_action = np.zeros((len(self._actions) + 1,), dtype=np.float32) |
|
self._prev_action[len(self._actions)] = 1. |
|
if self._eval_init_points is not None: |
|
if self._eval_init_index >= len(self._eval_init_points): |
|
self._eval_init_index = 0 |
|
a = self._eval_init_points[self._eval_init_index] |
|
self._cur_world, self._cur_image_id, self._cur_goal = a |
|
self._eval_init_index += 1 |
|
elif not new_world: |
|
attempts = 100 |
|
found = False |
|
while attempts >= 0: |
|
attempts -= 1 |
|
self._cur_goal = np.random.choice(self._targets) |
|
available_worlds = list( |
|
set(self._annotated_targets[self._cur_goal].keys()).intersection( |
|
set(self._worlds))) |
|
if available_worlds: |
|
found = True |
|
break |
|
if not found: |
|
raise ValueError('could not find a world that has a target annotated') |
|
self._cur_world = np.random.choice(available_worlds) |
|
else: |
|
self._cur_world = new_world |
|
self._cur_goal = new_goal |
|
if new_world not in self._annotated_targets[new_goal]: |
|
return None |
|
|
|
self._cur_goal_index = self._targets.index(self._cur_goal) |
|
if new_image_id: |
|
self._cur_image_id = new_image_id |
|
else: |
|
self._cur_image_id = np.random.choice( |
|
self._world_id_dict[self._cur_world]) |
|
if self._cur_world not in self._detection_cache: |
|
with tf.gfile.Open( |
|
_get_detection_path(self._dataset_root, self._detection_folder_name, |
|
self._cur_world)) as f: |
|
|
|
|
|
self._detection_cache[self._cur_world] = np.load(f).item() |
|
self._detection_table = self._detection_cache[self._cur_world] |
|
self._detection_area = {} |
|
self._update_graph() |
|
if self._cur_world not in self._vertex_to_pose: |
|
|
|
self._vertex_to_pose[self._cur_world] = { |
|
index: (-index,) for index in self._cur_graph.target_indexes.values() |
|
} |
|
|
|
|
|
for image_id in self._world_id_dict[self._cur_world]: |
|
self.vertex_to_pose(self.to_vertex(image_id)) |
|
|
|
|
|
self._pose_to_vertex[self._cur_world] = { |
|
tuple(v): k |
|
for k, v in self._vertex_to_pose[self._cur_world].iteritems() |
|
} |
|
|
|
cur_vertex = self._cur_graph.id_to_index[self._cur_image_id] |
|
observation = self.observation(self.vertex_to_pose(cur_vertex)) |
|
return observation |
|
|
|
def cur_vertex(self): |
|
return self._cur_graph.id_to_index[self._cur_image_id] |
|
|
|
def cur_image_id(self): |
|
return self._cur_image_id |
|
|
|
def path_to_goal(self, image_id=None): |
|
"""Returns the path from image_id to the self._cur_goal. |
|
|
|
Args: |
|
image_id: If set to None, computes the path from the current view. |
|
Otherwise, sets the current view to the given image_id. |
|
Returns: |
|
The path to the goal. |
|
Raises: |
|
Exception if there's no path from the view to the goal. |
|
""" |
|
if image_id is None: |
|
image_id = self._cur_image_id |
|
super_source = self._cur_graph.target_indexes[self._cur_goal] |
|
try: |
|
path = nx.shortest_path(self._cur_graph.graph, |
|
self._cur_graph.id_to_index[image_id], |
|
super_source) |
|
except: |
|
print 'path not found, image_id = ', self._cur_world, self._cur_image_id |
|
raise |
|
return path[:-1] |
|
|
|
def targets(self): |
|
return [self.vertex_to_pose(self._cur_graph.target_indexes[self._cur_goal])] |
|
|
|
def vertex_to_pose(self, v): |
|
"""Returns pose of the view for a given vertex. |
|
|
|
Args: |
|
v: integer, vertex index. |
|
|
|
Returns: |
|
(x, z, dir_x, dir_z) where x and z are the tranlation and dir_x, dir_z are |
|
a vector giving direction of the view. |
|
""" |
|
if v in self._vertex_to_pose[self._cur_world]: |
|
return np.copy(self._vertex_to_pose[self._cur_world][v]) |
|
|
|
x, z, rot, scale = self._cached_poses[self._cur_world][self.to_image_id( |
|
v)] |
|
if rot is None: |
|
self._vertex_to_pose[self._cur_world][v] = np.asarray( |
|
[x * scale, z * scale, v]) |
|
return np.copy(self._vertex_to_pose[self._cur_world][v]) |
|
|
|
|
|
direction = np.zeros((3, 1), dtype=np.float32) |
|
direction[2][0] = 1 |
|
direction = np.matmul(np.transpose(rot), direction) |
|
direction = [direction[0][0], direction[2][0]] |
|
self._vertex_to_pose[self._cur_world][v] = np.asarray( |
|
[x * scale, z * scale, direction[0], direction[1]]) |
|
return np.copy(self._vertex_to_pose[self._cur_world][v]) |
|
|
|
def pose_to_vertex(self, pose): |
|
"""Returns the vertex id for the given pose.""" |
|
if tuple(pose) not in self._pose_to_vertex[self._cur_world]: |
|
raise ValueError( |
|
'The given pose is not present in the dictionary: {}'.format( |
|
tuple(pose))) |
|
|
|
return self._pose_to_vertex[self._cur_world][tuple(pose)] |
|
|
|
def check_scene_graph(self, world, goal): |
|
"""Checks the connectivity of the scene graph. |
|
|
|
Goes over all the views. computes the shortest path to the goal. If it |
|
crashes it means that it's not connected. Otherwise, the env graph is fine. |
|
|
|
Args: |
|
world: the string name of the world. |
|
goal: the string label for the goal. |
|
Returns: |
|
Nothing. |
|
""" |
|
obs = self._reset_env(new_world=world, new_goal=goal) |
|
if not obs: |
|
print '{} is not availble in {}'.format(goal, world) |
|
return True |
|
for image_id in self._world_id_dict[self._cur_world]: |
|
print 'check image_id = {}'.format(image_id) |
|
self._cur_image_id = image_id |
|
path = self.path_to_goal() |
|
actions = [] |
|
for i in range(len(path) - 2): |
|
actions.append(self.action(path[i], path[i + 1])) |
|
actions.append('stop') |
|
|
|
@property |
|
def goal_one_hot(self): |
|
res = np.zeros((len(self._targets),), dtype=np.float32) |
|
res[self._cur_goal_index] = 1. |
|
return res |
|
|
|
@property |
|
def goal_index(self): |
|
return self._cur_goal_index |
|
|
|
@property |
|
def goal_string(self): |
|
return self._cur_goal |
|
|
|
@property |
|
def worlds(self): |
|
return self._worlds |
|
|
|
@property |
|
def possible_targets(self): |
|
return self._targets |
|
|
|
def action(self, from_pose, to_pose): |
|
"""Returns the action that takes source vertex to destination vertex. |
|
|
|
Args: |
|
from_pose: pose of the source. |
|
to_pose: pose of the destination. |
|
Returns: |
|
Returns the index of the action. |
|
Raises: |
|
ValueError: If it is not possible to go from the first vertice to second |
|
vertice with one action, it raises value error. |
|
""" |
|
from_index = self.pose_to_vertex(from_pose) |
|
to_index = self.pose_to_vertex(to_pose) |
|
if to_index not in self.graph[from_index]: |
|
from_image_id = self.to_image_id(from_index) |
|
to_image_id = self.to_image_id(to_index) |
|
raise ValueError('{},{} is not connected to {},{}'.format( |
|
from_index, from_image_id, to_index, to_image_id)) |
|
return self._actions.index(self.graph[from_index][to_index]['action']) |
|
|
|
def random_step_sequence(self, min_len=None, max_len=None): |
|
"""Generates random step sequence that takes agent to the goal. |
|
|
|
Args: |
|
min_len: integer, minimum length of a step sequence. Not yet implemented. |
|
max_len: integer, should be set to an integer and it is the maximum number |
|
of observations and path length to be max_len. |
|
Returns: |
|
Tuple of (path, actions, states, step_outputs). |
|
path: a random path from a random starting point and random environment. |
|
actions: actions of the returned path. |
|
states: viewpoints of all the states in between. |
|
step_outputs: list of step() return tuples. |
|
Raises: |
|
ValueError: if first_n is not greater than zero; if min_len is different |
|
from None. |
|
""" |
|
if max_len is None: |
|
raise ValueError('max_len can not be set as None') |
|
if max_len < 1: |
|
raise ValueError('first_n must be greater or equal to 1.') |
|
if min_len is not None: |
|
raise ValueError('min_len is not yet implemented.') |
|
|
|
path = [] |
|
actions = [] |
|
states = [] |
|
step_outputs = [] |
|
obs = self.reset() |
|
last_obs_tuple = [obs, 0, False, {}] |
|
for _ in xrange(max_len): |
|
action = np.random.choice(self._actions) |
|
|
|
|
|
while action == 'stop': |
|
action = np.random.choice(self._actions) |
|
path.append(self.to_vertex(self._cur_image_id)) |
|
onehot = np.zeros((len(self._actions),), dtype=np.float32) |
|
onehot[self._actions.index(action)] = 1. |
|
actions.append(onehot) |
|
states.append(self.vertex_to_pose(path[-1])) |
|
step_outputs.append(copy.deepcopy(last_obs_tuple)) |
|
last_obs_tuple = self.step(action) |
|
|
|
return path, actions, states, step_outputs |
|
|