File size: 41,226 Bytes
0b8359d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
# Copyright 2018 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Gym environment for the ActiveVision Dataset.

   The dataset is captured with a robot moving around and taking picture in
   multiple directions. The actions are moving in four directions, and rotate
   clockwise or counter clockwise. The observations are the output of vision
   pipelines such as object detectors. The goal is to find objects of interest
   in each environment. For more details, refer:
   http://cs.unc.edu/~ammirato/active_vision_dataset_website/.
"""
import tensorflow as tf
import collections
import copy
import json
import os
from StringIO import StringIO
import time
import gym
from gym.envs.registration import register
import gym.spaces
import networkx as nx
import numpy as np
import scipy.io as sio
from absl import logging
import gin
import cv2
import label_map_util
import visualization_utils as vis_util
from envs import task_env


register(
    id='active-vision-env-v0',
    entry_point=
    'cognitive_planning.envs.active_vision_dataset_env:ActiveVisionDatasetEnv',  # pylint: disable=line-too-long
)

_MAX_DEPTH_VALUE = 12102

SUPPORTED_ACTIONS = [
    'right', 'rotate_cw', 'rotate_ccw', 'forward', 'left', 'backward', 'stop'
]
SUPPORTED_MODALITIES = [
    task_env.ModalityTypes.SEMANTIC_SEGMENTATION,
    task_env.ModalityTypes.DEPTH,
    task_env.ModalityTypes.OBJECT_DETECTION,
    task_env.ModalityTypes.IMAGE,
    task_env.ModalityTypes.GOAL,
    task_env.ModalityTypes.PREV_ACTION,
    task_env.ModalityTypes.DISTANCE,
]

# Data structure for storing the information related to the graph of the world.
_Graph = collections.namedtuple('_Graph', [
    'graph', 'id_to_index', 'index_to_id', 'target_indexes', 'distance_to_goal'
])


def _init_category_index(label_map_path):
  """Creates category index from class indexes to name of the classes.

  Args:
    label_map_path: path to the mapping.
  Returns:
    A map for mapping int keys to string categories.
  """

  label_map = label_map_util.load_labelmap(label_map_path)
  num_classes = np.max(x.id for x in label_map.item)
  categories = label_map_util.convert_label_map_to_categories(
      label_map, max_num_classes=num_classes, use_display_name=True)
  category_index = label_map_util.create_category_index(categories)
  return category_index


def _draw_detections(image_np, detections, category_index):
  """Draws detections on to the image.

  Args:
    image_np: Image in the form of uint8 numpy array.
    detections: a dictionary that contains the detection outputs.
    category_index: contains the mapping between indexes and the category names.

  Returns:
    Does not return anything but draws the boxes on the
  """
  vis_util.visualize_boxes_and_labels_on_image_array(
      image_np,
      detections['detection_boxes'],
      detections['detection_classes'],
      detections['detection_scores'],
      category_index,
      use_normalized_coordinates=True,
      max_boxes_to_draw=1000,
      min_score_thresh=.0,
      agnostic_mode=False)


def generate_detection_image(detections,
                             image_size,
                             category_map,
                             num_classes,
                             is_binary=True):
  """Generates one_hot vector of the image using the detection boxes.

  Args:
    detections: 2D object detections from the image. It's a dictionary that
      contains detection_boxes, detection_classes, and detection_scores with
      dimensions of nx4, nx1, nx1 where n is the number of detections.
    image_size: The resolution of the output image.
    category_map: dictionary that maps label names to index.
    num_classes: Number of classes.
    is_binary: If true, it sets the corresponding channels to 0 and 1.
      Otherwise, sets the score in the corresponding channel.
  Returns:
    Returns image_size x image_size x num_classes image for the detection boxes.
  """
  res = np.zeros((image_size, image_size, num_classes), dtype=np.float32)
  boxes = detections['detection_boxes']
  labels = detections['detection_classes']
  scores = detections['detection_scores']
  for box, label, score in zip(boxes, labels, scores):
    transformed_boxes = [int(round(t)) for t in box * image_size]
    y1, x1, y2, x2 = transformed_boxes
    # Detector returns fixed number of detections. Boxes with area of zero
    # are equivalent of boxes that don't correspond to any detection box.
    # So, we need to skip the boxes with area 0.
    if (y2 - y1) * (x2 - x1) == 0:
      continue
    assert category_map[label] < num_classes, 'label = {}'.format(label)
    value = score
    if is_binary:
      value = 1
    res[y1:y2, x1:x2, category_map[label]] = value
  return res


def _get_detection_path(root, detection_folder_name, world):
  return os.path.join(root, 'Meta', detection_folder_name, world + '.npy')


def _get_image_folder(root, world):
  return os.path.join(root, world, 'jpg_rgb')


def _get_json_path(root, world):
  return os.path.join(root, world, 'annotations.json')


def _get_image_path(root, world, image_id):
  return os.path.join(_get_image_folder(root, world), image_id + '.jpg')


def _get_image_list(path, worlds):
  """Builds a dictionary for all the worlds.

  Args:
    path: the path to the dataset on cns.
    worlds: list of the worlds.

  Returns:
    dictionary where the key is the world names and the values
    are the image_ids of that world.
  """
  world_id_dict = {}
  for loc in worlds:
    files = [t[:-4] for t in tf.gfile.ListDir(_get_image_folder(path, loc))]
    world_id_dict[loc] = files
  return world_id_dict


def read_all_poses(dataset_root, world):
  """Reads all the poses for each world.

  Args:
    dataset_root: the path to the root of the dataset.
    world: string, name of the world.

  Returns:
    Dictionary of poses for all the images in each world. The key is the image
    id of each view and the values are tuple of (x, z, R, scale). Where x and z
    are the first and third coordinate of translation. R is the 3x3 rotation
    matrix and scale is a float scalar that indicates the scale that needs to
    be multipled to x and z in order to get the real world coordinates.

  Raises:
    ValueError: if the number of images do not match the number of poses read.
  """
  path = os.path.join(dataset_root, world, 'image_structs.mat')
  with tf.gfile.Open(path) as f:
    data = sio.loadmat(f)
  xyz = data['image_structs']['world_pos']
  image_names = data['image_structs']['image_name'][0]
  rot = data['image_structs']['R'][0]
  scale = data['scale'][0][0]
  n = xyz.shape[1]
  x = [xyz[0][i][0][0] for i in range(n)]
  z = [xyz[0][i][2][0] for i in range(n)]
  names = [name[0][:-4] for name in image_names]
  if len(names) != len(x):
    raise ValueError('number of image names are not equal to the number of '
                     'poses {} != {}'.format(len(names), len(x)))
  output = {}
  for i in range(n):
    if rot[i].shape[0] != 0:
      assert rot[i].shape[0] == 3
      assert rot[i].shape[1] == 3
      output[names[i]] = (x[i], z[i], rot[i], scale)
    else:
      output[names[i]] = (x[i], z[i], None, scale)

  return output


def read_cached_data(should_load_images, dataset_root, segmentation_file_name,
                     targets_file_name, output_size):
  """Reads all the necessary cached data.

  Args:
    should_load_images: whether to load the images or not.
    dataset_root: path to the root of the dataset.
    segmentation_file_name: The name of the file that contains semantic
      segmentation annotations.
    targets_file_name: The name of the file the contains targets annotated for
      each world.
    output_size: Size of the output images. This is used for pre-processing the
      loaded images.
  Returns:
    Dictionary of all the cached data.
  """

  load_start = time.time()
  result_data = {}

  annotated_target_path = os.path.join(dataset_root, 'Meta',
                                       targets_file_name + '.npy')

  logging.info('loading targets: %s', annotated_target_path)
  with tf.gfile.Open(annotated_target_path) as f:
    result_data['targets'] = np.load(f).item()

  depth_image_path = os.path.join(dataset_root, 'Meta/depth_imgs.npy')
  logging.info('loading depth: %s', depth_image_path)
  with tf.gfile.Open(depth_image_path) as f:
    depth_data = np.load(f).item()

  logging.info('processing depth')
  for home_id in depth_data:
    images = depth_data[home_id]
    for image_id in images:
      depth = images[image_id]
      depth = cv2.resize(
          depth / _MAX_DEPTH_VALUE, (output_size, output_size),
          interpolation=cv2.INTER_NEAREST)
      depth_mask = (depth > 0).astype(np.float32)
      depth = np.dstack((depth, depth_mask))
      images[image_id] = depth
  result_data[task_env.ModalityTypes.DEPTH] = depth_data

  sseg_path = os.path.join(dataset_root, 'Meta',
                           segmentation_file_name + '.npy')
  logging.info('loading sseg: %s', sseg_path)
  with tf.gfile.Open(sseg_path) as f:
    sseg_data = np.load(f).item()

  logging.info('processing sseg')
  for home_id in sseg_data:
    images = sseg_data[home_id]
    for image_id in images:
      sseg = images[image_id]
      sseg = cv2.resize(
          sseg, (output_size, output_size), interpolation=cv2.INTER_NEAREST)
      images[image_id] = np.expand_dims(sseg, axis=-1).astype(np.float32)
  result_data[task_env.ModalityTypes.SEMANTIC_SEGMENTATION] = sseg_data

  if should_load_images:
    image_path = os.path.join(dataset_root, 'Meta/imgs.npy')
    logging.info('loading imgs: %s', image_path)
    with tf.gfile.Open(image_path) as f:
      image_data = np.load(f).item()

    result_data[task_env.ModalityTypes.IMAGE] = image_data

  with tf.gfile.Open(os.path.join(dataset_root, 'Meta/world_id_dict.npy')) as f:
    result_data['world_id_dict'] = np.load(f).item()

  logging.info('logging done in %f seconds', time.time() - load_start)
  return result_data


@gin.configurable
def get_spec_dtype_map():
  return {gym.spaces.Box: np.float32}


@gin.configurable
class ActiveVisionDatasetEnv(task_env.TaskEnv):
  """Simulates the environment from ActiveVisionDataset."""
  cached_data = None

  def __init__(
      self,
      episode_length,
      modality_types,
      confidence_threshold,
      output_size,
      worlds,
      targets,
      compute_distance,
      should_draw_detections,
      dataset_root,
      labelmap_path,
      reward_collision,
      reward_goal_range,
      num_detection_classes,
      segmentation_file_name,
      detection_folder_name,
      actions,
      targets_file_name,
      eval_init_points_file_name=None,
      shaped_reward=False,
  ):
    """Instantiates the environment for ActiveVision Dataset.

    Args:
      episode_length: the length of each episode.
      modality_types: a list of the strings where each entry indicates the name
        of the modalities to be loaded. Valid entries are "sseg", "det",
        "depth", "image", "distance", and "prev_action". "distance" should be
        used for computing metrics in tf agents.
      confidence_threshold: Consider detections more than confidence_threshold
        for potential targets.
      output_size: Resolution of the output image.
      worlds: List of the name of the worlds.
      targets: List of the target names. Each entry is a string label of the
        target category (e.g. 'fridge', 'microwave', so on).
      compute_distance: If True, outputs the distance of the view to the goal.
      should_draw_detections (bool): If True, the image returned for the
        observation will contains the bounding boxes.
      dataset_root: the path to the root folder of the dataset.
      labelmap_path: path to the dictionary that converts label strings to
        indexes.
      reward_collision: the reward the agents get after hitting an obstacle.
        It should be a non-positive number.
      reward_goal_range: the number of steps from goal, such that the agent is
        considered to have reached the goal. If the agent's distance is less
        than the specified goal range, the episode is also finishes by setting
        done = True.
      num_detection_classes: number of classes that detector outputs.
      segmentation_file_name: the name of the file that contains the semantic
        information. The file should be in the dataset_root/Meta/ folder.
      detection_folder_name: Name of the folder that contains the detections
        for each world. The folder should be under dataset_root/Meta/ folder.
      actions: The list of the action names. Valid entries are listed in
        SUPPORTED_ACTIONS.
      targets_file_name: the name of the file that contains the annotated
        targets. The file should be in the dataset_root/Meta/Folder
      eval_init_points_file_name: The name of the file that contains the initial
        points for evaluating the performance of the agent. If set to None,
        episodes start at random locations. Should be only set for evaluation.
      shaped_reward: Whether to add delta goal distance to the reward each step.

    Raises:
      ValueError: If one of the targets are not available in the annotated
        targets or the modality names are not from the domain specified above.
      ValueError: If one of the actions is not in SUPPORTED_ACTIONS.
      ValueError: If the reward_collision is a positive number.
      ValueError: If there is no action other than stop provided.
    """
    if reward_collision > 0:
      raise ValueError('"reward" for collision should be non positive')

    if reward_goal_range < 0:
      logging.warning('environment does not terminate the episode if the agent '
                      'is too close to the environment')

    if not modality_types:
      raise ValueError('modality names can not be empty')

    for name in modality_types:
      if name not in SUPPORTED_MODALITIES:
        raise ValueError('invalid modality type: {}'.format(name))

    actions_other_than_stop_found = False
    for a in actions:
      if a != 'stop':
        actions_other_than_stop_found = True
      if a not in SUPPORTED_ACTIONS:
        raise ValueError('invalid action %s', a)

    if not actions_other_than_stop_found:
      raise ValueError('environment needs to have actions other than stop.')

    super(ActiveVisionDatasetEnv, self).__init__()

    self._episode_length = episode_length
    self._modality_types = set(modality_types)
    self._confidence_threshold = confidence_threshold
    self._output_size = output_size
    self._dataset_root = dataset_root
    self._worlds = worlds
    self._targets = targets
    self._all_graph = {}
    for world in self._worlds:
      with tf.gfile.Open(_get_json_path(self._dataset_root, world), 'r') as f:
        file_content = f.read()
        file_content = file_content.replace('.jpg', '')
        io = StringIO(file_content)
        self._all_graph[world] = json.load(io)

    self._cur_world = ''
    self._cur_image_id = ''
    self._cur_graph = None  # Loaded by _update_graph
    self._steps_taken = 0
    self._last_action_success = True
    self._category_index = _init_category_index(labelmap_path)
    self._category_map = dict(
        [(c, i) for i, c in enumerate(self._category_index)])
    self._detection_cache = {}
    if not ActiveVisionDatasetEnv.cached_data:
      ActiveVisionDatasetEnv.cached_data = read_cached_data(
          True, self._dataset_root, segmentation_file_name, targets_file_name,
          self._output_size)
    cached_data = ActiveVisionDatasetEnv.cached_data

    self._world_id_dict = cached_data['world_id_dict']
    self._depth_images = cached_data[task_env.ModalityTypes.DEPTH]
    self._semantic_segmentations = cached_data[
        task_env.ModalityTypes.SEMANTIC_SEGMENTATION]
    self._annotated_targets = cached_data['targets']
    self._cached_imgs = cached_data[task_env.ModalityTypes.IMAGE]
    self._graph_cache = {}
    self._compute_distance = compute_distance
    self._should_draw_detections = should_draw_detections
    self._reward_collision = reward_collision
    self._reward_goal_range = reward_goal_range
    self._num_detection_classes = num_detection_classes
    self._actions = actions
    self._detection_folder_name = detection_folder_name
    self._shaped_reward = shaped_reward

    self._eval_init_points = None
    if eval_init_points_file_name is not None:
      self._eval_init_index = 0
      init_points_path = os.path.join(self._dataset_root, 'Meta',
                                      eval_init_points_file_name + '.npy')
      with tf.gfile.Open(init_points_path) as points_file:
        data = np.load(points_file).item()
      self._eval_init_points = []
      for world in self._worlds:
        for goal in self._targets:
          if world in self._annotated_targets[goal]:
            for image_id in data[world]:
              self._eval_init_points.append((world, image_id[0], goal))
        logging.info('loaded %d eval init points', len(self._eval_init_points))

    self.action_space = gym.spaces.Discrete(len(self._actions))

    obs_shapes = {}
    if task_env.ModalityTypes.SEMANTIC_SEGMENTATION in self._modality_types:
      obs_shapes[task_env.ModalityTypes.SEMANTIC_SEGMENTATION] = gym.spaces.Box(
          low=0, high=255, shape=(self._output_size, self._output_size, 1))
    if task_env.ModalityTypes.OBJECT_DETECTION in self._modality_types:
      obs_shapes[task_env.ModalityTypes.OBJECT_DETECTION] = gym.spaces.Box(
          low=0,
          high=255,
          shape=(self._output_size, self._output_size,
                 self._num_detection_classes))
    if task_env.ModalityTypes.DEPTH in self._modality_types:
      obs_shapes[task_env.ModalityTypes.DEPTH] = gym.spaces.Box(
          low=0,
          high=_MAX_DEPTH_VALUE,
          shape=(self._output_size, self._output_size, 2))
    if task_env.ModalityTypes.IMAGE in self._modality_types:
      obs_shapes[task_env.ModalityTypes.IMAGE] = gym.spaces.Box(
          low=0, high=255, shape=(self._output_size, self._output_size, 3))
    if task_env.ModalityTypes.GOAL in self._modality_types:
      obs_shapes[task_env.ModalityTypes.GOAL] = gym.spaces.Box(
          low=0, high=1., shape=(len(self._targets),))
    if task_env.ModalityTypes.PREV_ACTION in self._modality_types:
      obs_shapes[task_env.ModalityTypes.PREV_ACTION] = gym.spaces.Box(
          low=0, high=1., shape=(len(self._actions) + 1,))
    if task_env.ModalityTypes.DISTANCE in self._modality_types:
      obs_shapes[task_env.ModalityTypes.DISTANCE] = gym.spaces.Box(
          low=0, high=255, shape=(1,))
    self.observation_space = gym.spaces.Dict(obs_shapes)

    self._prev_action = np.zeros((len(self._actions) + 1), dtype=np.float32)

    # Loading all the poses.
    all_poses = {}
    for world in self._worlds:
      all_poses[world] = read_all_poses(self._dataset_root, world)
    self._cached_poses = all_poses
    self._vertex_to_pose = {}
    self._pose_to_vertex = {}

  @property
  def actions(self):
    """Returns list of actions for the env."""
    return self._actions

  def _next_image(self, image_id, action):
    """Given the action, returns the name of the image that agent ends up in.

    Args:
      image_id: The image id of the current view.
      action: valid actions are ['right', 'rotate_cw', 'rotate_ccw',
      'forward', 'left']. Each rotation is 30 degrees.

    Returns:
      The image name for the next location of the agent. If the action results
      in collision or it is not possible for the agent to execute that action,
      returns empty string.
    """
    assert action in self._actions, 'invalid action : {}'.format(action)
    assert self._cur_world in self._all_graph, 'invalid world {}'.format(
        self._cur_world)
    assert image_id in self._all_graph[
        self._cur_world], 'image_id {} is not in {}'.format(
            image_id, self._cur_world)
    return self._all_graph[self._cur_world][image_id][action]

  def _largest_detection_for_image(self, image_id, detections_dict):
    """Assigns area of the largest box for the view with given image id.

    Args:
      image_id: Image id of the view.
      detections_dict: Detections for the view.
    """
    for cls, box, score in zip(detections_dict['detection_classes'],
                               detections_dict['detection_boxes'],
                               detections_dict['detection_scores']):
      if cls not in self._targets:
        continue
      if score < self._confidence_threshold:
        continue
      ymin, xmin, ymax, xmax = box
      area = (ymax - ymin) * (xmax - xmin)
      if abs(area) < 1e-5:
        continue
      if image_id not in self._detection_area:
        self._detection_area[image_id] = area
      else:
        self._detection_area[image_id] = max(self._detection_area[image_id],
                                             area)

  def _compute_goal_indexes(self):
    """Computes the goal indexes for the environment.

    Returns:
      The indexes of the goals that are closest to target categories. A vertex
      is goal vertice if the desired objects are detected in the image and the
      target categories are not seen by moving forward from that vertice.
    """
    for image_id in self._world_id_dict[self._cur_world]:
      detections_dict = self._detection_table[image_id]
      self._largest_detection_for_image(image_id, detections_dict)
    goal_indexes = []
    for image_id in self._world_id_dict[self._cur_world]:
      if image_id not in self._detection_area:
        continue
      # Detection box is large enough.
      if self._detection_area[image_id] < 0.01:
        continue
      ok = True
      next_image_id = self._next_image(image_id, 'forward')
      if next_image_id:
        if next_image_id in self._detection_area:
          ok = False
      if ok:
        goal_indexes.append(self._cur_graph.id_to_index[image_id])
    return goal_indexes

  def to_image_id(self, vid):
    """Converts vertex id to the image id.

    Args:
      vid: vertex id of the view.
    Returns:
      image id of the input vertex id.
    """
    return self._cur_graph.index_to_id[vid]

  def to_vertex(self, image_id):
    return self._cur_graph.id_to_index[image_id]

  def observation(self, view_pose):
    """Returns the observation at the given the vertex.

    Args:
      view_pose: pose of the view of interest.

    Returns:
      Observation at the given view point.

    Raises:
      ValueError: if the given view pose is not similar to any of the poses in
        the current world.
    """
    vertex = self.pose_to_vertex(view_pose)
    if vertex is None:
      raise ValueError('The given found is not close enough to any of the poses'
                       ' in the environment.')
    image_id = self._cur_graph.index_to_id[vertex]
    output = collections.OrderedDict()

    if task_env.ModalityTypes.SEMANTIC_SEGMENTATION in self._modality_types:
      output[task_env.ModalityTypes.
             SEMANTIC_SEGMENTATION] = self._semantic_segmentations[
                 self._cur_world][image_id]

    detection = None
    need_det = (
        task_env.ModalityTypes.OBJECT_DETECTION in self._modality_types or
        (task_env.ModalityTypes.IMAGE in self._modality_types and
         self._should_draw_detections))
    if need_det:
      detection = self._detection_table[image_id]
      detection_image = generate_detection_image(
          detection,
          self._output_size,
          self._category_map,
          num_classes=self._num_detection_classes)

    if task_env.ModalityTypes.OBJECT_DETECTION in self._modality_types:
      output[task_env.ModalityTypes.OBJECT_DETECTION] = detection_image

    if task_env.ModalityTypes.DEPTH in self._modality_types:
      output[task_env.ModalityTypes.DEPTH] = self._depth_images[
          self._cur_world][image_id]

    if task_env.ModalityTypes.IMAGE in self._modality_types:
      output_img = self._cached_imgs[self._cur_world][image_id]
      if self._should_draw_detections:
        output_img = output_img.copy()
        _draw_detections(output_img, detection, self._category_index)
      output[task_env.ModalityTypes.IMAGE] = output_img

    if task_env.ModalityTypes.GOAL in self._modality_types:
      goal = np.zeros((len(self._targets),), dtype=np.float32)
      goal[self._targets.index(self._cur_goal)] = 1.
      output[task_env.ModalityTypes.GOAL] = goal

    if task_env.ModalityTypes.PREV_ACTION in self._modality_types:
      output[task_env.ModalityTypes.PREV_ACTION] = self._prev_action

    if task_env.ModalityTypes.DISTANCE in self._modality_types:
      output[task_env.ModalityTypes.DISTANCE] = np.asarray(
          [self.gt_value(self._cur_goal, vertex)], dtype=np.float32)

    return output

  def _step_no_reward(self, action):
    """Performs a step in the environment with given action.

    Args:
      action: Action that is used to step in the environment. Action can be
        string or integer. If the type is integer then it uses the ith element
        from self._actions list. Otherwise, uses the string value as the action.

    Returns:
      observation, done, info
      observation: dictonary that contains all the observations specified in
        modality_types.
        observation[task_env.ModalityTypes.OBJECT_DETECTION]: contains the
        detection of the current view.
        observation[task_env.ModalityTypes.IMAGE]: contains the
          image of the current view. Note that if using the images for training,
          should_load_images should be set to false.
        observation[task_env.ModalityTypes.SEMANTIC_SEGMENTATION]: contains the
          semantic segmentation of the current view.
        observation[task_env.ModalityTypes.DEPTH]: If selected, returns the
          depth map for the current view.
        observation[task_env.ModalityTypes.PREV_ACTION]: If selected, returns
          a numpy of (action_size + 1,). The first action_size elements indicate
          the action and the last element indicates whether the previous action
          was successful or not.
      done: True after episode_length steps have been taken, False otherwise.
      info: Empty dictionary.

    Raises:
      ValueError: for invalid actions.
    """
    # Primarily used for gym interface.
    if not isinstance(action, str):
      if not self.action_space.contains(action):
        raise ValueError('Not a valid actions: %d', action)

      action = self._actions[action]

    if action not in self._actions:
      raise ValueError('Not a valid action: %s', action)

    action_index = self._actions.index(action)

    if action == 'stop':
      next_image_id = self._cur_image_id
      done = True
      success = True
    else:
      next_image_id = self._next_image(self._cur_image_id, action)
      self._steps_taken += 1
      done = False
      success = True
    if not next_image_id:
      success = False
    else:
      self._cur_image_id = next_image_id

    if self._steps_taken >= self._episode_length:
      done = True

    cur_vertex = self._cur_graph.id_to_index[self._cur_image_id]
    observation = self.observation(self.vertex_to_pose(cur_vertex))

    # Concatenation of one-hot prev action + a binary number for success of
    # previous actions.
    self._prev_action = np.zeros((len(self._actions) + 1,), dtype=np.float32)
    self._prev_action[action_index] = 1.
    self._prev_action[-1] = float(success)

    distance_to_goal = self.gt_value(self._cur_goal, cur_vertex)
    if success:
      if distance_to_goal <= self._reward_goal_range:
        done = True

    return observation, done, {'success': success}

  @property
  def graph(self):
    return self._cur_graph.graph

  def state(self):
    return self.vertex_to_pose(self.to_vertex(self._cur_image_id))

  def gt_value(self, goal, v):
    """Computes the distance to the goal from vertex v.

    Args:
      goal: name of the goal.
      v: vertex id.

    Returns:
      Minimmum number of steps to the given goal.
    """
    assert goal in self._cur_graph.distance_to_goal, 'goal: {}'.format(goal)
    assert v in self._cur_graph.distance_to_goal[goal]
    res = self._cur_graph.distance_to_goal[goal][v]
    return res

  def _update_graph(self):
    """Creates the graph for each environment and updates the _cur_graph."""
    if self._cur_world not in self._graph_cache:
      graph = nx.DiGraph()
      id_to_index = {}
      index_to_id = {}
      image_list = self._world_id_dict[self._cur_world]
      for i, image_id in enumerate(image_list):
        id_to_index[image_id] = i
        index_to_id[i] = image_id
        graph.add_node(i)

      for image_id in image_list:
        for action in self._actions:
          if action == 'stop':
            continue
          next_image = self._all_graph[self._cur_world][image_id][action]
          if next_image:
            graph.add_edge(
                id_to_index[image_id], id_to_index[next_image], action=action)
      target_indexes = {}
      number_of_nodes_without_targets = graph.number_of_nodes()
      distance_to_goal = {}
      for goal in self._targets:
        if self._cur_world not in self._annotated_targets[goal]:
          continue
        goal_indexes = [
            id_to_index[i]
            for i in self._annotated_targets[goal][self._cur_world]
            if i
        ]
        super_source_index = graph.number_of_nodes()
        target_indexes[goal] = super_source_index
        graph.add_node(super_source_index)
        index_to_id[super_source_index] = goal
        id_to_index[goal] = super_source_index
        for v in goal_indexes:
          graph.add_edge(v, super_source_index, action='stop')
          graph.add_edge(super_source_index, v, action='stop')
        distance_to_goal[goal] = {}
        for v in range(number_of_nodes_without_targets):
          distance_to_goal[goal][v] = len(
              nx.shortest_path(graph, v, super_source_index)) - 2

      self._graph_cache[self._cur_world] = _Graph(
          graph, id_to_index, index_to_id, target_indexes, distance_to_goal)
    self._cur_graph = self._graph_cache[self._cur_world]

  def reset_for_eval(self, new_world, new_goal, new_image_id):
    """Resets to the given goal and image_id."""
    return self._reset_env(new_world=new_world, new_goal=new_goal, new_image_id=new_image_id)

  def get_init_config(self, path):
    """Exposes the initial state of the agent for the given path.

    Args:
      path: sequences of the vertexes that the agent moves.

    Returns:
      image_id of the first view, world, and the goal.
    """
    return self._cur_graph.index_to_id[path[0]], self._cur_world, self._cur_goal

  def _reset_env(
      self,
      new_world=None,
      new_goal=None,
      new_image_id=None,
  ):
    """Resets the agent in a random world and random id.

    Args:
      new_world: If not None, sets the new world to new_world.
      new_goal: If not None, sets the new goal to new_goal.
      new_image_id: If not None, sets the first image id to new_image_id.

    Returns:
      observation: dictionary of the observations. Content of the observation
      is similar to that of the step function.
    Raises:
      ValueError: if it can't find a world and annotated goal.
    """
    self._steps_taken = 0
    # The first prev_action is special all zero vector + success=1.
    self._prev_action = np.zeros((len(self._actions) + 1,), dtype=np.float32)
    self._prev_action[len(self._actions)] = 1.
    if self._eval_init_points is not None:
      if self._eval_init_index >= len(self._eval_init_points):
        self._eval_init_index = 0
      a = self._eval_init_points[self._eval_init_index]
      self._cur_world, self._cur_image_id, self._cur_goal = a
      self._eval_init_index += 1
    elif not new_world:
      attempts = 100
      found = False
      while attempts >= 0:
        attempts -= 1
        self._cur_goal = np.random.choice(self._targets)
        available_worlds = list(
            set(self._annotated_targets[self._cur_goal].keys()).intersection(
                set(self._worlds)))
        if available_worlds:
          found = True
          break
      if not found:
        raise ValueError('could not find a world that has a target annotated')
      self._cur_world = np.random.choice(available_worlds)
    else:
      self._cur_world = new_world
      self._cur_goal = new_goal
      if new_world not in self._annotated_targets[new_goal]:
        return None

    self._cur_goal_index = self._targets.index(self._cur_goal)
    if new_image_id:
      self._cur_image_id = new_image_id
    else:
      self._cur_image_id = np.random.choice(
          self._world_id_dict[self._cur_world])
    if self._cur_world not in self._detection_cache:
      with tf.gfile.Open(
          _get_detection_path(self._dataset_root, self._detection_folder_name,
                              self._cur_world)) as f:
        # Each file contains a dictionary with image ids as keys and detection
        # dicts as values.
        self._detection_cache[self._cur_world] = np.load(f).item()
    self._detection_table = self._detection_cache[self._cur_world]
    self._detection_area = {}
    self._update_graph()
    if self._cur_world not in self._vertex_to_pose:
      # adding fake pose for the super nodes of each target categories.
      self._vertex_to_pose[self._cur_world] = {
          index: (-index,) for index in self._cur_graph.target_indexes.values()
      }
      # Calling vetex_to_pose for each vertex results in filling out the
      # dictionaries that contain pose related data.
      for image_id in self._world_id_dict[self._cur_world]:
        self.vertex_to_pose(self.to_vertex(image_id))

      # Filling out pose_to_vertex from vertex_to_pose.
      self._pose_to_vertex[self._cur_world] = {
          tuple(v): k
          for k, v in self._vertex_to_pose[self._cur_world].iteritems()
      }

    cur_vertex = self._cur_graph.id_to_index[self._cur_image_id]
    observation = self.observation(self.vertex_to_pose(cur_vertex))
    return observation

  def cur_vertex(self):
    return self._cur_graph.id_to_index[self._cur_image_id]

  def cur_image_id(self):
    return self._cur_image_id

  def path_to_goal(self, image_id=None):
    """Returns the path from image_id to the self._cur_goal.

    Args:
      image_id: If set to None, computes the path from the current view.
        Otherwise, sets the current view to the given image_id.
    Returns:
      The path to the goal.
    Raises:
      Exception if there's no path from the view to the goal.
    """
    if image_id is None:
      image_id = self._cur_image_id
    super_source = self._cur_graph.target_indexes[self._cur_goal]
    try:
      path = nx.shortest_path(self._cur_graph.graph,
                              self._cur_graph.id_to_index[image_id],
                              super_source)
    except:
      print 'path not found, image_id = ', self._cur_world, self._cur_image_id
      raise
    return path[:-1]

  def targets(self):
    return [self.vertex_to_pose(self._cur_graph.target_indexes[self._cur_goal])]

  def vertex_to_pose(self, v):
    """Returns pose of the view for a given vertex.

    Args:
      v: integer, vertex index.

    Returns:
      (x, z, dir_x, dir_z) where x and z are the tranlation and dir_x, dir_z are
        a vector giving direction of the view.
    """
    if v in self._vertex_to_pose[self._cur_world]:
      return np.copy(self._vertex_to_pose[self._cur_world][v])

    x, z, rot, scale = self._cached_poses[self._cur_world][self.to_image_id(
        v)]
    if rot is None:  # if rotation is not provided for the given vertex.
      self._vertex_to_pose[self._cur_world][v] = np.asarray(
          [x * scale, z * scale, v])
      return np.copy(self._vertex_to_pose[self._cur_world][v])
    # Multiply rotation matrix by [0,0,1] to get a vector of length 1 in the
    # direction of the ray.
    direction = np.zeros((3, 1), dtype=np.float32)
    direction[2][0] = 1
    direction = np.matmul(np.transpose(rot), direction)
    direction = [direction[0][0], direction[2][0]]
    self._vertex_to_pose[self._cur_world][v] = np.asarray(
        [x * scale, z * scale, direction[0], direction[1]])
    return np.copy(self._vertex_to_pose[self._cur_world][v])

  def pose_to_vertex(self, pose):
    """Returns the vertex id for the given pose."""
    if tuple(pose) not in self._pose_to_vertex[self._cur_world]:
      raise ValueError(
          'The given pose is not present in the dictionary: {}'.format(
              tuple(pose)))

    return self._pose_to_vertex[self._cur_world][tuple(pose)]

  def check_scene_graph(self, world, goal):
    """Checks the connectivity of the scene graph.

    Goes over all the views. computes the shortest path to the goal. If it
    crashes it means that it's not connected. Otherwise, the env graph is fine.

    Args:
      world: the string name of the world.
      goal: the string label for the goal.
    Returns:
      Nothing.
    """
    obs = self._reset_env(new_world=world, new_goal=goal)
    if not obs:
      print '{} is not availble in {}'.format(goal, world)
      return True
    for image_id in self._world_id_dict[self._cur_world]:
      print 'check image_id = {}'.format(image_id)
      self._cur_image_id = image_id
      path = self.path_to_goal()
      actions = []
      for i in range(len(path) - 2):
        actions.append(self.action(path[i], path[i + 1]))
      actions.append('stop')

  @property
  def goal_one_hot(self):
    res = np.zeros((len(self._targets),), dtype=np.float32)
    res[self._cur_goal_index] = 1.
    return res

  @property
  def goal_index(self):
    return self._cur_goal_index

  @property
  def goal_string(self):
    return self._cur_goal

  @property
  def worlds(self):
    return self._worlds

  @property
  def possible_targets(self):
    return self._targets

  def action(self, from_pose, to_pose):
    """Returns the action that takes source vertex to destination vertex.

    Args:
      from_pose: pose of the source.
      to_pose: pose of the destination.
    Returns:
      Returns the index of the action.
    Raises:
      ValueError: If it is not possible to go from the first vertice to second
      vertice with one action, it raises value error.
    """
    from_index = self.pose_to_vertex(from_pose)
    to_index = self.pose_to_vertex(to_pose)
    if to_index not in self.graph[from_index]:
      from_image_id = self.to_image_id(from_index)
      to_image_id = self.to_image_id(to_index)
      raise ValueError('{},{} is not connected to {},{}'.format(
          from_index, from_image_id, to_index, to_image_id))
    return self._actions.index(self.graph[from_index][to_index]['action'])

  def random_step_sequence(self, min_len=None, max_len=None):
    """Generates random step sequence that takes agent to the goal.

    Args:
      min_len: integer, minimum length of a step sequence. Not yet implemented.
      max_len: integer, should be set to an integer and it is the maximum number
        of observations and path length to be max_len.
    Returns:
      Tuple of (path, actions, states, step_outputs).
        path: a random path from a random starting point and random environment.
        actions: actions of the returned path.
        states: viewpoints of all the states in between.
        step_outputs: list of step() return tuples.
    Raises:
      ValueError: if first_n is not greater than zero; if min_len is different
        from None.
    """
    if max_len is None:
      raise ValueError('max_len can not be set as None')
    if max_len < 1:
      raise ValueError('first_n must be greater or equal to 1.')
    if min_len is not None:
      raise ValueError('min_len is not yet implemented.')

    path = []
    actions = []
    states = []
    step_outputs = []
    obs = self.reset()
    last_obs_tuple = [obs, 0, False, {}]
    for _ in xrange(max_len):
      action = np.random.choice(self._actions)
      # We don't want to sample stop action because stop does not add new
      # information.
      while action == 'stop':
        action = np.random.choice(self._actions)
      path.append(self.to_vertex(self._cur_image_id))
      onehot = np.zeros((len(self._actions),), dtype=np.float32)
      onehot[self._actions.index(action)] = 1.
      actions.append(onehot)
      states.append(self.vertex_to_pose(path[-1]))
      step_outputs.append(copy.deepcopy(last_obs_tuple))
      last_obs_tuple = self.step(action)

    return path, actions, states, step_outputs