dianecy commited on
Commit
5c8ef86
·
verified ·
1 Parent(s): 8d82201

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .history/datasets/__init___20250113130205.py +40 -0
  2. .history/datasets/ytvos_20241227174300.py +246 -0
  3. .history/datasets/ytvos_20250113131154.py +246 -0
  4. .history/datasets/ytvos_20250113131303.py +246 -0
  5. .history/datasets/ytvos_ref_20250113130047.py +237 -0
  6. .history/datasets/ytvos_ref_20250113131359.py +239 -0
  7. .history/datasets/ytvos_ref_20250113162825.py +244 -0
  8. .history/datasets/ytvos_ref_20250113163406.py +250 -0
  9. .history/datasets/ytvos_ref_20250113163605.py +250 -0
  10. .history/datasets/ytvos_ref_20250113180729.py +250 -0
  11. .history/datasets/ytvos_ref_20250114201918.py +253 -0
  12. .history/datasets/ytvos_ref_20250114202502.py +250 -0
  13. .history/datasets/ytvos_ref_20250114205233.py +252 -0
  14. .history/datasets/ytvos_ref_20250114210537.py +250 -0
  15. .history/make_ref-ytvos/annotate_ref_ytvos_20241227174304.py +288 -0
  16. .history/make_ref-ytvos/annotate_ref_ytvos_20250113111315.py +288 -0
  17. davis2017/utils.py +174 -0
  18. inference_davis.py +330 -0
  19. main.py +243 -0
  20. main_joint.py +198 -0
  21. main_pretrain.py +304 -0
  22. make_refcoco/refcocog_google/motion_split_generation_grefg_val.ipynb +0 -0
  23. make_refcoco/refcocog_google/part4_ref_id.txt +130 -0
  24. make_refcoco/refcocog_google/revised_refid_part4.json +506 -0
  25. make_refcoco/refcocog_umd/motion_split_generation.ipynb +0 -0
  26. make_refcoco/refcocog_umd/part4_ref_id.txt +126 -0
  27. make_refcoco/refcocog_umd/revised_refid_part4.json +498 -0
  28. mbench/__init__.py +0 -0
  29. mbench/__pycache__/transforms_video.cpython-39.pyc +0 -0
  30. mbench/__pycache__/ytvos_ref.cpython-39.pyc +0 -0
  31. mbench/check_image.ipynb +0 -0
  32. mbench/check_image_numbered.ipynb +0 -0
  33. mbench/check_image_revised.ipynb +164 -0
  34. mbench/gpt_ref-ytvos-revised.py +428 -0
  35. mbench/gpt_ref-ytvos.ipynb +0 -0
  36. mbench/gpt_ref-ytvos.py +302 -0
  37. mbench/gpt_ref-ytvos_numbered_cy.py +460 -0
  38. mbench/gpt_ref-ytvos_numbered_cy_sanity.py +643 -0
  39. mbench/gpt_ref-ytvos_numbered_cy_sanity_2.py +676 -0
  40. mbench/gpt_test.ipynb +0 -0
  41. mbench/make_ref-ytvos_json.py +108 -0
  42. mbench/numbered_captions_gpt-4o_final.json +0 -0
  43. mbench/numbered_captions_gpt-4o_no_mask_color.json +0 -0
  44. mbench/numbered_captions_gpt-4o_nomask_randcap.json +0 -0
  45. mbench/numbered_captions_gpt-4o_randcap.json +0 -0
  46. mbench/numbered_valid_obj_ids.json +2153 -0
  47. mbench/numbered_valid_obj_ids_gpt-4o.json +2153 -0
  48. mbench/numbered_valid_obj_ids_gpt-4o_no_mask_color.json +2153 -0
  49. mbench/numbered_valid_obj_ids_gpt-4o_nomask_randcap.json +2153 -0
  50. mbench/numbered_valid_obj_ids_gpt-4o_randcap.json +2153 -0
.history/datasets/__init___20250113130205.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.utils.data
2
+ import torchvision
3
+
4
+ from .ytvos import build as build_ytvos
5
+ from .ytvos_ref import build as build_ytvos_ref
6
+ from .davis import build as build_davis
7
+ from .a2d import build as build_a2d
8
+ from .jhmdb import build as build_jhmdb
9
+ from .refexp import build as build_refexp
10
+ from .concat_dataset import build as build_joint
11
+
12
+
13
+ def get_coco_api_from_dataset(dataset):
14
+ for _ in range(10):
15
+ # if isinstance(dataset, torchvision.datasets.CocoDetection):
16
+ # break
17
+ if isinstance(dataset, torch.utils.data.Subset):
18
+ dataset = dataset.dataset
19
+ if isinstance(dataset, torchvision.datasets.CocoDetection):
20
+ return dataset.coco
21
+
22
+
23
+ def build_dataset(dataset_file: str, image_set: str, args):
24
+ if dataset_file == 'ytvos':
25
+ return build_ytvos(image_set, args)
26
+ if dataset_file == 'ytvos_ref':
27
+ return build_ytvos_ref(image_set, args)
28
+ if dataset_file == 'davis':
29
+ return build_davis(image_set, args)
30
+ if dataset_file == 'a2d':
31
+ return build_a2d(image_set, args)
32
+ if dataset_file == 'jhmdb':
33
+ return build_jhmdb(image_set, args)
34
+ # for pretraining
35
+ if dataset_file == "refcoco" or dataset_file == "refcoco+" or dataset_file == "refcocog":
36
+ return build_refexp(dataset_file, image_set, args)
37
+ # for joint training of refcoco and ytvos
38
+ if dataset_file == 'joint':
39
+ return build_joint(image_set, args)
40
+ raise ValueError(f'dataset {dataset_file} not supported')
.history/datasets/ytvos_20241227174300.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ for vid in self.videos:
61
+ vid_meta = subset_metas_by_video[vid]
62
+ vid_data = subset_expressions_by_video[vid]
63
+ vid_frames = sorted(vid_data['frames'])
64
+ vid_len = len(vid_frames)
65
+ for exp_id, exp_dict in vid_data['expressions'].items():
66
+ for frame_id in range(0, vid_len, self.num_frames):
67
+ meta = {}
68
+ meta['video'] = vid
69
+ meta['exp'] = exp_dict['exp']
70
+ meta['obj_id'] = int(exp_dict['obj_id'])
71
+ meta['frames'] = vid_frames
72
+ meta['frame_id'] = frame_id
73
+ # get object category
74
+ obj_id = exp_dict['obj_id']
75
+ meta['category'] = vid_meta['objects'][obj_id]['category']
76
+ self.metas.append(meta)
77
+
78
+ @staticmethod
79
+ def bounding_box(img):
80
+ rows = np.any(img, axis=1)
81
+ cols = np.any(img, axis=0)
82
+ rmin, rmax = np.where(rows)[0][[0, -1]]
83
+ cmin, cmax = np.where(cols)[0][[0, -1]]
84
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
85
+
86
+ def __len__(self):
87
+ return len(self.metas)
88
+
89
+ def __getitem__(self, idx):
90
+ instance_check = False
91
+ while not instance_check:
92
+ meta = self.metas[idx] # dict
93
+
94
+ video, exp, obj_id, category, frames, frame_id = \
95
+ meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['frame_id']
96
+ # clean up the caption
97
+ exp = " ".join(exp.lower().split())
98
+ category_id = category_dict[category]
99
+ vid_len = len(frames)
100
+
101
+ num_frames = self.num_frames
102
+ # random sparse sample
103
+ sample_indx = [frame_id]
104
+ if self.num_frames != 1:
105
+ # local sample
106
+ sample_id_before = random.randint(1, 3)
107
+ sample_id_after = random.randint(1, 3)
108
+ local_indx = [max(0, frame_id - sample_id_before), min(vid_len - 1, frame_id + sample_id_after)]
109
+ sample_indx.extend(local_indx)
110
+ sample_indx = list(set(sample_indx))
111
+
112
+ # global sampling
113
+ if num_frames > 3:
114
+ all_inds = list(range(vid_len))
115
+ global_inds = all_inds[:min(sample_indx)] + all_inds[max(sample_indx):]
116
+ global_n = num_frames - len(sample_indx)
117
+ if len(global_inds) > global_n:
118
+ select_id = random.sample(range(len(global_inds)), global_n)
119
+ for s_id in select_id:
120
+ sample_indx.append(global_inds[s_id])
121
+ elif vid_len >=global_n: # sample long range global frames
122
+ select_id = random.sample(range(vid_len), global_n)
123
+ for s_id in select_id:
124
+ sample_indx.append(all_inds[s_id])
125
+ else:
126
+ select_id = random.sample(range(vid_len), global_n - vid_len) + list(range(vid_len))
127
+ for s_id in select_id:
128
+ sample_indx.append(all_inds[s_id])
129
+ sample_indx.sort()
130
+
131
+ # read frames and masks
132
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
133
+ for j in range(self.num_frames):
134
+ frame_indx = sample_indx[j]
135
+ frame_name = frames[frame_indx]
136
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
137
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
138
+ img = Image.open(img_path).convert('RGB')
139
+ mask = Image.open(mask_path).convert('P')
140
+
141
+ # create the target
142
+ label = torch.tensor(category_id)
143
+ mask = np.array(mask)
144
+ mask = (mask==obj_id).astype(np.float32) # 0,1 binary
145
+ if (mask > 0).any():
146
+ y1, y2, x1, x2 = self.bounding_box(mask)
147
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
148
+ valid.append(1)
149
+ else: # some frame didn't contain the instance
150
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
151
+ valid.append(0)
152
+ mask = torch.from_numpy(mask)
153
+
154
+ # append
155
+ imgs.append(img)
156
+ labels.append(label)
157
+ masks.append(mask)
158
+ boxes.append(box)
159
+
160
+ # transform
161
+ w, h = img.size
162
+ labels = torch.stack(labels, dim=0)
163
+ boxes = torch.stack(boxes, dim=0)
164
+ boxes[:, 0::2].clamp_(min=0, max=w)
165
+ boxes[:, 1::2].clamp_(min=0, max=h)
166
+ masks = torch.stack(masks, dim=0)
167
+ target = {
168
+ 'frames_idx': torch.tensor(sample_indx), # [T,]
169
+ 'labels': labels, # [T,]
170
+ 'boxes': boxes, # [T, 4], xyxy
171
+ 'masks': masks, # [T, H, W]
172
+ 'valid': torch.tensor(valid), # [T,]
173
+ 'caption': exp,
174
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
175
+ 'size': torch.as_tensor([int(h), int(w)])
176
+ }
177
+
178
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
179
+ if self._transforms:
180
+ imgs, target = self._transforms(imgs, target)
181
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
182
+ else:
183
+ imgs = np.array(imgs)
184
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
185
+
186
+
187
+ # FIXME: handle "valid", since some box may be removed due to random crop
188
+ if torch.any(target['valid'] == 1): # at leatst one instance
189
+ instance_check = True
190
+ else:
191
+ idx = random.randint(0, self.__len__() - 1)
192
+
193
+ return imgs, target
194
+
195
+
196
+ def make_coco_transforms(image_set, max_size=640):
197
+ normalize = T.Compose([
198
+ T.ToTensor(),
199
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
200
+ ])
201
+
202
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
203
+
204
+ if image_set == 'train':
205
+ return T.Compose([
206
+ T.RandomHorizontalFlip(),
207
+ T.PhotometricDistort(),
208
+ T.RandomSelect(
209
+ T.Compose([
210
+ T.RandomResize(scales, max_size=max_size),
211
+ T.Check(),
212
+ ]),
213
+ T.Compose([
214
+ T.RandomResize([400, 500, 600]),
215
+ T.RandomSizeCrop(384, 600),
216
+ T.RandomResize(scales, max_size=max_size),
217
+ T.Check(),
218
+ ])
219
+ ),
220
+ normalize,
221
+ ])
222
+
223
+ # we do not use the 'val' set since the annotations are inaccessible
224
+ if image_set == 'val':
225
+ return T.Compose([
226
+ T.RandomResize([360], max_size=640),
227
+ normalize,
228
+ ])
229
+
230
+ raise ValueError(f'unknown {image_set}')
231
+
232
+
233
+ def build(image_set, args):
234
+ root = Path(args.ytvos_path)
235
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
236
+ PATHS = {
237
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
238
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
239
+ }
240
+ img_folder, ann_file = PATHS[image_set]
241
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
242
+ # num_frames=args.num_frames, max_skip=args.max_skip)
243
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
244
+ num_frames=args.num_frames, max_skip=args.max_skip)
245
+ return dataset
246
+
.history/datasets/ytvos_20250113131154.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ for vid in self.videos:
61
+ vid_meta = subset_metas_by_video[vid]
62
+ vid_data = subset_expressions_by_video[vid]
63
+ vid_frames = sorted(vid_data['frames'])
64
+ vid_len = len(vid_frames)
65
+ for exp_id, exp_dict in vid_data['expressions'].items():
66
+ for frame_id in range(0, vid_len, self.num_frames):
67
+ meta = {}
68
+ meta['video'] = vid
69
+ meta['exp'] = exp_dict['exp']
70
+ meta['obj_id'] = int(exp_dict['obj_id'])
71
+ meta['frames'] = vid_frames
72
+ meta['frame_id'] = frame_id
73
+ # get object category
74
+ obj_id =
75
+ meta['category'] = vid_meta['objects'][obj_id]['category']
76
+ self.metas.append(meta)
77
+
78
+ @staticmethod
79
+ def bounding_box(img):
80
+ rows = np.any(img, axis=1)
81
+ cols = np.any(img, axis=0)
82
+ rmin, rmax = np.where(rows)[0][[0, -1]]
83
+ cmin, cmax = np.where(cols)[0][[0, -1]]
84
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
85
+
86
+ def __len__(self):
87
+ return len(self.metas)
88
+
89
+ def __getitem__(self, idx):
90
+ instance_check = False
91
+ while not instance_check:
92
+ meta = self.metas[idx] # dict
93
+ exp_dict['obj_id']
94
+ video, exp, obj_id, category, frames, frame_id = \
95
+ meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['frame_id']
96
+ # clean up the caption
97
+ exp = " ".join(exp.lower().split())
98
+ category_id = category_dict[category]
99
+ vid_len = len(frames)
100
+
101
+ num_frames = self.num_frames
102
+ # random sparse sample
103
+ sample_indx = [frame_id]
104
+ if self.num_frames != 1:
105
+ # local sample
106
+ sample_id_before = random.randint(1, 3)
107
+ sample_id_after = random.randint(1, 3)
108
+ local_indx = [max(0, frame_id - sample_id_before), min(vid_len - 1, frame_id + sample_id_after)]
109
+ sample_indx.extend(local_indx)
110
+ sample_indx = list(set(sample_indx))
111
+
112
+ # global sampling
113
+ if num_frames > 3:
114
+ all_inds = list(range(vid_len))
115
+ global_inds = all_inds[:min(sample_indx)] + all_inds[max(sample_indx):]
116
+ global_n = num_frames - len(sample_indx)
117
+ if len(global_inds) > global_n:
118
+ select_id = random.sample(range(len(global_inds)), global_n)
119
+ for s_id in select_id:
120
+ sample_indx.append(global_inds[s_id])
121
+ elif vid_len >=global_n: # sample long range global frames
122
+ select_id = random.sample(range(vid_len), global_n)
123
+ for s_id in select_id:
124
+ sample_indx.append(all_inds[s_id])
125
+ else:
126
+ select_id = random.sample(range(vid_len), global_n - vid_len) + list(range(vid_len))
127
+ for s_id in select_id:
128
+ sample_indx.append(all_inds[s_id])
129
+ sample_indx.sort()
130
+
131
+ # read frames and masks
132
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
133
+ for j in range(self.num_frames):
134
+ frame_indx = sample_indx[j]
135
+ frame_name = frames[frame_indx]
136
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
137
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
138
+ img = Image.open(img_path).convert('RGB')
139
+ mask = Image.open(mask_path).convert('P')
140
+
141
+ # create the target
142
+ label = torch.tensor(category_id)
143
+ mask = np.array(mask)
144
+ mask = (mask==obj_id).astype(np.float32) # 0,1 binary
145
+ if (mask > 0).any():
146
+ y1, y2, x1, x2 = self.bounding_box(mask)
147
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
148
+ valid.append(1)
149
+ else: # some frame didn't contain the instance
150
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
151
+ valid.append(0)
152
+ mask = torch.from_numpy(mask)
153
+
154
+ # append
155
+ imgs.append(img)
156
+ labels.append(label)
157
+ masks.append(mask)
158
+ boxes.append(box)
159
+
160
+ # transform
161
+ w, h = img.size
162
+ labels = torch.stack(labels, dim=0)
163
+ boxes = torch.stack(boxes, dim=0)
164
+ boxes[:, 0::2].clamp_(min=0, max=w)
165
+ boxes[:, 1::2].clamp_(min=0, max=h)
166
+ masks = torch.stack(masks, dim=0)
167
+ target = {
168
+ 'frames_idx': torch.tensor(sample_indx), # [T,]
169
+ 'labels': labels, # [T,]
170
+ 'boxes': boxes, # [T, 4], xyxy
171
+ 'masks': masks, # [T, H, W]
172
+ 'valid': torch.tensor(valid), # [T,]
173
+ 'caption': exp,
174
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
175
+ 'size': torch.as_tensor([int(h), int(w)])
176
+ }
177
+
178
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
179
+ if self._transforms:
180
+ imgs, target = self._transforms(imgs, target)
181
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
182
+ else:
183
+ imgs = np.array(imgs)
184
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
185
+
186
+
187
+ # FIXME: handle "valid", since some box may be removed due to random crop
188
+ if torch.any(target['valid'] == 1): # at leatst one instance
189
+ instance_check = True
190
+ else:
191
+ idx = random.randint(0, self.__len__() - 1)
192
+
193
+ return imgs, target
194
+
195
+
196
+ def make_coco_transforms(image_set, max_size=640):
197
+ normalize = T.Compose([
198
+ T.ToTensor(),
199
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
200
+ ])
201
+
202
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
203
+
204
+ if image_set == 'train':
205
+ return T.Compose([
206
+ T.RandomHorizontalFlip(),
207
+ T.PhotometricDistort(),
208
+ T.RandomSelect(
209
+ T.Compose([
210
+ T.RandomResize(scales, max_size=max_size),
211
+ T.Check(),
212
+ ]),
213
+ T.Compose([
214
+ T.RandomResize([400, 500, 600]),
215
+ T.RandomSizeCrop(384, 600),
216
+ T.RandomResize(scales, max_size=max_size),
217
+ T.Check(),
218
+ ])
219
+ ),
220
+ normalize,
221
+ ])
222
+
223
+ # we do not use the 'val' set since the annotations are inaccessible
224
+ if image_set == 'val':
225
+ return T.Compose([
226
+ T.RandomResize([360], max_size=640),
227
+ normalize,
228
+ ])
229
+
230
+ raise ValueError(f'unknown {image_set}')
231
+
232
+
233
+ def build(image_set, args):
234
+ root = Path(args.ytvos_path)
235
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
236
+ PATHS = {
237
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
238
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
239
+ }
240
+ img_folder, ann_file = PATHS[image_set]
241
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
242
+ # num_frames=args.num_frames, max_skip=args.max_skip)
243
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
244
+ num_frames=args.num_frames, max_skip=args.max_skip)
245
+ return dataset
246
+
.history/datasets/ytvos_20250113131303.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ for vid in self.videos:
61
+ vid_meta = subset_metas_by_video[vid]
62
+ vid_data = subset_expressions_by_video[vid]
63
+ vid_frames = sorted(vid_data['frames'])
64
+ vid_len = len(vid_frames)
65
+ for exp_id, exp_dict in vid_data['expressions'].items():
66
+ for frame_id in range(0, vid_len, self.num_frames):
67
+ meta = {}
68
+ meta['video'] = vid
69
+ meta['exp'] = exp_dict['exp']
70
+ meta['obj_id'] = int(exp_dict['obj_id'])
71
+ meta['frames'] = vid_frames
72
+ meta['frame_id'] = frame_id
73
+ # get object category
74
+ obj_id = exp_dict['obj_id']
75
+ meta['category'] = vid_meta['objects'][obj_id]['category']
76
+ self.metas.append(meta)
77
+
78
+ @staticmethod
79
+ def bounding_box(img):
80
+ rows = np.any(img, axis=1)
81
+ cols = np.any(img, axis=0)
82
+ rmin, rmax = np.where(rows)[0][[0, -1]]
83
+ cmin, cmax = np.where(cols)[0][[0, -1]]
84
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
85
+
86
+ def __len__(self):
87
+ return len(self.metas)
88
+
89
+ def __getitem__(self, idx):
90
+ instance_check = False
91
+ while not instance_check:
92
+ meta = self.metas[idx] # dict
93
+
94
+ video, exp, obj_id, category, frames, frame_id = \
95
+ meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['frame_id']
96
+ # clean up the caption
97
+ exp = " ".join(exp.lower().split())
98
+ category_id = category_dict[category]
99
+ vid_len = len(frames)
100
+
101
+ num_frames = self.num_frames
102
+ # random sparse sample
103
+ sample_indx = [frame_id]
104
+ if self.num_frames != 1:
105
+ # local sample
106
+ sample_id_before = random.randint(1, 3)
107
+ sample_id_after = random.randint(1, 3)
108
+ local_indx = [max(0, frame_id - sample_id_before), min(vid_len - 1, frame_id + sample_id_after)]
109
+ sample_indx.extend(local_indx)
110
+ sample_indx = list(set(sample_indx))
111
+
112
+ # global sampling
113
+ if num_frames > 3:
114
+ all_inds = list(range(vid_len))
115
+ global_inds = all_inds[:min(sample_indx)] + all_inds[max(sample_indx):]
116
+ global_n = num_frames - len(sample_indx)
117
+ if len(global_inds) > global_n:
118
+ select_id = random.sample(range(len(global_inds)), global_n)
119
+ for s_id in select_id:
120
+ sample_indx.append(global_inds[s_id])
121
+ elif vid_len >=global_n: # sample long range global frames
122
+ select_id = random.sample(range(vid_len), global_n)
123
+ for s_id in select_id:
124
+ sample_indx.append(all_inds[s_id])
125
+ else:
126
+ select_id = random.sample(range(vid_len), global_n - vid_len) + list(range(vid_len))
127
+ for s_id in select_id:
128
+ sample_indx.append(all_inds[s_id])
129
+ sample_indx.sort()
130
+
131
+ # read frames and masks
132
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
133
+ for j in range(self.num_frames):
134
+ frame_indx = sample_indx[j]
135
+ frame_name = frames[frame_indx]
136
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
137
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
138
+ img = Image.open(img_path).convert('RGB')
139
+ mask = Image.open(mask_path).convert('P')
140
+
141
+ # create the target
142
+ label = torch.tensor(category_id)
143
+ mask = np.array(mask)
144
+ mask = (mask==obj_id).astype(np.float32) # 0,1 binary
145
+ if (mask > 0).any():
146
+ y1, y2, x1, x2 = self.bounding_box(mask)
147
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
148
+ valid.append(1)
149
+ else: # some frame didn't contain the instance
150
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
151
+ valid.append(0)
152
+ mask = torch.from_numpy(mask)
153
+
154
+ # append
155
+ imgs.append(img)
156
+ labels.append(label)
157
+ masks.append(mask)
158
+ boxes.append(box)
159
+
160
+ # transform
161
+ w, h = img.size
162
+ labels = torch.stack(labels, dim=0)
163
+ boxes = torch.stack(boxes, dim=0)
164
+ boxes[:, 0::2].clamp_(min=0, max=w)
165
+ boxes[:, 1::2].clamp_(min=0, max=h)
166
+ masks = torch.stack(masks, dim=0)
167
+ target = {
168
+ 'frames_idx': torch.tensor(sample_indx), # [T,]
169
+ 'labels': labels, # [T,]
170
+ 'boxes': boxes, # [T, 4], xyxy
171
+ 'masks': masks, # [T, H, W]
172
+ 'valid': torch.tensor(valid), # [T,]
173
+ 'caption': exp,
174
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
175
+ 'size': torch.as_tensor([int(h), int(w)])
176
+ }
177
+
178
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
179
+ if self._transforms:
180
+ imgs, target = self._transforms(imgs, target)
181
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
182
+ else:
183
+ imgs = np.array(imgs)
184
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
185
+
186
+
187
+ # FIXME: handle "valid", since some box may be removed due to random crop
188
+ if torch.any(target['valid'] == 1): # at leatst one instance
189
+ instance_check = True
190
+ else:
191
+ idx = random.randint(0, self.__len__() - 1)
192
+
193
+ return imgs, target
194
+
195
+
196
+ def make_coco_transforms(image_set, max_size=640):
197
+ normalize = T.Compose([
198
+ T.ToTensor(),
199
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
200
+ ])
201
+
202
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
203
+
204
+ if image_set == 'train':
205
+ return T.Compose([
206
+ T.RandomHorizontalFlip(),
207
+ T.PhotometricDistort(),
208
+ T.RandomSelect(
209
+ T.Compose([
210
+ T.RandomResize(scales, max_size=max_size),
211
+ T.Check(),
212
+ ]),
213
+ T.Compose([
214
+ T.RandomResize([400, 500, 600]),
215
+ T.RandomSizeCrop(384, 600),
216
+ T.RandomResize(scales, max_size=max_size),
217
+ T.Check(),
218
+ ])
219
+ ),
220
+ normalize,
221
+ ])
222
+
223
+ # we do not use the 'val' set since the annotations are inaccessible
224
+ if image_set == 'val':
225
+ return T.Compose([
226
+ T.RandomResize([360], max_size=640),
227
+ normalize,
228
+ ])
229
+
230
+ raise ValueError(f'unknown {image_set}')
231
+
232
+
233
+ def build(image_set, args):
234
+ root = Path(args.ytvos_path)
235
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
236
+ PATHS = {
237
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
238
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
239
+ }
240
+ img_folder, ann_file = PATHS[image_set]
241
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
242
+ # num_frames=args.num_frames, max_skip=args.max_skip)
243
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
244
+ num_frames=args.num_frames, max_skip=args.max_skip)
245
+ return dataset
246
+
.history/datasets/ytvos_ref_20250113130047.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ for vid in self.videos:
61
+ vid_meta = subset_metas_by_video[vid]
62
+ vid_data = subset_expressions_by_video[vid]
63
+ vid_frames = sorted(vid_data['frames'])
64
+ vid_len = len(vid_frames)
65
+
66
+ for exp_id, exp_dict in vid_data['expressions'].items():
67
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
68
+ start_idx , end_idx = 2, vid_len-2
69
+ bin_size = (end_idx - start_idx) // 4
70
+
71
+ bins = []
72
+ for i in range(4):
73
+ bin_start = start_idx + i * bin_size
74
+ bin_end = bin_start + bin_size if i < 3 else end_idx
75
+
76
+ bins.append((bin_start, bin_end))
77
+
78
+ meta = {
79
+ 'video': vid,
80
+ 'exp': exp_dict['exp'],
81
+ 'obj_id': int(exp_dict['obj_id']),
82
+ 'frames': vid_frames,
83
+ 'bins': bins,
84
+ 'category': vid_meta['objects'][int(exp_dict['obj_id'])]['category']
85
+ }
86
+ self.metas.append(meta)
87
+
88
+
89
+ @staticmethod
90
+ def bounding_box(img):
91
+ rows = np.any(img, axis=1)
92
+ cols = np.any(img, axis=0)
93
+ rmin, rmax = np.where(rows)[0][[0, -1]]
94
+ cmin, cmax = np.where(cols)[0][[0, -1]]
95
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
96
+
97
+ def __len__(self):
98
+ return len(self.metas)
99
+
100
+ def __getitem__(self, idx):
101
+ instance_check = False
102
+ while not instance_check:
103
+ meta = self.metas[idx] # dict
104
+
105
+
106
+ video, exp, obj_id, category, frames, bins = \
107
+ meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['bins']
108
+
109
+
110
+ # clean up the caption
111
+ exp = " ".join(exp.lower().split())
112
+ category_id = category_dict[category]
113
+ vid_len = len(frames)
114
+
115
+ # num_frames = self.num_frames
116
+
117
+ # Random sample one frame from each bin
118
+ sample_indx = []
119
+ for start_idx, end_idx in bins:
120
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
121
+ sample_indx.sort() # Ensure indices are in order
122
+
123
+ # read frames and masks
124
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
125
+ for frame_indx in sample_indx:
126
+ frame_name = frames[frame_indx]
127
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
128
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
129
+ img = Image.open(img_path).convert('RGB')
130
+ mask = Image.open(mask_path).convert('P')
131
+
132
+ # create the target
133
+ label = torch.tensor(category_id)
134
+ mask = np.array(mask)
135
+ mask = (mask==obj_id).astype(np.float32) # 0,1 binary
136
+ if (mask > 0).any():
137
+ y1, y2, x1, x2 = self.bounding_box(mask)
138
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
139
+ valid.append(1)
140
+ else: # some frame didn't contain the instance
141
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
142
+ valid.append(0)
143
+ mask = torch.from_numpy(mask)
144
+
145
+ # append
146
+ imgs.append(img)
147
+ labels.append(label)
148
+ masks.append(mask)
149
+ boxes.append(box)
150
+
151
+ # transform
152
+ w, h = img.size
153
+ labels = torch.stack(labels, dim=0)
154
+ boxes = torch.stack(boxes, dim=0)
155
+ boxes[:, 0::2].clamp_(min=0, max=w)
156
+ boxes[:, 1::2].clamp_(min=0, max=h)
157
+ masks = torch.stack(masks, dim=0)
158
+ target = {
159
+ 'frames_idx': torch.tensor(sample_indx), # [T,]
160
+ 'labels': labels, # [T,]
161
+ 'boxes': boxes, # [T, 4], xyxy
162
+ 'masks': masks, # [T, H, W]
163
+ 'valid': torch.tensor(valid), # [T,]
164
+ 'caption': exp,
165
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
166
+ 'size': torch.as_tensor([int(h), int(w)])
167
+ }
168
+
169
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
170
+ if self._transforms:
171
+ imgs, target = self._transforms(imgs, target)
172
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
173
+ else:
174
+ imgs = np.array(imgs)
175
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
176
+
177
+
178
+ # FIXME: handle "valid", since some box may be removed due to random crop
179
+ if torch.any(target['valid'] == 1): # at leatst one instance
180
+ instance_check = True
181
+ else:
182
+ idx = random.randint(0, self.__len__() - 1)
183
+
184
+ return imgs, target
185
+
186
+
187
+ def make_coco_transforms(image_set, max_size=640):
188
+ normalize = T.Compose([
189
+ T.ToTensor(),
190
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
191
+ ])
192
+
193
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
194
+
195
+ if image_set == 'train':
196
+ return T.Compose([
197
+ T.RandomHorizontalFlip(),
198
+ T.PhotometricDistort(),
199
+ T.RandomSelect(
200
+ T.Compose([
201
+ T.RandomResize(scales, max_size=max_size),
202
+ T.Check(),
203
+ ]),
204
+ T.Compose([
205
+ T.RandomResize([400, 500, 600]),
206
+ T.RandomSizeCrop(384, 600),
207
+ T.RandomResize(scales, max_size=max_size),
208
+ T.Check(),
209
+ ])
210
+ ),
211
+ normalize,
212
+ ])
213
+
214
+ # we do not use the 'val' set since the annotations are inaccessible
215
+ if image_set == 'val':
216
+ return T.Compose([
217
+ T.RandomResize([360], max_size=640),
218
+ normalize,
219
+ ])
220
+
221
+ raise ValueError(f'unknown {image_set}')
222
+
223
+
224
+ def build(image_set, args):
225
+ root = Path(args.ytvos_path)
226
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
227
+ PATHS = {
228
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
229
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
230
+ }
231
+ img_folder, ann_file = PATHS[image_set]
232
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
233
+ # num_frames=args.num_frames, max_skip=args.max_skip)
234
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
235
+ num_frames=args.num_frames, max_skip=args.max_skip)
236
+ return dataset
237
+
.history/datasets/ytvos_ref_20250113131359.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ for vid in self.videos:
61
+ vid_meta = subset_metas_by_video[vid]
62
+ vid_data = subset_expressions_by_video[vid]
63
+ vid_frames = sorted(vid_data['frames'])
64
+ vid_len = len(vid_frames)
65
+
66
+
67
+ for exp_id, exp_dict in vid_data['expressions'].items():
68
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
69
+ start_idx , end_idx = 2, vid_len-2
70
+ bin_size = (end_idx - start_idx) // 4
71
+
72
+ bins = []
73
+ for i in range(4):
74
+ bin_start = start_idx + i * bin_size
75
+ bin_end = bin_start + bin_size if i < 3 else end_idx
76
+
77
+ bins.append((bin_start, bin_end))
78
+
79
+
80
+ meta = {
81
+ 'video': vid,
82
+ 'exp': exp_dict['exp'],
83
+ 'obj_id': int(exp_dict['obj_id']),
84
+ 'frames': vid_frames,
85
+ 'bins': bins,
86
+ 'category': vid_meta['objects'][exp_dict['obj_id']]['category']
87
+ }
88
+ self.metas.append(meta)
89
+
90
+
91
+ @staticmethod
92
+ def bounding_box(img):
93
+ rows = np.any(img, axis=1)
94
+ cols = np.any(img, axis=0)
95
+ rmin, rmax = np.where(rows)[0][[0, -1]]
96
+ cmin, cmax = np.where(cols)[0][[0, -1]]
97
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
98
+
99
+ def __len__(self):
100
+ return len(self.metas)
101
+
102
+ def __getitem__(self, idx):
103
+ instance_check = False
104
+ while not instance_check:
105
+ meta = self.metas[idx] # dict
106
+
107
+
108
+ video, exp, obj_id, category, frames, bins = \
109
+ meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['bins']
110
+
111
+
112
+ # clean up the caption
113
+ exp = " ".join(exp.lower().split())
114
+ category_id = category_dict[category]
115
+ vid_len = len(frames)
116
+
117
+ # num_frames = self.num_frames
118
+
119
+ # Random sample one frame from each bin
120
+ sample_indx = []
121
+ for start_idx, end_idx in bins:
122
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
123
+ sample_indx.sort() # Ensure indices are in order
124
+
125
+ # read frames and masks
126
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
127
+ for frame_indx in sample_indx:
128
+ frame_name = frames[frame_indx]
129
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
130
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
131
+ img = Image.open(img_path).convert('RGB')
132
+ mask = Image.open(mask_path).convert('P')
133
+
134
+ # create the target
135
+ label = torch.tensor(category_id)
136
+ mask = np.array(mask)
137
+ mask = (mask==obj_id).astype(np.float32) # 0,1 binary
138
+ if (mask > 0).any():
139
+ y1, y2, x1, x2 = self.bounding_box(mask)
140
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
141
+ valid.append(1)
142
+ else: # some frame didn't contain the instance
143
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
144
+ valid.append(0)
145
+ mask = torch.from_numpy(mask)
146
+
147
+ # append
148
+ imgs.append(img)
149
+ labels.append(label)
150
+ masks.append(mask)
151
+ boxes.append(box)
152
+
153
+ # transform
154
+ w, h = img.size
155
+ labels = torch.stack(labels, dim=0)
156
+ boxes = torch.stack(boxes, dim=0)
157
+ boxes[:, 0::2].clamp_(min=0, max=w)
158
+ boxes[:, 1::2].clamp_(min=0, max=h)
159
+ masks = torch.stack(masks, dim=0)
160
+ target = {
161
+ 'frames_idx': torch.tensor(sample_indx), # [T,]
162
+ 'labels': labels, # [T,]
163
+ 'boxes': boxes, # [T, 4], xyxy
164
+ 'masks': masks, # [T, H, W]
165
+ 'valid': torch.tensor(valid), # [T,]
166
+ 'caption': exp,
167
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
168
+ 'size': torch.as_tensor([int(h), int(w)])
169
+ }
170
+
171
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
172
+ if self._transforms:
173
+ imgs, target = self._transforms(imgs, target)
174
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
175
+ else:
176
+ imgs = np.array(imgs)
177
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
178
+
179
+
180
+ # FIXME: handle "valid", since some box may be removed due to random crop
181
+ if torch.any(target['valid'] == 1): # at leatst one instance
182
+ instance_check = True
183
+ else:
184
+ idx = random.randint(0, self.__len__() - 1)
185
+
186
+ return imgs, target
187
+
188
+
189
+ def make_coco_transforms(image_set, max_size=640):
190
+ normalize = T.Compose([
191
+ T.ToTensor(),
192
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
193
+ ])
194
+
195
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
196
+
197
+ if image_set == 'train':
198
+ return T.Compose([
199
+ T.RandomHorizontalFlip(),
200
+ T.PhotometricDistort(),
201
+ T.RandomSelect(
202
+ T.Compose([
203
+ T.RandomResize(scales, max_size=max_size),
204
+ T.Check(),
205
+ ]),
206
+ T.Compose([
207
+ T.RandomResize([400, 500, 600]),
208
+ T.RandomSizeCrop(384, 600),
209
+ T.RandomResize(scales, max_size=max_size),
210
+ T.Check(),
211
+ ])
212
+ ),
213
+ normalize,
214
+ ])
215
+
216
+ # we do not use the 'val' set since the annotations are inaccessible
217
+ if image_set == 'val':
218
+ return T.Compose([
219
+ T.RandomResize([360], max_size=640),
220
+ normalize,
221
+ ])
222
+
223
+ raise ValueError(f'unknown {image_set}')
224
+
225
+
226
+ def build(image_set, args):
227
+ root = Path(args.ytvos_path)
228
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
229
+ PATHS = {
230
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
231
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
232
+ }
233
+ img_folder, ann_file = PATHS[image_set]
234
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
235
+ # num_frames=args.num_frames, max_skip=args.max_skip)
236
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
237
+ num_frames=args.num_frames, max_skip=args.max_skip)
238
+ return dataset
239
+
.history/datasets/ytvos_ref_20250113162825.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ for vid in self.videos:
61
+ vid_meta = subset_metas_by_video[vid]
62
+ vid_data = subset_expressions_by_video[vid]
63
+ vid_frames = sorted(vid_data['frames'])
64
+ vid_len = len(vid_frames)
65
+
66
+ for exp_id, exp_dict in vid_data['expressions'].items():
67
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
68
+ start_idx , end_idx = 2, vid_len-2
69
+ bin_size = (end_idx - start_idx) // 4
70
+
71
+ bins = []
72
+ for i in range(4):
73
+ bin_start = start_idx + i * bin_size
74
+ bin_end = bin_start + bin_size if i < 3 else end_idx
75
+
76
+ bins.append((bin_start, bin_end))
77
+
78
+ # Random sample one frame from each bin
79
+ sample_indx = []
80
+ for start_idx, end_idx in bins:
81
+ try:
82
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
83
+ except ValueError:
84
+ print(bins)
85
+ sample_indx.sort() # Ensure indices are in order
86
+
87
+
88
+ for frame_id in sample_indx:
89
+ meta = {
90
+ 'video': vid,
91
+ 'exp': exp_dict['exp'],
92
+ 'obj_id': int(exp_dict['obj_id']),
93
+ 'frames': vid_frames,
94
+ 'frame_id' : frame_id,
95
+ 'sample_frames_id' : sample_indx,
96
+ 'bins': bins,
97
+ 'category': vid_meta['objects'][exp_dict['obj_id']]['category']
98
+ }
99
+ self.metas.append(meta)
100
+
101
+
102
+ @staticmethod
103
+ def bounding_box(img):
104
+ rows = np.any(img, axis=1)
105
+ cols = np.any(img, axis=0)
106
+ rmin, rmax = np.where(rows)[0][[0, -1]]
107
+ cmin, cmax = np.where(cols)[0][[0, -1]]
108
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
109
+
110
+ def __len__(self):
111
+ return len(self.metas)
112
+
113
+ def __getitem__(self, idx):
114
+ instance_check = False
115
+ while not instance_check:
116
+ meta = self.metas[idx] # dict
117
+
118
+
119
+ video, exp, obj_id, category, frames, frame_id, sample_frames_id, bins = \
120
+ meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], metas['frame_id'], metas['sample_frames_id'], meta['bins']
121
+
122
+
123
+ # clean up the caption
124
+ exp = " ".join(exp.lower().split())
125
+ category_id = category_dict[category]
126
+ vid_len = len(frames)
127
+
128
+ # num_frames = self.num_frames
129
+
130
+ # read frames and masks
131
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
132
+ for frame_indx in sample_frames_id:
133
+ frame_name = frames[frame_indx]
134
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
135
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
136
+ img = Image.open(img_path).convert('RGB')
137
+ mask = Image.open(mask_path).convert('P')
138
+
139
+ # create the target
140
+ label = torch.tensor(category_id)
141
+ mask = np.array(mask)
142
+ mask = (mask==obj_id).astype(np.float32) # 0,1 binary
143
+ if (mask > 0).any():
144
+ y1, y2, x1, x2 = self.bounding_box(mask)
145
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
146
+ valid.append(1)
147
+ else: # some frame didn't contain the instance
148
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
149
+ valid.append(0)
150
+ mask = torch.from_numpy(mask)
151
+
152
+ # append
153
+ imgs.append(img)
154
+ labels.append(label)
155
+ masks.append(mask)
156
+ boxes.append(box)
157
+
158
+ # transform
159
+ w, h = img.size
160
+ labels = torch.stack(labels, dim=0)
161
+ boxes = torch.stack(boxes, dim=0)
162
+ boxes[:, 0::2].clamp_(min=0, max=w)
163
+ boxes[:, 1::2].clamp_(min=0, max=h)
164
+ masks = torch.stack(masks, dim=0)
165
+ target = {
166
+ 'frames_idx': torch.tensor(sample_frames_id), # [T,]
167
+ 'labels': labels, # [T,]
168
+ 'boxes': boxes, # [T, 4], xyxy
169
+ 'masks': masks, # [T, H, W]
170
+ 'valid': torch.tensor(valid), # [T,]
171
+ 'caption': exp,
172
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
173
+ 'size': torch.as_tensor([int(h), int(w)])
174
+ }
175
+
176
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
177
+ if self._transforms:
178
+ imgs, target = self._transforms(imgs, target)
179
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
180
+ else:
181
+ imgs = np.array(imgs)
182
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
183
+
184
+
185
+ # FIXME: handle "valid", since some box may be removed due to random crop
186
+ if torch.any(target['valid'] == 1): # at leatst one instance
187
+ instance_check = True
188
+ else:
189
+ idx = random.randint(0, self.__len__() - 1)
190
+
191
+ return imgs, target
192
+
193
+
194
+ def make_coco_transforms(image_set, max_size=640):
195
+ normalize = T.Compose([
196
+ T.ToTensor(),
197
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
198
+ ])
199
+
200
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
201
+
202
+ if image_set == 'train':
203
+ return T.Compose([
204
+ T.RandomHorizontalFlip(),
205
+ T.PhotometricDistort(),
206
+ T.RandomSelect(
207
+ T.Compose([
208
+ T.RandomResize(scales, max_size=max_size),
209
+ T.Check(),
210
+ ]),
211
+ T.Compose([
212
+ T.RandomResize([400, 500, 600]),
213
+ T.RandomSizeCrop(384, 600),
214
+ T.RandomResize(scales, max_size=max_size),
215
+ T.Check(),
216
+ ])
217
+ ),
218
+ normalize,
219
+ ])
220
+
221
+ # we do not use the 'val' set since the annotations are inaccessible
222
+ if image_set == 'val':
223
+ return T.Compose([
224
+ T.RandomResize([360], max_size=640),
225
+ normalize,
226
+ ])
227
+
228
+ raise ValueError(f'unknown {image_set}')
229
+
230
+
231
+ def build(image_set, args):
232
+ root = Path(args.ytvos_path)
233
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
234
+ PATHS = {
235
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
236
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
237
+ }
238
+ img_folder, ann_file = PATHS[image_set]
239
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
240
+ # num_frames=args.num_frames, max_skip=args.max_skip)
241
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
242
+ num_frames=args.num_frames, max_skip=args.max_skip)
243
+ return dataset
244
+
.history/datasets/ytvos_ref_20250113163406.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ if vid_len < 11:
69
+ #print(f"Too short video: {vid} with frame length {vid_len}")
70
+ skip_vid_count += 1
71
+ continue
72
+
73
+ for exp_id, exp_dict in vid_data['expressions'].items():
74
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
75
+ start_idx , end_idx = 2, vid_len-2
76
+ bin_size = (end_idx - start_idx) // 4
77
+
78
+ bins = []
79
+ for i in range(4):
80
+ bin_start = start_idx + i * bin_size
81
+ bin_end = bin_start + bin_size if i < 3 else end_idx
82
+
83
+ bins.append((bin_start, bin_end))
84
+
85
+ # Random sample one frame from each bin
86
+ sample_indx = []
87
+ for start_idx, end_idx in bins:
88
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
89
+ sample_indx.sort() # Ensure indices are in order
90
+
91
+
92
+ for frame_id in sample_indx:
93
+ meta = {
94
+ 'video': vid,
95
+ 'exp': exp_dict['exp'],
96
+ 'obj_id': int(exp_dict['obj_id']),
97
+ 'frames': vid_frames,
98
+ 'frame_id' : frame_id,
99
+ 'sample_frames_id' : sample_indx,
100
+ 'bins': bins,
101
+ 'category': vid_meta['objects'][exp_dict['obj_id']]['category']
102
+ }
103
+ self.metas.append(meta)
104
+
105
+ print(f"skipped {skip_vid_count} short videos")
106
+
107
+
108
+ @staticmethod
109
+ def bounding_box(img):
110
+ rows = np.any(img, axis=1)
111
+ cols = np.any(img, axis=0)
112
+ rmin, rmax = np.where(rows)[0][[0, -1]]
113
+ cmin, cmax = np.where(cols)[0][[0, -1]]
114
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
115
+
116
+ def __len__(self):
117
+ return len(self.metas)
118
+
119
+ def __getitem__(self, idx):
120
+ instance_check = False
121
+ while not instance_check:
122
+ meta = self.metas[idx] # dict
123
+
124
+
125
+ video, exp, obj_id, category, frames, frame_id, sample_frames_id, bins = \
126
+ meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], metas['frame_id'], metas['sample_frames_id'], meta['bins']
127
+
128
+
129
+ # clean up the caption
130
+ exp = " ".join(exp.lower().split())
131
+ category_id = category_dict[category]
132
+ vid_len = len(frames)
133
+
134
+ # num_frames = self.num_frames
135
+
136
+ # read frames and masks
137
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
138
+ for frame_indx in sample_frames_id:
139
+ frame_name = frames[frame_indx]
140
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
141
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
142
+ img = Image.open(img_path).convert('RGB')
143
+ mask = Image.open(mask_path).convert('P')
144
+
145
+ # create the target
146
+ label = torch.tensor(category_id)
147
+ mask = np.array(mask)
148
+ mask = (mask==obj_id).astype(np.float32) # 0,1 binary
149
+ if (mask > 0).any():
150
+ y1, y2, x1, x2 = self.bounding_box(mask)
151
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
152
+ valid.append(1)
153
+ else: # some frame didn't contain the instance
154
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
155
+ valid.append(0)
156
+ mask = torch.from_numpy(mask)
157
+
158
+ # append
159
+ imgs.append(img)
160
+ labels.append(label)
161
+ masks.append(mask)
162
+ boxes.append(box)
163
+
164
+ # transform
165
+ w, h = img.size
166
+ labels = torch.stack(labels, dim=0)
167
+ boxes = torch.stack(boxes, dim=0)
168
+ boxes[:, 0::2].clamp_(min=0, max=w)
169
+ boxes[:, 1::2].clamp_(min=0, max=h)
170
+ masks = torch.stack(masks, dim=0)
171
+ target = {
172
+ 'frames_idx': torch.tensor(sample_frames_id), # [T,]
173
+ 'labels': labels, # [T,]
174
+ 'boxes': boxes, # [T, 4], xyxy
175
+ 'masks': masks, # [T, H, W]
176
+ 'valid': torch.tensor(valid), # [T,]
177
+ 'caption': exp,
178
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
179
+ 'size': torch.as_tensor([int(h), int(w)])
180
+ }
181
+
182
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
183
+ if self._transforms:
184
+ imgs, target = self._transforms(imgs, target)
185
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
186
+ else:
187
+ imgs = np.array(imgs)
188
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
189
+
190
+
191
+ # FIXME: handle "valid", since some box may be removed due to random crop
192
+ if torch.any(target['valid'] == 1): # at leatst one instance
193
+ instance_check = True
194
+ else:
195
+ idx = random.randint(0, self.__len__() - 1)
196
+
197
+ return imgs, target
198
+
199
+
200
+ def make_coco_transforms(image_set, max_size=640):
201
+ normalize = T.Compose([
202
+ T.ToTensor(),
203
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
204
+ ])
205
+
206
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
207
+
208
+ if image_set == 'train':
209
+ return T.Compose([
210
+ T.RandomHorizontalFlip(),
211
+ T.PhotometricDistort(),
212
+ T.RandomSelect(
213
+ T.Compose([
214
+ T.RandomResize(scales, max_size=max_size),
215
+ T.Check(),
216
+ ]),
217
+ T.Compose([
218
+ T.RandomResize([400, 500, 600]),
219
+ T.RandomSizeCrop(384, 600),
220
+ T.RandomResize(scales, max_size=max_size),
221
+ T.Check(),
222
+ ])
223
+ ),
224
+ normalize,
225
+ ])
226
+
227
+ # we do not use the 'val' set since the annotations are inaccessible
228
+ if image_set == 'val':
229
+ return T.Compose([
230
+ T.RandomResize([360], max_size=640),
231
+ normalize,
232
+ ])
233
+
234
+ raise ValueError(f'unknown {image_set}')
235
+
236
+
237
+ def build(image_set, args):
238
+ root = Path(args.ytvos_path)
239
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
240
+ PATHS = {
241
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
242
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
243
+ }
244
+ img_folder, ann_file = PATHS[image_set]
245
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
246
+ # num_frames=args.num_frames, max_skip=args.max_skip)
247
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
248
+ num_frames=args.num_frames, max_skip=args.max_skip)
249
+ return dataset
250
+
.history/datasets/ytvos_ref_20250113163605.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ if vid_len < 11:
69
+ #print(f"Too short video: {vid} with frame length {vid_len}")
70
+ skip_vid_count += 1
71
+ continue
72
+
73
+ for exp_id, exp_dict in vid_data['expressions'].items():
74
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
75
+ start_idx , end_idx = 2, vid_len-2
76
+ bin_size = (end_idx - start_idx) // 4
77
+
78
+ bins = []
79
+ for i in range(4):
80
+ bin_start = start_idx + i * bin_size
81
+ bin_end = bin_start + bin_size if i < 3 else end_idx
82
+
83
+ bins.append((bin_start, bin_end))
84
+
85
+ # Random sample one frame from each bin
86
+ sample_indx = []
87
+ for start_idx, end_idx in bins:
88
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
89
+ sample_indx.sort() # Ensure indices are in order
90
+
91
+
92
+ for frame_id in sample_indx:
93
+ meta = {
94
+ 'video': vid,
95
+ 'exp': exp_dict['exp'],
96
+ 'obj_id': int(exp_dict['obj_id']),
97
+ 'frames': vid_frames,
98
+ 'frame_id' : frame_id,
99
+ 'sample_frames_id' : sample_indx,
100
+ 'bins': bins,
101
+ 'category': vid_meta['objects'][exp_dict['obj_id']]['category']
102
+ }
103
+ self.metas.append(meta)
104
+
105
+ print(f"skipped {skip_vid_count} short videos")
106
+
107
+
108
+ @staticmethod
109
+ def bounding_box(img):
110
+ rows = np.any(img, axis=1)
111
+ cols = np.any(img, axis=0)
112
+ rmin, rmax = np.where(rows)[0][[0, -1]]
113
+ cmin, cmax = np.where(cols)[0][[0, -1]]
114
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
115
+
116
+ def __len__(self):
117
+ return len(self.metas)
118
+
119
+ def __getitem__(self, idx):
120
+ instance_check = False
121
+ while not instance_check:
122
+ meta = self.metas[idx] # dict
123
+
124
+
125
+ video, exp, obj_id, category, frames, frame_id, sample_frames_id, bins = \
126
+ meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['frame_id'], meta['sample_frames_id'], meta['bins']
127
+
128
+
129
+ # clean up the caption
130
+ exp = " ".join(exp.lower().split())
131
+ category_id = category_dict[category]
132
+ vid_len = len(frames)
133
+
134
+ # num_frames = self.num_frames
135
+
136
+ # read frames and masks
137
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
138
+ for frame_indx in sample_frames_id:
139
+ frame_name = frames[frame_indx]
140
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
141
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
142
+ img = Image.open(img_path).convert('RGB')
143
+ mask = Image.open(mask_path).convert('P')
144
+
145
+ # create the target
146
+ label = torch.tensor(category_id)
147
+ mask = np.array(mask)
148
+ mask = (mask==obj_id).astype(np.float32) # 0,1 binary
149
+ if (mask > 0).any():
150
+ y1, y2, x1, x2 = self.bounding_box(mask)
151
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
152
+ valid.append(1)
153
+ else: # some frame didn't contain the instance
154
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
155
+ valid.append(0)
156
+ mask = torch.from_numpy(mask)
157
+
158
+ # append
159
+ imgs.append(img)
160
+ labels.append(label)
161
+ masks.append(mask)
162
+ boxes.append(box)
163
+
164
+ # transform
165
+ w, h = img.size
166
+ labels = torch.stack(labels, dim=0)
167
+ boxes = torch.stack(boxes, dim=0)
168
+ boxes[:, 0::2].clamp_(min=0, max=w)
169
+ boxes[:, 1::2].clamp_(min=0, max=h)
170
+ masks = torch.stack(masks, dim=0)
171
+ target = {
172
+ 'frames_idx': torch.tensor(sample_frames_id), # [T,]
173
+ 'labels': labels, # [T,]
174
+ 'boxes': boxes, # [T, 4], xyxy
175
+ 'masks': masks, # [T, H, W]
176
+ 'valid': torch.tensor(valid), # [T,]
177
+ 'caption': exp,
178
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
179
+ 'size': torch.as_tensor([int(h), int(w)])
180
+ }
181
+
182
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
183
+ if self._transforms:
184
+ imgs, target = self._transforms(imgs, target)
185
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
186
+ else:
187
+ imgs = np.array(imgs)
188
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
189
+
190
+
191
+ # FIXME: handle "valid", since some box may be removed due to random crop
192
+ if torch.any(target['valid'] == 1): # at leatst one instance
193
+ instance_check = True
194
+ else:
195
+ idx = random.randint(0, self.__len__() - 1)
196
+
197
+ return imgs, target
198
+
199
+
200
+ def make_coco_transforms(image_set, max_size=640):
201
+ normalize = T.Compose([
202
+ T.ToTensor(),
203
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
204
+ ])
205
+
206
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
207
+
208
+ if image_set == 'train':
209
+ return T.Compose([
210
+ T.RandomHorizontalFlip(),
211
+ T.PhotometricDistort(),
212
+ T.RandomSelect(
213
+ T.Compose([
214
+ T.RandomResize(scales, max_size=max_size),
215
+ T.Check(),
216
+ ]),
217
+ T.Compose([
218
+ T.RandomResize([400, 500, 600]),
219
+ T.RandomSizeCrop(384, 600),
220
+ T.RandomResize(scales, max_size=max_size),
221
+ T.Check(),
222
+ ])
223
+ ),
224
+ normalize,
225
+ ])
226
+
227
+ # we do not use the 'val' set since the annotations are inaccessible
228
+ if image_set == 'val':
229
+ return T.Compose([
230
+ T.RandomResize([360], max_size=640),
231
+ normalize,
232
+ ])
233
+
234
+ raise ValueError(f'unknown {image_set}')
235
+
236
+
237
+ def build(image_set, args):
238
+ root = Path(args.ytvos_path)
239
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
240
+ PATHS = {
241
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
242
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
243
+ }
244
+ img_folder, ann_file = PATHS[image_set]
245
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
246
+ # num_frames=args.num_frames, max_skip=args.max_skip)
247
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
248
+ num_frames=args.num_frames, max_skip=args.max_skip)
249
+ return dataset
250
+
.history/datasets/ytvos_ref_20250113180729.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ if vid_len < 11:
69
+ #print(f"Too short video: {vid} with frame length {vid_len}")
70
+ skip_vid_count += 1
71
+ continue
72
+
73
+ for exp_id, exp_dict in vid_data['expressions'].items():
74
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
75
+ start_idx , end_idx = 2, vid_len-2
76
+ bin_size = (end_idx - start_idx) // 4
77
+
78
+ bins = []
79
+ for i in range(4):
80
+ bin_start = start_idx + i * bin_size
81
+ bin_end = bin_start + bin_size if i < 3 else end_idx
82
+
83
+ bins.append((bin_start, bin_end))
84
+
85
+ # Random sample one frame from each bin
86
+ sample_indx = []
87
+ for start_idx, end_idx in bins:
88
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
89
+ sample_indx.sort() # Ensure indices are in order
90
+
91
+
92
+ for sample_id in sample_indx:
93
+ meta = {
94
+ 'video': vid,
95
+ 'exp': exp_dict['exp'],
96
+ 'obj_id': int(exp_dict['obj_id']),
97
+ 'frames': vid_frames,
98
+ 'sample_id' : sample_id,
99
+ 'sample_frames_id' : sample_indx,
100
+ 'bins': bins,
101
+ 'category': vid_meta['objects'][exp_dict['obj_id']]['category']
102
+ }
103
+ self.metas.append(meta)
104
+
105
+ print(f"skipped {skip_vid_count} short videos")
106
+
107
+
108
+ @staticmethod
109
+ def bounding_box(img):
110
+ rows = np.any(img, axis=1)
111
+ cols = np.any(img, axis=0)
112
+ rmin, rmax = np.where(rows)[0][[0, -1]]
113
+ cmin, cmax = np.where(cols)[0][[0, -1]]
114
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
115
+
116
+ def __len__(self):
117
+ return len(self.metas)
118
+
119
+ def __getitem__(self, idx):
120
+ instance_check = False
121
+ while not instance_check:
122
+ meta = self.metas[idx] # dict
123
+
124
+
125
+ video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \
126
+ meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins']
127
+
128
+
129
+ # clean up the caption
130
+ exp = " ".join(exp.lower().split())
131
+ category_id = category_dict[category]
132
+ vid_len = len(frames)
133
+
134
+ # num_frames = self.num_frames
135
+
136
+ # read frames and masks
137
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
138
+ for frame_indx in sample_frames_id:
139
+ frame_name = frames[frame_indx]
140
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
141
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
142
+ img = Image.open(img_path).convert('RGB')
143
+ mask = Image.open(mask_path).convert('P')
144
+
145
+ # create the target
146
+ label = torch.tensor(category_id)
147
+ mask = np.array(mask)
148
+ mask = (mask==obj_id).astype(np.float32) # 0,1 binary
149
+ if (mask > 0).any():
150
+ y1, y2, x1, x2 = self.bounding_box(mask)
151
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
152
+ valid.append(1)
153
+ else: # some frame didn't contain the instance
154
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
155
+ valid.append(0)
156
+ mask = torch.from_numpy(mask)
157
+
158
+ # append
159
+ imgs.append(img)
160
+ labels.append(label)
161
+ masks.append(mask)
162
+ boxes.append(box)
163
+
164
+ # transform
165
+ w, h = img.size
166
+ labels = torch.stack(labels, dim=0)
167
+ boxes = torch.stack(boxes, dim=0)
168
+ boxes[:, 0::2].clamp_(min=0, max=w)
169
+ boxes[:, 1::2].clamp_(min=0, max=h)
170
+ masks = torch.stack(masks, dim=0)
171
+ target = {
172
+ 'frames_idx': torch.tensor(sample_frames_id), # [T,]
173
+ 'labels': labels, # [T,]
174
+ 'boxes': boxes, # [T, 4], xyxy
175
+ 'masks': masks, # [T, H, W]
176
+ 'valid': torch.tensor(valid), # [T,]
177
+ 'caption': exp,
178
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
179
+ 'size': torch.as_tensor([int(h), int(w)])
180
+ }
181
+
182
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
183
+ if self._transforms:
184
+ imgs, target = self._transforms(imgs, target)
185
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
186
+ else:
187
+ imgs = np.array(imgs)
188
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
189
+
190
+
191
+ # FIXME: handle "valid", since some box may be removed due to random crop
192
+ if torch.any(target['valid'] == 1): # at leatst one instance
193
+ instance_check = True
194
+ else:
195
+ idx = random.randint(0, self.__len__() - 1)
196
+
197
+ return imgs, target
198
+
199
+
200
+ def make_coco_transforms(image_set, max_size=640):
201
+ normalize = T.Compose([
202
+ T.ToTensor(),
203
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
204
+ ])
205
+
206
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
207
+
208
+ if image_set == 'train':
209
+ return T.Compose([
210
+ T.RandomHorizontalFlip(),
211
+ T.PhotometricDistort(),
212
+ T.RandomSelect(
213
+ T.Compose([
214
+ T.RandomResize(scales, max_size=max_size),
215
+ T.Check(),
216
+ ]),
217
+ T.Compose([
218
+ T.RandomResize([400, 500, 600]),
219
+ T.RandomSizeCrop(384, 600),
220
+ T.RandomResize(scales, max_size=max_size),
221
+ T.Check(),
222
+ ])
223
+ ),
224
+ normalize,
225
+ ])
226
+
227
+ # we do not use the 'val' set since the annotations are inaccessible
228
+ if image_set == 'val':
229
+ return T.Compose([
230
+ T.RandomResize([360], max_size=640),
231
+ normalize,
232
+ ])
233
+
234
+ raise ValueError(f'unknown {image_set}')
235
+
236
+
237
+ def build(image_set, args):
238
+ root = Path(args.ytvos_path)
239
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
240
+ PATHS = {
241
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
242
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
243
+ }
244
+ img_folder, ann_file = PATHS[image_set]
245
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
246
+ # num_frames=args.num_frames, max_skip=args.max_skip)
247
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
248
+ num_frames=args.num_frames, max_skip=args.max_skip)
249
+ return dataset
250
+
.history/datasets/ytvos_ref_20250114201918.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ if vid_len < 11:
69
+ #print(f"Too short video: {vid} with frame length {vid_len}")
70
+ skip_vid_count += 1
71
+ continue
72
+
73
+ print(f"vid_data: {vid_data}")
74
+ print(f"vid_meta: {vid_meta}")
75
+ return
76
+ for exp_id, exp_dict in vid_data['expressions'].items():
77
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
78
+ start_idx , end_idx = 2, vid_len-2
79
+ bin_size = (end_idx - start_idx) // 4
80
+
81
+ bins = []
82
+ for i in range(4):
83
+ bin_start = start_idx + i * bin_size
84
+ bin_end = bin_start + bin_size if i < 3 else end_idx
85
+
86
+ bins.append((bin_start, bin_end))
87
+
88
+ # Random sample one frame from each bin
89
+ sample_indx = []
90
+ for start_idx, end_idx in bins:
91
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
92
+ sample_indx.sort() # Ensure indices are in order
93
+
94
+
95
+ for sample_id in sample_indx:
96
+ meta = {
97
+ 'video': vid,
98
+ 'exp': exp_dict['exp'],
99
+ 'obj_id': int(exp_dict['obj_id']),
100
+ 'frames': vid_frames,
101
+ 'sample_id' : sample_id,
102
+ 'sample_frames_id' : sample_indx,
103
+ 'bins': bins,
104
+ 'category': vid_meta['objects'][exp_dict['obj_id']]['category']
105
+ }
106
+ self.metas.append(meta)
107
+
108
+ print(f"skipped {skip_vid_count} short videos")
109
+
110
+
111
+ @staticmethod
112
+ def bounding_box(img):
113
+ rows = np.any(img, axis=1)
114
+ cols = np.any(img, axis=0)
115
+ rmin, rmax = np.where(rows)[0][[0, -1]]
116
+ cmin, cmax = np.where(cols)[0][[0, -1]]
117
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
118
+
119
+ def __len__(self):
120
+ return len(self.metas)
121
+
122
+ def __getitem__(self, idx):
123
+ instance_check = False
124
+ while not instance_check:
125
+ meta = self.metas[idx] # dict
126
+
127
+
128
+ video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \
129
+ meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins']
130
+
131
+
132
+ # clean up the caption
133
+ exp = " ".join(exp.lower().split())
134
+ category_id = category_dict[category]
135
+ vid_len = len(frames)
136
+
137
+ # num_frames = self.num_frames
138
+
139
+ # read frames and masks
140
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
141
+ for frame_indx in sample_frames_id:
142
+ frame_name = frames[frame_indx]
143
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
144
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
145
+ img = Image.open(img_path).convert('RGB')
146
+ mask = Image.open(mask_path).convert('P')
147
+
148
+ # create the target
149
+ label = torch.tensor(category_id)
150
+ mask = np.array(mask)
151
+ mask = (mask==obj_id).astype(np.float32) # 0,1 binary
152
+ if (mask > 0).any():
153
+ y1, y2, x1, x2 = self.bounding_box(mask)
154
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
155
+ valid.append(1)
156
+ else: # some frame didn't contain the instance
157
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
158
+ valid.append(0)
159
+ mask = torch.from_numpy(mask)
160
+
161
+ # append
162
+ imgs.append(img)
163
+ labels.append(label)
164
+ masks.append(mask)
165
+ boxes.append(box)
166
+
167
+ # transform
168
+ w, h = img.size
169
+ labels = torch.stack(labels, dim=0)
170
+ boxes = torch.stack(boxes, dim=0)
171
+ boxes[:, 0::2].clamp_(min=0, max=w)
172
+ boxes[:, 1::2].clamp_(min=0, max=h)
173
+ masks = torch.stack(masks, dim=0)
174
+ target = {
175
+ 'frames_idx': torch.tensor(sample_frames_id), # [T,]
176
+ 'labels': labels, # [T,]
177
+ 'boxes': boxes, # [T, 4], xyxy
178
+ 'masks': masks, # [T, H, W]
179
+ 'valid': torch.tensor(valid), # [T,]
180
+ 'caption': exp,
181
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
182
+ 'size': torch.as_tensor([int(h), int(w)])
183
+ }
184
+
185
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
186
+ if self._transforms:
187
+ imgs, target = self._transforms(imgs, target)
188
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
189
+ else:
190
+ imgs = np.array(imgs)
191
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
192
+
193
+
194
+ # FIXME: handle "valid", since some box may be removed due to random crop
195
+ if torch.any(target['valid'] == 1): # at leatst one instance
196
+ instance_check = True
197
+ else:
198
+ idx = random.randint(0, self.__len__() - 1)
199
+
200
+ return imgs, target
201
+
202
+
203
+ def make_coco_transforms(image_set, max_size=640):
204
+ normalize = T.Compose([
205
+ T.ToTensor(),
206
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
207
+ ])
208
+
209
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
210
+
211
+ if image_set == 'train':
212
+ return T.Compose([
213
+ T.RandomHorizontalFlip(),
214
+ T.PhotometricDistort(),
215
+ T.RandomSelect(
216
+ T.Compose([
217
+ T.RandomResize(scales, max_size=max_size),
218
+ T.Check(),
219
+ ]),
220
+ T.Compose([
221
+ T.RandomResize([400, 500, 600]),
222
+ T.RandomSizeCrop(384, 600),
223
+ T.RandomResize(scales, max_size=max_size),
224
+ T.Check(),
225
+ ])
226
+ ),
227
+ normalize,
228
+ ])
229
+
230
+ # we do not use the 'val' set since the annotations are inaccessible
231
+ if image_set == 'val':
232
+ return T.Compose([
233
+ T.RandomResize([360], max_size=640),
234
+ normalize,
235
+ ])
236
+
237
+ raise ValueError(f'unknown {image_set}')
238
+
239
+
240
+ def build(image_set, args):
241
+ root = Path(args.ytvos_path)
242
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
243
+ PATHS = {
244
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
245
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
246
+ }
247
+ img_folder, ann_file = PATHS[image_set]
248
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
249
+ # num_frames=args.num_frames, max_skip=args.max_skip)
250
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
251
+ num_frames=args.num_frames, max_skip=args.max_skip)
252
+ return dataset
253
+
.history/datasets/ytvos_ref_20250114202502.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ if vid_len < 11:
69
+ #print(f"Too short video: {vid} with frame length {vid_len}")
70
+ skip_vid_count += 1
71
+ continue
72
+
73
+ for exp_id, exp_dict in vid_data['expressions'].items():
74
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
75
+ start_idx , end_idx = 2, vid_len-2
76
+ bin_size = (end_idx - start_idx) // 4
77
+
78
+ bins = []
79
+ for i in range(4):
80
+ bin_start = start_idx + i * bin_size
81
+ bin_end = bin_start + bin_size if i < 3 else end_idx
82
+
83
+ bins.append((bin_start, bin_end))
84
+
85
+ # Random sample one frame from each bin
86
+ sample_indx = []
87
+ for start_idx, end_idx in bins:
88
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
89
+ sample_indx.sort() # Ensure indices are in order
90
+
91
+
92
+ for sample_id in sample_indx:
93
+ meta = {
94
+ 'video': vid,
95
+ 'exp': exp_dict['exp'],
96
+ 'obj_id': int(exp_dict['obj_id']),
97
+ 'frames': vid_frames,
98
+ 'sample_id' : sample_id,
99
+ 'sample_frames_id' : sample_indx,
100
+ 'bins': bins,
101
+ 'category': vid_meta['objects'][exp_dict['obj_id']]['category']
102
+ }
103
+ self.metas.append(meta)
104
+
105
+ print(f"skipped {skip_vid_count} short videos")
106
+
107
+
108
+ @staticmethod
109
+ def bounding_box(img):
110
+ rows = np.any(img, axis=1)
111
+ cols = np.any(img, axis=0)
112
+ rmin, rmax = np.where(rows)[0][[0, -1]]
113
+ cmin, cmax = np.where(cols)[0][[0, -1]]
114
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
115
+
116
+ def __len__(self):
117
+ return len(self.metas)
118
+
119
+ def __getitem__(self, idx):
120
+ instance_check = False
121
+ while not instance_check:
122
+ meta = self.metas[idx] # dict
123
+
124
+
125
+ video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \
126
+ meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins']
127
+
128
+
129
+ # clean up the caption
130
+ exp = " ".join(exp.lower().split())
131
+ category_id = category_dict[category]
132
+ vid_len = len(frames)
133
+
134
+ # num_frames = self.num_frames
135
+
136
+ # read frames and masks
137
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
138
+ for frame_indx in sample_frames_id:
139
+ frame_name = frames[frame_indx]
140
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
141
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
142
+ img = Image.open(img_path).convert('RGB')
143
+ mask = Image.open(mask_path).convert('P')
144
+
145
+ # create the target
146
+ label = torch.tensor(category_id)
147
+ mask = np.array(mask)
148
+ mask = (mask==obj_id).astype(np.float32) # 0,1 binary
149
+ if (mask > 0).any():
150
+ y1, y2, x1, x2 = self.bounding_box(mask)
151
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
152
+ valid.append(1)
153
+ else: # some frame didn't contain the instance
154
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
155
+ valid.append(0)
156
+ mask = torch.from_numpy(mask)
157
+
158
+ # append
159
+ imgs.append(img)
160
+ labels.append(label)
161
+ masks.append(mask)
162
+ boxes.append(box)
163
+
164
+ # transform
165
+ w, h = img.size
166
+ labels = torch.stack(labels, dim=0)
167
+ boxes = torch.stack(boxes, dim=0)
168
+ boxes[:, 0::2].clamp_(min=0, max=w)
169
+ boxes[:, 1::2].clamp_(min=0, max=h)
170
+ masks = torch.stack(masks, dim=0)
171
+ target = {
172
+ 'frames_idx': torch.tensor(sample_frames_id), # [T,]
173
+ 'labels': labels, # [T,]
174
+ 'boxes': boxes, # [T, 4], xyxy
175
+ 'masks': masks, # [T, H, W]
176
+ 'valid': torch.tensor(valid), # [T,]
177
+ 'caption': exp,
178
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
179
+ 'size': torch.as_tensor([int(h), int(w)])
180
+ }
181
+
182
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
183
+ if self._transforms:
184
+ imgs, target = self._transforms(imgs, target)
185
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
186
+ else:
187
+ imgs = np.array(imgs)
188
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
189
+
190
+
191
+ # FIXME: handle "valid", since some box may be removed due to random crop
192
+ if torch.any(target['valid'] == 1): # at leatst one instance
193
+ instance_check = True
194
+ else:
195
+ idx = random.randint(0, self.__len__() - 1)
196
+
197
+ return imgs, target
198
+
199
+
200
+ def make_coco_transforms(image_set, max_size=640):
201
+ normalize = T.Compose([
202
+ T.ToTensor(),
203
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
204
+ ])
205
+
206
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
207
+
208
+ if image_set == 'train':
209
+ return T.Compose([
210
+ T.RandomHorizontalFlip(),
211
+ T.PhotometricDistort(),
212
+ T.RandomSelect(
213
+ T.Compose([
214
+ T.RandomResize(scales, max_size=max_size),
215
+ T.Check(),
216
+ ]),
217
+ T.Compose([
218
+ T.RandomResize([400, 500, 600]),
219
+ T.RandomSizeCrop(384, 600),
220
+ T.RandomResize(scales, max_size=max_size),
221
+ T.Check(),
222
+ ])
223
+ ),
224
+ normalize,
225
+ ])
226
+
227
+ # we do not use the 'val' set since the annotations are inaccessible
228
+ if image_set == 'val':
229
+ return T.Compose([
230
+ T.RandomResize([360], max_size=640),
231
+ normalize,
232
+ ])
233
+
234
+ raise ValueError(f'unknown {image_set}')
235
+
236
+
237
+ def build(image_set, args):
238
+ root = Path(args.ytvos_path)
239
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
240
+ PATHS = {
241
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
242
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
243
+ }
244
+ img_folder, ann_file = PATHS[image_set]
245
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
246
+ # num_frames=args.num_frames, max_skip=args.max_skip)
247
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
248
+ num_frames=args.num_frames, max_skip=args.max_skip)
249
+ return dataset
250
+
.history/datasets/ytvos_ref_20250114205233.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.vid_meta, self.vid_data = self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ return vid_meta, vid_data
69
+
70
+ if vid_len < 11:
71
+ #print(f"Too short video: {vid} with frame length {vid_len}")
72
+ skip_vid_count += 1
73
+ continue
74
+
75
+ for exp_id, exp_dict in vid_data['expressions'].items():
76
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
77
+ start_idx , end_idx = 2, vid_len-2
78
+ bin_size = (end_idx - start_idx) // 4
79
+
80
+ bins = []
81
+ for i in range(4):
82
+ bin_start = start_idx + i * bin_size
83
+ bin_end = bin_start + bin_size if i < 3 else end_idx
84
+
85
+ bins.append((bin_start, bin_end))
86
+
87
+ # Random sample one frame from each bin
88
+ sample_indx = []
89
+ for start_idx, end_idx in bins:
90
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
91
+ sample_indx.sort() # Ensure indices are in order
92
+
93
+
94
+ for sample_id in sample_indx:
95
+ meta = {
96
+ 'video': vid,
97
+ 'exp': exp_dict['exp'],
98
+ 'obj_id': int(exp_dict['obj_id']),
99
+ 'frames': vid_frames,
100
+ 'sample_id' : sample_id,
101
+ 'sample_frames_id' : sample_indx,
102
+ 'bins': bins,
103
+ 'category': vid_meta['objects'][exp_dict['obj_id']]['category']
104
+ }
105
+ self.metas.append(meta)
106
+
107
+ print(f"skipped {skip_vid_count} short videos")
108
+
109
+
110
+ @staticmethod
111
+ def bounding_box(img):
112
+ rows = np.any(img, axis=1)
113
+ cols = np.any(img, axis=0)
114
+ rmin, rmax = np.where(rows)[0][[0, -1]]
115
+ cmin, cmax = np.where(cols)[0][[0, -1]]
116
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
117
+
118
+ def __len__(self):
119
+ return len(self.metas)
120
+
121
+ def __getitem__(self, idx):
122
+ instance_check = False
123
+ while not instance_check:
124
+ meta = self.metas[idx] # dict
125
+
126
+
127
+ video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \
128
+ meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins']
129
+
130
+
131
+ # clean up the caption
132
+ exp = " ".join(exp.lower().split())
133
+ category_id = category_dict[category]
134
+ vid_len = len(frames)
135
+
136
+ # num_frames = self.num_frames
137
+
138
+ # read frames and masks
139
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
140
+ for frame_indx in sample_frames_id:
141
+ frame_name = frames[frame_indx]
142
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
143
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
144
+ img = Image.open(img_path).convert('RGB')
145
+ mask = Image.open(mask_path).convert('P')
146
+
147
+ # create the target
148
+ label = torch.tensor(category_id)
149
+ mask = np.array(mask)
150
+ mask = (mask==obj_id).astype(np.float32) # 0,1 binary
151
+ if (mask > 0).any():
152
+ y1, y2, x1, x2 = self.bounding_box(mask)
153
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
154
+ valid.append(1)
155
+ else: # some frame didn't contain the instance
156
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
157
+ valid.append(0)
158
+ mask = torch.from_numpy(mask)
159
+
160
+ # append
161
+ imgs.append(img)
162
+ labels.append(label)
163
+ masks.append(mask)
164
+ boxes.append(box)
165
+
166
+ # transform
167
+ w, h = img.size
168
+ labels = torch.stack(labels, dim=0)
169
+ boxes = torch.stack(boxes, dim=0)
170
+ boxes[:, 0::2].clamp_(min=0, max=w)
171
+ boxes[:, 1::2].clamp_(min=0, max=h)
172
+ masks = torch.stack(masks, dim=0)
173
+ target = {
174
+ 'frames_idx': torch.tensor(sample_frames_id), # [T,]
175
+ 'labels': labels, # [T,]
176
+ 'boxes': boxes, # [T, 4], xyxy
177
+ 'masks': masks, # [T, H, W]
178
+ 'valid': torch.tensor(valid), # [T,]
179
+ 'caption': exp,
180
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
181
+ 'size': torch.as_tensor([int(h), int(w)])
182
+ }
183
+
184
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
185
+ if self._transforms:
186
+ imgs, target = self._transforms(imgs, target)
187
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
188
+ else:
189
+ imgs = np.array(imgs)
190
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
191
+
192
+
193
+ # FIXME: handle "valid", since some box may be removed due to random crop
194
+ if torch.any(target['valid'] == 1): # at leatst one instance
195
+ instance_check = True
196
+ else:
197
+ idx = random.randint(0, self.__len__() - 1)
198
+
199
+ return imgs, target
200
+
201
+
202
+ def make_coco_transforms(image_set, max_size=640):
203
+ normalize = T.Compose([
204
+ T.ToTensor(),
205
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
206
+ ])
207
+
208
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
209
+
210
+ if image_set == 'train':
211
+ return T.Compose([
212
+ T.RandomHorizontalFlip(),
213
+ T.PhotometricDistort(),
214
+ T.RandomSelect(
215
+ T.Compose([
216
+ T.RandomResize(scales, max_size=max_size),
217
+ T.Check(),
218
+ ]),
219
+ T.Compose([
220
+ T.RandomResize([400, 500, 600]),
221
+ T.RandomSizeCrop(384, 600),
222
+ T.RandomResize(scales, max_size=max_size),
223
+ T.Check(),
224
+ ])
225
+ ),
226
+ normalize,
227
+ ])
228
+
229
+ # we do not use the 'val' set since the annotations are inaccessible
230
+ if image_set == 'val':
231
+ return T.Compose([
232
+ T.RandomResize([360], max_size=640),
233
+ normalize,
234
+ ])
235
+
236
+ raise ValueError(f'unknown {image_set}')
237
+
238
+
239
+ def build(image_set, args):
240
+ root = Path(args.ytvos_path)
241
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
242
+ PATHS = {
243
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
244
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
245
+ }
246
+ img_folder, ann_file = PATHS[image_set]
247
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
248
+ # num_frames=args.num_frames, max_skip=args.max_skip)
249
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
250
+ num_frames=args.num_frames, max_skip=args.max_skip)
251
+ return dataset
252
+
.history/datasets/ytvos_ref_20250114210537.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ if vid_len < 11:
69
+ #print(f"Too short video: {vid} with frame length {vid_len}")
70
+ skip_vid_count += 1
71
+ continue
72
+
73
+
74
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
75
+ start_idx , end_idx = 2, vid_len-2
76
+ bin_size = (end_idx - start_idx) // 4
77
+
78
+ bins = []
79
+ for i in range(4):
80
+ bin_start = start_idx + i * bin_size
81
+ bin_end = bin_start + bin_size if i < 3 else end_idx
82
+
83
+ bins.append((bin_start, bin_end))
84
+
85
+ # Random sample one frame from each bin
86
+ sample_indx = []
87
+ for start_idx, end_idx in bins:
88
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
89
+ sample_indx.sort() # Ensure indices are in order
90
+
91
+
92
+ meta = {
93
+ 'video':vid,
94
+ 'sample_indx':sample_indx,
95
+ 'bins':bins
96
+ }
97
+ obj_id_cat = {}
98
+ for exp_id, exp_dict in vid_data['expressions'].items():
99
+ obj_id = exp_dict['obj_id']
100
+ if obj_id not in obj_id_cat:
101
+ obj_id_cat[obj_id] = vid_meta[obj_id]['category']
102
+ meta['obj_id_cat'] = obj_id_cat
103
+ self.metas.append(meta)
104
+
105
+ print(f"skipped {skip_vid_count} short videos")
106
+
107
+
108
+ @staticmethod
109
+ def bounding_box(img):
110
+ rows = np.any(img, axis=1)
111
+ cols = np.any(img, axis=0)
112
+ rmin, rmax = np.where(rows)[0][[0, -1]]
113
+ cmin, cmax = np.where(cols)[0][[0, -1]]
114
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
115
+
116
+ def __len__(self):
117
+ return len(self.metas)
118
+
119
+ def __getitem__(self, idx):
120
+ instance_check = False
121
+ while not instance_check:
122
+ meta = self.metas[idx] # dict
123
+
124
+
125
+ video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \
126
+ meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins']
127
+
128
+
129
+ # clean up the caption
130
+ exp = " ".join(exp.lower().split())
131
+ category_id = category_dict[category]
132
+ vid_len = len(frames)
133
+
134
+ # num_frames = self.num_frames
135
+
136
+ # read frames and masks
137
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
138
+ for frame_indx in sample_frames_id:
139
+ frame_name = frames[frame_indx]
140
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
141
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
142
+ img = Image.open(img_path).convert('RGB')
143
+ mask = Image.open(mask_path).convert('P')
144
+
145
+ # create the target
146
+ label = torch.tensor(category_id)
147
+ mask = np.array(mask)
148
+ mask = (mask==obj_id).astype(np.float32) # 0,1 binary
149
+ if (mask > 0).any():
150
+ y1, y2, x1, x2 = self.bounding_box(mask)
151
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
152
+ valid.append(1)
153
+ else: # some frame didn't contain the instance
154
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
155
+ valid.append(0)
156
+ mask = torch.from_numpy(mask)
157
+
158
+ # append
159
+ imgs.append(img)
160
+ labels.append(label)
161
+ masks.append(mask)
162
+ boxes.append(box)
163
+
164
+ # transform
165
+ w, h = img.size
166
+ labels = torch.stack(labels, dim=0)
167
+ boxes = torch.stack(boxes, dim=0)
168
+ boxes[:, 0::2].clamp_(min=0, max=w)
169
+ boxes[:, 1::2].clamp_(min=0, max=h)
170
+ masks = torch.stack(masks, dim=0)
171
+ target = {
172
+ 'frames_idx': torch.tensor(sample_frames_id), # [T,]
173
+ 'labels': labels, # [T,]
174
+ 'boxes': boxes, # [T, 4], xyxy
175
+ 'masks': masks, # [T, H, W]
176
+ 'valid': torch.tensor(valid), # [T,]
177
+ 'caption': exp,
178
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
179
+ 'size': torch.as_tensor([int(h), int(w)])
180
+ }
181
+
182
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
183
+ if self._transforms:
184
+ imgs, target = self._transforms(imgs, target)
185
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
186
+ else:
187
+ imgs = np.array(imgs)
188
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
189
+
190
+
191
+ # FIXME: handle "valid", since some box may be removed due to random crop
192
+ if torch.any(target['valid'] == 1): # at leatst one instance
193
+ instance_check = True
194
+ else:
195
+ idx = random.randint(0, self.__len__() - 1)
196
+
197
+ return imgs, target
198
+
199
+
200
+ def make_coco_transforms(image_set, max_size=640):
201
+ normalize = T.Compose([
202
+ T.ToTensor(),
203
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
204
+ ])
205
+
206
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
207
+
208
+ if image_set == 'train':
209
+ return T.Compose([
210
+ T.RandomHorizontalFlip(),
211
+ T.PhotometricDistort(),
212
+ T.RandomSelect(
213
+ T.Compose([
214
+ T.RandomResize(scales, max_size=max_size),
215
+ T.Check(),
216
+ ]),
217
+ T.Compose([
218
+ T.RandomResize([400, 500, 600]),
219
+ T.RandomSizeCrop(384, 600),
220
+ T.RandomResize(scales, max_size=max_size),
221
+ T.Check(),
222
+ ])
223
+ ),
224
+ normalize,
225
+ ])
226
+
227
+ # we do not use the 'val' set since the annotations are inaccessible
228
+ if image_set == 'val':
229
+ return T.Compose([
230
+ T.RandomResize([360], max_size=640),
231
+ normalize,
232
+ ])
233
+
234
+ raise ValueError(f'unknown {image_set}')
235
+
236
+
237
+ def build(image_set, args):
238
+ root = Path(args.ytvos_path)
239
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
240
+ PATHS = {
241
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
242
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
243
+ }
244
+ img_folder, ann_file = PATHS[image_set]
245
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
246
+ # num_frames=args.num_frames, max_skip=args.max_skip)
247
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
248
+ num_frames=args.num_frames, max_skip=args.max_skip)
249
+ return dataset
250
+
.history/make_ref-ytvos/annotate_ref_ytvos_20241227174304.py ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import build_dataset
2
+ import argparse
3
+ import opts
4
+
5
+ import sys
6
+ from pathlib import Path
7
+ from os import path as osp
8
+ import io
9
+
10
+ import numpy as np
11
+ import pandas as pd
12
+ import regex as re
13
+ import json
14
+
15
+ import cv2
16
+ from PIL import Image
17
+ import torch
18
+ from torchvision.transforms import functional as F
19
+
20
+ from skimage import measure # (pip install scikit-image)
21
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
22
+
23
+ import matplotlib.pyplot as plt
24
+ from matplotlib.collections import PatchCollection
25
+ from matplotlib.patches import Rectangle
26
+
27
+ import ipywidgets as widgets
28
+ from IPython.display import display, clear_output
29
+
30
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
31
+ args = parser.parse_args()
32
+
33
+ #==================데이터 불러오기===================
34
+ # 전체 데이터셋
35
+ train_dataset = build_dataset('ytvos', image_set = 'train', args = args)
36
+
37
+ # 전체 데이터셋 메타데이터
38
+ metas = train_dataset.metas
39
+
40
+ # 필터링한 프레임들
41
+ selected_frames_df = pd.read_json("selected_frames4.jsonl", lines = True)
42
+
43
+ #==================마스크 만드는 함수들===================
44
+ def prepare_mask_for_pil(mask_tensor):
45
+ mask_array = mask_tensor.squeeze(0).cpu().numpy()
46
+ mask_array = (mask_array * 255).astype(np.uint8)
47
+ mask_image = Image.fromarray(mask_array)
48
+ return mask_image
49
+
50
+ def create_sub_masks(mask_image):
51
+ width, height = mask_image.size
52
+
53
+ sub_masks = {}
54
+ for x in range(width):
55
+ for y in range(height):
56
+ # Get the RGB values of the pixel
57
+ pixel = mask_image.getpixel((x, y))
58
+
59
+ # If the pixel is not black...
60
+ if pixel != 0 :
61
+ # Check to see if we've created a sub-mask...
62
+ pixel_str = str(pixel)
63
+ sub_mask = sub_masks.get(pixel_str)
64
+ if sub_mask is None:
65
+ # Create a sub-mask (one bit per pixel) and add to the dictionary
66
+ # Note: we add 1 pixel of padding in each direction
67
+ # because the contours module doesn't handle cases
68
+ # where pixels bleed to the edge of the image
69
+ sub_masks[pixel_str] = Image.new('1', (width+2, height+2))
70
+
71
+ # Set the pixel value to 1 (default is 0), accounting for padding
72
+ sub_masks[pixel_str].putpixel((x+1, y+1), 1)
73
+ return sub_masks
74
+
75
+ #==================마스크 annotation 만드는 함수===================
76
+ def create_sub_mask_annotation(sub_mask, image_id, annotation_id, is_crowd):
77
+ # Find contours (boundary lines) around each sub-mask
78
+ # Note: there could be multiple contours if the object
79
+ # is partially occluded. (E.g. an elephant behind a tree)
80
+ contours = measure.find_contours(sub_mask, 0.5, positive_orientation='low')
81
+
82
+ segmentations = []
83
+ polygons = []
84
+ for contour in contours:
85
+ # Flip from (row, col) representation to (x, y)
86
+ # and subtract the padding pixel
87
+ for i in range(len(contour)):
88
+ row, col = contour[i]
89
+ contour[i] = (col - 1, row - 1)
90
+
91
+ # Make a polygon and simplify it
92
+ poly = Polygon(contour)
93
+ poly = poly.simplify(1.0, preserve_topology=False)
94
+ polygons.append(poly)
95
+ segmentation = np.array(poly.exterior.coords).ravel().tolist()
96
+ segmentations.append(segmentation)
97
+
98
+ # Combine the polygons to calculate the bounding box and area
99
+ multi_poly = MultiPolygon(polygons)
100
+ x, y, max_x, max_y = multi_poly.bounds
101
+ width = max_x - x
102
+ height = max_y - y
103
+ bbox = (x, y, width, height)
104
+ area = multi_poly.area
105
+
106
+ annotation = {
107
+ 'segmentation': segmentations,
108
+ 'iscrowd': is_crowd,
109
+ 'image_id': image_id,
110
+ 'id': annotation_id,
111
+ 'bbox': bbox,
112
+ 'area': area
113
+ }
114
+ return annotation
115
+
116
+ #==================시각화 함수===================
117
+ # annotation dictionary as input
118
+ def showRef(annotation, image_dir, seg_box='seg'):
119
+ ax = plt.gca()
120
+ I = io.imread(osp.join(image_dir, annotation['file_name']))
121
+ ax.imshow(I)
122
+
123
+
124
+ for sid, sent in enumerate(annotation['sentences']):
125
+ print('%s. %s' % (sid + 1, sent))
126
+
127
+ if seg_box == 'seg':
128
+ polygons = []
129
+ color = []
130
+ c = (np.random.random((1, 3)) * 0.6 + 0.4).tolist()[0]
131
+
132
+ if type(annotation['segmentation'][0]) == list:
133
+ # polygon used for refcoco*
134
+ for seg in annotation['segmentation']:
135
+ poly = np.array(seg).reshape((int(len(seg) / 2), 2))
136
+ polygons.append(Polygon(poly))
137
+ color.append(c)
138
+
139
+ p = PatchCollection(polygons,
140
+ facecolors=(221/255, 160/255, 221/255), # 연보라색
141
+ linewidths=0,
142
+ alpha=0.4)
143
+ ax.add_collection(p)
144
+
145
+ p = PatchCollection(polygons,
146
+ facecolors='none',
147
+ edgecolors=color,
148
+ linewidths=2)
149
+ ax.add_collection(p)
150
+ # else:
151
+ # # mask used for refclef
152
+ # rle = annotation['segmentation']
153
+ # m = mask.decode(rle)
154
+ # img = np.ones((m.shape[0], m.shape[1], 3))
155
+ # color_mask = np.array([2.0, 166.0, 101.0]) / 255
156
+ # for i in range(3):
157
+ # img[:, :, i] = color_mask[i]
158
+ # ax.imshow(np.dstack((img, m * 0.5)))
159
+
160
+ # bounding box
161
+ elif seg_box == 'box':
162
+ bbox = annotation['bbox']
163
+ box_plot = Rectangle((bbox[0], bbox[1]),
164
+ bbox[2],
165
+ bbox[3],
166
+ fill=False,
167
+ edgecolor='green',
168
+ linewidth=3)
169
+ ax.add_patch(box_plot)
170
+
171
+ #==================모두 종합한 함수===================
172
+ def create_dict_from_selected_images(selected_frames_df):
173
+
174
+ image_id = 0
175
+ anno_id = 0
176
+ train_idx = 0
177
+
178
+ with open("/home/yejin/data/data/dataset/VRIS/mbench/ytvos/selected_instances2.jsonl", "w") as f:
179
+
180
+ for selected_idx in range(len(selected_frames_df)):
181
+ selected = selected_frames_df.loc[selected_idx]
182
+ selected_vid_id = selected['video']
183
+ selected_frame_id = selected['frame_id']
184
+
185
+ for obj_id in selected['objects'].keys():
186
+
187
+ selected_exp = selected['objects'][obj_id][0] #캡션
188
+ selected_verb = selected['objects'][obj_id][1] #동사
189
+
190
+ train_idx = next(
191
+ idx for idx, meta in enumerate(metas)
192
+ if meta['video'] == selected_vid_id
193
+ and meta['frame_id'] == selected_frame_id
194
+ and meta['obj_id'] == int(obj_id)
195
+ and meta['exp'] == selected_exp
196
+ )
197
+
198
+ train_frames, train_info = train_dataset[train_idx]
199
+
200
+ try:
201
+ valid_frame_loc = train_info['frames_idx'].tolist().index(selected_frame_id) #valid한 frame이 있는 index
202
+ except ValueError:
203
+ print(f"selected vid id: {selected_vid_id}, metas['frame_id']: {metas[train_idx]['frame_id']}, selected frame id: {selected_frame_id}, train_info['frames_idx']: {train_info['frames_idx'].tolist()}")
204
+
205
+
206
+ frame = train_frames[valid_frame_loc] #해당 frame
207
+ frame = F.to_pil_image(frame)
208
+
209
+ image_file_name = f"{selected_vid_id}_{str(selected_frame_id).rjust(5, '0')}"
210
+
211
+ #원래 frame 저장하기
212
+ save_dir = Path("/home/yejin/data/data/dataset/VRIS/mbench/ytvos/selected_frames")
213
+ #save_dir.mkdir(exist_ok=True)
214
+ save_path = save_dir / f"{image_file_name}.png"
215
+ #frame.save(save_path)
216
+
217
+ #카테고리
218
+ label = train_info['labels'][valid_frame_loc].item() #category id
219
+ category_name = metas[train_idx]['category'] #category name
220
+
221
+ #박스 정보
222
+ box = train_info['boxes'][valid_frame_loc]
223
+
224
+ # Annotation tools ########################################################################
225
+ mask = train_info['masks'][valid_frame_loc]
226
+ # print(mask.shape)
227
+
228
+ # frame과 mask 맞는지 확인만
229
+ # plt.imshow(frame.permute(1, 2, 0))
230
+ # mask_color = np.zeros((*mask.shape, 3), dtype = np.uint8)
231
+ # mask_color[mask == 1] = [255, 0, 0]
232
+ # plt.imshow(mask_color, alpha = 0.5)
233
+ # plt.show()
234
+
235
+
236
+ mask_image = prepare_mask_for_pil(mask)
237
+ sub_masks = create_sub_masks(mask_image)
238
+
239
+ for color, sub_mask in sub_masks.items():
240
+ # print(f"Color: {color}, Sub-mask size: {sub_mask.size}")
241
+ sub_mask_array = np.array(sub_mask, dtype=np.uint8)
242
+ annotation = create_sub_mask_annotation(sub_mask_array, image_id, anno_id, is_crowd = 0)
243
+ anno_id += 1
244
+ image_id += 1
245
+
246
+ #파일 경로 추가
247
+ annotation['file_name'] = f"{image_file_name}.png"
248
+
249
+ #불필요한 정보 지우기
250
+ annotation.pop('iscrowd', None)
251
+ annotation.pop('image_id', None)
252
+ annotation.pop('id', None)
253
+
254
+ valid = train_info['valid'][valid_frame_loc]
255
+ orig_size = train_info['orig_size']
256
+ size = train_info['size']
257
+ caption = metas[train_idx]['exp']
258
+
259
+ #filename, height, width 추가
260
+ #annotation['file_name'] = save_path
261
+ annotation['height'] = orig_size[0].item()
262
+ annotation['width'] = orig_size[1].item()
263
+
264
+ # category id,name, sentence dictionary 추가
265
+ annotation['label'] = label
266
+ annotation['category_name'] = category_name
267
+ sentence_dict = {
268
+ "tokens" : caption.split(' '),
269
+ "raw" : caption,
270
+ "sent" : re.sub('[^A-Za-z0-9\s]+', '', caption.lower())
271
+ }
272
+ annotation['sentences'] = sentence_dict
273
+ ############################################################################################
274
+ # double check for segmentation annotation
275
+ # orig_img_np = draw_polygon_on_image(frame, annotation['segmentation'])
276
+ # plt.imshow(orig_img_np)
277
+ # plt.axis('off')
278
+ # plt.show()
279
+
280
+ # showRef(annotation, save_dir)
281
+ ############################################################################################
282
+
283
+ # 최종
284
+ f.write(json.dumps(annotation) + "\n")
285
+ f.flush()
286
+
287
+ if __name__ == '__main__':
288
+ create_dict_from_selected_images(selected_frames_df)
.history/make_ref-ytvos/annotate_ref_ytvos_20250113111315.py ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import build_dataset
2
+ import argparse
3
+ import opts
4
+
5
+ import sys
6
+ from pathlib import Path
7
+ from os import path as osp
8
+ import io
9
+
10
+ import numpy as np
11
+ import pandas as pd
12
+ import regex as re
13
+ import json
14
+
15
+ import cv2
16
+ from PIL import Image
17
+ import torch
18
+ from torchvision.transforms import functional as F
19
+
20
+ from skimage import measure # (pip install scikit-image)
21
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
22
+
23
+ import matplotlib.pyplot as plt
24
+ from matplotlib.collections import PatchCollection
25
+ from matplotlib.patches import Rectangle
26
+
27
+ import ipywidgets as widgets
28
+ from IPython.display import display, clear_output
29
+
30
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
31
+ args = parser.parse_args()
32
+
33
+ #==================데이터 불러오기===================
34
+ # 전체 데이터셋
35
+ train_dataset = build_dataset('ytvos', image_set = 'train', args = args)
36
+
37
+ # 전체 데이터셋 메타데이터
38
+ metas = train_dataset.metas
39
+
40
+ # 필터링한 프레임들
41
+ selected_frames_df = pd.read_json("selected_frames4.jsonl", lines = True)
42
+
43
+ #==================마스크 만드는 함수들===================
44
+ def prepare_mask_for_pil(mask_tensor):
45
+ mask_array = mask_tensor.squeeze(0).cpu().numpy()
46
+ mask_array = (mask_array * 255).astype(np.uint8)
47
+ mask_image = Image.fromarray(mask_array)
48
+ return mask_image
49
+
50
+ def create_sub_masks(mask_image):
51
+ width, height = mask_image.size
52
+
53
+ sub_masks = {}
54
+ for x in range(width):
55
+ for y in range(height):
56
+ # Get the RGB values of the pixel
57
+ pixel = mask_image.getpixel((x, y))
58
+
59
+ # If the pixel is not black...
60
+ if pixel != 0 :
61
+ # Check to see if we've created a sub-mask...
62
+ pixel_str = str(pixel)
63
+ sub_mask = sub_masks.get(pixel_str)
64
+ if sub_mask is None:
65
+ # Create a sub-mask (one bit per pixel) and add to the dictionary
66
+ # Note: we add 1 pixel of padding in each direction
67
+ # because the contours module doesn't handle cases
68
+ # where pixels bleed to the edge of the image
69
+ sub_masks[pixel_str] = Image.new('1', (width+2, height+2))
70
+
71
+ # Set the pixel value to 1 (default is 0), accounting for padding
72
+ sub_masks[pixel_str].putpixel((x+1, y+1), 1)
73
+ return sub_masks
74
+
75
+ #==================마스크 annotation 만드는 함수===================
76
+ def create_sub_mask_annotation(sub_mask, image_id, annotation_id, is_crowd):
77
+ # Find contours (boundary lines) around each sub-mask
78
+ # Note: there could be multiple contours if the object
79
+ # is partially occluded. (E.g. an elephant behind a tree)
80
+ contours = measure.find_contours(sub_mask, 0.5, positive_orientation='low')
81
+
82
+ segmentations = []
83
+ polygons = []
84
+ for contour in contours:
85
+ # Flip from (row, col) representation to (x, y)
86
+ # and subtract the padding pixel
87
+ for i in range(len(contour)):
88
+ row, col = contour[i]
89
+ contour[i] = (col - 1, row - 1)
90
+
91
+ # Make a polygon and simplify it
92
+ poly = Polygon(contour)
93
+ poly = poly.simplify(1.0, preserve_topology=False)
94
+ polygons.append(poly)
95
+ segmentation = np.array(poly.exterior.coords).ravel().tolist()
96
+ segmentations.append(segmentation)
97
+
98
+ # Combine the polygons to calculate the bounding box and area
99
+ multi_poly = MultiPolygon(polygons)
100
+ x, y, max_x, max_y = multi_poly.bounds
101
+ width = max_x - x
102
+ height = max_y - y
103
+ bbox = (x, y, width, height)
104
+ area = multi_poly.area
105
+
106
+ annotation = {
107
+ 'segmentation': segmentations,
108
+ 'iscrowd': is_crowd,
109
+ 'image_id': image_id,
110
+ 'id': annotation_id,
111
+ 'bbox': bbox,
112
+ 'area': area
113
+ }
114
+ return annotation
115
+
116
+ #==================시각화 함수===================
117
+ # annotation dictionary as input
118
+ def showRef(annotation, image_dir, seg_box='seg'):
119
+ ax = plt.gca()
120
+ I = io.imread(osp.join(image_dir, annotation['file_name']))
121
+ ax.imshow(I)
122
+
123
+
124
+ for sid, sent in enumerate(annotation['sentences']):
125
+ print('%s. %s' % (sid + 1, sent))
126
+
127
+ if seg_box == 'seg':
128
+ polygons = []
129
+ color = []
130
+ c = (np.random.random((1, 3)) * 0.6 + 0.4).tolist()[0]
131
+
132
+ if type(annotation['segmentation'][0]) == list:
133
+ # polygon used for refcoco*
134
+ for seg in annotation['segmentation']:
135
+ poly = np.array(seg).reshape((int(len(seg) / 2), 2))
136
+ polygons.append(Polygon(poly))
137
+ color.append(c)
138
+
139
+ p = PatchCollection(polygons,
140
+ facecolors=(221/255, 160/255, 221/255), # 연보라색
141
+ linewidths=0,
142
+ alpha=0.4)
143
+ ax.add_collection(p)
144
+
145
+ p = PatchCollection(polygons,
146
+ facecolors='none',
147
+ edgecolors=color,
148
+ linewidths=2)
149
+ ax.add_collection(p)
150
+ # else:
151
+ # # mask used for refclef
152
+ # rle = annotation['segmentation']
153
+ # m = mask.decode(rle)
154
+ # img = np.ones((m.shape[0], m.shape[1], 3))
155
+ # color_mask = np.array([2.0, 166.0, 101.0]) / 255
156
+ # for i in range(3):
157
+ # img[:, :, i] = color_mask[i]
158
+ # ax.imshow(np.dstack((img, m * 0.5)))
159
+
160
+ # bounding box
161
+ elif seg_box == 'box':
162
+ bbox = annotation['bbox']
163
+ box_plot = Rectangle((bbox[0], bbox[1]),
164
+ bbox[2],
165
+ bbox[3],
166
+ fill=False,
167
+ edgecolor='green',
168
+ linewidth=3)
169
+ ax.add_patch(box_plot)
170
+
171
+ #==================모두 종합한 함수===================
172
+ def create_dict_from_selected_images(selected_frames_df):
173
+
174
+ image_id = 0
175
+ anno_id = 0
176
+ train_idx = 0
177
+
178
+ with open("/home/yejin/data/data/dataset/VRIS/mbench/ytvos/selected_instances2.jsonl", "w") as f:
179
+
180
+ for selected_idx in range(len(selected_frames_df)):
181
+ selected = selected_frames_df.loc[selected_idx]
182
+ selected_vid_id = selected['video']
183
+ selected_frame_id = selected['frame_id']
184
+
185
+ for obj_id in selected['objects'].keys():
186
+
187
+ selected_exp = selected['objects'][obj_id][0] #캡션
188
+ selected_verb = selected['objects'][obj_id][1] #동사
189
+
190
+ train_idx = next(
191
+ idx for idx, meta in enumerate(metas)
192
+ if meta['video'] == selected_vid_id
193
+ and meta['frame_id'] == selected_frame_id
194
+ and meta['obj_id'] == int(obj_id)
195
+ and meta['exp'] == selected_exp
196
+ )
197
+
198
+ train_frames, train_info = train_dataset[train_idx]
199
+
200
+ try:
201
+ valid_frame_loc = train_info['frames_idx'].tolist().index(selected_frame_id) #valid한 frame이 있는 index
202
+ except ValueError:
203
+ print(f"selected vid id: {selected_vid_id}, metas['frame_id']: {metas[train_idx]['frame_id']}, selected frame id: {selected_frame_id}, train_info['frames_idx']: {train_info['frames_idx'].tolist()}")
204
+
205
+
206
+ frame = train_frames[valid_frame_loc] #해당 frame
207
+ frame = F.to_pil_image(frame)
208
+
209
+ image_file_name = f"{selected_vid_id}_{str(selected_frame_id).rjust(5, '0')}"
210
+
211
+ #원래 frame 저장하기
212
+ save_dir = Path("/home/yejin/data/data/dataset/VRIS/mbench/ytvos/selected_frames")
213
+ #save_dir.mkdir(exist_ok=True)
214
+ save_path = save_dir / f"{image_file_name}.png"
215
+ #frame.save(save_path)
216
+
217
+ #카테고리
218
+ label = train_info['labels'][valid_frame_loc].item() #category id
219
+ category_name = metas[train_idx]['category'] #category name
220
+
221
+ #박스 정보
222
+ box = train_info['boxes'][valid_frame_loc]
223
+
224
+ # Annotation tools ########################################################################
225
+ mask = train_info['masks'][valid_frame_loc]
226
+ # print(mask.shape)
227
+
228
+ # frame과 mask 맞는지 확인만
229
+ # plt.imshow(frame.permute(1, 2, 0))
230
+ # mask_color = np.zeros((*mask.shape, 3), dtype = np.uint8)
231
+ # mask_color[mask == 1] = [255, 0, 0]
232
+ # plt.imshow(mask_color, alpha = 0.5)
233
+ # plt.show()
234
+
235
+
236
+ mask_image = prepare_mask_for_pil(mask)
237
+ sub_masks = create_sub_masks(mask_image)
238
+
239
+ for color, sub_mask in sub_masks.items():
240
+ # print(f"Color: {color}, Sub-mask size: {sub_mask.size}")
241
+ sub_mask_array = np.array(sub_mask, dtype=np.uint8)
242
+ annotation = create_sub_mask_annotation(sub_mask_array, image_id, anno_id, is_crowd = 0)
243
+ anno_id += 1
244
+ image_id += 1
245
+
246
+ #파일 경로 추가
247
+ annotation['file_name'] = f"{image_file_name}.png"
248
+
249
+ #불필요한 정보 지우기
250
+ annotation.pop('iscrowd', None)
251
+ annotation.pop('image_id', None)
252
+ annotation.pop('id', None)
253
+
254
+ valid = train_info['valid'][valid_frame_loc]
255
+ orig_size = train_info['orig_size']
256
+ size = train_info['size']
257
+ caption = metas[train_idx]['exp']
258
+
259
+ #filename, height, width 추가
260
+ #annotation['file_name'] = save_path
261
+ annotation['height'] = orig_size[0].item()
262
+ annotation['width'] = orig_size[1].item()
263
+
264
+ # category id,name, sentence dictionary 추가
265
+ annotation['label'] = label
266
+ annotation['category_name'] = category_name
267
+ sentence_dict = {
268
+ "tokens" : caption.split(' '),
269
+ "raw" : caption,
270
+ "sent" : re.sub('[^A-Za-z0-9\s]+', '', caption.lower())
271
+ }
272
+ annotation['sentences'] = sentence_dict
273
+ ############################################################################################
274
+ # double check for segmentation annotation
275
+ # orig_img_np = draw_polygon_on_image(frame, annotation['segmentation'])
276
+ # plt.imshow(orig_img_np)
277
+ # plt.axis('off')
278
+ # plt.show()
279
+
280
+ # showRef(annotation, save_dir)
281
+ ############################################################################################
282
+
283
+ # 최종
284
+ f.write(json.dumps(annotation) + "\n")
285
+ f.flush()
286
+
287
+ # if __name__ == '__main__':
288
+ # create_dict_from_selected_images(selected_frames_df)
davis2017/utils.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import errno
3
+ import numpy as np
4
+ from PIL import Image
5
+ import warnings
6
+ from davis2017.davis import DAVIS
7
+
8
+
9
+ def _pascal_color_map(N=256, normalized=False):
10
+ """
11
+ Python implementation of the color map function for the PASCAL VOC data set.
12
+ Official Matlab version can be found in the PASCAL VOC devkit
13
+ http://host.robots.ox.ac.uk/pascal/VOC/voc2012/index.html#devkit
14
+ """
15
+
16
+ def bitget(byteval, idx):
17
+ return (byteval & (1 << idx)) != 0
18
+
19
+ dtype = 'float32' if normalized else 'uint8'
20
+ cmap = np.zeros((N, 3), dtype=dtype)
21
+ for i in range(N):
22
+ r = g = b = 0
23
+ c = i
24
+ for j in range(8):
25
+ r = r | (bitget(c, 0) << 7 - j)
26
+ g = g | (bitget(c, 1) << 7 - j)
27
+ b = b | (bitget(c, 2) << 7 - j)
28
+ c = c >> 3
29
+
30
+ cmap[i] = np.array([r, g, b])
31
+
32
+ cmap = cmap / 255 if normalized else cmap
33
+ return cmap
34
+
35
+
36
+ def overlay_semantic_mask(im, ann, alpha=0.5, colors=None, contour_thickness=None):
37
+ im, ann = np.asarray(im, dtype=np.uint8), np.asarray(ann, dtype=np.int)
38
+ if im.shape[:-1] != ann.shape:
39
+ raise ValueError('First two dimensions of `im` and `ann` must match')
40
+ if im.shape[-1] != 3:
41
+ raise ValueError('im must have three channels at the 3 dimension')
42
+
43
+ colors = colors or _pascal_color_map()
44
+ colors = np.asarray(colors, dtype=np.uint8)
45
+
46
+ mask = colors[ann]
47
+ fg = im * alpha + (1 - alpha) * mask
48
+
49
+ img = im.copy()
50
+ img[ann > 0] = fg[ann > 0]
51
+
52
+ if contour_thickness: # pragma: no cover
53
+ import cv2
54
+ for obj_id in np.unique(ann[ann > 0]):
55
+ contours = cv2.findContours((ann == obj_id).astype(
56
+ np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)[-2:]
57
+ cv2.drawContours(img, contours[0], -1, colors[obj_id].tolist(),
58
+ contour_thickness)
59
+ return img
60
+
61
+
62
+ def generate_obj_proposals(davis_root, subset, num_proposals, save_path):
63
+ dataset = DAVIS(davis_root, subset=subset, codalab=True)
64
+ for seq in dataset.get_sequences():
65
+ save_dir = os.path.join(save_path, seq)
66
+ if os.path.exists(save_dir):
67
+ continue
68
+ all_gt_masks, all_masks_id = dataset.get_all_masks(seq, True)
69
+ img_size = all_gt_masks.shape[2:]
70
+ num_rows = int(np.ceil(np.sqrt(num_proposals)))
71
+ proposals = np.zeros((num_proposals, len(all_masks_id), *img_size))
72
+ height_slices = np.floor(np.arange(0, img_size[0] + 1, img_size[0]/num_rows)).astype(np.uint).tolist()
73
+ width_slices = np.floor(np.arange(0, img_size[1] + 1, img_size[1]/num_rows)).astype(np.uint).tolist()
74
+ ii = 0
75
+ prev_h, prev_w = 0, 0
76
+ for h in height_slices[1:]:
77
+ for w in width_slices[1:]:
78
+ proposals[ii, :, prev_h:h, prev_w:w] = 1
79
+ prev_w = w
80
+ ii += 1
81
+ if ii == num_proposals:
82
+ break
83
+ prev_h, prev_w = h, 0
84
+ if ii == num_proposals:
85
+ break
86
+
87
+ os.makedirs(save_dir, exist_ok=True)
88
+ for i, mask_id in enumerate(all_masks_id):
89
+ mask = np.sum(proposals[:, i, ...] * np.arange(1, proposals.shape[0] + 1)[:, None, None], axis=0)
90
+ save_mask(mask, os.path.join(save_dir, f'{mask_id}.png'))
91
+
92
+
93
+ def generate_random_permutation_gt_obj_proposals(davis_root, subset, save_path):
94
+ dataset = DAVIS(davis_root, subset=subset, codalab=True)
95
+ for seq in dataset.get_sequences():
96
+ gt_masks, all_masks_id = dataset.get_all_masks(seq, True)
97
+ obj_swap = np.random.permutation(np.arange(gt_masks.shape[0]))
98
+ gt_masks = gt_masks[obj_swap, ...]
99
+ save_dir = os.path.join(save_path, seq)
100
+ os.makedirs(save_dir, exist_ok=True)
101
+ for i, mask_id in enumerate(all_masks_id):
102
+ mask = np.sum(gt_masks[:, i, ...] * np.arange(1, gt_masks.shape[0] + 1)[:, None, None], axis=0)
103
+ save_mask(mask, os.path.join(save_dir, f'{mask_id}.png'))
104
+
105
+
106
+ def color_map(N=256, normalized=False):
107
+ def bitget(byteval, idx):
108
+ return ((byteval & (1 << idx)) != 0)
109
+
110
+ dtype = 'float32' if normalized else 'uint8'
111
+ cmap = np.zeros((N, 3), dtype=dtype)
112
+ for i in range(N):
113
+ r = g = b = 0
114
+ c = i
115
+ for j in range(8):
116
+ r = r | (bitget(c, 0) << 7-j)
117
+ g = g | (bitget(c, 1) << 7-j)
118
+ b = b | (bitget(c, 2) << 7-j)
119
+ c = c >> 3
120
+
121
+ cmap[i] = np.array([r, g, b])
122
+
123
+ cmap = cmap/255 if normalized else cmap
124
+ return cmap
125
+
126
+
127
+ def save_mask(mask, img_path):
128
+ if np.max(mask) > 255:
129
+ raise ValueError('Maximum id pixel value is 255')
130
+ mask_img = Image.fromarray(mask.astype(np.uint8))
131
+ mask_img.putpalette(color_map().flatten().tolist())
132
+ mask_img.save(img_path)
133
+
134
+
135
+ def db_statistics(per_frame_values):
136
+ """ Compute mean,recall and decay from per-frame evaluation.
137
+ Arguments:
138
+ per_frame_values (ndarray): per-frame evaluation
139
+
140
+ Returns:
141
+ M,O,D (float,float,float):
142
+ return evaluation statistics: mean,recall,decay.
143
+ """
144
+
145
+ # strip off nan values
146
+ with warnings.catch_warnings():
147
+ warnings.simplefilter("ignore", category=RuntimeWarning)
148
+ M = np.nanmean(per_frame_values)
149
+ O = np.nanmean(per_frame_values > 0.5)
150
+
151
+ N_bins = 4
152
+ ids = np.round(np.linspace(1, len(per_frame_values), N_bins + 1) + 1e-10) - 1
153
+ ids = ids.astype(np.uint8)
154
+
155
+ D_bins = [per_frame_values[ids[i]:ids[i + 1] + 1] for i in range(0, 4)]
156
+
157
+ with warnings.catch_warnings():
158
+ warnings.simplefilter("ignore", category=RuntimeWarning)
159
+ D = np.nanmean(D_bins[0]) - np.nanmean(D_bins[3])
160
+
161
+ return M, O, D
162
+
163
+
164
+ def list_files(dir, extension=".png"):
165
+ return [os.path.splitext(file_)[0] for file_ in os.listdir(dir) if file_.endswith(extension)]
166
+
167
+
168
+ def force_symlink(file1, file2):
169
+ try:
170
+ os.symlink(file1, file2)
171
+ except OSError as e:
172
+ if e.errno == errno.EEXIST:
173
+ os.remove(file2)
174
+ os.symlink(file1, file2)
inference_davis.py ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ Inference code for ReferFormer, on Ref-Youtube-VOS
3
+ Modified from DETR (https://github.com/facebookresearch/detr)
4
+ Ref-Davis17 does not support visualize
5
+ '''
6
+ import argparse
7
+ import json
8
+ import random
9
+ import time
10
+ from pathlib import Path
11
+
12
+ import numpy as np
13
+ import torch
14
+
15
+
16
+ import util.misc as utils
17
+ from models import build_model
18
+ import torchvision.transforms as T
19
+ import matplotlib.pyplot as plt
20
+ import os
21
+ import cv2
22
+ from PIL import Image, ImageDraw
23
+ import math
24
+ import torch.nn.functional as F
25
+ import json
26
+
27
+
28
+ import opts
29
+ from tqdm import tqdm
30
+
31
+ import multiprocessing as mp
32
+ import threading
33
+
34
+ from tools.colormap import colormap
35
+
36
+
37
+ # colormap
38
+ color_list = colormap()
39
+ color_list = color_list.astype('uint8').tolist()
40
+
41
+ # build transform
42
+ transform = T.Compose([
43
+ T.Resize(360),
44
+ T.ToTensor(),
45
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
46
+ ])
47
+
48
+
49
+ def main(args):
50
+ args.dataset_file = "davis"
51
+ args.masks = True
52
+ args.batch_size == 1
53
+ print("Inference only supports for batch size = 1")
54
+ print(args)
55
+
56
+ # fix the seed for reproducibility
57
+ seed = args.seed + utils.get_rank()
58
+ torch.manual_seed(seed)
59
+ np.random.seed(seed)
60
+ random.seed(seed)
61
+
62
+ split = args.split
63
+ # save path
64
+ output_dir = args.output_dir
65
+ save_path_prefix = os.path.join(output_dir, split)
66
+ if not os.path.exists(save_path_prefix):
67
+ os.makedirs(save_path_prefix)
68
+
69
+ save_visualize_path_prefix = os.path.join(output_dir, split + '_images')
70
+ if args.visualize:
71
+ if not os.path.exists(save_visualize_path_prefix):
72
+ os.makedirs(save_visualize_path_prefix)
73
+
74
+ # load data
75
+ root = Path(args.davis_path) # data/ref-davis
76
+ img_folder = os.path.join(root, split, "JPEGImages")
77
+ meta_file = os.path.join(root, "meta_expressions", split, "meta_expressions.json")
78
+ with open(meta_file, "r") as f:
79
+ data = json.load(f)["videos"]
80
+ video_list = list(data.keys())
81
+
82
+ # create subprocess
83
+ thread_num = args.ngpu
84
+ global result_dict
85
+ result_dict = mp.Manager().dict()
86
+
87
+ processes = []
88
+ lock = threading.Lock()
89
+
90
+ video_num = len(video_list)
91
+ per_thread_video_num = math.ceil(float(video_num) / float(thread_num))
92
+
93
+ start_time = time.time()
94
+ print('Start inference')
95
+ for i in range(thread_num):
96
+ if i == thread_num - 1:
97
+ sub_video_list = video_list[i * per_thread_video_num:]
98
+ else:
99
+ sub_video_list = video_list[i * per_thread_video_num: (i + 1) * per_thread_video_num]
100
+ p = mp.Process(target=sub_processor, args=(lock, i, args, data,
101
+ save_path_prefix, save_visualize_path_prefix,
102
+ img_folder, sub_video_list))
103
+ p.start()
104
+ processes.append(p)
105
+
106
+ for p in processes:
107
+ p.join()
108
+
109
+ end_time = time.time()
110
+ total_time = end_time - start_time
111
+
112
+ result_dict = dict(result_dict)
113
+ num_all_frames_gpus = 0
114
+ for pid, num_all_frames in result_dict.items():
115
+ num_all_frames_gpus += num_all_frames
116
+
117
+ print("Total inference time: %.4f s" %(total_time))
118
+
119
+
120
+ def sub_processor(lock, pid, args, data, save_path_prefix, save_visualize_path_prefix, img_folder, video_list):
121
+ text = 'processor %d' % pid
122
+ with lock:
123
+ progress = tqdm(
124
+ total=len(video_list),
125
+ position=pid,
126
+ desc=text,
127
+ ncols=0
128
+ )
129
+ torch.cuda.set_device(pid)
130
+
131
+ # model
132
+ model, criterion, _ = build_model(args)
133
+ device = args.device
134
+ model.to(device)
135
+
136
+ model_without_ddp = model
137
+ n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
138
+
139
+ if pid == 0:
140
+ print('number of params:', n_parameters)
141
+
142
+ if args.resume:
143
+ checkpoint = torch.load(args.resume, map_location='cpu')
144
+ missing_keys, unexpected_keys = model_without_ddp.load_state_dict(checkpoint['model'], strict=False)
145
+ unexpected_keys = [k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops'))]
146
+ if len(missing_keys) > 0:
147
+ print('Missing Keys: {}'.format(missing_keys))
148
+ if len(unexpected_keys) > 0:
149
+ print('Unexpected Keys: {}'.format(unexpected_keys))
150
+ else:
151
+ raise ValueError('Please specify the checkpoint for inference.')
152
+
153
+ # get palette
154
+ palette_img = os.path.join(args.davis_path, "valid/Annotations/blackswan/00000.png")
155
+ palette = Image.open(palette_img).getpalette()
156
+
157
+ # start inference
158
+ num_all_frames = 0
159
+ model.eval()
160
+
161
+ # 1. for each video
162
+ for video in video_list:
163
+ metas = []
164
+
165
+ expressions = data[video]["expressions"]
166
+ expression_list = list(expressions.keys())
167
+ num_expressions = len(expression_list)
168
+ video_len = len(data[video]["frames"])
169
+
170
+ # read all the anno meta
171
+ for i in range(num_expressions):
172
+ meta = {}
173
+ meta["video"] = video
174
+ meta["exp"] = expressions[expression_list[i]]["exp"]
175
+ meta["exp_id"] = expression_list[i] # start from 0
176
+ meta["frames"] = data[video]["frames"]
177
+ metas.append(meta)
178
+ meta = metas
179
+
180
+ # since there are 4 annotations
181
+ num_obj = num_expressions // 4
182
+
183
+ # 2. for each annotator
184
+ for anno_id in range(4): # 4 annotators
185
+ anno_logits = []
186
+ anno_masks = [] # [num_obj+1, video_len, h, w], +1 for background
187
+
188
+ for obj_id in range(num_obj):
189
+ i = obj_id * 4 + anno_id
190
+ video_name = meta[i]["video"]
191
+ exp = meta[i]["exp"]
192
+ exp_id = meta[i]["exp_id"]
193
+ frames = meta[i]["frames"]
194
+
195
+ video_len = len(frames)
196
+ # NOTE: the im2col_step for MSDeformAttention is set as 64
197
+ # so the max length for a clip is 64
198
+ # store the video pred results
199
+ all_pred_logits = []
200
+ all_pred_masks = []
201
+
202
+ # 3. for each clip
203
+ for clip_id in range(0, video_len, 36):
204
+ frames_ids = [x for x in range(video_len)]
205
+ clip_frames_ids = frames_ids[clip_id : clip_id + 36]
206
+ clip_len = len(clip_frames_ids)
207
+
208
+ # load the clip images
209
+ imgs = []
210
+ for t in clip_frames_ids:
211
+ frame = frames[t]
212
+ img_path = os.path.join(img_folder, video_name, frame + ".jpg")
213
+ img = Image.open(img_path).convert('RGB')
214
+ origin_w, origin_h = img.size
215
+ imgs.append(transform(img)) # list[Img]
216
+
217
+ imgs = torch.stack(imgs, dim=0).to(args.device) # [video_len, 3, H, W]
218
+ img_h, img_w = imgs.shape[-2:]
219
+ size = torch.as_tensor([int(img_h), int(img_w)]).to(args.device)
220
+ target = {"size": size}
221
+
222
+ with torch.no_grad():
223
+ outputs = model([imgs], [exp], [target])
224
+
225
+ pred_logits = outputs["pred_logits"][0] # [t, q, k]
226
+ pred_masks = outputs["pred_masks"][0] # [t, q, h, w]
227
+
228
+ # according to pred_logits, select the query index
229
+ pred_scores = pred_logits.sigmoid() # [t, q, k]
230
+ pred_scores = pred_scores.mean(0) # [q, K]
231
+ max_scores, _ = pred_scores.max(-1) # [q,]
232
+ _, max_ind = max_scores.max(-1) # [1,]
233
+ max_inds = max_ind.repeat(clip_len)
234
+ pred_masks = pred_masks[range(clip_len), max_inds, ...] # [t, h, w]
235
+ pred_masks = pred_masks.unsqueeze(0)
236
+
237
+ pred_masks = F.interpolate(pred_masks, size=(origin_h, origin_w), mode='bilinear', align_corners=False)
238
+ pred_masks = pred_masks.sigmoid()[0] # [t, h, w], NOTE: here mask is score
239
+
240
+ # store the clip results
241
+ pred_logits = pred_logits[range(clip_len), max_inds] # [t, k]
242
+ all_pred_logits.append(pred_logits)
243
+ all_pred_masks.append(pred_masks)
244
+
245
+ all_pred_logits = torch.cat(all_pred_logits, dim=0) # (video_len, K)
246
+ all_pred_masks = torch.cat(all_pred_masks, dim=0) # (video_len, h, w)
247
+ anno_logits.append(all_pred_logits)
248
+ anno_masks.append(all_pred_masks)
249
+
250
+ # handle a complete image (all objects of a annotator)
251
+ anno_logits = torch.stack(anno_logits) # [num_obj, video_len, k]
252
+ anno_masks = torch.stack(anno_masks) # [num_obj, video_len, h, w]
253
+ t, h, w = anno_masks.shape[-3:]
254
+ anno_masks[anno_masks < 0.5] = 0.0
255
+ background = 0.1 * torch.ones(1, t, h, w).to(args.device)
256
+ anno_masks = torch.cat([background, anno_masks], dim=0) # [num_obj+1, video_len, h, w]
257
+ out_masks = torch.argmax(anno_masks, dim=0) # int, the value indicate which object, [video_len, h, w]
258
+
259
+ out_masks = out_masks.detach().cpu().numpy().astype(np.uint8) # [video_len, h, w]
260
+
261
+ # save results
262
+ anno_save_path = os.path.join(save_path_prefix, f"anno_{anno_id}", video)
263
+ if not os.path.exists(anno_save_path):
264
+ os.makedirs(anno_save_path)
265
+ for f in range(out_masks.shape[0]):
266
+ img_E = Image.fromarray(out_masks[f])
267
+ img_E.putpalette(palette)
268
+ img_E.save(os.path.join(anno_save_path, '{:05d}.png'.format(f)))
269
+
270
+
271
+ with lock:
272
+ progress.update(1)
273
+ result_dict[str(pid)] = num_all_frames
274
+ with lock:
275
+ progress.close()
276
+
277
+
278
+
279
+ # Post-process functions
280
+ def box_cxcywh_to_xyxy(x):
281
+ x_c, y_c, w, h = x.unbind(1)
282
+ b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
283
+ (x_c + 0.5 * w), (y_c + 0.5 * h)]
284
+ return torch.stack(b, dim=1)
285
+
286
+ def rescale_bboxes(out_bbox, size):
287
+ img_w, img_h = size
288
+ b = box_cxcywh_to_xyxy(out_bbox)
289
+ b = b.cpu() * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
290
+ return b
291
+
292
+
293
+ # Visualization functions
294
+ def draw_reference_points(draw, reference_points, img_size, color):
295
+ W, H = img_size
296
+ for i, ref_point in enumerate(reference_points):
297
+ init_x, init_y = ref_point
298
+ x, y = W * init_x, H * init_y
299
+ cur_color = color
300
+ draw.line((x-10, y, x+10, y), tuple(cur_color), width=4)
301
+ draw.line((x, y-10, x, y+10), tuple(cur_color), width=4)
302
+
303
+ def draw_sample_points(draw, sample_points, img_size, color_list):
304
+ alpha = 255
305
+ for i, samples in enumerate(sample_points):
306
+ for sample in samples:
307
+ x, y = sample
308
+ cur_color = color_list[i % len(color_list)][::-1]
309
+ cur_color += [alpha]
310
+ draw.ellipse((x-2, y-2, x+2, y+2),
311
+ fill=tuple(cur_color), outline=tuple(cur_color), width=1)
312
+
313
+ def vis_add_mask(img, mask, color):
314
+ origin_img = np.asarray(img.convert('RGB')).copy()
315
+ color = np.array(color)
316
+
317
+ mask = mask.reshape(mask.shape[0], mask.shape[1]).astype('uint8') # np
318
+ mask = mask > 0.5
319
+
320
+ origin_img[mask] = origin_img[mask] * 0.5 + color * 0.5
321
+ origin_img = Image.fromarray(origin_img)
322
+ return origin_img
323
+
324
+
325
+
326
+ if __name__ == '__main__':
327
+ parser = argparse.ArgumentParser('ReferFormer inference script', parents=[opts.get_args_parser()])
328
+ args = parser.parse_args()
329
+ main(args)
330
+
main.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Training script of ReferFormer
3
+ Modified from DETR (https://github.com/facebookresearch/detr)
4
+ """
5
+ import argparse
6
+ import datetime
7
+ import json
8
+ import random
9
+ import time
10
+ from pathlib import Path
11
+
12
+ import numpy as np
13
+ import torch
14
+ from torch.utils.data import DataLoader, DistributedSampler
15
+
16
+ import util.misc as utils
17
+ import datasets.samplers as samplers
18
+ from datasets import build_dataset, get_coco_api_from_dataset
19
+ from engine import train_one_epoch, evaluate, evaluate_a2d
20
+ from models import build_model
21
+
22
+ from tools.load_pretrained_weights import pre_trained_model_to_finetune
23
+
24
+ import opts
25
+
26
+
27
+
28
+ def main(args):
29
+ args.masks = True
30
+
31
+ utils.init_distributed_mode(args)
32
+ print("git:\n {}\n".format(utils.get_sha()))
33
+ print(args)
34
+
35
+ print(f'\n Run on {args.dataset_file} dataset.')
36
+ print('\n')
37
+
38
+ device = torch.device(args.device)
39
+
40
+ # fix the seed for reproducibility
41
+ seed = args.seed + utils.get_rank()
42
+ torch.manual_seed(seed)
43
+ np.random.seed(seed)
44
+ random.seed(seed)
45
+
46
+ model, criterion, postprocessor = build_model(args)
47
+ model.to(device)
48
+
49
+ model_without_ddp = model
50
+ if args.distributed:
51
+ model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
52
+ model_without_ddp = model.module
53
+
54
+ # for n, p in model_without_ddp.named_parameters():
55
+ # print(n)
56
+
57
+ n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
58
+ print('number of params:', n_parameters)
59
+
60
+ def match_name_keywords(n, name_keywords):
61
+ out = False
62
+ for b in name_keywords:
63
+ if b in n:
64
+ out = True
65
+ break
66
+ return out
67
+
68
+
69
+ param_dicts = [
70
+ {
71
+ "params":
72
+ [p for n, p in model_without_ddp.named_parameters()
73
+ if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_text_encoder_names)
74
+ and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad],
75
+ "lr": args.lr,
76
+ },
77
+ {
78
+ "params": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad],
79
+ "lr": args.lr_backbone,
80
+ },
81
+ {
82
+ "params": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_text_encoder_names) and p.requires_grad],
83
+ "lr": args.lr_text_encoder,
84
+ },
85
+ {
86
+ "params": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad],
87
+ "lr": args.lr * args.lr_linear_proj_mult,
88
+ }
89
+ ]
90
+ optimizer = torch.optim.AdamW(param_dicts, lr=args.lr,
91
+ weight_decay=args.weight_decay)
92
+ lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, args.lr_drop)
93
+
94
+ # no validation ground truth for ytvos dataset
95
+ dataset_train = build_dataset(args.dataset_file, image_set='train', args=args)
96
+
97
+ if args.distributed:
98
+ if args.cache_mode:
99
+ sampler_train = samplers.NodeDistributedSampler(dataset_train)
100
+ else:
101
+ sampler_train = samplers.DistributedSampler(dataset_train)
102
+ else:
103
+ sampler_train = torch.utils.data.RandomSampler(dataset_train)
104
+
105
+ batch_sampler_train = torch.utils.data.BatchSampler(
106
+ sampler_train, args.batch_size, drop_last=True)
107
+
108
+ data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train,
109
+ collate_fn=utils.collate_fn, num_workers=args.num_workers)
110
+
111
+ # A2D-Sentences
112
+ if args.dataset_file == 'a2d' or args.dataset_file == 'jhmdb':
113
+ dataset_val = build_dataset(args.dataset_file, image_set='val', args=args)
114
+ if args.distributed:
115
+ if args.cache_mode:
116
+ sampler_val = samplers.NodeDistributedSampler(dataset_val, shuffle=False)
117
+ else:
118
+ sampler_val = samplers.DistributedSampler(dataset_val, shuffle=False)
119
+ else:
120
+ sampler_val = torch.utils.data.SequentialSampler(dataset_val)
121
+ data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val,
122
+ drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers,
123
+ pin_memory=True)
124
+
125
+
126
+ if args.dataset_file == "davis":
127
+ assert args.pretrained_weights is not None, "Please provide the pretrained weight to finetune for Ref-DAVIS17"
128
+ print("============================================>")
129
+ print("Ref-DAVIS17 are finetuned using the checkpoint trained on Ref-Youtube-VOS")
130
+ print("Load checkpoint weights from {} ...".format(args.pretrained_weights))
131
+ checkpoint = torch.load(args.pretrained_weights, map_location="cpu")
132
+ checkpoint_dict = pre_trained_model_to_finetune(checkpoint, args)
133
+ model_without_ddp.load_state_dict(checkpoint_dict, strict=False)
134
+ print("============================================>")
135
+
136
+ if args.dataset_file == "jhmdb":
137
+ assert args.resume is not None, "Please provide the checkpoint to resume for JHMDB-Sentences"
138
+ print("============================================>")
139
+ print("JHMDB-Sentences are directly evaluated using the checkpoint trained on A2D-Sentences")
140
+ print("Load checkpoint weights from {} ...".format(args.pretrained_weights))
141
+ # load checkpoint in the args.resume
142
+ print("============================================>")
143
+
144
+ # for Ref-Youtube-VOS and A2D-Sentences
145
+ # finetune using the pretrained weights on Ref-COCO
146
+ if args.dataset_file != "davis" and args.dataset_file != "jhmdb" and args.pretrained_weights is not None:
147
+ print("============================================>")
148
+ print("Load pretrained weights from {} ...".format(args.pretrained_weights))
149
+ checkpoint = torch.load(args.pretrained_weights, map_location="cpu")
150
+ checkpoint_dict = pre_trained_model_to_finetune(checkpoint, args)
151
+ model_without_ddp.load_state_dict(checkpoint_dict, strict=False)
152
+ print("============================================>")
153
+
154
+
155
+ output_dir = Path(args.output_dir)
156
+ if args.resume:
157
+ if args.resume.startswith('https'):
158
+ checkpoint = torch.hub.load_state_dict_from_url(
159
+ args.resume, map_location='cpu', check_hash=True)
160
+ else:
161
+ checkpoint = torch.load(args.resume, map_location='cpu')
162
+ missing_keys, unexpected_keys = model_without_ddp.load_state_dict(checkpoint['model'], strict=False)
163
+ unexpected_keys = [k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops'))]
164
+ if len(missing_keys) > 0:
165
+ print('Missing Keys: {}'.format(missing_keys))
166
+ if len(unexpected_keys) > 0:
167
+ print('Unexpected Keys: {}'.format(unexpected_keys))
168
+ if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
169
+ import copy
170
+ p_groups = copy.deepcopy(optimizer.param_groups)
171
+ optimizer.load_state_dict(checkpoint['optimizer'])
172
+ for pg, pg_old in zip(optimizer.param_groups, p_groups):
173
+ pg['lr'] = pg_old['lr']
174
+ pg['initial_lr'] = pg_old['initial_lr']
175
+ print(optimizer.param_groups)
176
+ lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
177
+ # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance).
178
+ args.override_resumed_lr_drop = True
179
+ if args.override_resumed_lr_drop:
180
+ print('Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.')
181
+ lr_scheduler.step_size = args.lr_drop
182
+ lr_scheduler.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups))
183
+ lr_scheduler.step(lr_scheduler.last_epoch)
184
+ args.start_epoch = checkpoint['epoch'] + 1
185
+
186
+ if args.eval:
187
+ assert args.dataset_file == 'a2d' or args.dataset_file == 'jhmdb', \
188
+ 'Only A2D-Sentences and JHMDB-Sentences datasets support evaluation'
189
+ test_stats = evaluate_a2d(model, data_loader_val, postprocessor, device, args)
190
+ return
191
+
192
+
193
+ print("Start training")
194
+ start_time = time.time()
195
+ for epoch in range(args.start_epoch, args.epochs):
196
+ if args.distributed:
197
+ sampler_train.set_epoch(epoch)
198
+ train_stats = train_one_epoch(
199
+ model, criterion, data_loader_train, optimizer, device, epoch,
200
+ args.clip_max_norm)
201
+ lr_scheduler.step()
202
+ if args.output_dir:
203
+ checkpoint_paths = [output_dir / 'checkpoint.pth']
204
+ # extra checkpoint before LR drop and every epochs
205
+ # if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 1 == 0:
206
+ if (epoch + 1) % 1 == 0:
207
+ checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth')
208
+ for checkpoint_path in checkpoint_paths:
209
+ utils.save_on_master({
210
+ 'model': model_without_ddp.state_dict(),
211
+ 'optimizer': optimizer.state_dict(),
212
+ 'lr_scheduler': lr_scheduler.state_dict(),
213
+ 'epoch': epoch,
214
+ 'args': args,
215
+ }, checkpoint_path)
216
+
217
+ log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
218
+ 'epoch': epoch,
219
+ 'n_parameters': n_parameters}
220
+
221
+ if args.dataset_file == 'a2d':
222
+ test_stats = evaluate_a2d(model, data_loader_val, postprocessor, device, args)
223
+ log_stats.update({**{f'{k}': v for k, v in test_stats.items()}})
224
+
225
+ if args.output_dir and utils.is_main_process():
226
+ with (output_dir / "log.txt").open("a") as f:
227
+ f.write(json.dumps(log_stats) + "\n")
228
+
229
+
230
+ total_time = time.time() - start_time
231
+ total_time_str = str(datetime.timedelta(seconds=int(total_time)))
232
+ print('Training time {}'.format(total_time_str))
233
+
234
+
235
+ if __name__ == '__main__':
236
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
237
+ args = parser.parse_args()
238
+ if args.output_dir:
239
+ Path(args.output_dir).mkdir(parents=True, exist_ok=True)
240
+ main(args)
241
+
242
+
243
+
main_joint.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Training script of ReferFormer
3
+ Modified from DETR (https://github.com/facebookresearch/detr)
4
+ """
5
+ import argparse
6
+ import datetime
7
+ import json
8
+ import random
9
+ import time
10
+ from pathlib import Path
11
+
12
+ import numpy as np
13
+ import torch
14
+ from torch.utils.data import DataLoader, DistributedSampler
15
+
16
+ import util.misc as utils
17
+ import datasets.samplers as samplers
18
+ from datasets import build_dataset, get_coco_api_from_dataset
19
+ from engine import train_one_epoch, evaluate, evaluate_a2d
20
+ from models import build_model
21
+
22
+ from tools.load_pretrained_weights import pre_trained_model_to_finetune
23
+
24
+ import opts
25
+
26
+
27
+
28
+ def main(args):
29
+ args.masks = True
30
+ args.dataset_file = 'joint' # joint training of ytvos and refcoco
31
+ args.binary = 1 # only run on binary referred
32
+
33
+ utils.init_distributed_mode(args)
34
+ print("git:\n {}\n".format(utils.get_sha()))
35
+ print(args)
36
+
37
+ print(f'\n Run on {args.dataset_file} dataset.')
38
+ print('\n')
39
+
40
+ device = torch.device(args.device)
41
+
42
+ # fix the seed for reproducibility
43
+ seed = args.seed + utils.get_rank()
44
+ torch.manual_seed(seed)
45
+ np.random.seed(seed)
46
+ random.seed(seed)
47
+
48
+ model, criterion, postprocessor = build_model(args)
49
+ model.to(device)
50
+
51
+ model_without_ddp = model
52
+ if args.distributed:
53
+ model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
54
+ model_without_ddp = model.module
55
+
56
+ # for n, p in model_without_ddp.named_parameters():
57
+ # print(n)
58
+
59
+ n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
60
+ print('number of params:', n_parameters)
61
+
62
+ def match_name_keywords(n, name_keywords):
63
+ out = False
64
+ for b in name_keywords:
65
+ if b in n:
66
+ out = True
67
+ break
68
+ return out
69
+
70
+
71
+ param_dicts = [
72
+ {
73
+ "params":
74
+ [p for n, p in model_without_ddp.named_parameters()
75
+ if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_text_encoder_names)
76
+ and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad],
77
+ "lr": args.lr,
78
+ },
79
+ {
80
+ "params": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad],
81
+ "lr": args.lr_backbone,
82
+ },
83
+ {
84
+ "params": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_text_encoder_names) and p.requires_grad],
85
+ "lr": args.lr_text_encoder,
86
+ },
87
+ {
88
+ "params": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad],
89
+ "lr": args.lr * args.lr_linear_proj_mult,
90
+ }
91
+ ]
92
+ optimizer = torch.optim.AdamW(param_dicts, lr=args.lr,
93
+ weight_decay=args.weight_decay)
94
+ lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, args.lr_drop)
95
+
96
+ # no validation ground truth for ytvos dataset
97
+ dataset_train = build_dataset(args.dataset_file, image_set='train', args=args)
98
+
99
+ if args.distributed:
100
+ if args.cache_mode:
101
+ sampler_train = samplers.NodeDistributedSampler(dataset_train)
102
+ else:
103
+ sampler_train = samplers.DistributedSampler(dataset_train)
104
+ else:
105
+ sampler_train = torch.utils.data.RandomSampler(dataset_train)
106
+
107
+ batch_sampler_train = torch.utils.data.BatchSampler(
108
+ sampler_train, args.batch_size, drop_last=True)
109
+
110
+ data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train,
111
+ collate_fn=utils.collate_fn, num_workers=args.num_workers)
112
+
113
+
114
+ output_dir = Path(args.output_dir)
115
+ if args.resume:
116
+ if args.resume.startswith('https'):
117
+ checkpoint = torch.hub.load_state_dict_from_url(
118
+ args.resume, map_location='cpu', check_hash=True)
119
+ else:
120
+ checkpoint = torch.load(args.resume, map_location='cpu')
121
+ missing_keys, unexpected_keys = model_without_ddp.load_state_dict(checkpoint['model'], strict=False)
122
+ unexpected_keys = [k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops'))]
123
+ if len(missing_keys) > 0:
124
+ print('Missing Keys: {}'.format(missing_keys))
125
+ if len(unexpected_keys) > 0:
126
+ print('Unexpected Keys: {}'.format(unexpected_keys))
127
+ if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
128
+ import copy
129
+ p_groups = copy.deepcopy(optimizer.param_groups)
130
+ optimizer.load_state_dict(checkpoint['optimizer'])
131
+ for pg, pg_old in zip(optimizer.param_groups, p_groups):
132
+ pg['lr'] = pg_old['lr']
133
+ pg['initial_lr'] = pg_old['initial_lr']
134
+ print(optimizer.param_groups)
135
+ lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
136
+ # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance).
137
+ args.override_resumed_lr_drop = True
138
+ if args.override_resumed_lr_drop:
139
+ print('Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.')
140
+ lr_scheduler.step_size = args.lr_drop
141
+ lr_scheduler.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups))
142
+ lr_scheduler.step(lr_scheduler.last_epoch)
143
+ args.start_epoch = checkpoint['epoch'] + 1
144
+
145
+ if args.eval:
146
+ assert args.dataset_file == 'a2d' or args.dataset_file == 'jhmdb', \
147
+ 'Only A2D-Sentences and JHMDB-Sentences datasets support evaluation'
148
+ test_stats = evaluate_a2d(model, data_loader_val, postprocessor, device, args)
149
+ return
150
+
151
+
152
+ print("Start training")
153
+ start_time = time.time()
154
+ for epoch in range(args.start_epoch, args.epochs):
155
+ if args.distributed:
156
+ sampler_train.set_epoch(epoch)
157
+ train_stats = train_one_epoch(
158
+ model, criterion, data_loader_train, optimizer, device, epoch,
159
+ args.clip_max_norm)
160
+ lr_scheduler.step()
161
+ if args.output_dir:
162
+ checkpoint_paths = [output_dir / 'checkpoint.pth']
163
+ # extra checkpoint before LR drop and every epochs
164
+ # if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 1 == 0:
165
+ if (epoch + 1) % 1 == 0:
166
+ checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth')
167
+ for checkpoint_path in checkpoint_paths:
168
+ utils.save_on_master({
169
+ 'model': model_without_ddp.state_dict(),
170
+ 'optimizer': optimizer.state_dict(),
171
+ 'lr_scheduler': lr_scheduler.state_dict(),
172
+ 'epoch': epoch,
173
+ 'args': args,
174
+ }, checkpoint_path)
175
+
176
+ log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
177
+ 'epoch': epoch,
178
+ 'n_parameters': n_parameters}
179
+
180
+
181
+ if args.output_dir and utils.is_main_process():
182
+ with (output_dir / "log.txt").open("a") as f:
183
+ f.write(json.dumps(log_stats) + "\n")
184
+
185
+
186
+ total_time = time.time() - start_time
187
+ total_time_str = str(datetime.timedelta(seconds=int(total_time)))
188
+ print('Training time {}'.format(total_time_str))
189
+
190
+
191
+ if __name__ == '__main__':
192
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
193
+ args = parser.parse_args()
194
+ if args.output_dir:
195
+ Path(args.output_dir).mkdir(parents=True, exist_ok=True)
196
+ main(args)
197
+
198
+
main_pretrain.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import datetime
3
+ import json
4
+ import random
5
+ import time
6
+ from pathlib import Path
7
+ from collections import namedtuple
8
+ from functools import partial
9
+
10
+ import os
11
+ import numpy as np
12
+ import torch
13
+ from torch.utils.data import DataLoader, DistributedSampler
14
+
15
+ import util.misc as utils
16
+ import datasets.samplers as samplers
17
+ from datasets.coco_eval import CocoEvaluator
18
+ from datasets import build_dataset, get_coco_api_from_dataset
19
+ from engine import evaluate, train_one_epoch
20
+ from models import build_model
21
+ from models.postprocessors import build_postprocessors
22
+
23
+ import opts
24
+
25
+
26
+
27
+ def main(args):
28
+ # set environ
29
+ os.environ["MDETR_CPU_REDUCE"] = "1"
30
+
31
+ args.masks = True
32
+ assert args.dataset_file in ["refcoco", "refcoco+", "refcocog", "all"]
33
+
34
+ utils.init_distributed_mode(args)
35
+ print("git:\n {}\n".format(utils.get_sha()))
36
+ print(args)
37
+
38
+ device = torch.device(args.device)
39
+
40
+ # fix the seed for reproducibility
41
+ seed = args.seed + utils.get_rank()
42
+ torch.manual_seed(seed)
43
+ np.random.seed(seed)
44
+ random.seed(seed)
45
+
46
+ model, criterion, postprocessors = build_model(args)
47
+ model.to(device)
48
+
49
+ model_without_ddp = model
50
+ if args.distributed:
51
+ model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
52
+ model_without_ddp = model.module
53
+ n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
54
+ print('number of params:', n_parameters)
55
+
56
+ # lr_backbone_names = ["backbone.0", "text_encoder"]
57
+ def match_name_keywords(n, name_keywords):
58
+ out = False
59
+ for b in name_keywords:
60
+ if b in n:
61
+ out = True
62
+ break
63
+ return out
64
+
65
+ # for n, p in model_without_ddp.named_parameters():
66
+ # print(n)
67
+
68
+ param_dicts = [
69
+ {
70
+ "params":
71
+ [p for n, p in model_without_ddp.named_parameters()
72
+ if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_text_encoder_names)
73
+ and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad],
74
+ "lr": args.lr,
75
+ },
76
+ {
77
+ "params": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad],
78
+ "lr": args.lr_backbone,
79
+ },
80
+ {
81
+ "params": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_text_encoder_names) and p.requires_grad],
82
+ "lr": args.lr_text_encoder,
83
+ },
84
+ {
85
+ "params": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad],
86
+ "lr": args.lr * args.lr_linear_proj_mult,
87
+ }
88
+ ]
89
+ optimizer = torch.optim.AdamW(param_dicts, lr=args.lr,
90
+ weight_decay=args.weight_decay)
91
+ lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, args.lr_drop)
92
+
93
+ # build train dataset
94
+ if args.dataset_file != "all":
95
+ dataset_train = build_dataset(args.dataset_file, image_set='train', args=args)
96
+ else:
97
+ dataset_names = ["refcoco", "refcoco+", "refcocog"]
98
+ dataset_train = torch.utils.data.ConcatDataset(
99
+ [build_dataset(name, image_set="train", args=args) for name in dataset_names]
100
+ )
101
+
102
+ print("\nTrain dataset sample number: ", len(dataset_train))
103
+ print("\n")
104
+
105
+ if args.distributed:
106
+ if args.cache_mode:
107
+ sampler_train = samplers.NodeDistributedSampler(dataset_train)
108
+ else:
109
+ sampler_train = samplers.DistributedSampler(dataset_train)
110
+ else:
111
+ sampler_train = torch.utils.data.RandomSampler(dataset_train)
112
+
113
+ batch_sampler_train = torch.utils.data.BatchSampler(
114
+ sampler_train, args.batch_size, drop_last=True)
115
+
116
+ data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train,
117
+ collate_fn=utils.collate_fn, num_workers=args.num_workers,
118
+ pin_memory=True)
119
+
120
+ # build val datasets
121
+ Val_all = namedtuple(typename="val_data", field_names=["dataset_name", "dataloader", "base_ds", "evaluator_list"])
122
+ if args.dataset_file != "all":
123
+ dataset_names = [args.dataset_file]
124
+ else:
125
+ dataset_names = ["refcoco", "refcoco+", "refcocog"]
126
+
127
+ val_tuples = []
128
+ for name in dataset_names:
129
+ dataset_val = build_dataset(name, image_set="val", args=args)
130
+ sampler_val = (
131
+ samplers.DistributedSampler(dataset_val, shuffle=False) if args.distributed else torch.utils.data.SequentialSampler(dataset_val)
132
+ )
133
+ data_loader_val = DataLoader(
134
+ dataset_val,
135
+ args.batch_size,
136
+ sampler=sampler_val,
137
+ drop_last=False,
138
+ collate_fn=utils.collate_fn,
139
+ num_workers=args.num_workers,
140
+ )
141
+ base_ds = get_coco_api_from_dataset(dataset_val)
142
+ val_tuples.append(Val_all(dataset_name=name, dataloader=data_loader_val, base_ds=base_ds, evaluator_list=None))
143
+
144
+ # build evaluator list for dataset_val
145
+ def build_evaluator_list(base_ds, dataset_name):
146
+ """Helper function to build the list of evaluators for a given dataset"""
147
+ evaluator_list = []
148
+ iou_types = ["bbox"]
149
+ if args.masks:
150
+ iou_types.append("segm")
151
+
152
+ evaluator_list.append(CocoEvaluator(base_ds, tuple(iou_types), useCats=False))
153
+ # TODO: currently ont support RefExpEvaluator (memory error)
154
+ return evaluator_list
155
+
156
+
157
+
158
+ output_dir = Path(args.output_dir)
159
+ if args.resume:
160
+ print("Resume from {}".format(args.resume))
161
+ if args.resume.startswith('https'):
162
+ checkpoint = torch.hub.load_state_dict_from_url(
163
+ args.resume, map_location='cpu', check_hash=True)
164
+ else:
165
+ checkpoint = torch.load(args.resume, map_location='cpu')
166
+ missing_keys, unexpected_keys = model_without_ddp.load_state_dict(checkpoint['model'], strict=False)
167
+ unexpected_keys = [k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops'))]
168
+ if len(missing_keys) > 0:
169
+ print('Missing Keys: {}'.format(missing_keys))
170
+ if len(unexpected_keys) > 0:
171
+ print('Unexpected Keys: {}'.format(unexpected_keys))
172
+ if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
173
+ import copy
174
+ p_groups = copy.deepcopy(optimizer.param_groups)
175
+ optimizer.load_state_dict(checkpoint['optimizer'])
176
+ for pg, pg_old in zip(optimizer.param_groups, p_groups):
177
+ pg['lr'] = pg_old['lr']
178
+ pg['initial_lr'] = pg_old['initial_lr']
179
+ print(optimizer.param_groups)
180
+ lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
181
+ # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance).
182
+ args.override_resumed_lr_drop = True
183
+ if args.override_resumed_lr_drop:
184
+ print('Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.')
185
+ lr_scheduler.step_size = args.lr_drop
186
+ lr_scheduler.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups))
187
+ lr_scheduler.step(lr_scheduler.last_epoch)
188
+ args.start_epoch = checkpoint['epoch'] + 1
189
+ if not args.eval:
190
+ test_stats = {}
191
+ for i, item in enumerate(val_tuples):
192
+ evaluator_list = build_evaluator_list(item.base_ds, item.dataset_name)
193
+ postprocessors = build_postprocessors(args, item.dataset_name)
194
+ item = item._replace(evaluator_list=evaluator_list)
195
+ print(f"Evaluating {item.dataset_name}")
196
+ curr_test_stats = evaluate(
197
+ model=model,
198
+ criterion=criterion,
199
+ postprocessors=postprocessors,
200
+ data_loader=item.dataloader,
201
+ evaluator_list=item.evaluator_list,
202
+ device=device,
203
+ args=args,
204
+ )
205
+ test_stats.update({item.dataset_name + "_" + k: v for k, v in curr_test_stats.items()})
206
+
207
+ log_stats = {
208
+ **{f"test_{k}": v for k, v in test_stats.items()},
209
+ "n_parameters": n_parameters,
210
+ }
211
+ print(log_stats)
212
+
213
+
214
+ if args.eval:
215
+ print("Evaluating......")
216
+ test_stats = {}
217
+ for i, item in enumerate(val_tuples):
218
+ evaluator_list = build_evaluator_list(item.base_ds, item.dataset_name)
219
+ postprocessors = build_postprocessors(args, item.dataset_name)
220
+ item = item._replace(evaluator_list=evaluator_list)
221
+ print(f"Evaluating {item.dataset_name}")
222
+ curr_test_stats = evaluate(
223
+ model=model,
224
+ criterion=criterion,
225
+ postprocessors=postprocessors,
226
+ data_loader=item.dataloader,
227
+ evaluator_list=item.evaluator_list,
228
+ device=device,
229
+ args=args,
230
+ )
231
+ test_stats.update({item.dataset_name + "_" + k: v for k, v in curr_test_stats.items()})
232
+
233
+ log_stats = {
234
+ **{f"test_{k}": v for k, v in test_stats.items()},
235
+ "n_parameters": n_parameters,
236
+ }
237
+ print(log_stats)
238
+
239
+ return
240
+
241
+
242
+ print("Start training")
243
+ start_time = time.time()
244
+ for epoch in range(args.start_epoch, args.epochs):
245
+ if args.distributed:
246
+ sampler_train.set_epoch(epoch)
247
+ train_stats = train_one_epoch(
248
+ model, criterion, data_loader_train, optimizer, device, epoch,
249
+ args.clip_max_norm)
250
+ lr_scheduler.step()
251
+ if args.output_dir:
252
+ checkpoint_paths = [output_dir / 'checkpoint.pth']
253
+ # extra checkpoint before LR drop and every epochs
254
+ # if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 1 == 0:
255
+ if (epoch + 1) % 1 == 0:
256
+ checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth')
257
+ for checkpoint_path in checkpoint_paths:
258
+ utils.save_on_master({
259
+ 'model': model_without_ddp.state_dict(),
260
+ 'optimizer': optimizer.state_dict(),
261
+ 'lr_scheduler': lr_scheduler.state_dict(),
262
+ 'epoch': epoch,
263
+ 'args': args,
264
+ }, checkpoint_path)
265
+
266
+ test_stats = {}
267
+ for i, item in enumerate(val_tuples):
268
+ evaluator_list = build_evaluator_list(item.base_ds, item.dataset_name)
269
+ postprocessors = build_postprocessors(args, item.dataset_name)
270
+ item = item._replace(evaluator_list=evaluator_list)
271
+ print(f"Evaluating {item.dataset_name}")
272
+ curr_test_stats = evaluate(
273
+ model=model,
274
+ criterion=criterion,
275
+ postprocessors=postprocessors,
276
+ data_loader=item.dataloader,
277
+ evaluator_list=item.evaluator_list,
278
+ device=device,
279
+ args=args,
280
+ )
281
+ test_stats.update({item.dataset_name + "_" + k: v for k, v in curr_test_stats.items()})
282
+
283
+ log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
284
+ **{f'test_{k}': v for k, v in test_stats.items()},
285
+ 'epoch': epoch,
286
+ 'n_parameters': n_parameters}
287
+
288
+ if args.output_dir and utils.is_main_process():
289
+ with (output_dir / "log.txt").open("a") as f:
290
+ f.write(json.dumps(log_stats) + "\n")
291
+
292
+
293
+ total_time = time.time() - start_time
294
+ total_time_str = str(datetime.timedelta(seconds=int(total_time)))
295
+ print('Training time {}'.format(total_time_str))
296
+
297
+
298
+ if __name__ == '__main__':
299
+ parser = argparse.ArgumentParser('ReferFormer pretrain training and evaluation script', parents=[opts.get_args_parser()])
300
+ args = parser.parse_args()
301
+ if args.output_dir:
302
+ Path(args.output_dir).mkdir(parents=True, exist_ok=True)
303
+ main(args)
304
+
make_refcoco/refcocog_google/motion_split_generation_grefg_val.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
make_refcoco/refcocog_google/part4_ref_id.txt ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 4859
2
+ 678
3
+ 162
4
+ 3052
5
+ 2355
6
+ 3408
7
+ 834
8
+ 328
9
+ 1646
10
+ 4400
11
+ 3683
12
+ 3788
13
+ 4701
14
+ 1211
15
+ 2138
16
+ 3510
17
+ 899
18
+ 293
19
+ 3196
20
+ 1939
21
+ 2659
22
+ 2849
23
+ 756
24
+ 4573
25
+ 4514
26
+ 304
27
+ 3465
28
+ 1092
29
+ 2025
30
+ 1701
31
+ 2958
32
+ 4793
33
+ 1247
34
+ 1841
35
+ 4404
36
+ 4536
37
+ 2787
38
+ 3377
39
+ 3889
40
+ 2194
41
+ 2969
42
+ 1951
43
+ 508
44
+ 2312
45
+ 3948
46
+ 1388
47
+ 2690
48
+ 1109
49
+ 1374
50
+ 3475
51
+ 1333
52
+ 2068
53
+ 2824
54
+ 2294
55
+ 2446
56
+ 4771
57
+ 2686
58
+ 4558
59
+ 1499
60
+ 4303
61
+ 1376
62
+ 3544
63
+ 1858
64
+ 434
65
+ 3024
66
+ 513
67
+ 693
68
+ 2523
69
+ 4987
70
+ 3133
71
+ 4041
72
+ 2105
73
+ 135
74
+ 3613
75
+ 1722
76
+ 1607
77
+ 2761
78
+ 2454
79
+ 1603
80
+ 4794
81
+ 2485
82
+ 3280
83
+ 3336
84
+ 3118
85
+ 4494
86
+ 3004
87
+ 127
88
+ 3389
89
+ 2568
90
+ 2283
91
+ 1530
92
+ 4251
93
+ 2540
94
+ 2870
95
+ 4946
96
+ 113
97
+ 711
98
+ 3209
99
+ 3620
100
+ 4382
101
+ 2861
102
+ 3954
103
+ 1984
104
+ 2069
105
+ 2016
106
+ 1153
107
+ 3614
108
+ 198
109
+ 3012
110
+ 4247
111
+ 2205
112
+ 4831
113
+ 4534
114
+ 638
115
+ 1419
116
+ 1992
117
+ 542
118
+ 2223
119
+ 4865
120
+ 751
121
+ 3540
122
+ 3765
123
+ 2879
124
+ 4529
125
+ 2131
126
+ 1306
127
+ 3508
128
+ 4165
129
+ 4126
130
+ 388
make_refcoco/refcocog_google/revised_refid_part4.json ADDED
@@ -0,0 +1,506 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "4859": {
3
+ "(motion, 101105)": "man sitting on the ground playing wii",
4
+ "(static, 101106)": "man in white and light blue t - shirt"
5
+ },
6
+ "678": {
7
+ "(motion, 14720)": "the man crouching inside the plane",
8
+ "(static, 14721)": "the man wearing white hat"
9
+ },
10
+ "162": {
11
+ "(motion, 2908)": "the man resting his face on his hands",
12
+ "(static, 2909)": "the man with a plastic bag between his feet"
13
+ },
14
+ "3052": {
15
+ "(motion, 63901)": "person looking at a book",
16
+ "(static, 63902)": "person wearing a hat and backpack"
17
+ },
18
+ "2355": {
19
+ "(motion, 49522)": "the cat sitting in the chair",
20
+ "(static, 49523)": "cat on left side"
21
+ },
22
+ "3408": {
23
+ "(motion, 71397)": "a man bending and judging a tennis match",
24
+ "(static, 71398)": "a man wearing a red shirt and black pants"
25
+ },
26
+ "834": {
27
+ "(motion, 17983)": "a giraffe who is eating hay out of a feeder",
28
+ "(static, 17984)": "the giraffe on the right side of the pole"
29
+ },
30
+ "328": {
31
+ "(motion, 6730)": "person bending over",
32
+ "(static, 6731)": "big person in blue cap"
33
+ },
34
+ "1646": {
35
+ "(motion, 35169)": "person about to hit a ball",
36
+ "(static, 35170)": "person wearing shirt and pants"
37
+ },
38
+ "4400": {
39
+ "(motion, 91825)": "boy sitting on his skateboard and looking at another boy",
40
+ "(static, 91826)": "boy wearing dark t - shirt and jeans"
41
+ },
42
+ "3683": {
43
+ "(motion, 77184)": "a man dishing up food",
44
+ "(static, 77185)": "a man in military camo and a black hat on the right"
45
+ },
46
+ "3788": {
47
+ "(motion, 79367)": "a black cat sitting and starring",
48
+ "(static, 79368)": "a cat with a heart shaped tag"
49
+ },
50
+ "4701": {
51
+ "(motion, 97795)": "person whose tie is being pulled by another person",
52
+ "(static, 97796)": "person in blue shirt with a red undone tie"
53
+ },
54
+ "1211": {
55
+ "(motion, 26003)": "person putting arm around another person",
56
+ "(static, 26004)": "person with backpack"
57
+ },
58
+ "2138": {
59
+ "(motion, 45446)": "a person sleeping on the top bunk",
60
+ "(static, 45447)": "a person in a green shirt and brown shorts"
61
+ },
62
+ "3510": {
63
+ "(motion, 73478)": "personn sitting in a train compartment and reading book",
64
+ "(static, 73479)": "person in striped shirt"
65
+ },
66
+ "899": {
67
+ "(motion, 19308)": "a man serving soup",
68
+ "(static, 19309)": "a man with tattoo on his arm"
69
+ },
70
+ "293": {
71
+ "(motion, 5939)": "a lady laughing and looking at another lady",
72
+ "(static, 5940)": "a lady with dark hair and a dark shirt"
73
+ },
74
+ "3196": {
75
+ "(motion, 67017)": "person holding a pen",
76
+ "(static, 67018)": "person in a brown suit"
77
+ },
78
+ "1939": {
79
+ "(motion, 41076)": "a person sitting cross legged on the beach",
80
+ "(static, 41077)": "person in khakis and a white shirt with yellow flowers"
81
+ },
82
+ "2659": {
83
+ "(motion, 56121)": "person helping another cross a stream",
84
+ "(static, 56122)": "person in white dress"
85
+ },
86
+ "2849": {
87
+ "(motion, 59798)": "person looking down drinking a glass of wine",
88
+ "(static, 59799)": "person on the right side not wearing glasses"
89
+ },
90
+ "756": {
91
+ "(motion, 16375)": "the woman about to pick up a slice of pizza",
92
+ "(static, 16376)": "a woman with a flower shirt"
93
+ },
94
+ "4573": {
95
+ "(motion, 95258)": "person reaching for another person with the frisbee",
96
+ "(static, 95259)": "person with blue and white striped shirt on"
97
+ },
98
+ "4514": {
99
+ "(motion, 94061)": "person running behind",
100
+ "(static, 94062)": "person in dark brown top and jeans"
101
+ },
102
+ "304": {
103
+ "(motion, 6165)": "person resting her head in hand and crossing one's legs",
104
+ "(static, 6166)": "the person in pink jacket"
105
+ },
106
+ "3465": {
107
+ "(motion, 72753)": "person sitting on a love seat and watching others play wii",
108
+ "(static, 72754)": "person in a black shirt and white shorts"
109
+ },
110
+ "1092": {
111
+ "(motion, 23796)": "a bear standing up with its mouth open",
112
+ "(static, 23797)": "a bear on the right"
113
+ },
114
+ "2025": {
115
+ "(motion, 42838)": "the person leading the horse",
116
+ "(static, 42839)": "the person in gray top and jeans"
117
+ },
118
+ "1701": {
119
+ "(motion, 36094)": "giraffe biting off of a tree",
120
+ "(static, 36095)": "tall giraffe on the right"
121
+ },
122
+ "2958": {
123
+ "(motion, 62137)": "person playing with dog",
124
+ "(static, 62138)": "balding person wearing brown hoodie"
125
+ },
126
+ "4793": {
127
+ "(motion, 99824)": "the girl eating and looking at her plate",
128
+ "(static, 99825)": "the girl wearing a pink shirt"
129
+ },
130
+ "1247": {
131
+ "(motion, 26624)": "the person holding the bat",
132
+ "(static, 26625)": "the person in white t - shirt and grey pants"
133
+ },
134
+ "1841": {
135
+ "(motion, 38888)": "person resting hands on other people's shoulders",
136
+ "(static, 38889)": "tallest person wearing bright suit"
137
+ },
138
+ "4404": {
139
+ "(motion, 91907)": "a elephant whose trunk pointing to the floor , may be touching",
140
+ "(static, 91908)": "elephant more on the right side of the picture"
141
+ },
142
+ "4536": {
143
+ "(motion, 94448)": "a person reaching for the microwave looking at the camera",
144
+ "(static, 94449)": "person in black t shirt"
145
+ },
146
+ "2787": {
147
+ "(motion, 58740)": "a giraffe snacking on the tree",
148
+ "(static, 58741)": "a giraffe on the right"
149
+ },
150
+ "3377": {
151
+ "(motion, 70765)": "a zebra resting its head on another zebra ' s back",
152
+ "(static, 70766)": "a zebra on the left"
153
+ },
154
+ "3889": {
155
+ "(motion, 81051)": "a man holding a basket of pastries",
156
+ "(static, 81052)": "a man wearing grey hoodie"
157
+ },
158
+ "2194": {
159
+ "(motion, 46507)": "standing dog",
160
+ "(static, 46508)": "a black and white dog with a blue collar tag"
161
+ },
162
+ "508": {
163
+ "(motion, 11146)": "person being held by another person",
164
+ "(static, 11147)": "person dressed in a red suit and blue cap"
165
+ },
166
+ "2312": {
167
+ "(motion, 48847)": "a bird standing on a table",
168
+ "(static, 48848)": "a bird on the left"
169
+ },
170
+ "3948": {
171
+ "(motion, 82190)": "the woman who is squinting in one eye",
172
+ "(static, 82191)": "a blue eyed brown haired woman not wearing glasses"
173
+ },
174
+ "1388": {
175
+ "(motion, 29353)": "person holding another person while watching giraffe drink water",
176
+ "(static, 29354)": "person in brown shirt with bag"
177
+ },
178
+ "2690": {
179
+ "(motion, 56849)": "a man about to kick a ball",
180
+ "(static, 56850)": "a man in all white with number 23 on his chest"
181
+ },
182
+ "1109": {
183
+ "(motion, 24043)": "man holding the ktie",
184
+ "(static, 24044)": "man on the right"
185
+ },
186
+ "1374": {
187
+ "(motion, 29120)": "person arranging pansts of another person",
188
+ "(static, 29121)": "the person with in the black tuxedo and glasses in his head"
189
+ },
190
+ "3475": {
191
+ "(motion, 72951)": "woman holding the horse",
192
+ "(static, 72952)": "a woman wearing spectacles with violet shirt and flourecent colour waist vest"
193
+ },
194
+ "1333": {
195
+ "(motion, 28225)": "a person holding another person",
196
+ "(static, 28226)": "a person in a pink and orange flannel shirt"
197
+ },
198
+ "2068": {
199
+ "(motion, 43909)": "person standing and playing wii",
200
+ "(static, 43910)": "person wearing black t - shirt"
201
+ },
202
+ "2824": {
203
+ "(motion, 59394)": "person standing besides a table crossing arms",
204
+ "(static, 59395)": "person with glasses and long hair"
205
+ },
206
+ "2294": {
207
+ "(motion, 48483)": "a person sitting on bike holding another person",
208
+ "(static, 48484)": "a person with a helmet on the head"
209
+ },
210
+ "2446": {
211
+ "(motion, 51355)": "an elephant that has it ' s trunk pointing towards the water",
212
+ "(static, 51356)": "elephant on the left"
213
+ },
214
+ "2686": {
215
+ "(motion, 56783)": "a man staring at another man",
216
+ "(static, 56784)": "a man in an orange tie"
217
+ },
218
+ "4558": {
219
+ "(motion, 94950)": "a zebra facing the camera",
220
+ "(static, 94951)": "a small zebra beside a larger zebra"
221
+ },
222
+ "1499": {
223
+ "(motion, 32051)": "a man resting on a metal fence",
224
+ "(static, 32052)": "a man in white shirt and polka dot tie"
225
+ },
226
+ "4303": {
227
+ "(motion, 89833)": "a man throwing a banana",
228
+ "(static, 89834)": "a man in bike gear on the right of the picture"
229
+ },
230
+ "1376": {
231
+ "(motion, 29146)": "a man sitting down with his hands together",
232
+ "(static, 29147)": "a man with a purple shirt and khaki pants "
233
+ },
234
+ "3544": {
235
+ "(motion, 74100)": "the man holding a riding crop",
236
+ "(static, 74101)": "man in black shirt and slacks on the left"
237
+ },
238
+ "1858": {
239
+ "(motion, 39103)": "a bull standing",
240
+ "(static, 39104)": "a white and brown bull on the left of the picture"
241
+ },
242
+ "434": {
243
+ "(motion, 9561)": "the man looking down",
244
+ "(static, 9562)": "the man on the left"
245
+ },
246
+ "3024": {
247
+ "(motion, 63345)": "a baseball player sliding into a base",
248
+ "(static, 63346)": "baseball player wearing the number 12"
249
+ },
250
+ "513": {
251
+ "(motion, 11239)": "a man riding on a skateboard as his picture is being taken",
252
+ "(static, 11240)": "a man in a purple t - shirt and ripped jeans"
253
+ },
254
+ "693": {
255
+ "(motion, 14989)": "a person standing",
256
+ "(static, 14990)": "a small person"
257
+ },
258
+ "2523": {
259
+ "(motion, 53103)": "a baseball player sliding into home plate and getting tagged by the catcher",
260
+ "(static, 53104)": "a la dodgers player on the right of the picture"
261
+ },
262
+ "4987": {
263
+ "(motion, 104145)": "a girl punching out her arm while playing an interactive video game",
264
+ "(static, 104146)": "girl wearing grey and white stripes and sweatpants"
265
+ },
266
+ "4041": {
267
+ "(motion, 84159)": "soccer player about to kick soccer ball",
268
+ "(static, 84160)": "soccer player wearing black t - shirt and black gloves"
269
+ },
270
+ "2105": {
271
+ "(motion, 44674)": "a baseball player holding his arm up to catch a ball",
272
+ "(static, 44675)": "a baseball player wearing helmet and vest"
273
+ },
274
+ "135": {
275
+ "(motion, 2353)": "dog resting it ' s head on a table",
276
+ "(static, 2354)": "golden dog"
277
+ },
278
+ "3613": {
279
+ "(motion, 75580)": "person talking to another person while crossing legs",
280
+ "(static, 75581)": "person with long sleeve shirt, jeans and cap"
281
+ },
282
+ "1722": {
283
+ "(motion, 36451)": "person pulling another person's tie",
284
+ "(static, 36452)": "blonde person in black dress"
285
+ },
286
+ "1607": {
287
+ "(motion, 34281)": "a person reading a book to another person he ' s holding",
288
+ "(static, 34282)": "a bald person wearing a beige t - shirt and gray jeans"
289
+ },
290
+ "2761": {
291
+ "(motion, 58225)": "girl propping her chin on her hand",
292
+ "(static, 58226)": "girl in a pink shirt near window"
293
+ },
294
+ "2454": {
295
+ "(motion, 51492)": "a man looking at laptop",
296
+ "(static, 51493)": "the man with glasses and painted fingernails"
297
+ },
298
+ "1603": {
299
+ "(motion, 34234)": "person eating a donut",
300
+ "(static, 34235)": "person with the black beanie"
301
+ },
302
+ "4794": {
303
+ "(motion, 99868)": "a duck that is looking straight ahead",
304
+ "(static, 99869)": "the duck on the right side"
305
+ },
306
+ "2485": {
307
+ "(motion, 52246)": "a person reaching across the net",
308
+ "(static, 52247)": "tallest person in a grey shirt and shorts"
309
+ },
310
+ "3280": {
311
+ "(motion, 68799)": "a boy walking towards his skate board",
312
+ "(static, 68800)": "a boy in a striped shirt"
313
+ },
314
+ "3336": {
315
+ "(motion, 69882)": "person holding a piece of chocolate cake",
316
+ "(static, 69883)": "person wearing a purple dress"
317
+ },
318
+ "3118": {
319
+ "(motion, 65349)": "giraffe stretching its neck straight up",
320
+ "(static, 65350)": "taller giraffe"
321
+ },
322
+ "4494": {
323
+ "(motion, 93729)": "man touching the frisbee",
324
+ "(static, 93730)": "a man in a white shirt"
325
+ },
326
+ "3004": {
327
+ "(motion, 62940)": "person crouching to catch a ball",
328
+ "(static, 62941)": "person in a red uniform and helmet"
329
+ },
330
+ "127": {
331
+ "(motion, 2256)": "a person holding a plate",
332
+ "(static, 2257)": "the person in the purple coat"
333
+ },
334
+ "3389": {
335
+ "(motion, 70905)": "person waving",
336
+ "(static, 70906)": "person in black sneakers"
337
+ },
338
+ "2568": {
339
+ "(motion, 54256)": "person looking at phone",
340
+ "(static, 54257)": "blonde person on the right"
341
+ },
342
+ "2283": {
343
+ "(motion, 48251)": "the cook holding a plate",
344
+ "(static, 48252)": "middle cook of three cooks"
345
+ },
346
+ "1530": {
347
+ "(motion, 32639)": "person petting the cat",
348
+ "(static, 32640)": "person with sleeves rolled up"
349
+ },
350
+ "4251": {
351
+ "(motion, 88833)": "a person reading a book",
352
+ "(static, 88834)": "person in a striped jacket "
353
+ },
354
+ "2540": {
355
+ "(motion, 53539)": "a man reaching out his right arm holding a controller",
356
+ "(static, 53540)": "a man in red shirt and black jeans"
357
+ },
358
+ "2870": {
359
+ "(motion, 60169)": "a person watching horse riding",
360
+ "(static, 60170)": "a person in a white jacket and beige pants"
361
+ },
362
+ "4946": {
363
+ "(motion, 103092)": "a man about to hit a ball",
364
+ "(static, 103093)": "a man in red shirt and blue vest"
365
+ },
366
+ "113": {
367
+ "(motion, 1973)": "person holding phone",
368
+ "(static, 1974)": "person with a black shirt and brown coat"
369
+ },
370
+ "711": {
371
+ "(motion, 15398)": "girl crouching and holding an umbrella",
372
+ "(static, 15399)": "girl wearing light green socks on the left"
373
+ },
374
+ "3209": {
375
+ "(motion, 67236)": "the person that is sliding into home , getting tagged out by the catcher",
376
+ "(static, 67237)": "the person in the white vest over the blue shirt"
377
+ },
378
+ "3620": {
379
+ "(motion, 75711)": "person petting a horse",
380
+ "(static, 75712)": "a person in white t - shirt"
381
+ },
382
+ "4382": {
383
+ "(motion, 91559)": "horse being hugged by a person",
384
+ "(static, 91560)": "white and brown horse"
385
+ },
386
+ "2861": {
387
+ "(motion, 60004)": "a man playing tennis",
388
+ "(static, 60005)": "a man wearing a blue shirt and white shorts"
389
+ },
390
+ "3954": {
391
+ "(motion, 82306)": "a person putting gloves on",
392
+ "(static, 82307)": "person with dark blue jumper"
393
+ },
394
+ "1984": {
395
+ "(motion, 42076)": "a person being held by another person",
396
+ "(static, 42077)": "little person on pink skiis with yellow parka on"
397
+ },
398
+ "2069": {
399
+ "(motion, 43945)": "a person helping another person ski",
400
+ "(static, 43946)": "a big person in white jumper and backpack"
401
+ },
402
+ "2016": {
403
+ "(motion, 42686)": "person putting food in the oven",
404
+ "(static, 42687)": "person in green t - shirt"
405
+ },
406
+ "1153": {
407
+ "(motion, 25076)": "a giraffe , with head lowered , crosses in front of another giraffe",
408
+ "(static, 25077)": "giraffe in the middle"
409
+ },
410
+ "3614": {
411
+ "(motion, 75583)": "a man in explaining something on a tablet",
412
+ "(static, 75584)": "a man with a blue cap and striped shirt"
413
+ },
414
+ "198": {
415
+ "(motion, 3830)": "a giraffe bending down to eat grass",
416
+ "(static, 3831)": "giraffe in front"
417
+ },
418
+ "3012": {
419
+ "(motion, 63097)": "person standing with hands on hips",
420
+ "(static, 63098)": "person in a white collared shirt and jeans"
421
+ },
422
+ "4247": {
423
+ "(motion, 88808)": "man pointing toward another man",
424
+ "(static, 88809)": "man in plaid shirt"
425
+ },
426
+ "2205": {
427
+ "(motion, 46674)": "person bending over",
428
+ "(static, 46675)": "person in red shirt and cap"
429
+ },
430
+ "4831": {
431
+ "(motion, 100694)": "person holding bat in hands",
432
+ "(static, 100695)": "person wearing light blue shirt and glass"
433
+ },
434
+ "4534": {
435
+ "(motion, 94419)": "the bird not drinking",
436
+ "(static, 94420)": "the bird on the left"
437
+ },
438
+ "638": {
439
+ "(motion, 13717)": "person sitting on another person's lap and holding the remote controller",
440
+ "(static, 13718)": "small person in red shirt"
441
+ },
442
+ "1419": {
443
+ "(motion, 30082)": "person squatting on the ground to catch a ball",
444
+ "(static, 30083)": "person in red and white wearing glove"
445
+ },
446
+ "1992": {
447
+ "(motion, 42197)": "a person reaching for a cupcake",
448
+ "(static, 42198)": "a person in a blue vest"
449
+ },
450
+ "542": {
451
+ "(motion, 11877)": "man receiving food",
452
+ "(static, 11878)": "a black man in a black shirt"
453
+ },
454
+ "2223": {
455
+ "(motion, 47051)": "person sitting a chair holding a protest sign",
456
+ "(static, 47052)": "old person in grey t - shirt and blue jeans"
457
+ },
458
+ "4865": {
459
+ "(motion, 101219)": "person being held by another person",
460
+ "(static, 101220)": "a young person wearing a yellow shirt"
461
+ },
462
+ "751": {
463
+ "(motion, 16247)": "person holding a painting brush",
464
+ "(static, 16248)": "person wearing white top and cap"
465
+ },
466
+ "3540": {
467
+ "(motion, 74039)": "a man swinging a bat",
468
+ "(static, 74040)": "a man in a blue baseball shirt and white pants"
469
+ },
470
+ "3765": {
471
+ "(motion, 78908)": "person sitting",
472
+ "(static, 78909)": "person wearing white shirt and red shoes"
473
+ },
474
+ "2879": {
475
+ "(motion, 60471)": "bear standing against the fence",
476
+ "(static, 60472)": "a small bear on the right"
477
+ },
478
+ "4529": {
479
+ "(motion, 94312)": "kid holding out left arm playing wii",
480
+ "(static, 94313)": "kid in a green and red sweatshirt"
481
+ },
482
+ "2131": {
483
+ "(motion, 45308)": "man putting both hands behind his head",
484
+ "(static, 45309)": "a man with the pool noodle"
485
+ },
486
+ "1306": {
487
+ "(motion, 27841)": "a cow eating grass",
488
+ "(static, 27842)": "the cow on the right"
489
+ },
490
+ "3508": {
491
+ "(motion, 73469)": "a person standing and playing a video game",
492
+ "(static, 73470)": "a little person dressed in brown"
493
+ },
494
+ "4165": {
495
+ "(motion, 87036)": "a child holding feathers",
496
+ "(static, 87037)": "a child wearing green t - shirt"
497
+ },
498
+ "4126": {
499
+ "(motion, 86073)": "a person standing and reading a book",
500
+ "(static, 86074)": "a person in a suit"
501
+ },
502
+ "388": {
503
+ "(motion, 8339)": "a man holding up an umbrella in the rain for a man who is fixing a tire",
504
+ "(static, 8340)": "a man wearing glasses in a red jacket"
505
+ }
506
+ }
make_refcoco/refcocog_umd/motion_split_generation.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
make_refcoco/refcocog_umd/part4_ref_id.txt ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 1679
2
+ 4048
3
+ 2530
4
+ 4385
5
+ 5018
6
+ 2290
7
+ 2347
8
+ 3143
9
+ 4745
10
+ 1688
11
+ 944
12
+ 3477
13
+ 2497
14
+ 4110
15
+ 2011
16
+ 2884
17
+ 1076
18
+ 4803
19
+ 3508
20
+ 169
21
+ 258
22
+ 3661
23
+ 4831
24
+ 2214
25
+ 2266
26
+ 2477
27
+ 5005
28
+ 2919
29
+ 1850
30
+ 3757
31
+ 524
32
+ 4363
33
+ 2976
34
+ 838
35
+ 3044
36
+ 2426
37
+ 2113
38
+ 2327
39
+ 4727
40
+ 859
41
+ 935
42
+ 1105
43
+ 395
44
+ 771
45
+ 2942
46
+ 41
47
+ 885
48
+ 4862
49
+ 1246
50
+ 3346
51
+ 3657
52
+ 540
53
+ 3364
54
+ 1880
55
+ 1949
56
+ 1620
57
+ 2902
58
+ 397
59
+ 732
60
+ 1173
61
+ 2920
62
+ 1643
63
+ 1454
64
+ 1725
65
+ 2338
66
+ 4249
67
+ 3917
68
+ 1156
69
+ 1998
70
+ 3571
71
+ 292
72
+ 3367
73
+ 2069
74
+ 4050
75
+ 2953
76
+ 4280
77
+ 1743
78
+ 4598
79
+ 3380
80
+ 3439
81
+ 3355
82
+ 3409
83
+ 711
84
+ 3764
85
+ 113
86
+ 518
87
+ 3158
88
+ 3223
89
+ 914
90
+ 3568
91
+ 592
92
+ 2856
93
+ 4879
94
+ 157
95
+ 1774
96
+ 2354
97
+ 174
98
+ 2369
99
+ 4247
100
+ 1014
101
+ 1080
102
+ 2272
103
+ 2495
104
+ 3511
105
+ 3955
106
+ 2409
107
+ 2775
108
+ 996
109
+ 4789
110
+ 1028
111
+ 244
112
+ 3538
113
+ 557
114
+ 1810
115
+ 4982
116
+ 4570
117
+ 1698
118
+ 3182
119
+ 846
120
+ 671
121
+ 3254
122
+ 3318
123
+ 1424
124
+ 3926
125
+ 862
126
+ 2932
make_refcoco/refcocog_umd/revised_refid_part4.json ADDED
@@ -0,0 +1,498 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1679": {
3
+ "(motion, 37582)": "player holding a baseball glove",
4
+ "(static, 37583)": "a blurred player"
5
+ },
6
+ "4048": {
7
+ "(motion, 92810)": "player hitting a ball with a baseball bat",
8
+ "(static, 92811)": "player with number 18 on his back"
9
+ },
10
+ "2530": {
11
+ "(motion, 57782)": "man crouching ready to catch a ball",
12
+ "(static, 57783)": "man with 55 on his back"
13
+ },
14
+ "4385": {
15
+ "(motion, 101410)": "man leaning on one leg watching the players",
16
+ "(static, 101411)": "man in gray pants"
17
+ },
18
+ "5018": {
19
+ "(motion, 102413)": "man standing ready to swing his bat",
20
+ "(static, 102414)": "man in front of the other two men"
21
+ },
22
+ "2290": {
23
+ "(motion, 52302)": "sheep standing in the pasture next to a sitting sheep",
24
+ "(static, 52303)": "the front most sheep"
25
+ },
26
+ "2347": {
27
+ "(motion, 53861)": "a sheep sitting down in the grass",
28
+ "(static, 53862)": "a sheep in the background"
29
+ },
30
+ "3143": {
31
+ "(motion, 71854)": "a horse being led by it ' s trainer",
32
+ "(static, 71855)": "a horse in front of the picture"
33
+ },
34
+ "1688": {
35
+ "(motion, 37818)": "zebra eating grass",
36
+ "(static, 37819)": "the zebra in the middle with its face near the ground"
37
+ },
38
+ "944": {
39
+ "(motion, 21007)": "a bird touching its neck with its right feet",
40
+ "(static, 21008)": "a bird in the back"
41
+ },
42
+ "3477": {
43
+ "(motion, 79163)": "the bird standing and looking to the left",
44
+ "(static, 79164)": "bird with both feet in the water"
45
+ },
46
+ "2497": {
47
+ "(motion, 56845)": "person holding a baseball bat",
48
+ "(static, 56846)": "person in blue baseball cap"
49
+ },
50
+ "4110": {
51
+ "(motion, 94298)": "person sitting and watching children play a ballgame",
52
+ "(static, 94299)": "person wearing a white shirt and black leggings"
53
+ },
54
+ "2011": {
55
+ "(motion, 45909)": "a woman talking on her cell phone",
56
+ "(static, 45910)": "a blonde woman wearing a blue shirt and white shorts"
57
+ },
58
+ "2884": {
59
+ "(motion, 65819)": "a woman looking at her phone",
60
+ "(static, 65820)": "a woman with black hair wearing jeans, a striped gray shirt and flip flops"
61
+ },
62
+ "1076": {
63
+ "(motion, 24000)": "person crossing a stream of water",
64
+ "(static, 24001)": "person wearing jeans and a green vest"
65
+ },
66
+ "4803": {
67
+ "(motion, 56121)": "person helping the other cross a stream",
68
+ "(static, 56122)": "person in white dress"
69
+ },
70
+ "3508": {
71
+ "(motion, 80112)": "baseball player placing his hands on his hips",
72
+ "(static, 80113)": "a baseball player named datz"
73
+ },
74
+ "169": {
75
+ "(motion, 4002)": "person feeding a giraffe",
76
+ "(static, 4003)": "a small person in light blue shirt"
77
+ },
78
+ "258": {
79
+ "(motion, 5988)": "person holding a child",
80
+ "(static, 5989)": "person wearing glasses and navy shirt"
81
+ },
82
+ "3661": {
83
+ "(motion, 83542)": "person sitting on the floor",
84
+ "(static, 83543)": "person in a grey shirt and dark pants"
85
+ },
86
+ "4831": {
87
+ "(motion, 62137)": "person sitting on couch and playing with a dog",
88
+ "(static, 62138)": "bald person wearing jeans and brown hoodie"
89
+ },
90
+ "2214": {
91
+ "(motion, 50208)": "a woman eating a donut",
92
+ "(static, 50209)": "a brown hair woman in gray sweater"
93
+ },
94
+ "2266": {
95
+ "(motion, 51661)": "a woman holding a purse",
96
+ "(static, 51662)": "a woman with blonde hair and a black shirt"
97
+ },
98
+ "2477": {
99
+ "(motion, 56429)": "girl talking and looking at another girl",
100
+ "(static, 56430)": "girl in black"
101
+ },
102
+ "5005": {
103
+ "(motion, 99824)": "girl eating and looking at her plate",
104
+ "(static, 99825)": "girl wearing a pink shirt"
105
+ },
106
+ "2919": {
107
+ "(motion, 66832)": "person riding a bike",
108
+ "(static, 66833)": "asian person wearing black jacket"
109
+ },
110
+ "1850": {
111
+ "(motion, 42078)": "man placing his hand on another man's shoulder",
112
+ "(static, 42079)": "a man who is wearing a red color tie"
113
+ },
114
+ "3757": {
115
+ "(motion, 85761)": "boy holding a cell phone",
116
+ "(static, 85762)": "boy in a blue hoodie"
117
+ },
118
+ "524": {
119
+ "(motion, 12089)": "a zebra that is not eating grass",
120
+ "(static, 12090)": "a zebra on the far right"
121
+ },
122
+ "4363": {
123
+ "(motion, 100914)": "elephant holding up its trunk",
124
+ "(static, 100915)": "an elephant in front of another"
125
+ },
126
+ "2976": {
127
+ "(motion, 68306)": "girl eating food from her right hand",
128
+ "(static, 68307)": "a girl in a black flowered top"
129
+ },
130
+ "838": {
131
+ "(motion, 18887)": "man leaning on bike on boat",
132
+ "(static, 18888)": "a man not wearing a hat"
133
+ },
134
+ "3044": {
135
+ "(motion, 69755)": "man rowing boat",
136
+ "(static, 69756)": "a man on the left side of the picture"
137
+ },
138
+ "2426": {
139
+ "(motion, 55424)": "the baseball player facing towards the right not doing a high five",
140
+ "(static, 55425)": "baseball player in catcher ' s uniform"
141
+ },
142
+ "2113": {
143
+ "(motion, 47984)": "person that is dancing",
144
+ "(static, 47985)": "person with the thick beard, glasses and a hat"
145
+ },
146
+ "2327": {
147
+ "(motion, 53376)": "person bathing another person",
148
+ "(static, 53377)": "person in a floral print dress and hat"
149
+ },
150
+ "4727": {
151
+ "(motion, 39103)": "a bull laying down",
152
+ "(static, 39104)": "a white and brown bull on the right"
153
+ },
154
+ "859": {
155
+ "(motion, 19350)": "cat sitting on a luggage and staring at the camera",
156
+ "(static, 19351)": "cat infront of another cat"
157
+ },
158
+ "935": {
159
+ "(motion, 20809)": "cat laying down on a bag",
160
+ "(static, 20810)": "cat behind another cat"
161
+ },
162
+ "1105": {
163
+ "(motion, 24654)": "an elephant stepping on a large log",
164
+ "(static, 24655)": "elephant on far right"
165
+ },
166
+ "395": {
167
+ "(motion, 8819)": "person placing her hands on one's hips",
168
+ "(static, 8820)": "person on the far left"
169
+ },
170
+ "771": {
171
+ "(motion, 17614)": "person holding a child on one's shoulders",
172
+ "(static, 17615)": "tall person on the right"
173
+ },
174
+ "2942": {
175
+ "(motion, 67334)": "person sitting on another person's shoulders",
176
+ "(static, 67335)": "small person on the right"
177
+ },
178
+ "41": {
179
+ "(motion, 961)": "a lady pouring wine in a glass",
180
+ "(static, 962)": "a lady in black tank top"
181
+ },
182
+ "885": {
183
+ "(motion, 19926)": "person feeding another person with a bottle",
184
+ "(static, 19927)": "person in black blouse"
185
+ },
186
+ "4862": {
187
+ "(motion, 69276)": "person drinking from a bottle",
188
+ "(static, 69277)": "small person in white pajamas"
189
+ },
190
+ "1246": {
191
+ "(motion, 27831)": "person holding a laptop",
192
+ "(static, 27832)": "person with curly brown hair wearing jeans"
193
+ },
194
+ "3346": {
195
+ "(motion, 76051)": "person filing her nails",
196
+ "(static, 76052)": "person wearing a red robe and has a towel on her head"
197
+ },
198
+ "3657": {
199
+ "(motion, 83493)": "person holding a bottle and listening to music",
200
+ "(static, 83494)": "person wearing black in headphones"
201
+ },
202
+ "540": {
203
+ "(motion, 12381)": "the woman is swinging the controller",
204
+ "(static, 12382)": "woman in brown top on the right"
205
+ },
206
+ "3364": {
207
+ "(motion, 76757)": "the woman looking at the camera and opening her mouth",
208
+ "(static, 76758)": "a woman wearing a brown hooded sweatshirt on the left"
209
+ },
210
+ "1880": {
211
+ "(motion, 42973)": "man looking ahead at the tv",
212
+ "(static, 42974)": "a man in a white shirt"
213
+ },
214
+ "1949": {
215
+ "(motion, 44400)": "a man looking at his phone",
216
+ "(static, 44401)": "man in black t - shirt and cap"
217
+ },
218
+ "1620": {
219
+ "(motion, 36248)": "person playing tennis",
220
+ "(static, 36249)": "person in red tank top and black shorts"
221
+ },
222
+ "2902": {
223
+ "(motion, 66297)": "person sitting and watching a tennis game",
224
+ "(static, 66298)": "person in blue top"
225
+ },
226
+ "397": {
227
+ "(motion, 8843)": "giraffe bending its head down",
228
+ "(static, 8844)": "giraffe on the far right"
229
+ },
230
+ "732": {
231
+ "(motion, 16725)": "baseball player squatting and watching closely to judge a play",
232
+ "(static, 16726)": "baseball player in black top and gray pants"
233
+ },
234
+ "1173": {
235
+ "(motion, 26074)": "a man swinging a bat",
236
+ "(static, 26075)": "a man in blue and grey"
237
+ },
238
+ "2920": {
239
+ "(motion, 66854)": "a man reaching out his left arm to catch a ball",
240
+ "(static, 66855)": "a man in red uniform and helmet"
241
+ },
242
+ "1643": {
243
+ "(motion, 36762)": "a man smiling looking down at other people",
244
+ "(static, 36763)": "a man in a grey suite wearing a pink tie"
245
+ },
246
+ "1454": {
247
+ "(motion, 32177)": "person in putting hands in one's pockets",
248
+ "(static, 32178)": "person in gray shirt and jeans"
249
+ },
250
+ "1725": {
251
+ "(motion, 38835)": "person crossing her arms walking with another person",
252
+ "(static, 38836)": "person in a black shirt and jeans"
253
+ },
254
+ "2338": {
255
+ "(motion, 53733)": "the person crouching and placing his hands on his knees",
256
+ "(static, 53734)": "person with a black shirt and dark grey pants"
257
+ },
258
+ "4249": {
259
+ "(motion, 97957)": "a baseball player reaching out his arm to catch a ball",
260
+ "(static, 97958)": "a baseball player in green top"
261
+ },
262
+ "3917": {
263
+ "(motion, 89675)": "cow looking at camera",
264
+ "(static, 89676)": "a cow with an ear tag with the number 949 on it"
265
+ },
266
+ "1156": {
267
+ "(motion, 25761)": "man sitting on the couch using a laptop",
268
+ "(static, 25762)": "a man with a hat"
269
+ },
270
+ "1998": {
271
+ "(motion, 45619)": "a person watching his phone",
272
+ "(static, 45620)": "person wearing glasses"
273
+ },
274
+ "3571": {
275
+ "(motion, 81719)": "person looking at one's phone",
276
+ "(static, 81720)": "mature person with blonde hair and glasses"
277
+ },
278
+ "292": {
279
+ "(motion, 6707)": "a zebra lying down in dirt",
280
+ "(static, 6708)": "the zebra in the foreground"
281
+ },
282
+ "3367": {
283
+ "(motion, 76808)": "a zebra standing in the zoo",
284
+ "(static, 76809)": "a zebra in the background"
285
+ },
286
+ "2069": {
287
+ "(motion, 47212)": "person leaning forward on skis",
288
+ "(static, 47213)": "person in blue hat and jacket, black pants"
289
+ },
290
+ "4050": {
291
+ "(motion, 92834)": "person standing straight looking at another person",
292
+ "(static, 92835)": "a small person wearing purple pants"
293
+ },
294
+ "2953": {
295
+ "(motion, 67711)": "person who is looking away",
296
+ "(static, 67712)": "person in a suit"
297
+ },
298
+ "4280": {
299
+ "(motion, 98813)": "person pulling another person's tie",
300
+ "(static, 98814)": "a person in a white shirt"
301
+ },
302
+ "1743": {
303
+ "(motion, 39371)": "a person holding and looking at another person",
304
+ "(static, 39372)": "person with bald head and glasses"
305
+ },
306
+ "4598": {
307
+ "(motion, 13717)": "person playing with the remote controller",
308
+ "(static, 13718)": "small person in red shirt"
309
+ },
310
+ "3380": {
311
+ "(motion, 77052)": "a person cutting a cake",
312
+ "(static, 77053)": "a person in gray shirt that is not striped"
313
+ },
314
+ "3439": {
315
+ "(motion, 78305)": "a person holding a spatula getting readyy to have a cake",
316
+ "(static, 78306)": "a person in striped shirt"
317
+ },
318
+ "3355": {
319
+ "(motion, 76309)": "a man swining his bat",
320
+ "(static, 76310)": "a man in a baseball uniform with a brace on his left ankle"
321
+ },
322
+ "3409": {
323
+ "(motion, 77608)": "a man holding out his arm to catch a ball",
324
+ "(static, 77609)": "a man wearing a red vest with red shin guards"
325
+ },
326
+ "711": {
327
+ "(motion, 16184)": "the man holding a cat in his arms",
328
+ "(static, 16185)": "this is a man with thin rimmed glasses and a black scarf"
329
+ },
330
+ "3764": {
331
+ "(motion, 85913)": "person holding a remote and smilling",
332
+ "(static, 85914)": "person in a black t - shirt and not wearing glasses"
333
+ },
334
+ "113": {
335
+ "(motion, 2741)": "a sheep being fed by a little girl",
336
+ "(static, 2742)": "a sheep on the right"
337
+ },
338
+ "518": {
339
+ "(motion, 12021)": "a sheep eating grass with its head down",
340
+ "(static, 12022)": "a sheep on the left"
341
+ },
342
+ "3158": {
343
+ "(motion, 72128)": "a boy crouching and placing both hands on his knees",
344
+ "(static, 72129)": "boy wearing white baseball helmet , white baseball uniform with orange writing"
345
+ },
346
+ "3223": {
347
+ "(motion, 73555)": "a boy pitching the ball to a player",
348
+ "(static, 73556)": "a boy with the number 4 on his blue jersey"
349
+ },
350
+ "914": {
351
+ "(motion, 20478)": "a person standing on a surf board , riding a wave",
352
+ "(static, 20479)": "a person on the right"
353
+ },
354
+ "3568": {
355
+ "(motion, 81669)": "surfer laying down",
356
+ "(static, 81670)": "surfer on the left"
357
+ },
358
+ "592": {
359
+ "(motion, 13643)": "person sits on the floor watching tv",
360
+ "(static, 13644)": "person with a black hat and a beige shirt"
361
+ },
362
+ "2856": {
363
+ "(motion, 65208)": "person sitting on a chair watching another person play video games",
364
+ "(static, 65209)": "person in black shirt and jeans"
365
+ },
366
+ "4879": {
367
+ "(motion, 73469)": "person playing a video game",
368
+ "(static, 73470)": "blonde person dressed in brown"
369
+ },
370
+ "157": {
371
+ "(motion, 3682)": "a woman holding a plate and reaching for condiments",
372
+ "(static, 3683)": "woman wearing grey button up sweater"
373
+ },
374
+ "1774": {
375
+ "(motion, 40317)": "person being held by another person",
376
+ "(static, 40318)": "person with red hair, wearing a pink shirt"
377
+ },
378
+ "2354": {
379
+ "(motion, 53948)": "person with child , catching a frisby",
380
+ "(static, 53949)": "bigger person in white t - shirt"
381
+ },
382
+ "174": {
383
+ "(motion, 4179)": "a lamb eating grass",
384
+ "(static, 4180)": "a lamb to the left of another lamb"
385
+ },
386
+ "2369": {
387
+ "(motion, 54196)": "the sheep that is looking into the camera",
388
+ "(static, 54197)": "a white sheep with a black head on the right"
389
+ },
390
+ "4247": {
391
+ "(motion, 97897)": "a woman holding an umbrella on a bench",
392
+ "(static, 97898)": "woman on the right"
393
+ },
394
+ "1014": {
395
+ "(motion, 22621)": "man receiving an award",
396
+ "(static, 22622)": "a man in an orange and white uniform with a black cap"
397
+ },
398
+ "1080": {
399
+ "(motion, 24100)": "a man offers a trophy to anothe man",
400
+ "(static, 24101)": "a man in a suit"
401
+ },
402
+ "2272": {
403
+ "(motion, 51815)": "the baseball player catching a ball",
404
+ "(static, 51816)": "the baseball player in dark top and helmet"
405
+ },
406
+ "2495": {
407
+ "(motion, 56804)": "a baseball player swinging at a ball",
408
+ "(static, 56805)": "the baseball player in white uniform"
409
+ },
410
+ "3511": {
411
+ "(motion, 80309)": "person holding a cup",
412
+ "(static, 80310)": "person wearing pink shirt"
413
+ },
414
+ "3955": {
415
+ "(motion, 90542)": "person holding a remote control",
416
+ "(static, 90543)": "person in orange shirt"
417
+ },
418
+ "2409": {
419
+ "(motion, 55054)": "a man adjusting his head band",
420
+ "(static, 55055)": "man in orange and gray shirt"
421
+ },
422
+ "2775": {
423
+ "(motion, 63273)": "a person holding a remote control",
424
+ "(static, 63274)": "a tall person in white striped shirt and black pants"
425
+ },
426
+ "996": {
427
+ "(motion, 22281)": "a woman holding a baby",
428
+ "(static, 22282)": "woman wearing a black shirt and green apron"
429
+ },
430
+ "4789": {
431
+ "(motion, 52629)": "a person holding skies in one's hands",
432
+ "(static, 52630)": "a person with orange mirrored goggles"
433
+ },
434
+ "1028": {
435
+ "(motion, 22786)": "the cow standing up",
436
+ "(static, 22787)": "a cow in the middle"
437
+ },
438
+ "244": {
439
+ "(motion, 5666)": "a man holding wine glass",
440
+ "(static, 5668)": "a blonde man in a white shirt"
441
+ },
442
+ "3538": {
443
+ "(motion, 80923)": "the man throwing the ball from the picther ' s mound",
444
+ "(static, 80924)": "the man in front"
445
+ },
446
+ "557": {
447
+ "(motion, 12739)": "a baseball player getting ready to swing the bat",
448
+ "(static, 12740)": "a baseball player , wearing a white and blue uniform"
449
+ },
450
+ "4982": {
451
+ "(motion, 95870)": "cat sitting in front of television on a stand",
452
+ "(static, 95871)": "orange cat on the right side of the picture"
453
+ },
454
+ "4570": {
455
+ "(motion, 6638)": "a woman cutting a cake",
456
+ "(static, 6639)": "a woman wearing a long sleeve pink sweater"
457
+ },
458
+ "1698": {
459
+ "(motion, 38093)": "a baseball player swinging his bat",
460
+ "(static, 38094)": "a baseball player weaing a white uniform and blue helmet"
461
+ },
462
+ "3182": {
463
+ "(motion, 72616)": "the baseball player playing the catcher position",
464
+ "(static, 72617)": "the baseball player wearing a red and white uniform"
465
+ },
466
+ "846": {
467
+ "(motion, 19100)": "a man holding a toothbrush in his mouth",
468
+ "(static, 19101)": "a man wearing striped shirt"
469
+ },
470
+ "671": {
471
+ "(motion, 15227)": "person petting a horse",
472
+ "(static, 15228)": "person wearing a red jacket"
473
+ },
474
+ "3254": {
475
+ "(motion, 74216)": "person sitting in the chair",
476
+ "(static, 74217)": "person in the tan shirt wearing glasses"
477
+ },
478
+ "3318": {
479
+ "(motion, 75539)": "the person who is smashing cake in his own face",
480
+ "(static, 75540)": "person with a fake tie on its onesie"
481
+ },
482
+ "1424": {
483
+ "(motion, 31548)": "person watching another person eat",
484
+ "(static, 31549)": "person in the green shirt"
485
+ },
486
+ "3926": {
487
+ "(motion, 89831)": "person eating a sandwich",
488
+ "(static, 89832)": "person in orange top with sunglasses in one's head"
489
+ },
490
+ "862": {
491
+ "(motion, 19444)": "a man driving a bicycle and pulling a cart behind",
492
+ "(static, 19445)": "the man is wearing a pair of khaki shorts"
493
+ },
494
+ "2932": {
495
+ "(motion, 67140)": "man standing on bike",
496
+ "(static, 67141)": "man in blue jean shorts"
497
+ }
498
+ }
mbench/__init__.py ADDED
File without changes
mbench/__pycache__/transforms_video.cpython-39.pyc ADDED
Binary file (20 kB). View file
 
mbench/__pycache__/ytvos_ref.cpython-39.pyc ADDED
Binary file (7.4 kB). View file
 
mbench/check_image.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
mbench/check_image_numbered.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
mbench/check_image_revised.ipynb ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 32,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os\n",
10
+ "import argparse\n",
11
+ "import sys\n",
12
+ "import opts\n",
13
+ "import matplotlib.pyplot as plt\n",
14
+ "import matplotlib.patches as patches\n",
15
+ "import textwrap\n",
16
+ "\n",
17
+ "from PIL import Image, ImageDraw\n",
18
+ "import json\n",
19
+ "import numpy as np\n",
20
+ "from mbench.ytvos_ref import build as build_ytvos_ref"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 26,
26
+ "metadata": {},
27
+ "outputs": [],
28
+ "source": [
29
+ "img_folder = 'data/ref-youtube-vos/train'\n",
30
+ "text_colors = ['red', 'blue']"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": 2,
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "with open('mbench/result_revised50.json') as file:\n",
40
+ " data = json.load(file)"
41
+ ]
42
+ },
43
+ {
44
+ "cell_type": "code",
45
+ "execution_count": 24,
46
+ "metadata": {},
47
+ "outputs": [],
48
+ "source": [
49
+ "def bounding_box(img):\n",
50
+ " rows = np.any(img, axis=1)\n",
51
+ " cols = np.any(img, axis=0)\n",
52
+ " rmin, rmax = np.where(rows)[0][[0, -1]]\n",
53
+ " cmin, cmax = np.where(cols)[0][[0, -1]]\n",
54
+ " return rmin, rmax, cmin, cmax # y1, y2, x1, x2 "
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": 97,
60
+ "metadata": {},
61
+ "outputs": [],
62
+ "source": [
63
+ "def showImageRef(vid_id):\n",
64
+ " vid_data = data[vid_id]\n",
65
+ " cats = list(vid_data.keys())\n",
66
+ "\n",
67
+ " for cat in cats:\n",
68
+ " cat_data = vid_data[cat]\n",
69
+ " frames = list(cat_data.keys())\n",
70
+ " \n",
71
+ " for frame in frames:\n",
72
+ " frame_data = cat_data[frame]\n",
73
+ " \n",
74
+ " img_path = os.path.join(img_folder, 'JPEGImages', vid_id, frame + '.jpg')\n",
75
+ " mask_path = os.path.join(img_folder, 'Annotations', vid_id, frame + '.png')\n",
76
+ " img = Image.open(img_path).convert('RGB')\n",
77
+ " mask = Image.open(mask_path).convert('P')\n",
78
+ " mask = np.array(mask)\n",
79
+ " \n",
80
+ " if frame_data:\n",
81
+ " obj_ids = list(frame_data.keys())\n",
82
+ " obj_nums = len(obj_ids)\n",
83
+ "\n",
84
+ " fig, axes = plt.subplots(1, obj_nums, figsize=(16, obj_nums))\n",
85
+ "\n",
86
+ " for i in range(len(obj_ids)):\n",
87
+ " obj_id = obj_ids[i]\n",
88
+ " obj_data = frame_data[obj_id]\n",
89
+ " if obj_data:\n",
90
+ " ref_exp = obj_data['ref_exp']\n",
91
+ " isValid = obj_data['isValid']\n",
92
+ "\n",
93
+ " obj_mask = (mask == int(obj_id)).astype(np.float32)\n",
94
+ " if (obj_mask > 0).any():\n",
95
+ " y1, y2, x1, x2 = bounding_box(obj_mask)\n",
96
+ " box = np.array([x1, y1, x2, y2])\n",
97
+ " else:\n",
98
+ " box = np.array([0, 0, 0, 0])\n",
99
+ " \n",
100
+ " if obj_nums == 1:\n",
101
+ " ax = axes\n",
102
+ " else:\n",
103
+ " ax = axes[i]\n",
104
+ " ax.imshow(img)\n",
105
+ " width, height = box[2] - box[0], box[3] - box[1]\n",
106
+ " rect = patches.Rectangle((x1, y1), width, height, linewidth=2, edgecolor='red', facecolor='none')\n",
107
+ " ax.add_patch(rect)\n",
108
+ "\n",
109
+ " wrapped_text = \"\\n\".join(textwrap.wrap(ref_exp, width=30))\n",
110
+ " ax.annotate(wrapped_text, xy=(0.5, -1.5), xycoords=\"axes fraction\", ha = \"center\", color=text_colors[isValid])\n",
111
+ " \n",
112
+ " plt.suptitle(f\"video: {vid_id} - cat: {cat} - frame: {frame}\")\n",
113
+ " plt.show()"
114
+ ]
115
+ },
116
+ {
117
+ "cell_type": "code",
118
+ "execution_count": 142,
119
+ "metadata": {},
120
+ "outputs": [
121
+ {
122
+ "name": "stdout",
123
+ "output_type": "stream",
124
+ "text": [
125
+ "04667fabaa\n"
126
+ ]
127
+ }
128
+ ],
129
+ "source": [
130
+ "vid_id = list(data.keys())[49]\n",
131
+ "print(vid_id)\n",
132
+ "showImageRef(vid_id)"
133
+ ]
134
+ },
135
+ {
136
+ "cell_type": "code",
137
+ "execution_count": null,
138
+ "metadata": {},
139
+ "outputs": [],
140
+ "source": []
141
+ }
142
+ ],
143
+ "metadata": {
144
+ "kernelspec": {
145
+ "display_name": "referformer",
146
+ "language": "python",
147
+ "name": "referformer"
148
+ },
149
+ "language_info": {
150
+ "codemirror_mode": {
151
+ "name": "ipython",
152
+ "version": 3
153
+ },
154
+ "file_extension": ".py",
155
+ "mimetype": "text/x-python",
156
+ "name": "python",
157
+ "nbconvert_exporter": "python",
158
+ "pygments_lexer": "ipython3",
159
+ "version": "3.10.16"
160
+ }
161
+ },
162
+ "nbformat": 4,
163
+ "nbformat_minor": 2
164
+ }
mbench/gpt_ref-ytvos-revised.py ADDED
@@ -0,0 +1,428 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from os import path as osp
3
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
4
+
5
+ from mbench.ytvos_ref import build as build_ytvos_ref
6
+ import argparse
7
+ import opts
8
+
9
+ import sys
10
+ from pathlib import Path
11
+ import os
12
+ from os import path as osp
13
+ import skimage
14
+ from io import BytesIO
15
+
16
+ import numpy as np
17
+ import pandas as pd
18
+ import regex as re
19
+ import json
20
+
21
+ import cv2
22
+ from PIL import Image, ImageDraw
23
+ import torch
24
+ from torchvision.transforms import functional as F
25
+
26
+ from skimage import measure # (pip install scikit-image)
27
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
28
+
29
+ import matplotlib.pyplot as plt
30
+ import matplotlib.patches as patches
31
+ from matplotlib.collections import PatchCollection
32
+ from matplotlib.patches import Rectangle
33
+
34
+
35
+ import ipywidgets as widgets
36
+ from IPython.display import display, clear_output
37
+
38
+ from openai import OpenAI
39
+ import base64
40
+
41
+ # Function to encode the image
42
+ def encode_image(image_path):
43
+ with open(image_path, "rb") as image_file:
44
+ return base64.b64encode(image_file.read()).decode("utf-8")
45
+
46
+ # Captioner
47
+ ytvos_category_valid_list = [
48
+ 'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
49
+ 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
50
+ 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
51
+ 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
52
+ 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
53
+ 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
54
+ ]
55
+ def getCaption(video_id, json_data):
56
+ #데이터 가져오기
57
+ video_data = json_data[video_id]
58
+ frame_names = video_data['frame_names']
59
+ video_path = video_data['video_path']
60
+
61
+ cat_names = set()
62
+ all_captions = dict()
63
+ for obj_id in list(video_data['annotations'][0].keys()):
64
+ cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
65
+
66
+ # cat_names : person, snowboard
67
+ # 1. gpt에서 직접 action의 대상이 될 수 있는가 물어보기
68
+ # 2. ref-youtube-vos 에서 제공하는 카테고리 정보에서 우리가 처리하고 싶은 카테고리 이름만 남긴다
69
+
70
+ for cat_name in list(cat_names) :
71
+ image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
72
+ image_captions = {}
73
+
74
+ captioner = OpenAI()
75
+
76
+ #0단계: action의 대상이 될 수 있는가?
77
+ is_movable = False
78
+ if cat_name in ytvos_category_valid_list :
79
+ is_movable = True
80
+
81
+ # response_check = captioner.chat.completions.create(
82
+ # model="gpt-4o",
83
+ # messages=[
84
+ # {
85
+ # "role": "user",
86
+ # "content": f"""
87
+ # Can a {cat_name} be a subject of distinct actions or movements?
88
+ # For example, if {cat_name} is a person, animal, or vehicle, it is likely an action-capable subject.
89
+ # However, if it is an inanimate object like a snowboard, tree, or book, it cannot independently perform actions.
90
+ # Respond with YES if {cat_name} can perform distinct actions or movements; otherwise, respond with NONE.
91
+ # Answer only YES or NONE.
92
+ # """
93
+ # }
94
+ # ],
95
+ # )
96
+ # response_check_content = response_check.choices[0].message.content.strip().lower()
97
+ # print(f"Movable Check for {cat_name}: {response_check_content}")
98
+
99
+ # if response_check_content == "yes": is_movable = True
100
+
101
+ if not is_movable:
102
+ print(f"Skipping {cat_name}: Determined to be non-movable.")
103
+ continue
104
+
105
+ for i in range(len(image_paths)):
106
+ image_path = image_paths[i]
107
+ frame_name = frame_names[i]
108
+ base64_image = encode_image(image_path)
109
+
110
+ #1단계: 필터링
111
+ #print(f"-----------category name: {cat_name}, frame name: {frame_name}")
112
+ response1 = captioner.chat.completions.create(
113
+ model="chatgpt-4o-latest",
114
+ messages=[
115
+ {
116
+ "role": "user",
117
+ "content": [
118
+ {
119
+ "type": "text",
120
+
121
+ "text": f"""Are there multiple {cat_name}s in the image, each performing distinct and recognizable actions?
122
+ Focus only on clear and prominent actions, avoiding minor or ambiguous ones.
123
+ Each action should be unique and clearly associated with a specific object.
124
+
125
+ Respond with YES if:
126
+ - The {cat_name}s are people, animals or vehicles, and their actions are distinct and recognizable.
127
+ - The {cat_name}s involve clear, distinguishable actions performed independently.
128
+
129
+ Respond with NONE if:
130
+ - The {cat_name}s are objects (e.g., snowboard, tree, books) and do not involve direct interaction with a person.
131
+ - Actions are ambiguous, minor, or not clearly visible.
132
+
133
+ If the {cat_name} is 'snowboard' and it is not actively being used or interacted with by a person, output NONE.
134
+ If the {cat_name} is 'person' and their actions are distinct and clear, output YES.
135
+
136
+ Answer only YES or NONE."""
137
+
138
+ },
139
+ {
140
+ "type": "image_url",
141
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
142
+ },
143
+ ],
144
+ }
145
+ ],
146
+ )
147
+ response_content = response1.choices[0].message.content
148
+ should_caption = True if "yes" in response_content.lower() else False
149
+ #print(f"are {cat_name}s distinguished by action: {response_content}")
150
+
151
+ #2단계: dense caption 만들기
152
+ if should_caption:
153
+ response2 = captioner.chat.completions.create(
154
+ model="chatgpt-4o-latest",
155
+ messages=[
156
+ {
157
+ "role": "user",
158
+ "content": [
159
+ {
160
+ "type": "text",
161
+
162
+ "text": f"""
163
+ Generate a detailed action-centric caption describing the actions of the {cat_name}s in the image.
164
+ 1. Focus only on clear, unique, and prominent actions that distinguish each object.
165
+ 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
166
+ 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
167
+ 4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
168
+ 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
169
+ 6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
170
+ 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
171
+ 8. Include interactions with objects or other entities when they are prominent and observable.
172
+ 9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
173
+ Output only the caption.""",
174
+ },
175
+ {
176
+ "type": "image_url",
177
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
178
+ },
179
+ ],
180
+ }
181
+ ],
182
+ )
183
+
184
+ caption = response2.choices[0].message.content
185
+ #print(f"{image_path} - {frame_name}: {caption}")
186
+ else:
187
+ caption = None
188
+
189
+ image_captions[frame_name] = caption
190
+ all_captions[cat_name] = image_captions
191
+
192
+ # final : also prepare valid object ids
193
+ valid_obj_ids = []
194
+ valid_cat_names = list(all_captions.keys())
195
+ for obj_id in list(video_data['annotations'][0].keys()):
196
+ cat = video_data['annotations'][0][obj_id]['category_name']
197
+ if cat in valid_cat_names : valid_obj_ids.append(obj_id)
198
+
199
+ return all_captions, valid_obj_ids
200
+
201
+ # Referring expression generator and QA filter
202
+ def getRefExp(video_id, frame_name, caption, obj_id, json_data):
203
+
204
+ # 이미지에 해당 물체 바운딩 박스 그리기
205
+ video_data = json_data[video_id]
206
+ frame_names = video_data['frame_names']
207
+ video_path = video_data['video_path']
208
+ I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
209
+ frame_indx = frame_names.index(frame_name)
210
+ obj_data = video_data['annotations'][frame_indx][obj_id]
211
+
212
+ bbox = obj_data['bbox']
213
+ cat_name = obj_data['category_name']
214
+ valid = obj_data['valid']
215
+
216
+ if valid == 0:
217
+ print("Object not in this frame!")
218
+ return {}
219
+
220
+
221
+ x_min, y_min, x_max, y_max = bbox
222
+ x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
223
+ cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
224
+ plt.figure()
225
+ plt.imshow(I)
226
+ plt.axis('off')
227
+ plt.show()
228
+
229
+ #cropped object for visibility check
230
+ cropped_I = I[y_min:y_max, x_min:x_max]
231
+ pil_cropped_I = Image.fromarray(cropped_I)
232
+ buff_crop = BytesIO()
233
+ pil_cropped_I.save(buff_crop, format='JPEG')
234
+ base64_cropped_I = base64.b64encode(buff_crop.getvalue()).decode("utf-8")
235
+
236
+ #entire image for referring expression generation
237
+ pil_I = Image.fromarray(I)
238
+ buff = BytesIO()
239
+ pil_I.save(buff, format='JPEG')
240
+ base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
241
+
242
+ # 구분 가능 여부 확인
243
+ generator = OpenAI()
244
+ response_check = generator.chat.completions.create(
245
+ model="chatgpt-4o-latest",
246
+ messages=[
247
+ {
248
+ "role": "user",
249
+ "content": [
250
+ {
251
+
252
+ "type": "text",
253
+ "text": f"""Can the {cat_name} in the provided cropped image be clearly identified as belonging to the category {cat_name}?
254
+ Focus on whether the cropped image provides enough visible features (e.g., ears, head shape, fur texture) to confirm that it is a {cat_name}, even if the full body is not visible.
255
+
256
+ Guidelines:
257
+ - If the visible features (like ears, fur texture or head shape) are sufficient to identify the {cat_name}, respond with YES.
258
+ - If multiple {cat_name}s are entangled or overlapping, making it difficult to distinguish one from another, respond with NONE.
259
+ - If the object is clearly visible and identifiable as a {cat_name}, respond with YES.
260
+
261
+ Output only either YES or NONE.
262
+ """
263
+ },
264
+ {
265
+ "type": "image_url",
266
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
267
+ }
268
+ ]
269
+ },
270
+ ]
271
+ )
272
+
273
+ response_check_content = response_check.choices[0].message.content.strip().lower()
274
+ #print(f"is object {obj_id} visible: {response_check_content}")
275
+
276
+ if "yes" not in response_check_content:
277
+ print(f"Referring expression not generated: {cat_name} is ambiguous in this frame.")
278
+ return {"ref_exp": "NONE", "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : False}
279
+
280
+ # Referring expression 만들기
281
+ # generator = OpenAI()
282
+ response = generator.chat.completions.create(
283
+ model="chatgpt-4o-latest",
284
+ messages=[
285
+ {
286
+ "role": "user",
287
+ "content": [
288
+ {
289
+ "type": "text",
290
+
291
+ "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box, corresponding to Object ID {obj_id}.
292
+ Guidelines for creating the referring expression:
293
+ 1. The referring expression should describe the prominent actions or poses of the highlighted {cat_name} (Object ID {obj_id}).
294
+ 2. Focus on the behavior or pose described in the caption that is specifically associated with this {cat_name}. Do not include actions or poses of other {cat_name}s.
295
+ 3. If multiple {cat_name}s are present, ensure that the referring expression exclusively describes the {cat_name} corresponding to Object ID {obj_id}.
296
+ 4. Avoid ambiguous or subjective terms. Use specific and clear action verbs to describe the highlighted {cat_name}.
297
+ 5. The referring expression should only describe Object ID {obj_id} and not any other objects or entities.
298
+ 6. Use '{cat_name}' as the noun for the referring expressions.
299
+ Output only the referring expression for the highlighted {cat_name} (Object ID {obj_id}).
300
+
301
+ {caption}
302
+ """
303
+ },
304
+ {
305
+ "type": "image_url",
306
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
307
+ },
308
+ # {
309
+ # "type": "image_url",
310
+ # "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
311
+ # }
312
+ ],
313
+ }
314
+ ],
315
+ )
316
+
317
+ ref_exp = response.choices[0].message.content.strip()
318
+
319
+ #QA filtering
320
+ #QA1: 원하는 물체를 설명하는지
321
+ filter = OpenAI()
322
+ response1 = filter.chat.completions.create(
323
+ model="chatgpt-4o-latest",
324
+ messages=[
325
+ {
326
+ "role": "user",
327
+ "content": [
328
+ {
329
+ "type": "text",
330
+ "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
331
+ {ref_exp}""",
332
+ },
333
+ {
334
+ "type": "image_url",
335
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
336
+ },
337
+ ],
338
+ }
339
+ ],
340
+ )
341
+
342
+ response1_content = response1.choices[0].message.content
343
+ describesHighlighted = True if "yes" in response1_content.lower() else False
344
+
345
+ #QA2: 원하지 않는 물체를 설명하지 않는지
346
+ response2 = filter.chat.completions.create(
347
+ model="chatgpt-4o-latest",
348
+ messages=[
349
+ {
350
+ "role": "user",
351
+ "content": [
352
+ {
353
+ "type": "text",
354
+ "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
355
+ {ref_exp}""",
356
+ },
357
+ {
358
+ "type": "image_url",
359
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
360
+ },
361
+ ],
362
+ }
363
+ ],
364
+ )
365
+
366
+ response2_content = response2.choices[0].message.content
367
+ notDescribesNotHighlighted = False if "yes" in response2_content.lower() else True
368
+
369
+ isValid = True if describesHighlighted and notDescribesNotHighlighted else False
370
+
371
+ #print(f"describesHighlighted: {describesHighlighted}, notDescribesNotHighlighted: {notDescribesNotHighlighted}")
372
+ #print(f"ref exp: {ref_exp}")
373
+ #print("")
374
+
375
+ return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
376
+
377
+
378
+ if __name__ == '__main__':
379
+ with open('mbench/sampled_frame3.json', 'r') as file:
380
+ data = json.load(file)
381
+
382
+ vid_ids = list(data.keys())
383
+ all_ref_exps = {}
384
+
385
+ os.environ['OPENAI_API_KEY'] = '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'
386
+
387
+ # 전체 데이터셋의 vid_id에 대해
388
+ for i in range(50):
389
+ vid_id = vid_ids[i]
390
+
391
+ #====캡션 만들기====
392
+ # print("=====================captioner========================")
393
+ captions, valid_obj_ids = getCaption(vid_id, data)
394
+ cats_in_vid = list(captions.keys())
395
+ # print()
396
+
397
+ #====referring expression 만들고 QA filtering====
398
+ # print("=====================referring expression generator & QA filter========================")
399
+ ref_expressions = {}
400
+
401
+ # 각 카테고리별로
402
+ for cat_name in cats_in_vid:
403
+ if cat_name not in ref_expressions:
404
+ ref_expressions[cat_name] = {}
405
+ # 각 비디오 프레임 별로
406
+ for frame_name in data[vid_id]['frame_names']:
407
+ # print(f'--------category: {cat_name}, frame_name: {frame_name}')
408
+
409
+ if frame_name not in ref_expressions[cat_name]:
410
+ ref_expressions[cat_name][frame_name] = {} # Create frame-level dictionary
411
+ caption = captions[cat_name][frame_name]
412
+ if not caption : continue
413
+ else :
414
+ # 각 obj id별로
415
+ for obj_id in valid_obj_ids:
416
+ ref_exp = getRefExp(vid_id, frame_name, caption, obj_id, data)
417
+ ref_expressions[cat_name][frame_name][obj_id] = ref_exp # Store ref_exp
418
+
419
+ all_ref_exps[vid_id] = ref_expressions
420
+
421
+
422
+ with open('mbench/result_revised50.json', 'w') as file:
423
+ json.dump(all_ref_exps, file, indent=4)
424
+
425
+
426
+
427
+
428
+
mbench/gpt_ref-ytvos.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
mbench/gpt_ref-ytvos.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from os import path as osp
3
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
4
+
5
+ from datasets import build_dataset
6
+ import argparse
7
+ import opts
8
+
9
+ from pathlib import Path
10
+ import os
11
+ import skimage
12
+ from io import BytesIO
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+ import regex as re
17
+ import json
18
+
19
+ import cv2
20
+ from PIL import Image, ImageDraw
21
+ import torch
22
+ from torchvision.transforms import functional as F
23
+
24
+ from skimage import measure # (pip install scikit-image)
25
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
26
+
27
+ import matplotlib.pyplot as plt
28
+ import matplotlib.patches as patches
29
+ from matplotlib.collections import PatchCollection
30
+ from matplotlib.patches import Rectangle
31
+
32
+
33
+ import ipywidgets as widgets
34
+ from IPython.display import display, clear_output
35
+
36
+ from openai import OpenAI
37
+ import base64
38
+
39
+ os.environ['OPENAI_API_KEY'] = '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'
40
+
41
+
42
+ ytvos_category_valid_list = [
43
+ 'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
44
+ 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
45
+ 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
46
+ 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
47
+ 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
48
+ 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
49
+ ]
50
+
51
+ # Function to encode the image
52
+ def encode_image(image_path):
53
+ with open(image_path, "rb") as image_file:
54
+ return base64.b64encode(image_file.read()).decode("utf-8")
55
+
56
+ def getCaption(video_id, json_data):
57
+ #데이터 가져오기
58
+ video_data = json_data[video_id]
59
+ frame_names = video_data['frame_names']
60
+ video_path = video_data['video_path']
61
+
62
+ cat_names = set()
63
+ for obj_id in list(video_data['annotations'][0].keys()):
64
+ cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
65
+
66
+ if len(cat_names) == 1:
67
+ cat_name = next(iter(cat_names))
68
+ else:
69
+ print("more than 2 categories")
70
+ return -1
71
+
72
+ image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
73
+ image_captions = {}
74
+
75
+ captioner = OpenAI()
76
+ for i in range(len(image_paths)):
77
+ image_path = image_paths[i]
78
+ frame_name = frame_names[i]
79
+ base64_image = encode_image(image_path)
80
+
81
+ #1단계: 필터링
82
+ response1 = captioner.chat.completions.create(
83
+ model="gpt-4o-mini",
84
+ messages=[
85
+ {
86
+ "role": "user",
87
+ "content": [
88
+ {
89
+ "type": "text",
90
+ "text": f"Are there multiple {cat_name}s that can be distinguished by action? Each action should be prominent and describe the corresponding object only. If so, only output YES. If not, only output None",
91
+ },
92
+ {
93
+ "type": "image_url",
94
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
95
+ },
96
+ ],
97
+ }
98
+ ],
99
+ )
100
+ response_content = response1.choices[0].message.content
101
+ should_caption = True if "yes" in response_content.lower() else False
102
+
103
+ #2단계: dense caption 만들기
104
+ if should_caption:
105
+ response2 = captioner.chat.completions.create(
106
+ model="gpt-4o-mini",
107
+ messages=[
108
+ {
109
+ "role": "user",
110
+ "content": [
111
+ {
112
+ "type": "text",
113
+ "text": f"""
114
+ Describe the image in detail focusing on the {cat_name}s' actions.
115
+ 1. Each action should be prominent, clear and unique, describing the corresponding object only.
116
+ 2. Avoid overly detailed or indeterminate details such as ‘in anticipation’.
117
+ 3. Avoid subjective descriptions such as ‘soft’, ‘controlled’, ‘attentive’, ‘skilled’, ‘casual atmosphere’ and descriptions of the setting.
118
+ 4. Do not include actions that needs to be guessed or suggested.""",
119
+ },
120
+ {
121
+ "type": "image_url",
122
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
123
+ },
124
+ ],
125
+ }
126
+ ],
127
+ )
128
+
129
+ caption = response2.choices[0].message.content
130
+ else:
131
+ caption = None
132
+
133
+ image_captions[frame_name] = caption
134
+ return image_captions
135
+
136
+ def getRefExp(video_id, frame_name, caption, obj_id, json_data):
137
+ # 이미지에 해당 물체 바운딩 박스 그리기
138
+ video_data = json_data[video_id]
139
+ frame_names = video_data['frame_names']
140
+ video_path = video_data['video_path']
141
+ I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
142
+ frame_indx = frame_names.index(frame_name)
143
+ obj_data = video_data['annotations'][frame_indx][obj_id]
144
+
145
+ bbox = obj_data['bbox']
146
+ cat_name = obj_data['category_name']
147
+ valid = obj_data['valid']
148
+
149
+ if valid == 0:
150
+ print("Object not in this frame!")
151
+ return {}
152
+
153
+
154
+ x_min, y_min, x_max, y_max = bbox
155
+ x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
156
+ cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
157
+ # plt.figure()
158
+ # plt.imshow(I)
159
+ # plt.axis('off')
160
+ # plt.show()
161
+ pil_I = Image.fromarray(I)
162
+ buff = BytesIO()
163
+ pil_I.save(buff, format='JPEG')
164
+ base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
165
+
166
+ #ref expression 만들기
167
+ generator = OpenAI()
168
+ response = generator.chat.completions.create(
169
+ model="gpt-4o-mini",
170
+ messages=[
171
+ {
172
+ "role": "user",
173
+ "content": [
174
+ {
175
+ "type": "text",
176
+ "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box.
177
+ 1. The referring expression describes the action and does not contain information about appearance or location in the picture.
178
+ 2. Focus only on prominent actions and avoid overly detailed or indeterminate details.
179
+ 3. Avoid subjective terms describing emotion such as ‘in anticipation’, ‘attentively’ or ‘relaxed’ and professional, difficult words.
180
+ 4. The referring expression should only describe the highlighted {cat_name} and not any other.
181
+ 5. Use '{cat_name}' as the noun for the referring expressions.
182
+ Output only the referring expression.
183
+ {caption}""",
184
+ },
185
+ {
186
+ "type": "image_url",
187
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
188
+ },
189
+ ],
190
+ }
191
+ ],
192
+ )
193
+
194
+ ref_exp = response.choices[0].message.content
195
+
196
+ #QA filtering
197
+ #QA1: 원하는 물체를 설명하는지
198
+ filter = OpenAI()
199
+ response1 = filter.chat.completions.create(
200
+ model="gpt-4o-mini",
201
+ messages=[
202
+ {
203
+ "role": "user",
204
+ "content": [
205
+ {
206
+ "type": "text",
207
+ "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
208
+ {ref_exp}""",
209
+ },
210
+ {
211
+ "type": "image_url",
212
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
213
+ },
214
+ ],
215
+ }
216
+ ],
217
+ )
218
+
219
+ response1_content = response1.choices[0].message.content
220
+ describesHighlighted = True if "yes" in response1_content.lower() else False
221
+
222
+ #QA2: 원하지 않는 물체를 설명하지 않는지
223
+ response2 = filter.chat.completions.create(
224
+ model="gpt-4o-mini",
225
+ messages=[
226
+ {
227
+ "role": "user",
228
+ "content": [
229
+ {
230
+ "type": "text",
231
+ "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
232
+ {ref_exp}""",
233
+ },
234
+ {
235
+ "type": "image_url",
236
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
237
+ },
238
+ ],
239
+ }
240
+ ],
241
+ )
242
+
243
+ response2_content = response2.choices[0].message.content
244
+ describesNotHighlighted = True if "yes" in response2_content.lower() else False
245
+
246
+ isValid = True if describesHighlighted and not describesNotHighlighted else False
247
+
248
+ print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
249
+
250
+ return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
251
+
252
+ def createRefExp(video_id, json_data):
253
+ video_data = json_data[video_id]
254
+ obj_ids = list(video_data['annotations'][0].keys())
255
+ frame_names = video_data['frame_names']
256
+
257
+ captions_per_frame = getCaption(video_id, json_data)
258
+
259
+ if captions_per_frame == -1:
260
+ print("There are more than 2 cateories")
261
+ return None
262
+
263
+
264
+ video_ref_exps = {}
265
+
266
+ for frame_name in frame_names:
267
+ frame_caption = captions_per_frame[frame_name]
268
+
269
+ if frame_caption == None:
270
+ video_ref_exps[frame_name] = None
271
+
272
+ else:
273
+ frame_ref_exps = {}
274
+ for obj_id in obj_ids:
275
+ exp_per_obj = getRefExp(video_id, frame_name, frame_caption, obj_id, json_data)
276
+ frame_ref_exps[obj_id] = exp_per_obj
277
+ video_ref_exps[frame_name] = frame_ref_exps
278
+
279
+ return video_ref_exps
280
+
281
+ if __name__ == '__main__':
282
+ with open('mbench/sampled_frame3.json', 'r') as file:
283
+ data = json.load(file)
284
+
285
+ videos = set()
286
+ with open('make_ref-ytvos/selected_frames.jsonl', 'r') as file:
287
+ manual_select = list(file)
288
+ for frame in manual_select:
289
+ result = json.loads(frame)
290
+ videos.add(result['video'])
291
+ videos = list(videos)
292
+
293
+
294
+ all_video_refs = {}
295
+ for i in range(10):
296
+ video_id = videos[i]
297
+ video_ref = createRefExp(video_id, data)
298
+ all_video_refs[video_id] = video_ref
299
+
300
+ json_obj = json.dumps(all_video_refs, indent=4)
301
+ with open('mbench/result.json', 'w') as file:
302
+ file.write(json_obj)
mbench/gpt_ref-ytvos_numbered_cy.py ADDED
@@ -0,0 +1,460 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4
+ import time
5
+
6
+ from os import path as osp
7
+ from io import BytesIO
8
+
9
+ from mbench.ytvos_ref import build as build_ytvos_ref
10
+ import argparse
11
+ import opts
12
+
13
+ import sys
14
+ from pathlib import Path
15
+ import os
16
+ from os import path as osp
17
+ import skimage
18
+ from io import BytesIO
19
+
20
+ import numpy as np
21
+ import pandas as pd
22
+ import regex as re
23
+ import json
24
+
25
+ import cv2
26
+ from PIL import Image, ImageDraw
27
+ import torch
28
+ from torchvision.transforms import functional as F
29
+
30
+ from skimage import measure # (pip install scikit-image)
31
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
32
+
33
+ import matplotlib.pyplot as plt
34
+ import matplotlib.patches as patches
35
+ from matplotlib.collections import PatchCollection
36
+ from matplotlib.patches import Rectangle
37
+ import textwrap
38
+
39
+
40
+ import ipywidgets as widgets
41
+ from IPython.display import display, clear_output
42
+
43
+ from openai import OpenAI
44
+ import base64
45
+ import json
46
+
47
+ def number_objects_and_encode(idx, color_mask=False):
48
+ encoded_frames = {}
49
+ contoured_frames = {} # New dictionary for original images
50
+ vid_cat_cnts = {}
51
+
52
+ vid_meta = metas[idx]
53
+ vid_data = train_dataset[idx]
54
+ vid_id = vid_meta['video']
55
+ frame_indx = vid_meta['sample_indx']
56
+ cat_names = set(vid_meta['obj_id_cat'].values())
57
+ imgs = vid_data[0]
58
+
59
+ for cat in cat_names:
60
+ cat_frames = []
61
+ contour_frames = []
62
+ frame_cat_cnts = {}
63
+
64
+ for i in range(imgs.size(0)):
65
+ frame_name = frame_indx[i]
66
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
67
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
68
+
69
+ frame_data = vid_data[2][frame_name]
70
+ obj_ids = list(frame_data.keys())
71
+
72
+ cat_cnt = 0
73
+
74
+ for j in range(len(obj_ids)):
75
+ obj_id = obj_ids[j]
76
+ obj_data = frame_data[obj_id]
77
+ obj_bbox = obj_data['bbox']
78
+ obj_valid = obj_data['valid']
79
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
80
+ obj_cat = obj_data['category_name']
81
+
82
+ if obj_cat == cat and obj_valid:
83
+ cat_cnt += 1
84
+
85
+ if color_mask == False:
86
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
87
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
88
+ for i, contour in enumerate(contours):
89
+ # 윤곽선 중심 계산
90
+ moments = cv2.moments(contour)
91
+ if moments["m00"] != 0: # 중심 계산 가능 여부 확인
92
+ cx = int(moments["m10"] / moments["m00"])
93
+ cy = int(moments["m01"] / moments["m00"])
94
+ else:
95
+ cx, cy = contour[0][0] # 중심 계산 불가시 대체 좌표 사용
96
+
97
+ # 텍스트 배경 (검은색 배경 만들기)
98
+ font = cv2.FONT_HERSHEY_SIMPLEX
99
+ text = obj_id
100
+ text_size = cv2.getTextSize(text, font, 1, 2)[0]
101
+ text_w, text_h = text_size
102
+
103
+ # 텍스트 배경 그리기 (검은색 배경)
104
+ cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
105
+ (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
106
+
107
+ # 텍스트 그리기 (흰색 텍스트)
108
+ cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
109
+ font, 1, (255, 255, 255), 2)
110
+
111
+ else:
112
+ alpha = 0.08
113
+
114
+ colored_obj_mask = np.zeros_like(frame)
115
+ colored_obj_mask[obj_mask == 1] = colors[j]
116
+ frame[obj_mask == 1] = (
117
+ (1 - alpha) * frame[obj_mask == 1]
118
+ + alpha * colored_obj_mask[obj_mask == 1]
119
+ )
120
+
121
+
122
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
123
+ cv2.drawContours(frame, contours, -1, colors[j], 2)
124
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
125
+
126
+
127
+
128
+ if len(contours) > 0:
129
+ largest_contour = max(contours, key=cv2.contourArea)
130
+ M = cv2.moments(largest_contour)
131
+ if M["m00"] != 0:
132
+ center_x = int(M["m10"] / M["m00"])
133
+ center_y = int(M["m01"] / M["m00"])
134
+ else:
135
+ center_x, center_y = 0, 0
136
+
137
+ font = cv2.FONT_HERSHEY_SIMPLEX
138
+ text = obj_id
139
+
140
+ font_scale = 0.9
141
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
142
+ text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
143
+ text_y = center_y
144
+ # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
145
+
146
+ # 텍스트 배경 사각형 좌표 계산
147
+ rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
148
+ # rect_end = (text_x + text_size[0] + 5, text_y + 5)
149
+ rect_end = (text_x + text_size[0] + 5, text_y)
150
+
151
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
152
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
153
+
154
+ # plt.figure(figsize=(12, 8))
155
+ # plt.imshow(frame)
156
+ # plt.title(f"frame {frame_name}")
157
+ # plt.tight_layout()
158
+ # plt.axis('off')
159
+ # plt.show()
160
+
161
+ buffer = BytesIO()
162
+ frame = Image.fromarray(frame)
163
+ frame.save(buffer, format='jpeg')
164
+ buffer.seek(0)
165
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
166
+ frame_cat_cnts[frame_name] = cat_cnt
167
+
168
+ buffer.seek(0) # Reuse buffer instead of creating a new one
169
+ buffer.truncate()
170
+ frame_for_contour = Image.fromarray(frame_for_contour)
171
+ frame_for_contour.save(buffer, format='jpeg')
172
+ buffer.seek(0)
173
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
174
+
175
+ encoded_frames[cat] = cat_frames
176
+ contoured_frames[cat] = contour_frames
177
+ vid_cat_cnts[cat] = frame_cat_cnts
178
+
179
+ return encoded_frames, vid_cat_cnts, contoured_frames
180
+
181
+
182
+ def getCaption(idx, model='gpt-4o', color_mask=True):
183
+ vid_meta = metas[idx]
184
+ vid_data = train_dataset[idx]
185
+ vid_id = vid_meta['video']
186
+ print(f"vid id: {vid_id}\n")
187
+
188
+ frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
189
+ cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
190
+ all_captions = dict()
191
+
192
+ base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
193
+ #marked = "mask with boundary" if color_mask else "boundary"
194
+
195
+ for cat_name in list(cat_names) :
196
+
197
+ is_movable = False
198
+ if cat_name in ytvos_category_valid_list :
199
+ is_movable = True
200
+
201
+ if not is_movable:
202
+ print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
203
+
204
+
205
+ image_captions = {}
206
+ captioner = OpenAI()
207
+ cat_base64_frames = base64_frames[cat_name]
208
+ cont_base64_frames = contoured_frames[cat_name]
209
+
210
+ for i in range(len(cat_base64_frames)):
211
+ frame_name = frame_indx[i]
212
+ cont_base64_image = cont_base64_frames[i]
213
+ base64_image = cat_base64_frames[i]
214
+ should_filter = False
215
+ frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
216
+
217
+ if frame_cat_cnts >= 2:
218
+ should_filter = True
219
+ else:
220
+ print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
221
+
222
+ if is_movable and should_filter:
223
+ #1단계: 필터링
224
+ print(f"-----------category name: {cat_name}, frame name: {frame_name}")
225
+ caption_filter_text = f"""
226
+ You are a visual assistant analyzing a single frame from a video.
227
+ In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
228
+
229
+ Are {cat_name}s in the image performing all different and recognizable actions or postures?
230
+ Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
231
+ facial expressions, and any notable interactions with objects or other {cat_name}s or people.
232
+
233
+ Only focus on obvious, prominent actions that can be reliably identified from this single frame.
234
+
235
+ - Respond with "YES" if:
236
+ 1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
237
+ (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
238
+ 2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
239
+ 3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
240
+
241
+ - Respond with "NONE" if:
242
+ 1) The actions or pose are not clearly differentiable or too similar.
243
+ 2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
244
+ 3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
245
+
246
+ Answer strictly with either "YES" or "NONE".
247
+ """
248
+
249
+ response1 = captioner.chat.completions.create(
250
+ model=model,
251
+ messages=[
252
+ {
253
+ "role": "user",
254
+ "content": [
255
+ {
256
+ "type": "text",
257
+ "text": caption_filter_text,
258
+ },
259
+ {
260
+ "type": "image_url",
261
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
262
+ }
263
+ ],
264
+ }
265
+ ],
266
+ )
267
+ response_content = response1.choices[0].message.content
268
+ should_caption = True if "yes" in response_content.lower() else False
269
+ print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
270
+
271
+ else:
272
+ should_caption = False
273
+
274
+ #2단계: dense caption 만들기
275
+ dense_caption_prompt_1 = f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
276
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
277
+ I want to use your expressions to create a action-centric referring expression dataset.
278
+ Therefore, your expressions for these {cat_name}s should describe unique action of each object.
279
+
280
+ 1. Focus only on clear, unique, and prominent actions that distinguish each object.
281
+ 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
282
+ 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
283
+ 4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
284
+ 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
285
+ 6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
286
+ 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
287
+ 8. Include interactions with objects or other entities when they are prominent and observable.
288
+ 9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
289
+ 10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
290
+ 11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
291
+ 12. Do not mention object IDs.
292
+ 13. Use '{cat_name}' as the noun for the referring expressions.
293
+
294
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
295
+ Output referring expressions for each object id.
296
+ """
297
+
298
+ dense_caption_prompt = f"""
299
+ You are a visual assistant analyzing a single frame of a video.
300
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
301
+
302
+ I want to use your expressions to create an **action-centric referring expression** dataset.
303
+ Please describe each {cat_name} using **clearly observable** and **specific** actions.
304
+
305
+ ---
306
+ ## Guidelines:
307
+ 1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
308
+ 2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
309
+ 3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
310
+ 4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
311
+ 5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
312
+ 6. If multiple {cat_name}s appear, ensure each description **differentiates** their actions.
313
+ 7. Base your description on these action definitions:
314
+ - Avoid using term 'minimal' or 'slightly'.
315
+ - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
316
+ - details such as motion and intention, facial with object manipulation
317
+ - movements with objects or other entities when they are prominent and observable. expression should be specific.
318
+ (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
319
+ ---
320
+
321
+ ## Output Format:
322
+ - For each labeled {cat_name}, output **exactly one line**. Your answer should contain details and follow the following format :
323
+ object id. using {cat_name} as subject noun, action-oriented description
324
+ (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
325
+ - **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
326
+
327
+ ### Example
328
+ If the frame has 2 labeled bears, your output should look like:
329
+ 1. the bear reaching his right arm while leaning forward to capture the prey
330
+ 2. a bear standing upright facing right, touching the bike aside
331
+
332
+ ---
333
+ **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
334
+ **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
335
+ **Do not include markdown** in the output.
336
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
337
+ For each labeled {cat_name}, output referring expressions for each object id.
338
+ """
339
+ MAX_RETRIES = 2
340
+ retry_count = 0
341
+
342
+ if should_caption:
343
+ while retry_count < MAX_RETRIES:
344
+
345
+ response2 = captioner.chat.completions.create(
346
+ model=model,
347
+ messages=[
348
+ {
349
+ "role": "user",
350
+ "content": [
351
+ {
352
+ "type": "text",
353
+ "text": dense_caption_prompt,
354
+ },
355
+ {
356
+ "type": "image_url",
357
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
358
+ },
359
+ ],
360
+ }
361
+ ],
362
+ )
363
+
364
+ # caption = response2.choices[0].message.content
365
+ #print(f"{image_path} - {frame_name}: {caption}")
366
+
367
+ caption = response2.choices[0].message.content.strip()
368
+ caption_lower = caption.lower().lstrip()
369
+
370
+ if caption_lower.startswith("1.") and not any(
371
+ phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
372
+ ):
373
+ break
374
+
375
+ print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
376
+ retry_count += 1
377
+ time.sleep(2)
378
+
379
+ if retry_count == MAX_RETRIES:
380
+ caption = None
381
+ print("Max retries reached. Caption generation failed.")
382
+
383
+ else:
384
+ caption = None
385
+
386
+ image_captions[frame_name] = caption
387
+ all_captions[cat_name] = image_captions
388
+
389
+ # final : also prepare valid object ids
390
+ valid_obj_ids = dict()
391
+
392
+ for cat in cat_names:
393
+ if cat in ytvos_category_valid_list:
394
+ obj_id_cat = vid_meta['obj_id_cat']
395
+ valid_cat_ids = []
396
+ for obj_id in list(obj_id_cat.keys()):
397
+ if obj_id_cat[obj_id] == cat:
398
+ valid_cat_ids.append(obj_id)
399
+ valid_obj_ids[cat] = valid_cat_ids
400
+
401
+ return vid_id, all_captions, valid_obj_ids
402
+
403
+
404
+
405
+ if __name__ == '__main__':
406
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
407
+ parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
408
+ parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
409
+
410
+ args = parser.parse_args()
411
+
412
+ #==================데이터 불러오기===================
413
+ # 전체 데이터셋
414
+ train_dataset = build_ytvos_ref(image_set = 'train', args = args)
415
+
416
+ # 전체 데이터셋 메타데이터
417
+ metas = train_dataset.metas
418
+
419
+ # 색상 후보 8개 (RGB 형식)
420
+ colors = [
421
+ (255, 0, 0), # Red
422
+ (0, 255, 0), # Green
423
+ (0, 0, 255), # Blue
424
+ (255, 255, 0), # Yellow
425
+ (255, 0, 255), # Magenta
426
+ (0, 255, 255), # Cyan
427
+ (128, 0, 128), # Purple
428
+ (255, 165, 0) # Orange
429
+ ]
430
+
431
+ ytvos_category_valid_list = [
432
+ 'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
433
+ 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
434
+ 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
435
+ 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
436
+ 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
437
+ 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
438
+ ]
439
+
440
+ #==================gpt 돌리기===================
441
+ os.environ['OPENAI_API_KEY'] = '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'
442
+
443
+ result_captions = {}
444
+ result_valid_obj_ids = {}
445
+
446
+ for i in range(370):
447
+ vid_id, all_captions, valid_obj_ids = getCaption(i, color_mask=False)
448
+
449
+ if vid_id not in result_captions:
450
+ result_captions[vid_id] = all_captions
451
+ if vid_id not in result_valid_obj_ids:
452
+ result_valid_obj_ids[vid_id] = valid_obj_ids
453
+
454
+ print("Finished!", flush=True)
455
+
456
+ with open(args.save_caption_path, "w") as file:
457
+ json.dump(result_captions, file, indent=4)
458
+
459
+ with open(args.save_valid_obj_ids_path, "w") as file:
460
+ json.dump(result_valid_obj_ids, file, indent=4)
mbench/gpt_ref-ytvos_numbered_cy_sanity.py ADDED
@@ -0,0 +1,643 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4
+ import time
5
+
6
+ from os import path as osp
7
+ from io import BytesIO
8
+ import random
9
+
10
+ from mbench.ytvos_ref import build as build_ytvos_ref
11
+ import argparse
12
+ import opts
13
+
14
+ import sys
15
+ from pathlib import Path
16
+ import os
17
+ from os import path as osp
18
+ import skimage
19
+ from io import BytesIO
20
+
21
+ import numpy as np
22
+ import pandas as pd
23
+ import regex as re
24
+ import json
25
+
26
+ import cv2
27
+ from PIL import Image, ImageDraw
28
+ import torch
29
+ from torchvision.transforms import functional as F
30
+
31
+ from skimage import measure # (pip install scikit-image)
32
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
33
+
34
+ import matplotlib.pyplot as plt
35
+ import matplotlib.patches as patches
36
+ from matplotlib.collections import PatchCollection
37
+ from matplotlib.patches import Rectangle
38
+ import textwrap
39
+
40
+
41
+ import ipywidgets as widgets
42
+ from IPython.display import display, clear_output
43
+
44
+ from openai import OpenAI
45
+ import base64
46
+ import json
47
+
48
+ def number_objects_and_encode(idx, color_mask=False):
49
+ encoded_frames = {}
50
+ contoured_frames = {} # New dictionary for original images
51
+ vid_cat_cnts = {}
52
+
53
+ vid_meta = metas[idx]
54
+ vid_data = train_dataset[idx]
55
+ vid_id = vid_meta['video']
56
+ frame_indx = vid_meta['sample_indx']
57
+ cat_names = set(vid_meta['obj_id_cat'].values())
58
+ imgs = vid_data[0]
59
+
60
+ for cat in cat_names:
61
+ cat_frames = []
62
+ contour_frames = []
63
+ frame_cat_cnts = {}
64
+
65
+ for i in range(imgs.size(0)):
66
+ frame_name = frame_indx[i]
67
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
68
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
69
+
70
+ frame_data = vid_data[2][frame_name]
71
+ obj_ids = list(frame_data.keys())
72
+
73
+ cat_cnt = 0
74
+
75
+ for j in range(len(obj_ids)):
76
+ obj_id = obj_ids[j]
77
+ obj_data = frame_data[obj_id]
78
+ obj_bbox = obj_data['bbox']
79
+ obj_valid = obj_data['valid']
80
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
81
+ obj_cat = obj_data['category_name']
82
+
83
+ if obj_cat == cat and obj_valid:
84
+ cat_cnt += 1
85
+
86
+ if color_mask == False:
87
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
88
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
89
+ for i, contour in enumerate(contours):
90
+ moments = cv2.moments(contour)
91
+ if moments["m00"] != 0:
92
+ cx = int(moments["m10"] / moments["m00"])
93
+ cy = int(moments["m01"] / moments["m00"])
94
+ else:
95
+ cx, cy = contour[0][0]
96
+
97
+ font = cv2.FONT_HERSHEY_SIMPLEX
98
+ text = obj_id
99
+ text_size = cv2.getTextSize(text, font, 1, 2)[0]
100
+ text_w, text_h = text_size
101
+
102
+ cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
103
+ (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
104
+
105
+ cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
106
+ font, 1, (255, 255, 255), 2)
107
+
108
+ else:
109
+ alpha = 0.08
110
+
111
+ colored_obj_mask = np.zeros_like(frame)
112
+ colored_obj_mask[obj_mask == 1] = colors[j]
113
+ frame[obj_mask == 1] = (
114
+ (1 - alpha) * frame[obj_mask == 1]
115
+ + alpha * colored_obj_mask[obj_mask == 1]
116
+ )
117
+
118
+
119
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
120
+ cv2.drawContours(frame, contours, -1, colors[j], 2)
121
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
122
+
123
+ if len(contours) > 0:
124
+ largest_contour = max(contours, key=cv2.contourArea)
125
+ M = cv2.moments(largest_contour)
126
+ if M["m00"] != 0:
127
+ center_x = int(M["m10"] / M["m00"])
128
+ center_y = int(M["m01"] / M["m00"])
129
+ else:
130
+ center_x, center_y = 0, 0
131
+
132
+ font = cv2.FONT_HERSHEY_SIMPLEX
133
+ text = obj_id
134
+
135
+ font_scale = 0.9
136
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
137
+ text_x = center_x - text_size[0] // 1
138
+ text_y = center_y
139
+
140
+ rect_start = (text_x - 5, text_y - text_size[1] - 5)
141
+ rect_end = (text_x + text_size[0] + 5, text_y)
142
+
143
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
144
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
145
+
146
+ # plt.figure(figsize=(12, 8))
147
+ # plt.imshow(frame)
148
+ # plt.title(f"frame {frame_name}")
149
+ # plt.tight_layout()
150
+ # plt.axis('off')
151
+ # plt.show()
152
+
153
+ buffer = BytesIO()
154
+ frame = Image.fromarray(frame)
155
+ frame.save(buffer, format='jpeg')
156
+ buffer.seek(0)
157
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
158
+ frame_cat_cnts[frame_name] = cat_cnt
159
+
160
+ buffer.seek(0) # Reuse buffer instead of creating a new one
161
+ buffer.truncate()
162
+ frame_for_contour = Image.fromarray(frame_for_contour)
163
+ frame_for_contour.save(buffer, format='jpeg')
164
+ buffer.seek(0)
165
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
166
+
167
+ encoded_frames[cat] = cat_frames
168
+ contoured_frames[cat] = contour_frames
169
+ vid_cat_cnts[cat] = frame_cat_cnts
170
+
171
+ return encoded_frames, contoured_frames, vid_cat_cnts
172
+
173
+
174
+ # def number_objects_and_encode(idx, color_mask=False):
175
+ # encoded_frames = {}
176
+ # contoured_frames = {} # New dictionary for original images
177
+ # vid_cat_cnts = {}
178
+
179
+ # vid_meta = metas[idx]
180
+ # vid_data = train_dataset[idx]
181
+ # vid_id = vid_meta['video']
182
+ # frame_indx = vid_meta['sample_indx']
183
+ # cat_names = set(vid_meta['obj_id_cat'].values())
184
+ # imgs = vid_data[0]
185
+
186
+ # for cat in cat_names:
187
+ # cat_frames = []
188
+ # contour_frames = []
189
+ # frame_cat_cnts = {}
190
+
191
+ # for i in range(imgs.size(0)):
192
+ # frame_name = frame_indx[i]
193
+ # frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
194
+ # frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
195
+
196
+ # frame_data = vid_data[2][frame_name]
197
+ # obj_ids = list(frame_data.keys())
198
+
199
+ # cat_cnt = 0
200
+
201
+ # for j in range(len(obj_ids)):
202
+ # obj_id = obj_ids[j]
203
+ # obj_data = frame_data[obj_id]
204
+ # obj_bbox = obj_data['bbox']
205
+ # obj_valid = obj_data['valid']
206
+ # obj_mask = obj_data['mask'].numpy().astype(np.uint8)
207
+ # obj_cat = obj_data['category_name']
208
+
209
+ # if obj_cat == cat and obj_valid:
210
+ # cat_cnt += 1
211
+
212
+ # contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
213
+ # cv2.drawContours(frame, contours, -1, colors[j], 3)
214
+ # cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
215
+
216
+ # if len(contours) > 0:
217
+ # largest_contour = max(contours, key=cv2.contourArea)
218
+ # M = cv2.moments(largest_contour)
219
+ # if M["m00"] != 0:
220
+ # center_x = int(M["m10"] / M["m00"])
221
+ # center_y = int(M["m01"] / M["m00"])
222
+ # else:
223
+ # center_x, center_y = 0, 0
224
+
225
+ # font = cv2.FONT_HERSHEY_SIMPLEX
226
+ # text = obj_id
227
+ # font_scale = 1.2
228
+ # text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
229
+ # text_x = center_x - text_size[0] // 1
230
+ # text_y = center_y
231
+
232
+ # rect_start = (text_x - 5, text_y - text_size[1] - 5)
233
+ # rect_end = (text_x + text_size[0] + 5, text_y + 3)
234
+
235
+ # contour_thickness = 1
236
+ # rect_start_contour = (rect_start[0] - contour_thickness, rect_start[1] - contour_thickness)
237
+ # rect_end_contour = (rect_end[0] + contour_thickness, rect_end[1] + contour_thickness)
238
+
239
+ # cv2.rectangle(frame, rect_start_contour, rect_end_contour, colors[j], contour_thickness)
240
+ # cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
241
+ # cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
242
+
243
+
244
+ # if color_mask:
245
+ # alpha = 0.08
246
+ # colored_obj_mask = np.zeros_like(frame)
247
+ # colored_obj_mask[obj_mask == 1] = colors[j]
248
+ # frame[obj_mask == 1] = (
249
+ # (1 - alpha) * frame[obj_mask == 1]
250
+ # + alpha * colored_obj_mask[obj_mask == 1]
251
+ # )
252
+
253
+ # # plt.figure(figsize=(12, 8))
254
+ # # plt.imshow(frame)
255
+ # # plt.title(f"frame {frame_name}")
256
+ # # plt.tight_layout()
257
+ # # plt.axis('off')
258
+ # # plt.show()
259
+
260
+ # buffer = BytesIO()
261
+ # frame = Image.fromarray(frame)
262
+ # frame.save(buffer, format='jpeg')
263
+ # buffer.seek(0)
264
+ # cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
265
+ # frame_cat_cnts[frame_name] = cat_cnt
266
+
267
+ # buffer.seek(0) # Reuse buffer instead of creating a new one
268
+ # buffer.truncate()
269
+ # frame_for_contour = Image.fromarray(frame_for_contour)
270
+ # frame_for_contour.save(buffer, format='jpeg')
271
+ # buffer.seek(0)
272
+ # contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
273
+
274
+ # encoded_frames[cat] = cat_frames
275
+ # contoured_frames[cat] = contour_frames
276
+ # vid_cat_cnts[cat] = frame_cat_cnts
277
+
278
+
279
+ # return encoded_frames, contoured_frames, vid_cat_cnts
280
+
281
+
282
+
283
+ def getCaption(idx, model='gpt-4o', color_mask=False):
284
+ vid_meta = metas[idx]
285
+ vid_data = train_dataset[idx]
286
+ vid_id = vid_meta['video']
287
+ print(f"vid id: {vid_id}\n")
288
+
289
+ frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
290
+ cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
291
+ all_captions = dict()
292
+
293
+
294
+ base64_frames, _ , vid_cat_cnts = number_objects_and_encode(idx, color_mask)
295
+ #marked = "mask with boundary" if color_mask else "boundary"
296
+
297
+ for cat_name in list(cat_names) :
298
+
299
+ is_movable = False
300
+ if cat_name in ytvos_category_valid_list :
301
+ is_movable = True
302
+
303
+ if not is_movable:
304
+ print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
305
+
306
+
307
+ image_captions = {}
308
+ captioner = OpenAI()
309
+ cat_base64_frames = base64_frames[cat_name]
310
+ # cont_base64_frames = contoured_frames[cat_name]
311
+
312
+ for i in range(len(cat_base64_frames)):
313
+ frame_name = frame_indx[i]
314
+ # cont_base64_image = cont_base64_frames[i]
315
+ base64_image = cat_base64_frames[i]
316
+ should_filter = False
317
+ frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
318
+
319
+ if frame_cat_cnts >= 2:
320
+ should_filter = True
321
+ else:
322
+ print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
323
+
324
+
325
+ if is_movable and should_filter:
326
+ #1단계: 필터링
327
+ print(f"-----------category name: {cat_name}, frame name: {frame_name}")
328
+ caption_filter_text = f"""
329
+ You are a visual assistant analyzing a single frame from a video.
330
+ In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
331
+
332
+ Are {cat_name}s in the image performing all different and recognizable actions or postures?
333
+ Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
334
+ facial expressions, and any notable interactions with objects or other {cat_name}s or people.
335
+
336
+ Only focus on obvious, prominent actions that can be reliably identified from this single frame.
337
+
338
+ - Respond with "YES" if:
339
+ 1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
340
+ (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
341
+ 2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
342
+ 3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
343
+
344
+ - Respond with "NONE" if:
345
+ 1) The actions or pose are not clearly differentiable or too similar.
346
+ 2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
347
+ 3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
348
+
349
+ Answer strictly with either "YES" or "NONE".
350
+ """
351
+
352
+ response1 = captioner.chat.completions.create(
353
+ model=model,
354
+ messages=[
355
+ {
356
+ "role": "user",
357
+ "content": [
358
+ {
359
+ "type": "text",
360
+ "text": caption_filter_text,
361
+ },
362
+ {
363
+ "type": "image_url",
364
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
365
+ }
366
+ ],
367
+ }
368
+ ],
369
+ )
370
+ response_content = response1.choices[0].message.content
371
+ should_caption = True if "yes" in response_content.lower() else False
372
+ print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
373
+
374
+ else:
375
+ should_caption = False
376
+
377
+ #2단계: dense caption 만들기
378
+ dense_caption_prompt_1 = f"""
379
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. The category name of these objects are : {cat_name}.
380
+
381
+ Please describe the image focusing on labeled {cat_name}s in detail, focusing on their actions and interactions.
382
+
383
+ 1. Focus only on clear, unique, and prominent actions that distinguish each object.
384
+ 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
385
+ 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
386
+ 4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
387
+ 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
388
+ 6. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
389
+ - expressions like 'seems to be', 'appears to be' are BANNED!
390
+ 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
391
+ 8. Include interactions with objects or other entities when they are prominent and observable.
392
+ 9. **Do not include descriptions of appearance** such as clothes, color, size, shape etc.
393
+ 10. **Do not include relative position** between objects such as 'the left elephant' because left/right can be ambiguous.
394
+ 11. Do not mention object IDs.
395
+ 12. Use '{cat_name}' as the noun for the referring expressions.
396
+
397
+ Note that I want to use your description to create a grounding dataset, therefore, your descriptions for different objects should be unique, i.e., If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
398
+
399
+ - Your answer should contain details, and follow the following format:
400
+ object id. action-oriented description
401
+ (e.g. 1. the person is holding bananas on two hands and opening his mouth, turning the head right.
402
+ 2. a person bending over and touching his boots to tie the shoelace.)
403
+ - for action-oriented description, use {cat_name} as subject noun
404
+
405
+ **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
406
+ Please pay attention to the categories of these objects and don’t change them.
407
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
408
+ Output referring expressions for each object id. Please start your answer:"""
409
+
410
+
411
+ dense_caption_prompt_2 = f"""
412
+ You are an advanced visual language model analyzing a video frame.
413
+ In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been distinctly labeled with bright numerical IDs at their center and boundary.
414
+
415
+ Your task is to generate **action-oriented descriptions** for each labeled {cat_name}.
416
+ Your descriptions should capture their **observable actions and interactions**, making sure to highlight movement, gestures, and dynamic behaviors.
417
+
418
+ ---
419
+ ## Key Guidelines:
420
+ 1. **Describe only clear and visible actions** that uniquely define what the {cat_name} is doing.
421
+ - Example: "grabbing a branch and pulling it down" (**(O) Specific**)
422
+ - Avoid: "moving slightly to the side" (**(X) Too vague**)
423
+
424
+ 2. **Do not describe appearance, color, or position**—focus purely on the action.
425
+ - (X) "A large brown bear standing on the left"
426
+ - (O) "The bear is lifting its front paws and swiping forward."
427
+
428
+ 3. **Use dynamic, action-specific verbs** rather than passive descriptions.
429
+ - (O) "The giraffe is tilting its head and sniffing the ground."
430
+ - (X) "The giraffe is near a tree and looking around."
431
+
432
+ 4. **Avoid assumptions, emotions, or speculative phrasing.**
433
+ - (X) "The person seems excited" / "The person might be preparing to jump."
434
+ - (O) "The person is pushing its front legs against the rock and leaping forward."
435
+
436
+ 5. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
437
+ - expressions like 'seems to be', 'appears to be' are BANNED!
438
+ 6. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
439
+
440
+ 7. If multiple {cat_name}s are present, make sure their descriptions are **distinct and non-overlapping**.
441
+ - **Each object should have a unique, descriptive action.**
442
+ - (X) "Two dogs are running."
443
+ - (O) "1. One dog is chasing another, its legs stretched mid-air.
444
+ 2. The other dog is looking back while speeding up."
445
+
446
+ ---
447
+ ## Output Format:
448
+ - Each labeled **{cat_name}** should have exactly **one line of description**.
449
+ - Format: `ID. {cat_name} + action-based description`
450
+ - (O) Example:
451
+ ```
452
+ 1. The person is leaning forward while opening a bag with both hands.
453
+ 2. The person is holding onto a rope and pulling themselves up.
454
+ ```
455
+ - **Ensure that each object is described individually.**
456
+ - **Do not group objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
457
+
458
+ ---
459
+ ## Additional Instructions:
460
+ - **Do NOT** use expressions like "it appears that..." or "it seems like...".
461
+ - **Do NOT** mention object IDs in the description (only use the provided format).
462
+ - **DO NOT** include markdown formatting (no bullet points, no asterisks).
463
+ - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
464
+
465
+ Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:
466
+ """
467
+
468
+
469
+ dense_caption_prompt = f"""
470
+ You are a visual assistant analyzing a single frame of a video.
471
+ In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been labeled with bright numeric IDs at their center and boundary.
472
+
473
+ I am building an **action-centric referring expression** dataset.
474
+ Your task is to describe each labeled {cat_name} based on **clearly observable and specific actions**.
475
+
476
+ ---
477
+ ## Guidelines:
478
+ 1. **Focus only on visible and prominent actions** (e.g., running, pushing, grasping an object).
479
+ 2. **Avoid describing minor or ambiguous movements** (e.g., "slightly moving a paw," "tilting head a bit").
480
+ 3. **Do not include subjective or speculative descriptions** (e.g., "it seems excited" or "it might be preparing to jump").
481
+ 4. **Avoid vague expressions** like "engaging with something." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
482
+ 5. **Use dynamic action verbs** (e.g., holding, throwing, inspecting, leaning, pressing) to highlight motion and interaction.
483
+ 6. If multiple {cat_name}s appear, ensure each description is **distinct and non-overlapping**.
484
+ 7. Base your descriptions on these principles:
485
+ - **Avoid words like 'minimal' or 'slightly'.**
486
+ - Emphasize **body movement, posture, and motion patterns** (e.g., "lifting its head," "facing forward," "showing its back").
487
+ - Describe **facial expressions and interactions with objects** (e.g., "opening its mouth wide," "smiling while holding an item").
488
+ - **Specify actions with other objects or entities** only when they are clear and observable.
489
+ - (O) "pushing another person"
490
+ - (X) "interacting with another object"
491
+
492
+ ---
493
+ ## Output Format:
494
+ - Each labeled **{cat_name}** must have **exactly one line**.
495
+ - Format: `ID. {cat_name} + action-based description`
496
+ - (O) Example:
497
+ ```
498
+ 1. The person is holding ski poles and skiing down a snowy mountain with bent knees.
499
+ 2. The person is pulling a baby carriage while smiling.
500
+ ```
501
+ - **Ensure each object is described individually.**
502
+ - **Do not group multiple objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
503
+
504
+ ---
505
+ ## Example:
506
+ If the frame has two labeled **bears**, your output should be:
507
+ ```
508
+ 1. The bear is reaching out its right paw while leaning forward to catch prey.
509
+ 2. A bear is standing upright, facing right, and touching the bike beside it.
510
+ ```
511
+
512
+ ---
513
+ ## Additional Instructions:
514
+ - **Do NOT** describe appearance (e.g., color, size, texture) or relative positioning (e.g., "on the left/right").
515
+ - **Do NOT** reference object IDs explicitly (e.g., "Person 1" or "Object 2" is NOT allowed).
516
+ - **Do NOT** include markdown formatting (no bullet points, asterisks, or extra symbols).
517
+ - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
518
+
519
+ Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:"""
520
+
521
+
522
+ MAX_RETRIES = 3
523
+ retry_count = 0
524
+
525
+ if should_caption:
526
+ while retry_count < MAX_RETRIES:
527
+ selected_prompt = random.choice([dense_caption_prompt, dense_caption_prompt_2, dense_caption_prompt_1])
528
+
529
+ response2 = captioner.chat.completions.create(
530
+ model=model,
531
+ messages=[
532
+ {
533
+ "role": "user",
534
+ "content": [
535
+ {
536
+ "type": "text",
537
+ "text": selected_prompt,
538
+ },
539
+ {
540
+ "type": "image_url",
541
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
542
+ },
543
+ ],
544
+ }
545
+ ],
546
+ )
547
+
548
+ # caption = response2.choices[0].message.content
549
+ #print(f"{image_path} - {frame_name}: {caption}")
550
+
551
+ caption = response2.choices[0].message.content.strip()
552
+ caption_lower = caption.lower().lstrip()
553
+
554
+ if caption_lower.startswith("1.") and not any(
555
+ phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
556
+ ):
557
+ break
558
+
559
+ print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
560
+ retry_count += 1
561
+ time.sleep(2)
562
+
563
+ if retry_count == MAX_RETRIES:
564
+ caption = None
565
+ print("Max retries reached. Caption generation failed.")
566
+
567
+ else:
568
+ caption = None
569
+
570
+ image_captions[frame_name] = caption
571
+ all_captions[cat_name] = image_captions
572
+
573
+ # final : also prepare valid object ids
574
+ valid_obj_ids = dict()
575
+
576
+ for cat in cat_names:
577
+ if cat in ytvos_category_valid_list:
578
+ obj_id_cat = vid_meta['obj_id_cat']
579
+ valid_cat_ids = []
580
+ for obj_id in list(obj_id_cat.keys()):
581
+ if obj_id_cat[obj_id] == cat:
582
+ valid_cat_ids.append(obj_id)
583
+ valid_obj_ids[cat] = valid_cat_ids
584
+
585
+ return vid_id, all_captions, valid_obj_ids
586
+
587
+
588
+ if __name__ == '__main__':
589
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
590
+ parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions_gpt-4o_randcap.json")
591
+ parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids_gpt-4o_randcap.json")
592
+
593
+ args = parser.parse_args()
594
+
595
+ #==================데이터 불러오기===================
596
+ # 전체 데이터셋
597
+ train_dataset = build_ytvos_ref(image_set = 'train', args = args)
598
+
599
+ # 전체 데이터셋 메타데이터
600
+ metas = train_dataset.metas
601
+
602
+ # 색상 후보 8개 (RGB 형식)
603
+ colors = [
604
+ (255, 0, 0), # Red
605
+ (0, 255, 0), # Green
606
+ (0, 0, 255), # Blue
607
+ (255, 255, 0), # Yellow
608
+ (255, 0, 255), # Magenta
609
+ (0, 255, 255), # Cyan
610
+ (128, 0, 128), # Purple
611
+ (255, 165, 0) # Orange
612
+ ]
613
+
614
+ ytvos_category_valid_list = [
615
+ 'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
616
+ 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
617
+ 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
618
+ 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
619
+ 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
620
+ 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
621
+ ]
622
+
623
+ #==================gpt 돌리기===================
624
+ os.environ['OPENAI_API_KEY'] = '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'
625
+
626
+ result_captions = {}
627
+ result_valid_obj_ids = {}
628
+
629
+ for i in range(370):
630
+ vid_id, all_captions, valid_obj_ids = getCaption(i, color_mask=False)
631
+
632
+ if vid_id not in result_captions:
633
+ result_captions[vid_id] = all_captions
634
+ if vid_id not in result_valid_obj_ids:
635
+ result_valid_obj_ids[vid_id] = valid_obj_ids
636
+
637
+ print("Finished!", flush=True)
638
+
639
+ with open(args.save_caption_path, "w") as file:
640
+ json.dump(result_captions, file, indent=4)
641
+
642
+ with open(args.save_valid_obj_ids_path, "w") as file:
643
+ json.dump(result_valid_obj_ids, file, indent=4)
mbench/gpt_ref-ytvos_numbered_cy_sanity_2.py ADDED
@@ -0,0 +1,676 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4
+ import time
5
+
6
+ from os import path as osp
7
+ from io import BytesIO
8
+ import random
9
+
10
+ from mbench.ytvos_ref import build as build_ytvos_ref
11
+ import argparse
12
+ import opts
13
+
14
+ import sys
15
+ from pathlib import Path
16
+ import os
17
+ from os import path as osp
18
+ import skimage
19
+ from io import BytesIO
20
+
21
+ import numpy as np
22
+ import pandas as pd
23
+ import regex as re
24
+ import json
25
+
26
+ import cv2
27
+ from PIL import Image, ImageDraw
28
+ import torch
29
+ from torchvision.transforms import functional as F
30
+
31
+ from skimage import measure # (pip install scikit-image)
32
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
33
+
34
+ import matplotlib.pyplot as plt
35
+ import matplotlib.patches as patches
36
+ from matplotlib.collections import PatchCollection
37
+ from matplotlib.patches import Rectangle
38
+ import textwrap
39
+
40
+
41
+ import ipywidgets as widgets
42
+ from IPython.display import display, clear_output
43
+
44
+ from openai import OpenAI, APIConnectionError, OpenAIError
45
+ import base64
46
+ import json
47
+ import requests
48
+
49
+ def number_objects_and_encode_old(idx, color_mask=False):
50
+ encoded_frames = {}
51
+ contoured_frames = {} # New dictionary for original images
52
+ vid_cat_cnts = {}
53
+
54
+ vid_meta = metas[idx]
55
+ vid_data = train_dataset[idx]
56
+ vid_id = vid_meta['video']
57
+ frame_indx = vid_meta['sample_indx']
58
+ cat_names = set(vid_meta['obj_id_cat'].values())
59
+ imgs = vid_data[0]
60
+
61
+ for cat in cat_names:
62
+ cat_frames = []
63
+ contour_frames = []
64
+ frame_cat_cnts = {}
65
+
66
+ for i in range(imgs.size(0)):
67
+ frame_name = frame_indx[i]
68
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
69
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
70
+
71
+ frame_data = vid_data[2][frame_name]
72
+ obj_ids = list(frame_data.keys())
73
+
74
+ cat_cnt = 0
75
+
76
+ for j in range(len(obj_ids)):
77
+ obj_id = obj_ids[j]
78
+ obj_data = frame_data[obj_id]
79
+ obj_bbox = obj_data['bbox']
80
+ obj_valid = obj_data['valid']
81
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
82
+ obj_cat = obj_data['category_name']
83
+
84
+ if obj_cat == cat and obj_valid:
85
+ cat_cnt += 1
86
+
87
+ if color_mask == False:
88
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
89
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
90
+ for i, contour in enumerate(contours):
91
+ moments = cv2.moments(contour)
92
+ if moments["m00"] != 0:
93
+ cx = int(moments["m10"] / moments["m00"])
94
+ cy = int(moments["m01"] / moments["m00"])
95
+ else:
96
+ cx, cy = contour[0][0]
97
+
98
+ font = cv2.FONT_HERSHEY_SIMPLEX
99
+ text = obj_id
100
+ text_size = cv2.getTextSize(text, font, 1, 2)[0]
101
+ text_w, text_h = text_size
102
+
103
+ cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
104
+ (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
105
+
106
+ cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
107
+ font, 1, (255, 255, 255), 2)
108
+
109
+ else:
110
+ alpha = 0.08
111
+
112
+ colored_obj_mask = np.zeros_like(frame)
113
+ colored_obj_mask[obj_mask == 1] = colors[j]
114
+ frame[obj_mask == 1] = (
115
+ (1 - alpha) * frame[obj_mask == 1]
116
+ + alpha * colored_obj_mask[obj_mask == 1]
117
+ )
118
+
119
+
120
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
121
+ cv2.drawContours(frame, contours, -1, colors[j], 2)
122
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
123
+
124
+ if len(contours) > 0:
125
+ largest_contour = max(contours, key=cv2.contourArea)
126
+ M = cv2.moments(largest_contour)
127
+ if M["m00"] != 0:
128
+ center_x = int(M["m10"] / M["m00"])
129
+ center_y = int(M["m01"] / M["m00"])
130
+ else:
131
+ center_x, center_y = 0, 0
132
+
133
+ font = cv2.FONT_HERSHEY_SIMPLEX
134
+ text = obj_id
135
+
136
+ font_scale = 0.9
137
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
138
+ text_x = center_x - text_size[0] // 1
139
+ text_y = center_y
140
+
141
+ rect_start = (text_x - 5, text_y - text_size[1] - 5)
142
+ rect_end = (text_x + text_size[0] + 5, text_y)
143
+
144
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
145
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
146
+
147
+ # plt.figure(figsize=(12, 8))
148
+ # plt.imshow(frame)
149
+ # plt.title(f"frame {frame_name}")
150
+ # plt.tight_layout()
151
+ # plt.axis('off')
152
+ # plt.show()
153
+
154
+ buffer = BytesIO()
155
+ frame = Image.fromarray(frame)
156
+ frame.save(buffer, format='jpeg')
157
+ buffer.seek(0)
158
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
159
+ frame_cat_cnts[frame_name] = cat_cnt
160
+
161
+ buffer.seek(0) # Reuse buffer instead of creating a new one
162
+ buffer.truncate()
163
+ frame_for_contour = Image.fromarray(frame_for_contour)
164
+ frame_for_contour.save(buffer, format='jpeg')
165
+ buffer.seek(0)
166
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
167
+
168
+ encoded_frames[cat] = cat_frames
169
+ contoured_frames[cat] = contour_frames
170
+ vid_cat_cnts[cat] = frame_cat_cnts
171
+
172
+ return encoded_frames, contoured_frames, vid_cat_cnts
173
+
174
+
175
+ def number_objects_and_encode(idx, color_mask=False):
176
+ encoded_frames = {}
177
+ contoured_frames = {} # New dictionary for original images
178
+ vid_cat_cnts = {}
179
+
180
+ vid_meta = metas[idx]
181
+ vid_data = train_dataset[idx]
182
+ vid_id = vid_meta['video']
183
+ frame_indx = vid_meta['sample_indx']
184
+ cat_names = set(vid_meta['obj_id_cat'].values())
185
+ imgs = vid_data[0]
186
+
187
+ for cat in cat_names:
188
+ cat_frames = []
189
+ contour_frames = []
190
+ frame_cat_cnts = {}
191
+
192
+ for i in range(imgs.size(0)):
193
+ frame_name = frame_indx[i]
194
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
195
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
196
+
197
+ frame_data = vid_data[2][frame_name]
198
+ obj_ids = list(frame_data.keys())
199
+
200
+ cat_cnt = 0
201
+
202
+ for j in range(len(obj_ids)):
203
+ obj_id = obj_ids[j]
204
+ obj_data = frame_data[obj_id]
205
+ obj_bbox = obj_data['bbox']
206
+ obj_valid = obj_data['valid']
207
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
208
+ obj_cat = obj_data['category_name']
209
+
210
+ if obj_cat == cat and obj_valid:
211
+ cat_cnt += 1
212
+
213
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
214
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
215
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
216
+
217
+ if len(contours) > 0:
218
+ largest_contour = max(contours, key=cv2.contourArea)
219
+ M = cv2.moments(largest_contour)
220
+ if M["m00"] != 0:
221
+ center_x = int(M["m10"] / M["m00"])
222
+ center_y = int(M["m01"] / M["m00"])
223
+ else:
224
+ center_x, center_y = 0, 0
225
+
226
+ font = cv2.FONT_HERSHEY_SIMPLEX
227
+ text = obj_id
228
+ font_scale = 1.2
229
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
230
+ text_x = center_x - text_size[0] // 1
231
+ text_y = center_y
232
+
233
+ rect_start = (text_x - 5, text_y - text_size[1] - 5)
234
+ rect_end = (text_x + text_size[0] + 5, text_y + 3)
235
+
236
+ contour_thickness = 1
237
+ rect_start_contour = (rect_start[0] - contour_thickness, rect_start[1] - contour_thickness)
238
+ rect_end_contour = (rect_end[0] + contour_thickness, rect_end[1] + contour_thickness)
239
+
240
+ cv2.rectangle(frame, rect_start_contour, rect_end_contour, colors[j], contour_thickness)
241
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
242
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
243
+
244
+
245
+ if color_mask:
246
+ alpha = 0.08
247
+ colored_obj_mask = np.zeros_like(frame)
248
+ colored_obj_mask[obj_mask == 1] = colors[j]
249
+ frame[obj_mask == 1] = (
250
+ (1 - alpha) * frame[obj_mask == 1]
251
+ + alpha * colored_obj_mask[obj_mask == 1]
252
+ )
253
+
254
+ # plt.figure(figsize=(12, 8))
255
+ # plt.imshow(frame)
256
+ # plt.title(f"frame {frame_name}")
257
+ # plt.tight_layout()
258
+ # plt.axis('off')
259
+ # plt.show()
260
+
261
+ buffer = BytesIO()
262
+ frame = Image.fromarray(frame)
263
+ frame.save(buffer, format='jpeg')
264
+ buffer.seek(0)
265
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
266
+ frame_cat_cnts[frame_name] = cat_cnt
267
+
268
+ buffer.seek(0) # Reuse buffer instead of creating a new one
269
+ buffer.truncate()
270
+ frame_for_contour = Image.fromarray(frame_for_contour)
271
+ frame_for_contour.save(buffer, format='jpeg')
272
+ buffer.seek(0)
273
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
274
+
275
+ encoded_frames[cat] = cat_frames
276
+ contoured_frames[cat] = contour_frames
277
+ vid_cat_cnts[cat] = frame_cat_cnts
278
+
279
+ return encoded_frames, contoured_frames, vid_cat_cnts
280
+
281
+
282
+
283
+ def getCaption(idx, model='gpt-4o'):
284
+ vid_meta = metas[idx]
285
+ vid_data = train_dataset[idx]
286
+ vid_id = vid_meta['video']
287
+ print(f"vid id: {vid_id}\n")
288
+
289
+ frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
290
+ cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
291
+ all_captions = dict()
292
+
293
+ # color_mask = random.choice([True, False])
294
+ color_mask = random.choices([False, True], weights=[60, 40])[0]
295
+
296
+ base64_frames, _ , vid_cat_cnts = number_objects_and_encode(idx, color_mask)
297
+ #marked = "mask with boundary" if color_mask else "boundary"
298
+
299
+ for cat_name in list(cat_names) :
300
+
301
+ is_movable = False
302
+ if cat_name in ytvos_category_valid_list :
303
+ is_movable = True
304
+
305
+ if not is_movable:
306
+ print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
307
+
308
+
309
+ image_captions = {}
310
+ captioner = OpenAI()
311
+ cat_base64_frames = base64_frames[cat_name]
312
+ # cont_base64_frames = contoured_frames[cat_name]
313
+
314
+ for i in range(len(cat_base64_frames)):
315
+ frame_name = frame_indx[i]
316
+ # cont_base64_image = cont_base64_frames[i]
317
+ base64_image = cat_base64_frames[i]
318
+ should_filter = False
319
+ frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
320
+
321
+ if frame_cat_cnts >= 2:
322
+ should_filter = True
323
+ else:
324
+ print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
325
+
326
+
327
+ if is_movable and should_filter:
328
+ #1단계: 필터링
329
+ print(f"-----------category name: {cat_name}, frame name: {frame_name}")
330
+ caption_filter_text = f"""
331
+ You are a visual assistant analyzing a single frame from a video.
332
+ In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
333
+
334
+ Are {cat_name}s in the image performing all different and recognizable actions or postures?
335
+ Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
336
+ facial expressions, and any notable interactions with objects or other {cat_name}s or people.
337
+
338
+ Only focus on obvious, prominent actions that can be reliably identified from this single frame.
339
+
340
+ - Respond with "YES" if:
341
+ 1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
342
+ (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
343
+ 2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
344
+ 3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
345
+
346
+ - Respond with "NONE" if:
347
+ 1) The actions or pose are not clearly differentiable or too similar.
348
+ 2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
349
+ 3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
350
+
351
+ Answer strictly with either "YES" or "NONE".
352
+ """
353
+
354
+ response1 = captioner.chat.completions.create(
355
+ model=model,
356
+ messages=[
357
+ {
358
+ "role": "user",
359
+ "content": [
360
+ {
361
+ "type": "text",
362
+ "text": caption_filter_text,
363
+ },
364
+ {
365
+ "type": "image_url",
366
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
367
+ }
368
+ ],
369
+ }
370
+ ],
371
+ )
372
+ response_content = response1.choices[0].message.content
373
+ should_caption = True if "yes" in response_content.lower() else False
374
+ print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
375
+
376
+ else:
377
+ should_caption = False
378
+
379
+ #2단계: dense caption 만들기
380
+ dense_caption_prompt_1 = f"""
381
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. The category name of these objects are : {cat_name}.
382
+
383
+ Please describe the image focusing on labeled {cat_name}s in detail, focusing on their actions and interactions.
384
+
385
+ 1. Focus only on clear, unique, and prominent actions that distinguish each object.
386
+ 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
387
+ 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
388
+ 4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
389
+ 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
390
+ 6. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
391
+ - expressions like 'seems to be', 'appears to be' are BANNED!
392
+ 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
393
+ 8. Include interactions with objects or other entities when they are prominent and observable.
394
+ 9. **Do not include descriptions of appearance** such as clothes, color, size, shape etc.
395
+ 10. **Do not include relative position** between objects such as 'the left elephant' because left/right can be ambiguous.
396
+ 11. Do not mention object IDs.
397
+ 12. Use '{cat_name}' as the noun for the referring expressions.
398
+
399
+ Note that I want to use your description to create a grounding dataset, therefore, your descriptions for different objects should be unique, i.e., If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
400
+
401
+ - Your answer should contain details, and follow the following format:
402
+ object id. action-oriented description
403
+ (e.g. 1. the person is holding bananas on two hands and opening his mouth, turning the head right.
404
+ 2. a person bending over and touching his boots to tie the shoelace.)
405
+ - for action-oriented description, use {cat_name} as subject noun
406
+
407
+ **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
408
+ Please pay attention to the categories of these objects and don’t change them.
409
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
410
+ Output referring expressions for each object id. Please start your answer:"""
411
+
412
+
413
+ dense_caption_prompt_2 = f"""
414
+ You are an advanced visual language model analyzing a video frame.
415
+ In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been distinctly labeled with bright numerical IDs at their center and boundary.
416
+
417
+ Your task is to generate **action-oriented descriptions** for each labeled {cat_name}.
418
+ Your descriptions should capture their **observable actions and interactions**, making sure to highlight movement, gestures, and dynamic behaviors.
419
+
420
+ ---
421
+ ## Key Guidelines:
422
+ 1. **Describe only clear and visible actions** that uniquely define what the {cat_name} is doing.
423
+ - Example: "grabbing a branch and pulling it down" (**(O) Specific**)
424
+ - Avoid: "moving slightly to the side" (**(X) Too vague**)
425
+
426
+ 2. **Do not describe appearance, color, or position**—focus purely on the action.
427
+ - (X) "A large brown bear standing on the left"
428
+ - (O) "The bear is lifting its front paws and swiping forward."
429
+
430
+ 3. **Use dynamic, action-specific verbs** rather than passive descriptions.
431
+ - (O) "The giraffe is tilting its head and sniffing the ground."
432
+ - (X) "The giraffe is near a tree and looking around."
433
+
434
+ 4. **Avoid assumptions, emotions, or speculative phrasing.**
435
+ - (X) "The person seems excited" / "The person might be preparing to jump."
436
+ - (O) "The person is pushing its front legs against the rock and leaping forward."
437
+
438
+ 5. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
439
+ - expressions like 'seems to be', 'appears to be' are BANNED!
440
+ 6. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
441
+
442
+ 7. If multiple {cat_name}s are present, make sure their descriptions are **distinct and non-overlapping**.
443
+ - **Each object should have a unique, descriptive action.**
444
+ - (X) "Two dogs are running."
445
+ - (O) "1. One dog is chasing another, its legs stretched mid-air.
446
+ 2. The other dog is looking back while speeding up."
447
+
448
+ ---
449
+ ## Output Format:
450
+ - Each labeled **{cat_name}** should have exactly **one line of description**.
451
+ - Format: `ID. {cat_name} + action-based description`
452
+ - (O) Example:
453
+ ```
454
+ 1. The person is leaning forward while opening a bag with both hands.
455
+ 2. The person is holding onto a rope and pulling themselves up.
456
+ ```
457
+ - **Ensure that each object is described individually.**
458
+ - **Do not group objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
459
+
460
+ ---
461
+ ## Additional Instructions:
462
+ - **Do NOT** use expressions like "it appears that..." or "it seems like...".
463
+ - **Do NOT** mention object IDs in the description (only use the provided format).
464
+ - **DO NOT** include markdown formatting (no bullet points, no asterisks).
465
+ - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
466
+
467
+ Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:
468
+ """
469
+
470
+
471
+ dense_caption_prompt = f"""
472
+ You are a visual assistant analyzing a single frame of a video.
473
+ In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been labeled with bright numeric IDs at their center and boundary.
474
+
475
+ I am building an **action-centric referring expression** dataset.
476
+ Your task is to describe each labeled {cat_name} based on **clearly observable and specific actions**.
477
+
478
+ ---
479
+ ## Guidelines:
480
+ 1. **Focus only on visible and prominent actions** (e.g., running, pushing, grasping an object).
481
+ 2. **Avoid describing minor or ambiguous movements** (e.g., "slightly moving a paw," "tilting head a bit").
482
+ 3. **Do not include subjective or speculative descriptions** (e.g., "it seems excited" or "it might be preparing to jump").
483
+ 4. **Avoid vague expressions** like "engaging with something." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
484
+ 5. **Use dynamic action verbs** (e.g., holding, throwing, inspecting, leaning, pressing) to highlight motion and interaction.
485
+ 6. If multiple {cat_name}s appear, ensure each description is **distinct and non-overlapping**.
486
+ 7. Base your descriptions on these principles:
487
+ - **Avoid words like 'minimal' or 'slightly'.**
488
+ - Emphasize **body movement, posture, and motion patterns** (e.g., "lifting its head," "facing forward," "showing its back").
489
+ - Describe **facial expressions and interactions with objects** (e.g., "opening its mouth wide," "smiling while holding an item").
490
+ - **Specify actions with other objects or entities** only when they are clear and observable.
491
+ - (O) "pushing another person"
492
+ - (X) "interacting with another object"
493
+
494
+ ---
495
+ ## Output Format:
496
+ - Each labeled **{cat_name}** must have **exactly one line**.
497
+ - Format: `ID. {cat_name} + action-based description`
498
+ - (O) Example:
499
+ ```
500
+ 1. The person is holding ski poles and skiing down a snowy mountain with bent knees.
501
+ 2. The person is pulling a baby carriage while smiling.
502
+ ```
503
+ - **Ensure each object is described individually.**
504
+ - **Do not group multiple objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
505
+
506
+ ---
507
+ ## Example:
508
+ If the frame has two labeled **bears**, your output should be:
509
+ ```
510
+ 1. The bear is reaching out its right paw while leaning forward to catch prey.
511
+ 2. A bear is standing upright, facing right, and touching the bike beside it.
512
+ ```
513
+
514
+ ---
515
+ ## Additional Instructions:
516
+ - **Do NOT** describe appearance (e.g., color, size, texture) or relative positioning (e.g., "on the left/right").
517
+ - **Do NOT** reference object IDs explicitly (e.g., "Person 1" or "Object 2" is NOT allowed).
518
+ - **Do NOT** include markdown formatting (no bullet points, asterisks, or extra symbols).
519
+ - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
520
+
521
+ Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:"""
522
+
523
+
524
+ MAX_RETRIES = 3
525
+ retry_count = 0
526
+
527
+ if should_caption:
528
+ while retry_count < MAX_RETRIES:
529
+ selected_prompt = random.choice([dense_caption_prompt, dense_caption_prompt_2])
530
+
531
+ response2 = captioner.chat.completions.create(
532
+ model=model,
533
+ messages=[
534
+ {
535
+ "role": "user",
536
+ "content": [
537
+ {
538
+ "type": "text",
539
+ "text": selected_prompt,
540
+ },
541
+ {
542
+ "type": "image_url",
543
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
544
+ },
545
+ ],
546
+ }
547
+ ],
548
+ )
549
+
550
+ # caption = response2.choices[0].message.content
551
+ #print(f"{image_path} - {frame_name}: {caption}")
552
+
553
+ caption = response2.choices[0].message.content.strip()
554
+ caption_lower = caption.lower().lstrip()
555
+
556
+ if caption_lower.startswith("1.") and not any(
557
+ phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
558
+ ):
559
+ break
560
+
561
+ print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
562
+ retry_count += 1
563
+ time.sleep(2)
564
+
565
+ if retry_count == MAX_RETRIES:
566
+ caption = None
567
+ print("Max retries reached. Caption generation failed.")
568
+
569
+ else:
570
+ caption = None
571
+
572
+ image_captions[frame_name] = caption
573
+ all_captions[cat_name] = image_captions
574
+
575
+ # final : also prepare valid object ids
576
+ valid_obj_ids = dict()
577
+
578
+ for cat in cat_names:
579
+ if cat in ytvos_category_valid_list:
580
+ obj_id_cat = vid_meta['obj_id_cat']
581
+ valid_cat_ids = []
582
+ for obj_id in list(obj_id_cat.keys()):
583
+ if obj_id_cat[obj_id] == cat:
584
+ valid_cat_ids.append(obj_id)
585
+ valid_obj_ids[cat] = valid_cat_ids
586
+
587
+ return vid_id, all_captions, valid_obj_ids
588
+
589
+
590
+ if __name__ == '__main__':
591
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
592
+ parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions_gpt-4o_randcap.json")
593
+ parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids_gpt-4o_randcap.json")
594
+
595
+ args = parser.parse_args()
596
+
597
+ #==================데이터 불러오기===================
598
+ # 전체 데이터셋
599
+ train_dataset = build_ytvos_ref(image_set = 'train', args = args)
600
+
601
+ # 전체 데이터셋 메타데이터
602
+ metas = train_dataset.metas
603
+
604
+ # 색상 후보 8개 (RGB 형식)
605
+ colors = [
606
+ (255, 0, 0), # Red
607
+ (0, 255, 0), # Green
608
+ (0, 0, 255), # Blue
609
+ (255, 255, 0), # Yellow
610
+ (255, 0, 255), # Magenta
611
+ (0, 255, 255), # Cyan
612
+ (128, 0, 128), # Purple
613
+ (255, 165, 0) # Orange
614
+ ]
615
+
616
+ ytvos_category_valid_list = [
617
+ 'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
618
+ 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
619
+ 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
620
+ 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
621
+ 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
622
+ 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
623
+ ]
624
+
625
+ #==================gpt 돌리기===================
626
+ os.environ['OPENAI_API_KEY'] = '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'
627
+
628
+ result_captions = {}
629
+ result_valid_obj_ids = {}
630
+
631
+ for i in range(len(metas)):
632
+ try:
633
+ vid_id, all_captions, valid_obj_ids = getCaption(i)
634
+
635
+ if vid_id not in result_captions:
636
+ result_captions[vid_id] = all_captions
637
+ if vid_id not in result_valid_obj_ids:
638
+ result_valid_obj_ids[vid_id] = valid_obj_ids
639
+
640
+ except (requests.exceptions.ConnectionError, APIConnectionError) as e:
641
+ print(f"created caption until {i-1}", flush=True)
642
+ print("인터넷 연결 문제로 요청을 처리할 수 없습니다:", e, flush=True)
643
+
644
+ with open(args.save_caption_path, "w") as file:
645
+ json.dump(result_captions, file, indent=4)
646
+
647
+ with open(args.save_valid_obj_ids_path, "w") as file:
648
+ json.dump(result_valid_obj_ids, file, indent=4)
649
+
650
+ except OpenAIError as e:
651
+ print(f"created caption until {i-1}", flush=True)
652
+ print("OpenAI API 관련 오류가 발생했습니다:", e, flush=True)
653
+
654
+ with open(args.save_caption_path, "w") as file:
655
+ json.dump(result_captions, file, indent=4)
656
+
657
+ with open(args.save_valid_obj_ids_path, "w") as file:
658
+ json.dump(result_valid_obj_ids, file, indent=4)
659
+
660
+ except Exception as e:
661
+ print(f"created caption until {i-1}", flush=True)
662
+ print("알 수 없는 오류 발생:", e, flush=True)
663
+
664
+ with open(args.save_caption_path, "w") as file:
665
+ json.dump(result_captions, file, indent=4)
666
+
667
+ with open(args.save_valid_obj_ids_path, "w") as file:
668
+ json.dump(result_valid_obj_ids, file, indent=4)
669
+
670
+ print("Finished!", flush=True)
671
+
672
+ with open(args.save_caption_path, "w") as file:
673
+ json.dump(result_captions, file, indent=4)
674
+
675
+ with open(args.save_valid_obj_ids_path, "w") as file:
676
+ json.dump(result_valid_obj_ids, file, indent=4)
mbench/gpt_test.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
mbench/make_ref-ytvos_json.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ from os import path as osp
4
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
5
+
6
+ from datasets import build_dataset
7
+ import argparse
8
+ import opts
9
+
10
+
11
+ from pathlib import Path
12
+ import io
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+ import regex as re
17
+ import json
18
+
19
+ import cv2
20
+ from PIL import Image, ImageDraw
21
+ import torch
22
+ from torchvision.transforms import functional as F
23
+
24
+ from skimage import measure # (pip install scikit-image)
25
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
26
+
27
+ import matplotlib.pyplot as plt
28
+ import matplotlib.patches as patches
29
+ from matplotlib.collections import PatchCollection
30
+ from matplotlib.patches import Rectangle
31
+
32
+
33
+ import ipywidgets as widgets
34
+ from IPython.display import display, clear_output
35
+
36
+ #==================json 만들기===================
37
+ def createJson(train_dataset, metas):
38
+ entire_json = {}
39
+
40
+ #초기화
41
+ vid_idx = 0
42
+
43
+ while vid_idx < len(train_dataset):
44
+
45
+ #하나의 비디오에 대해
46
+ video_data = {}
47
+ video_train_frames, video_train_info = train_dataset[vid_idx]
48
+ video_meta = metas[vid_idx]
49
+
50
+ video_id = video_meta['video']
51
+ video_data['bins'] = video_meta['bins']
52
+ bin_nums = len(video_meta['bins'])
53
+ obj_nums = max([int(k) for k in list(video_meta['obj_id_cat'].keys())])
54
+
55
+ annotation_data = []
56
+ frame_names = []
57
+
58
+ for i in range(bin_nums):
59
+ bin_data = {}
60
+ for j in range(obj_nums):
61
+ obj_id = str(j+1)
62
+ try:
63
+ obj_data = {
64
+ "category_name":video_meta['obj_id_cat'][obj_id],
65
+ "bbox":video_train_info['boxes'][i*obj_nums+j, :].tolist(),
66
+ "valid":video_train_info['valid'][i*obj_nums+j].item()
67
+ }
68
+ except:
69
+ obj_data = {}
70
+ bin_data[obj_id] = obj_data
71
+ annotation_data.append(bin_data)
72
+
73
+ video_data['annotations'] = annotation_data
74
+
75
+
76
+ sample_indx = metas[vid_idx]['sample_indx']
77
+ frames = metas[vid_idx]['frames']
78
+ for i in sample_indx:
79
+ frame_name = frames[i]
80
+ frame_names.append(frame_name)
81
+
82
+ video_data['frame_names'] = frame_names
83
+ video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
84
+ entire_json[video_id] = video_data
85
+
86
+ vid_idx += 1
87
+
88
+ return entire_json
89
+
90
+
91
+ if __name__ == '__main__':
92
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
93
+ args = parser.parse_args()
94
+
95
+ #==================데이터 불러오기===================
96
+ # 전체 데이터셋
97
+ train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
98
+
99
+ # 전체 데이터셋 메타데이터
100
+ metas = train_dataset.metas
101
+
102
+ #==================json 만들기===================
103
+ entire_json_dict = createJson(train_dataset, metas)
104
+ print(type(entire_json_dict))
105
+ entire_json = json.dumps(entire_json_dict, indent=4)
106
+
107
+ with open('mbench/sampled_frame3.json', mode='w') as file:
108
+ file.write(entire_json)
mbench/numbered_captions_gpt-4o_final.json ADDED
The diff for this file is too large to render. See raw diff
 
mbench/numbered_captions_gpt-4o_no_mask_color.json ADDED
The diff for this file is too large to render. See raw diff
 
mbench/numbered_captions_gpt-4o_nomask_randcap.json ADDED
The diff for this file is too large to render. See raw diff
 
mbench/numbered_captions_gpt-4o_randcap.json ADDED
The diff for this file is too large to render. See raw diff
 
mbench/numbered_valid_obj_ids.json ADDED
@@ -0,0 +1,2153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "003234408d": {
3
+ "penguin": [
4
+ "1",
5
+ "2",
6
+ "3",
7
+ "4",
8
+ "5"
9
+ ]
10
+ },
11
+ "0043f083b5": {
12
+ "sedan": [
13
+ "2",
14
+ "3"
15
+ ],
16
+ "bus": [
17
+ "1"
18
+ ]
19
+ },
20
+ "0044fa5fba": {
21
+ "giant_panda": [
22
+ "1"
23
+ ]
24
+ },
25
+ "005a527edd": {
26
+ "ape": [
27
+ "1",
28
+ "2"
29
+ ]
30
+ },
31
+ "0065b171f9": {
32
+ "giant_panda": [
33
+ "1"
34
+ ]
35
+ },
36
+ "00917dcfc4": {
37
+ "zebra": [
38
+ "1",
39
+ "2",
40
+ "3"
41
+ ]
42
+ },
43
+ "00a23ccf53": {
44
+ "shark": [
45
+ "1"
46
+ ]
47
+ },
48
+ "00ad5016a4": {
49
+ "airplane": [
50
+ "1"
51
+ ]
52
+ },
53
+ "01082ae388": {
54
+ "leopard": [
55
+ "1"
56
+ ]
57
+ },
58
+ "011ac0a06f": {
59
+ "ape": [
60
+ "1",
61
+ "2",
62
+ "3",
63
+ "4",
64
+ "5"
65
+ ]
66
+ },
67
+ "013099c098": {
68
+ "giant_panda": [
69
+ "1",
70
+ "2"
71
+ ]
72
+ },
73
+ "0155498c85": {
74
+ "motorbike": [
75
+ "2"
76
+ ],
77
+ "person": [
78
+ "1"
79
+ ]
80
+ },
81
+ "01694ad9c8": {
82
+ "bird": [
83
+ "1"
84
+ ]
85
+ },
86
+ "017ac35701": {
87
+ "giant_panda": [
88
+ "1"
89
+ ]
90
+ },
91
+ "01b80e8e1a": {
92
+ "zebra": [
93
+ "1",
94
+ "2"
95
+ ]
96
+ },
97
+ "01baa5a4e1": {},
98
+ "01c3111683": {
99
+ "whale": [
100
+ "1"
101
+ ]
102
+ },
103
+ "01c4cb5ffe": {
104
+ "person": [
105
+ "1",
106
+ "3"
107
+ ]
108
+ },
109
+ "01c76f0a82": {
110
+ "sedan": [
111
+ "1",
112
+ "4"
113
+ ]
114
+ },
115
+ "01c783268c": {
116
+ "ape": [
117
+ "1"
118
+ ],
119
+ "person": [
120
+ "2"
121
+ ]
122
+ },
123
+ "01e64dd36a": {
124
+ "cow": [
125
+ "1",
126
+ "2",
127
+ "3"
128
+ ]
129
+ },
130
+ "01ed275c6e": {
131
+ "giraffe": [
132
+ "1",
133
+ "2"
134
+ ]
135
+ },
136
+ "01ff60d1fa": {
137
+ "lizard": [
138
+ "1"
139
+ ]
140
+ },
141
+ "020cd28cd2": {
142
+ "person": [
143
+ "1"
144
+ ]
145
+ },
146
+ "02264db755": {
147
+ "fox": [
148
+ "1"
149
+ ]
150
+ },
151
+ "0248626d9a": {
152
+ "train": [
153
+ "1"
154
+ ]
155
+ },
156
+ "02668dbffa": {
157
+ "frog": [
158
+ "1"
159
+ ]
160
+ },
161
+ "0274193026": {
162
+ "person": [
163
+ "2"
164
+ ]
165
+ },
166
+ "02d28375aa": {
167
+ "fox": [
168
+ "1"
169
+ ]
170
+ },
171
+ "031ccc99b1": {
172
+ "person": [
173
+ "1",
174
+ "2",
175
+ "3"
176
+ ]
177
+ },
178
+ "0321b18c10": {
179
+ "person": [
180
+ "1",
181
+ "2"
182
+ ],
183
+ "elephant": [
184
+ "3"
185
+ ]
186
+ },
187
+ "0348a45bca": {
188
+ "fish": [
189
+ "1",
190
+ "2",
191
+ "3",
192
+ "4",
193
+ "5"
194
+ ]
195
+ },
196
+ "0355e92655": {
197
+ "person": [
198
+ "2"
199
+ ],
200
+ "boat": [
201
+ "3"
202
+ ]
203
+ },
204
+ "0358b938c1": {
205
+ "elephant": [
206
+ "1",
207
+ "2",
208
+ "3",
209
+ "4"
210
+ ]
211
+ },
212
+ "0368107cf1": {
213
+ "person": [
214
+ "1",
215
+ "2"
216
+ ]
217
+ },
218
+ "0379ddf557": {
219
+ "person": [
220
+ "1"
221
+ ]
222
+ },
223
+ "038b2cc71d": {
224
+ "lizard": [
225
+ "1"
226
+ ]
227
+ },
228
+ "038c15a5dd": {
229
+ "hedgehog": [
230
+ "1"
231
+ ]
232
+ },
233
+ "03a06cc98a": {
234
+ "giraffe": [
235
+ "1",
236
+ "2",
237
+ "3"
238
+ ]
239
+ },
240
+ "03a63e187f": {
241
+ "lizard": [
242
+ "1"
243
+ ]
244
+ },
245
+ "03c95b4dae": {
246
+ "elephant": [
247
+ "1",
248
+ "2",
249
+ "3"
250
+ ]
251
+ },
252
+ "03e2b57b0e": {
253
+ "lizard": [
254
+ "1"
255
+ ]
256
+ },
257
+ "04194e1248": {
258
+ "lizard": [
259
+ "1"
260
+ ]
261
+ },
262
+ "04259896e2": {
263
+ "lizard": [
264
+ "1"
265
+ ]
266
+ },
267
+ "0444918a5f": {
268
+ "truck": [
269
+ "1",
270
+ "2",
271
+ "3",
272
+ "4"
273
+ ]
274
+ },
275
+ "04460a7a52": {
276
+ "lizard": [
277
+ "1"
278
+ ]
279
+ },
280
+ "04474174a4": {
281
+ "ape": [
282
+ "1",
283
+ "2"
284
+ ]
285
+ },
286
+ "0450095513": {
287
+ "snail": [
288
+ "1"
289
+ ]
290
+ },
291
+ "045f00aed2": {
292
+ "tiger": [
293
+ "1"
294
+ ],
295
+ "person": [
296
+ "3"
297
+ ]
298
+ },
299
+ "04667fabaa": {
300
+ "parrot": [
301
+ "1"
302
+ ]
303
+ },
304
+ "04735c5030": {
305
+ "cat": [
306
+ "1",
307
+ "2"
308
+ ]
309
+ },
310
+ "04990d1915": {
311
+ "truck": [
312
+ "3"
313
+ ],
314
+ "sedan": [
315
+ "1"
316
+ ],
317
+ "bus": [
318
+ "2"
319
+ ]
320
+ },
321
+ "04d62d9d98": {
322
+ "person": [
323
+ "1"
324
+ ]
325
+ },
326
+ "04f21da964": {
327
+ "monkey": [
328
+ "1"
329
+ ]
330
+ },
331
+ "04fbad476e": {
332
+ "parrot": [
333
+ "1"
334
+ ]
335
+ },
336
+ "04fe256562": {
337
+ "truck": [
338
+ "2"
339
+ ],
340
+ "motorbike": [
341
+ "1"
342
+ ]
343
+ },
344
+ "0503bf89c9": {
345
+ "hedgehog": [
346
+ "1"
347
+ ]
348
+ },
349
+ "0536c9eed0": {
350
+ "cat": [
351
+ "1"
352
+ ]
353
+ },
354
+ "054acb238f": {
355
+ "owl": [
356
+ "1"
357
+ ]
358
+ },
359
+ "05579ca250": {
360
+ "person": [
361
+ "1"
362
+ ],
363
+ "sedan": [
364
+ "3"
365
+ ]
366
+ },
367
+ "056c200404": {},
368
+ "05774f3a2c": {
369
+ "ape": [
370
+ "1",
371
+ "2",
372
+ "3"
373
+ ]
374
+ },
375
+ "058a7592c8": {
376
+ "train": [
377
+ "1"
378
+ ]
379
+ },
380
+ "05a0a513df": {
381
+ "person": [
382
+ "1",
383
+ "2"
384
+ ]
385
+ },
386
+ "05a569d8aa": {
387
+ "cat": [
388
+ "1"
389
+ ],
390
+ "mouse": [
391
+ "2"
392
+ ]
393
+ },
394
+ "05aa652648": {
395
+ "ape": [
396
+ "1"
397
+ ]
398
+ },
399
+ "05d7715782": {},
400
+ "05e0b0f28f": {
401
+ "mouse": [
402
+ "1"
403
+ ],
404
+ "person": [
405
+ "2"
406
+ ]
407
+ },
408
+ "05fdbbdd7a": {},
409
+ "05ffcfed85": {
410
+ "monkey": [
411
+ "1",
412
+ "2"
413
+ ]
414
+ },
415
+ "0630391881": {
416
+ "person": [
417
+ "1"
418
+ ]
419
+ },
420
+ "06840b2bbe": {
421
+ "snake": [
422
+ "1"
423
+ ]
424
+ },
425
+ "068f7dce6f": {
426
+ "shark": [
427
+ "1"
428
+ ]
429
+ },
430
+ "0693719753": {
431
+ "turtle": [
432
+ "1",
433
+ "2"
434
+ ]
435
+ },
436
+ "06ce2b51fb": {
437
+ "person": [
438
+ "1",
439
+ "2"
440
+ ]
441
+ },
442
+ "06e224798e": {
443
+ "tiger": [
444
+ "1"
445
+ ]
446
+ },
447
+ "06ee361788": {
448
+ "duck": [
449
+ "1",
450
+ "2",
451
+ "3"
452
+ ]
453
+ },
454
+ "06fbb3fa2c": {
455
+ "eagle": [
456
+ "1"
457
+ ]
458
+ },
459
+ "0700264286": {
460
+ "cow": [
461
+ "1",
462
+ "2"
463
+ ]
464
+ },
465
+ "070c918ca7": {
466
+ "parrot": [
467
+ "1"
468
+ ]
469
+ },
470
+ "07129e14a4": {
471
+ "parrot": [
472
+ "1",
473
+ "2"
474
+ ],
475
+ "person": [
476
+ "3"
477
+ ]
478
+ },
479
+ "07177017e9": {
480
+ "motorbike": [
481
+ "1",
482
+ "2"
483
+ ]
484
+ },
485
+ "07238ffc58": {
486
+ "monkey": [
487
+ "1",
488
+ "2",
489
+ "3"
490
+ ]
491
+ },
492
+ "07353b2a89": {
493
+ "sheep": [
494
+ "1",
495
+ "2",
496
+ "3",
497
+ "4"
498
+ ]
499
+ },
500
+ "0738493cbf": {
501
+ "airplane": [
502
+ "1"
503
+ ]
504
+ },
505
+ "075926c651": {
506
+ "person": [
507
+ "1",
508
+ "2"
509
+ ]
510
+ },
511
+ "075c701292": {
512
+ "duck": [
513
+ "1",
514
+ "2",
515
+ "3",
516
+ "4"
517
+ ]
518
+ },
519
+ "0762ea9a30": {
520
+ "person": [
521
+ "1"
522
+ ]
523
+ },
524
+ "07652ee4af": {
525
+ "person": [
526
+ "1"
527
+ ]
528
+ },
529
+ "076f206928": {
530
+ "person": [
531
+ "3"
532
+ ],
533
+ "zebra": [
534
+ "1",
535
+ "2"
536
+ ]
537
+ },
538
+ "077d32af19": {
539
+ "train": [
540
+ "4"
541
+ ],
542
+ "person": [
543
+ "1",
544
+ "2",
545
+ "3"
546
+ ]
547
+ },
548
+ "079049275c": {
549
+ "mouse": [
550
+ "1"
551
+ ]
552
+ },
553
+ "07913cdda7": {
554
+ "train": [
555
+ "1"
556
+ ],
557
+ "person": [
558
+ "2",
559
+ "3"
560
+ ]
561
+ },
562
+ "07a11a35e8": {
563
+ "ape": [
564
+ "1",
565
+ "2"
566
+ ]
567
+ },
568
+ "07ac33b6df": {
569
+ "ape": [
570
+ "1"
571
+ ]
572
+ },
573
+ "07c62c3d11": {
574
+ "parrot": [
575
+ "1",
576
+ "2",
577
+ "3"
578
+ ]
579
+ },
580
+ "07cc1c7d74": {
581
+ "snake": [
582
+ "1"
583
+ ]
584
+ },
585
+ "080196ef01": {
586
+ "lizard": [
587
+ "1"
588
+ ]
589
+ },
590
+ "081207976e": {},
591
+ "081ae4fa44": {
592
+ "shark": [
593
+ "1",
594
+ "2"
595
+ ]
596
+ },
597
+ "081d8250cb": {
598
+ "person": [
599
+ "1"
600
+ ],
601
+ "sedan": [
602
+ "3"
603
+ ]
604
+ },
605
+ "082900c5d4": {
606
+ "duck": [
607
+ "1",
608
+ "2",
609
+ "3"
610
+ ]
611
+ },
612
+ "0860df21e2": {},
613
+ "0866d4c5e3": {
614
+ "bird": [
615
+ "1",
616
+ "2",
617
+ "3"
618
+ ]
619
+ },
620
+ "0891ac2eb6": {
621
+ "person": [
622
+ "1",
623
+ "2",
624
+ "3"
625
+ ]
626
+ },
627
+ "08931bc458": {
628
+ "person": [
629
+ "1"
630
+ ]
631
+ },
632
+ "08aa2705d5": {
633
+ "snake": [
634
+ "1"
635
+ ]
636
+ },
637
+ "08c8450db7": {},
638
+ "08d50b926c": {
639
+ "turtle": [
640
+ "1",
641
+ "2"
642
+ ]
643
+ },
644
+ "08e1e4de15": {
645
+ "monkey": [
646
+ "1",
647
+ "2",
648
+ "3",
649
+ "4"
650
+ ]
651
+ },
652
+ "08e48c1a48": {
653
+ "cow": [
654
+ "1"
655
+ ]
656
+ },
657
+ "08f561c65e": {
658
+ "giant_panda": [
659
+ "1"
660
+ ],
661
+ "person": [
662
+ "2"
663
+ ]
664
+ },
665
+ "08feb87790": {
666
+ "sheep": [
667
+ "1"
668
+ ]
669
+ },
670
+ "09049f6fe3": {
671
+ "mouse": [
672
+ "1",
673
+ "2"
674
+ ]
675
+ },
676
+ "092e4ff450": {
677
+ "snake": [
678
+ "1"
679
+ ]
680
+ },
681
+ "09338adea8": {
682
+ "whale": [
683
+ "1",
684
+ "2"
685
+ ]
686
+ },
687
+ "093c335ccc": {
688
+ "person": [
689
+ "2"
690
+ ]
691
+ },
692
+ "0970d28339": {
693
+ "ape": [
694
+ "1",
695
+ "2"
696
+ ]
697
+ },
698
+ "0974a213dc": {
699
+ "giraffe": [
700
+ "1",
701
+ "2",
702
+ "3"
703
+ ]
704
+ },
705
+ "097b471ed8": {
706
+ "cat": [
707
+ "1",
708
+ "2"
709
+ ]
710
+ },
711
+ "0990941758": {
712
+ "giant_panda": [
713
+ "1"
714
+ ]
715
+ },
716
+ "09a348f4fa": {
717
+ "lizard": [
718
+ "1"
719
+ ]
720
+ },
721
+ "09a6841288": {
722
+ "duck": [
723
+ "1",
724
+ "2"
725
+ ]
726
+ },
727
+ "09c5bad17b": {
728
+ "airplane": [
729
+ "1"
730
+ ]
731
+ },
732
+ "09c9ce80c7": {
733
+ "giant_panda": [
734
+ "1"
735
+ ]
736
+ },
737
+ "09ff54fef4": {
738
+ "fox": [
739
+ "1",
740
+ "2"
741
+ ]
742
+ },
743
+ "0a23765d15": {
744
+ "person": [
745
+ "1",
746
+ "2"
747
+ ]
748
+ },
749
+ "0a275e7f12": {
750
+ "elephant": [
751
+ "1"
752
+ ]
753
+ },
754
+ "0a2f2bd294": {
755
+ "motorbike": [
756
+ "1"
757
+ ]
758
+ },
759
+ "0a7a2514aa": {
760
+ "cat": [
761
+ "1"
762
+ ],
763
+ "lizard": [
764
+ "2"
765
+ ]
766
+ },
767
+ "0a7b27fde9": {
768
+ "parrot": [
769
+ "1",
770
+ "2"
771
+ ]
772
+ },
773
+ "0a8c467cc3": {
774
+ "fish": [
775
+ "1",
776
+ "2",
777
+ "3"
778
+ ]
779
+ },
780
+ "0ac8c560ae": {
781
+ "person": [
782
+ "2",
783
+ "3"
784
+ ]
785
+ },
786
+ "0b1627e896": {
787
+ "boat": [
788
+ "1"
789
+ ]
790
+ },
791
+ "0b285c47f6": {
792
+ "mouse": [
793
+ "1"
794
+ ]
795
+ },
796
+ "0b34ec1d55": {
797
+ "ape": [
798
+ "1"
799
+ ]
800
+ },
801
+ "0b5b5e8e5a": {
802
+ "person": [
803
+ "1"
804
+ ],
805
+ "sedan": [
806
+ "2"
807
+ ]
808
+ },
809
+ "0b68535614": {
810
+ "rabbit": [
811
+ "1"
812
+ ]
813
+ },
814
+ "0b6f9105fc": {
815
+ "rabbit": [
816
+ "1"
817
+ ]
818
+ },
819
+ "0b7dbfa3cb": {
820
+ "cow": [
821
+ "1"
822
+ ]
823
+ },
824
+ "0b9cea51ca": {
825
+ "whale": [
826
+ "1"
827
+ ]
828
+ },
829
+ "0b9d012be8": {
830
+ "camel": [
831
+ "1"
832
+ ]
833
+ },
834
+ "0bcfc4177d": {
835
+ "truck": [
836
+ "1"
837
+ ]
838
+ },
839
+ "0bd37b23c1": {
840
+ "motorbike": [
841
+ "1"
842
+ ]
843
+ },
844
+ "0bd864064c": {
845
+ "eagle": [
846
+ "1"
847
+ ]
848
+ },
849
+ "0c11c6bf7b": {
850
+ "deer": [
851
+ "1"
852
+ ]
853
+ },
854
+ "0c26bc77ac": {
855
+ "crocodile": [
856
+ "1"
857
+ ]
858
+ },
859
+ "0c3a04798c": {
860
+ "fish": [
861
+ "2"
862
+ ],
863
+ "duck": [
864
+ "1"
865
+ ]
866
+ },
867
+ "0c44a9d545": {
868
+ "tiger": [
869
+ "1"
870
+ ]
871
+ },
872
+ "0c817cc390": {
873
+ "dog": [
874
+ "2"
875
+ ],
876
+ "hedgehog": [
877
+ "1"
878
+ ]
879
+ },
880
+ "0ca839ee9a": {
881
+ "ape": [
882
+ "1",
883
+ "2"
884
+ ]
885
+ },
886
+ "0cd7ac0ac0": {
887
+ "rabbit": [
888
+ "1"
889
+ ]
890
+ },
891
+ "0ce06e0121": {
892
+ "parrot": [
893
+ "1",
894
+ "2"
895
+ ]
896
+ },
897
+ "0cfe974a89": {
898
+ "turtle": [
899
+ "1",
900
+ "2"
901
+ ]
902
+ },
903
+ "0d2fcc0dcd": {
904
+ "zebra": [
905
+ "1",
906
+ "2",
907
+ "3",
908
+ "4"
909
+ ]
910
+ },
911
+ "0d3aad05d2": {
912
+ "person": [
913
+ "1"
914
+ ]
915
+ },
916
+ "0d40b015f4": {
917
+ "person": [
918
+ "1"
919
+ ]
920
+ },
921
+ "0d97fba242": {
922
+ "dog": [
923
+ "1"
924
+ ],
925
+ "person": [
926
+ "2"
927
+ ]
928
+ },
929
+ "0d9cc80d7e": {
930
+ "person": [
931
+ "1",
932
+ "2",
933
+ "3"
934
+ ]
935
+ },
936
+ "0dab85b6d3": {
937
+ "lizard": [
938
+ "1",
939
+ "2"
940
+ ]
941
+ },
942
+ "0db5c427a5": {
943
+ "train": [
944
+ "1"
945
+ ]
946
+ },
947
+ "0dbaf284f1": {
948
+ "cat": [
949
+ "1",
950
+ "2"
951
+ ]
952
+ },
953
+ "0de4923598": {},
954
+ "0df28a9101": {
955
+ "turtle": [
956
+ "1",
957
+ "2",
958
+ "3"
959
+ ]
960
+ },
961
+ "0e04f636c4": {
962
+ "frog": [
963
+ "1"
964
+ ]
965
+ },
966
+ "0e05f0e232": {
967
+ "lizard": [
968
+ "1",
969
+ "2"
970
+ ]
971
+ },
972
+ "0e0930474b": {
973
+ "sedan": [
974
+ "1"
975
+ ],
976
+ "person": [
977
+ "2",
978
+ "3"
979
+ ]
980
+ },
981
+ "0e27472bea": {
982
+ "turtle": [
983
+ "1"
984
+ ]
985
+ },
986
+ "0e30020549": {
987
+ "parrot": [
988
+ "1"
989
+ ]
990
+ },
991
+ "0e621feb6c": {
992
+ "lizard": [
993
+ "1",
994
+ "2"
995
+ ]
996
+ },
997
+ "0e803c7d73": {},
998
+ "0e9ebe4e3c": {
999
+ "truck": [
1000
+ "1"
1001
+ ]
1002
+ },
1003
+ "0e9f2785ec": {
1004
+ "person": [
1005
+ "2"
1006
+ ]
1007
+ },
1008
+ "0ea68d418b": {
1009
+ "airplane": [
1010
+ "1"
1011
+ ]
1012
+ },
1013
+ "0eb403a222": {},
1014
+ "0ee92053d6": {
1015
+ "person": [
1016
+ "1"
1017
+ ]
1018
+ },
1019
+ "0eefca067f": {
1020
+ "giant_panda": [
1021
+ "1",
1022
+ "2"
1023
+ ]
1024
+ },
1025
+ "0f17fa6fcb": {
1026
+ "duck": [
1027
+ "1",
1028
+ "2",
1029
+ "3"
1030
+ ]
1031
+ },
1032
+ "0f1ac8e9a3": {
1033
+ "frog": [
1034
+ "1"
1035
+ ]
1036
+ },
1037
+ "0f202e9852": {
1038
+ "parrot": [
1039
+ "1"
1040
+ ]
1041
+ },
1042
+ "0f2ab8b1ff": {
1043
+ "dolphin": [
1044
+ "1",
1045
+ "2",
1046
+ "3"
1047
+ ]
1048
+ },
1049
+ "0f51a78756": {
1050
+ "sheep": [
1051
+ "1"
1052
+ ]
1053
+ },
1054
+ "0f5fbe16b0": {
1055
+ "raccoon": [
1056
+ "1",
1057
+ "2"
1058
+ ]
1059
+ },
1060
+ "0f6072077b": {
1061
+ "person": [
1062
+ "1",
1063
+ "2",
1064
+ "3"
1065
+ ]
1066
+ },
1067
+ "0f6b69b2f4": {
1068
+ "rabbit": [
1069
+ "1"
1070
+ ]
1071
+ },
1072
+ "0f6c2163de": {
1073
+ "snail": [
1074
+ "1"
1075
+ ]
1076
+ },
1077
+ "0f74ec5599": {
1078
+ "giant_panda": [
1079
+ "1"
1080
+ ]
1081
+ },
1082
+ "0f9683715b": {
1083
+ "elephant": [
1084
+ "1"
1085
+ ]
1086
+ },
1087
+ "0fa7b59356": {
1088
+ "duck": [
1089
+ "1"
1090
+ ]
1091
+ },
1092
+ "0fb173695b": {
1093
+ "person": [
1094
+ "3"
1095
+ ]
1096
+ },
1097
+ "0fc958cde2": {
1098
+ "owl": [
1099
+ "1"
1100
+ ]
1101
+ },
1102
+ "0fe7b1a621": {
1103
+ "parrot": [
1104
+ "1"
1105
+ ]
1106
+ },
1107
+ "0ffcdb491c": {
1108
+ "person": [
1109
+ "1",
1110
+ "2",
1111
+ "3"
1112
+ ]
1113
+ },
1114
+ "101caff7d4": {
1115
+ "giant_panda": [
1116
+ "1",
1117
+ "2"
1118
+ ]
1119
+ },
1120
+ "1022fe8417": {
1121
+ "person": [
1122
+ "1",
1123
+ "2",
1124
+ "3"
1125
+ ]
1126
+ },
1127
+ "1032e80b37": {
1128
+ "giraffe": [
1129
+ "1"
1130
+ ]
1131
+ },
1132
+ "103f501680": {
1133
+ "fish": [
1134
+ "1"
1135
+ ]
1136
+ },
1137
+ "104e64565f": {
1138
+ "elephant": [
1139
+ "1"
1140
+ ]
1141
+ },
1142
+ "104f1ab997": {
1143
+ "person": [
1144
+ "1",
1145
+ "2",
1146
+ "3"
1147
+ ]
1148
+ },
1149
+ "106242403f": {
1150
+ "person": [
1151
+ "1",
1152
+ "2"
1153
+ ]
1154
+ },
1155
+ "10b31f5431": {
1156
+ "person": [
1157
+ "1",
1158
+ "3",
1159
+ "4"
1160
+ ]
1161
+ },
1162
+ "10eced835e": {
1163
+ "giant_panda": [
1164
+ "1",
1165
+ "2"
1166
+ ]
1167
+ },
1168
+ "110d26fa3a": {
1169
+ "shark": [
1170
+ "1"
1171
+ ]
1172
+ },
1173
+ "1122c1d16a": {
1174
+ "parrot": [
1175
+ "1",
1176
+ "2",
1177
+ "3",
1178
+ "4",
1179
+ "5"
1180
+ ],
1181
+ "person": [
1182
+ "6"
1183
+ ]
1184
+ },
1185
+ "1145b49a5f": {
1186
+ "rabbit": [
1187
+ "1"
1188
+ ]
1189
+ },
1190
+ "11485838c2": {
1191
+ "giraffe": [
1192
+ "1",
1193
+ "2",
1194
+ "3"
1195
+ ]
1196
+ },
1197
+ "114e7676ec": {
1198
+ "person": [
1199
+ "1"
1200
+ ]
1201
+ },
1202
+ "1157472b95": {
1203
+ "parrot": [
1204
+ "1",
1205
+ "2"
1206
+ ]
1207
+ },
1208
+ "115ee1072c": {
1209
+ "cow": [
1210
+ "1"
1211
+ ]
1212
+ },
1213
+ "1171141012": {
1214
+ "turtle": [
1215
+ "1"
1216
+ ],
1217
+ "person": [
1218
+ "2"
1219
+ ]
1220
+ },
1221
+ "117757b4b8": {
1222
+ "snail": [
1223
+ "1"
1224
+ ]
1225
+ },
1226
+ "1178932d2f": {
1227
+ "motorbike": [
1228
+ "3"
1229
+ ],
1230
+ "person": [
1231
+ "1",
1232
+ "2"
1233
+ ]
1234
+ },
1235
+ "117cc76bda": {
1236
+ "whale": [
1237
+ "1"
1238
+ ]
1239
+ },
1240
+ "1180cbf814": {
1241
+ "fish": [
1242
+ "1",
1243
+ "2"
1244
+ ]
1245
+ },
1246
+ "1187bbd0e3": {
1247
+ "cat": [
1248
+ "1"
1249
+ ]
1250
+ },
1251
+ "1197e44b26": {
1252
+ "giant_panda": [
1253
+ "1"
1254
+ ]
1255
+ },
1256
+ "119cf20728": {
1257
+ "lizard": [
1258
+ "1"
1259
+ ]
1260
+ },
1261
+ "119dd54871": {
1262
+ "lion": [
1263
+ "1",
1264
+ "2"
1265
+ ]
1266
+ },
1267
+ "11a0c3b724": {
1268
+ "mouse": [
1269
+ "1",
1270
+ "2"
1271
+ ]
1272
+ },
1273
+ "11a6ba8c94": {
1274
+ "person": [
1275
+ "1",
1276
+ "2"
1277
+ ]
1278
+ },
1279
+ "11c722a456": {
1280
+ "turtle": [
1281
+ "1",
1282
+ "2"
1283
+ ]
1284
+ },
1285
+ "11cbcb0b4d": {
1286
+ "zebra": [
1287
+ "1"
1288
+ ]
1289
+ },
1290
+ "11ccf5e99d": {
1291
+ "person": [
1292
+ "2"
1293
+ ]
1294
+ },
1295
+ "11ce6f452e": {
1296
+ "person": [
1297
+ "1",
1298
+ "2",
1299
+ "3"
1300
+ ]
1301
+ },
1302
+ "11feabe596": {
1303
+ "rabbit": [
1304
+ "1"
1305
+ ]
1306
+ },
1307
+ "120cb9514d": {
1308
+ "person": [
1309
+ "1",
1310
+ "2",
1311
+ "3"
1312
+ ]
1313
+ },
1314
+ "12156b25b3": {
1315
+ "person": [
1316
+ "1"
1317
+ ]
1318
+ },
1319
+ "122896672d": {
1320
+ "person": [
1321
+ "1",
1322
+ "3"
1323
+ ]
1324
+ },
1325
+ "1233ac8596": {
1326
+ "dog": [
1327
+ "1"
1328
+ ]
1329
+ },
1330
+ "1239c87234": {
1331
+ "lizard": [
1332
+ "1"
1333
+ ]
1334
+ },
1335
+ "1250423f7c": {
1336
+ "person": [
1337
+ "2"
1338
+ ],
1339
+ "elephant": [
1340
+ "3",
1341
+ "4"
1342
+ ]
1343
+ },
1344
+ "1257a1bc67": {
1345
+ "snake": [
1346
+ "1"
1347
+ ]
1348
+ },
1349
+ "125d1b19dd": {
1350
+ "giant_panda": [
1351
+ "1",
1352
+ "2"
1353
+ ]
1354
+ },
1355
+ "126d203967": {
1356
+ "person": [
1357
+ "2"
1358
+ ]
1359
+ },
1360
+ "1295e19071": {
1361
+ "airplane": [
1362
+ "1"
1363
+ ]
1364
+ },
1365
+ "12ad198c54": {
1366
+ "person": [
1367
+ "1"
1368
+ ]
1369
+ },
1370
+ "12bddb2bcb": {
1371
+ "person": [
1372
+ "2"
1373
+ ]
1374
+ },
1375
+ "12ec9b93ee": {
1376
+ "giant_panda": [
1377
+ "1"
1378
+ ]
1379
+ },
1380
+ "12eebedc35": {
1381
+ "bird": [
1382
+ "1"
1383
+ ]
1384
+ },
1385
+ "132852e094": {
1386
+ "fox": [
1387
+ "1"
1388
+ ]
1389
+ },
1390
+ "1329409f2a": {
1391
+ "fish": [
1392
+ "1"
1393
+ ]
1394
+ },
1395
+ "13325cfa14": {
1396
+ "person": [
1397
+ "2"
1398
+ ]
1399
+ },
1400
+ "1336440745": {
1401
+ "mouse": [
1402
+ "1",
1403
+ "2"
1404
+ ]
1405
+ },
1406
+ "134d06dbf9": {
1407
+ "cat": [
1408
+ "1"
1409
+ ]
1410
+ },
1411
+ "135625b53d": {
1412
+ "parrot": [
1413
+ "1"
1414
+ ]
1415
+ },
1416
+ "13870016f9": {
1417
+ "cow": [
1418
+ "2",
1419
+ "3"
1420
+ ],
1421
+ "person": [
1422
+ "1"
1423
+ ]
1424
+ },
1425
+ "13960b3c84": {
1426
+ "giraffe": [
1427
+ "1",
1428
+ "2",
1429
+ "3"
1430
+ ]
1431
+ },
1432
+ "13adaad9d9": {
1433
+ "giant_panda": [
1434
+ "1"
1435
+ ]
1436
+ },
1437
+ "13ae097e20": {
1438
+ "giant_panda": [
1439
+ "1"
1440
+ ]
1441
+ },
1442
+ "13e3070469": {
1443
+ "zebra": [
1444
+ "1",
1445
+ "2",
1446
+ "3"
1447
+ ]
1448
+ },
1449
+ "13f6a8c20d": {
1450
+ "fish": [
1451
+ "1"
1452
+ ]
1453
+ },
1454
+ "1416925cf2": {
1455
+ "truck": [
1456
+ "1",
1457
+ "2"
1458
+ ]
1459
+ },
1460
+ "142d2621f5": {
1461
+ "motorbike": [
1462
+ "3"
1463
+ ],
1464
+ "person": [
1465
+ "1",
1466
+ "2"
1467
+ ]
1468
+ },
1469
+ "145d5d7c03": {
1470
+ "giant_panda": [
1471
+ "1"
1472
+ ]
1473
+ },
1474
+ "145fdc3ac5": {
1475
+ "lizard": [
1476
+ "1"
1477
+ ]
1478
+ },
1479
+ "1471274fa7": {
1480
+ "person": [
1481
+ "1"
1482
+ ]
1483
+ },
1484
+ "14a6b5a139": {
1485
+ "fish": [
1486
+ "1"
1487
+ ]
1488
+ },
1489
+ "14c21cea0d": {
1490
+ "monkey": [
1491
+ "1",
1492
+ "2"
1493
+ ]
1494
+ },
1495
+ "14dae0dc93": {
1496
+ "person": [
1497
+ "2"
1498
+ ]
1499
+ },
1500
+ "14f9bd22b5": {
1501
+ "tiger": [
1502
+ "1"
1503
+ ]
1504
+ },
1505
+ "14fd28ae99": {
1506
+ "parrot": [
1507
+ "1"
1508
+ ]
1509
+ },
1510
+ "15097d5d4e": {
1511
+ "parrot": [
1512
+ "1"
1513
+ ]
1514
+ },
1515
+ "150ea711f2": {
1516
+ "whale": [
1517
+ "1"
1518
+ ]
1519
+ },
1520
+ "1514e3563f": {
1521
+ "earless_seal": [
1522
+ "1",
1523
+ "2"
1524
+ ]
1525
+ },
1526
+ "152aaa3a9e": {
1527
+ "raccoon": [
1528
+ "1"
1529
+ ]
1530
+ },
1531
+ "152b7d3bd7": {
1532
+ "giant_panda": [
1533
+ "1"
1534
+ ]
1535
+ },
1536
+ "15617297cc": {
1537
+ "person": [
1538
+ "1"
1539
+ ]
1540
+ },
1541
+ "15abbe0c52": {
1542
+ "person": [
1543
+ "1"
1544
+ ]
1545
+ },
1546
+ "15d1fb3de5": {
1547
+ "cat": [
1548
+ "2"
1549
+ ],
1550
+ "owl": [
1551
+ "1"
1552
+ ]
1553
+ },
1554
+ "15f67b0fab": {
1555
+ "person": [
1556
+ "1"
1557
+ ]
1558
+ },
1559
+ "161eb59aad": {
1560
+ "cow": [
1561
+ "2",
1562
+ "3"
1563
+ ],
1564
+ "giraffe": [
1565
+ "1"
1566
+ ]
1567
+ },
1568
+ "16288ea47f": {
1569
+ "duck": [
1570
+ "1",
1571
+ "2"
1572
+ ]
1573
+ },
1574
+ "164410ce62": {
1575
+ "person": [
1576
+ "1"
1577
+ ]
1578
+ },
1579
+ "165c3c8cd4": {
1580
+ "person": [
1581
+ "1",
1582
+ "2",
1583
+ "3"
1584
+ ]
1585
+ },
1586
+ "165c42b41b": {
1587
+ "motorbike": [
1588
+ "2",
1589
+ "3"
1590
+ ],
1591
+ "person": [
1592
+ "1",
1593
+ "4"
1594
+ ]
1595
+ },
1596
+ "165ec9e22b": {
1597
+ "person": [
1598
+ "1",
1599
+ "2"
1600
+ ]
1601
+ },
1602
+ "1669502269": {
1603
+ "person": [
1604
+ "1"
1605
+ ]
1606
+ },
1607
+ "16763cccbb": {
1608
+ "ape": [
1609
+ "1"
1610
+ ]
1611
+ },
1612
+ "16adde065e": {
1613
+ "cat": [
1614
+ "2"
1615
+ ],
1616
+ "person": [
1617
+ "3"
1618
+ ]
1619
+ },
1620
+ "16af445362": {
1621
+ "airplane": [
1622
+ "1"
1623
+ ]
1624
+ },
1625
+ "16afd538ad": {
1626
+ "parrot": [
1627
+ "1",
1628
+ "2"
1629
+ ]
1630
+ },
1631
+ "16c3fa4d5d": {
1632
+ "sedan": [
1633
+ "1"
1634
+ ]
1635
+ },
1636
+ "16d1d65c27": {
1637
+ "monkey": [
1638
+ "1"
1639
+ ]
1640
+ },
1641
+ "16e8599e94": {
1642
+ "giant_panda": [
1643
+ "1"
1644
+ ]
1645
+ },
1646
+ "16fe9fb444": {
1647
+ "motorbike": [
1648
+ "1"
1649
+ ],
1650
+ "person": [
1651
+ "2"
1652
+ ]
1653
+ },
1654
+ "1705796b02": {
1655
+ "train": [
1656
+ "1"
1657
+ ]
1658
+ },
1659
+ "1724db7671": {
1660
+ "giant_panda": [
1661
+ "1"
1662
+ ]
1663
+ },
1664
+ "17418e81ea": {
1665
+ "shark": [
1666
+ "1"
1667
+ ]
1668
+ },
1669
+ "175169edbb": {
1670
+ "ape": [
1671
+ "1",
1672
+ "2"
1673
+ ]
1674
+ },
1675
+ "17622326fd": {
1676
+ "lizard": [
1677
+ "1"
1678
+ ]
1679
+ },
1680
+ "17656bae77": {
1681
+ "elephant": [
1682
+ "1"
1683
+ ]
1684
+ },
1685
+ "17b0d94172": {
1686
+ "airplane": [
1687
+ "1"
1688
+ ]
1689
+ },
1690
+ "17c220e4f6": {
1691
+ "giant_panda": [
1692
+ "1"
1693
+ ]
1694
+ },
1695
+ "17c7bcd146": {
1696
+ "train": [
1697
+ "1"
1698
+ ]
1699
+ },
1700
+ "17cb4afe89": {
1701
+ "tiger": [
1702
+ "1"
1703
+ ]
1704
+ },
1705
+ "17cd79a434": {
1706
+ "squirrel": [
1707
+ "1"
1708
+ ]
1709
+ },
1710
+ "17d18604c3": {
1711
+ "person": [
1712
+ "1",
1713
+ "2"
1714
+ ]
1715
+ },
1716
+ "17d8ca1a37": {
1717
+ "person": [
1718
+ "2"
1719
+ ],
1720
+ "owl": [
1721
+ "1"
1722
+ ]
1723
+ },
1724
+ "17e33f4330": {
1725
+ "monkey": [
1726
+ "1"
1727
+ ]
1728
+ },
1729
+ "17f7a6d805": {
1730
+ "snail": [
1731
+ "1"
1732
+ ]
1733
+ },
1734
+ "180abc8378": {
1735
+ "person": [
1736
+ "2"
1737
+ ],
1738
+ "owl": [
1739
+ "1"
1740
+ ]
1741
+ },
1742
+ "183ba3d652": {
1743
+ "motorbike": [
1744
+ "3"
1745
+ ],
1746
+ "person": [
1747
+ "2"
1748
+ ]
1749
+ },
1750
+ "185bf64702": {
1751
+ "zebra": [
1752
+ "1",
1753
+ "2"
1754
+ ]
1755
+ },
1756
+ "18913cc690": {
1757
+ "train": [
1758
+ "1"
1759
+ ]
1760
+ },
1761
+ "1892651815": {
1762
+ "camel": [
1763
+ "1"
1764
+ ]
1765
+ },
1766
+ "189ac8208a": {
1767
+ "giraffe": [
1768
+ "1",
1769
+ "2"
1770
+ ]
1771
+ },
1772
+ "189b44e92c": {
1773
+ "zebra": [
1774
+ "1"
1775
+ ]
1776
+ },
1777
+ "18ac264b76": {
1778
+ "person": [
1779
+ "2"
1780
+ ]
1781
+ },
1782
+ "18b245ab49": {
1783
+ "penguin": [
1784
+ "1",
1785
+ "2",
1786
+ "3",
1787
+ "4"
1788
+ ]
1789
+ },
1790
+ "18b5cebc34": {
1791
+ "mouse": [
1792
+ "1"
1793
+ ]
1794
+ },
1795
+ "18bad52083": {
1796
+ "parrot": [
1797
+ "1",
1798
+ "2"
1799
+ ]
1800
+ },
1801
+ "18bb5144d5": {
1802
+ "lizard": [
1803
+ "1"
1804
+ ]
1805
+ },
1806
+ "18c6f205c5": {
1807
+ "person": [
1808
+ "1",
1809
+ "2",
1810
+ "3"
1811
+ ]
1812
+ },
1813
+ "1903f9ea15": {
1814
+ "bird": [
1815
+ "1",
1816
+ "2",
1817
+ "3"
1818
+ ]
1819
+ },
1820
+ "1917b209f2": {
1821
+ "horse": [
1822
+ "2"
1823
+ ],
1824
+ "cow": [
1825
+ "3",
1826
+ "4"
1827
+ ],
1828
+ "person": [
1829
+ "1"
1830
+ ]
1831
+ },
1832
+ "191e74c01d": {
1833
+ "deer": [
1834
+ "1"
1835
+ ]
1836
+ },
1837
+ "19367bb94e": {
1838
+ "fish": [
1839
+ "1",
1840
+ "2",
1841
+ "3"
1842
+ ]
1843
+ },
1844
+ "193ffaa217": {
1845
+ "person": [
1846
+ "1",
1847
+ "2",
1848
+ "3"
1849
+ ]
1850
+ },
1851
+ "19696b67d3": {
1852
+ "cow": [
1853
+ "1"
1854
+ ]
1855
+ },
1856
+ "197f3ab6f3": {
1857
+ "giant_panda": [
1858
+ "1"
1859
+ ]
1860
+ },
1861
+ "1981e763cc": {
1862
+ "sheep": [
1863
+ "1",
1864
+ "2"
1865
+ ]
1866
+ },
1867
+ "198afe39ae": {
1868
+ "person": [
1869
+ "1"
1870
+ ]
1871
+ },
1872
+ "19a6e62b9b": {
1873
+ "monkey": [
1874
+ "1",
1875
+ "2"
1876
+ ]
1877
+ },
1878
+ "19b60d5335": {
1879
+ "hedgehog": [
1880
+ "1"
1881
+ ]
1882
+ },
1883
+ "19c00c11f9": {
1884
+ "person": [
1885
+ "1"
1886
+ ]
1887
+ },
1888
+ "19e061eb88": {
1889
+ "boat": [
1890
+ "1",
1891
+ "2"
1892
+ ]
1893
+ },
1894
+ "19e8bc6178": {
1895
+ "dog": [
1896
+ "1"
1897
+ ]
1898
+ },
1899
+ "19ee80dac6": {
1900
+ "person": [
1901
+ "1",
1902
+ "3",
1903
+ "4"
1904
+ ]
1905
+ },
1906
+ "1a25a9170a": {
1907
+ "cow": [
1908
+ "1"
1909
+ ],
1910
+ "person": [
1911
+ "2",
1912
+ "3"
1913
+ ]
1914
+ },
1915
+ "1a359a6c1a": {
1916
+ "sheep": [
1917
+ "1"
1918
+ ]
1919
+ },
1920
+ "1a3e87c566": {
1921
+ "frog": [
1922
+ "1"
1923
+ ]
1924
+ },
1925
+ "1a5fe06b00": {
1926
+ "bus": [
1927
+ "1"
1928
+ ]
1929
+ },
1930
+ "1a6c0fbd1e": {
1931
+ "person": [
1932
+ "1"
1933
+ ]
1934
+ },
1935
+ "1a6f3b5a4b": {
1936
+ "sedan": [
1937
+ "3"
1938
+ ]
1939
+ },
1940
+ "1a8afbad92": {
1941
+ "zebra": [
1942
+ "1",
1943
+ "2",
1944
+ "3"
1945
+ ]
1946
+ },
1947
+ "1a8bdc5842": {
1948
+ "parrot": [
1949
+ "1",
1950
+ "2"
1951
+ ]
1952
+ },
1953
+ "1a95752aca": {
1954
+ "duck": [
1955
+ "1",
1956
+ "2"
1957
+ ]
1958
+ },
1959
+ "1a9c131cb7": {
1960
+ "ape": [
1961
+ "1",
1962
+ "2",
1963
+ "3"
1964
+ ]
1965
+ },
1966
+ "1aa3da3ee3": {
1967
+ "sheep": [
1968
+ "1",
1969
+ "2",
1970
+ "3",
1971
+ "4"
1972
+ ]
1973
+ },
1974
+ "1ab27ec7ea": {
1975
+ "deer": [
1976
+ "1"
1977
+ ]
1978
+ },
1979
+ "1abf16d21d": {
1980
+ "turtle": [
1981
+ "1"
1982
+ ]
1983
+ },
1984
+ "1acd0f993b": {
1985
+ "dog": [
1986
+ "1"
1987
+ ],
1988
+ "person": [
1989
+ "3"
1990
+ ]
1991
+ },
1992
+ "1ad202e499": {
1993
+ "lizard": [
1994
+ "1",
1995
+ "2"
1996
+ ]
1997
+ },
1998
+ "1af8d2395d": {
1999
+ "airplane": [
2000
+ "4"
2001
+ ],
2002
+ "person": [
2003
+ "1",
2004
+ "2"
2005
+ ]
2006
+ },
2007
+ "1afd39a1fa": {
2008
+ "motorbike": [
2009
+ "2"
2010
+ ]
2011
+ },
2012
+ "1b2d31306f": {
2013
+ "lizard": [
2014
+ "1"
2015
+ ]
2016
+ },
2017
+ "1b3fa67f0e": {
2018
+ "airplane": [
2019
+ "1"
2020
+ ]
2021
+ },
2022
+ "1b43fa74b4": {
2023
+ "owl": [
2024
+ "1",
2025
+ "2"
2026
+ ]
2027
+ },
2028
+ "1b73ea9fc2": {
2029
+ "parrot": [
2030
+ "1"
2031
+ ]
2032
+ },
2033
+ "1b7e8bb255": {
2034
+ "person": [
2035
+ "2"
2036
+ ]
2037
+ },
2038
+ "1b8680f8cd": {
2039
+ "person": [
2040
+ "2",
2041
+ "3"
2042
+ ]
2043
+ },
2044
+ "1b883843c0": {
2045
+ "person": [
2046
+ "1",
2047
+ "2"
2048
+ ]
2049
+ },
2050
+ "1b8898785b": {
2051
+ "monkey": [
2052
+ "1",
2053
+ "2"
2054
+ ]
2055
+ },
2056
+ "1b88ba1aa4": {
2057
+ "giant_panda": [
2058
+ "1"
2059
+ ]
2060
+ },
2061
+ "1b96a498e5": {
2062
+ "ape": [
2063
+ "1"
2064
+ ]
2065
+ },
2066
+ "1bbc4c274f": {
2067
+ "fish": [
2068
+ "2"
2069
+ ]
2070
+ },
2071
+ "1bd87fe9ab": {
2072
+ "train": [
2073
+ "1"
2074
+ ]
2075
+ },
2076
+ "1c4090c75b": {
2077
+ "whale": [
2078
+ "1"
2079
+ ]
2080
+ },
2081
+ "1c41934f84": {
2082
+ "elephant": [
2083
+ "1",
2084
+ "2"
2085
+ ]
2086
+ },
2087
+ "1c72b04b56": {
2088
+ "lion": [
2089
+ "1"
2090
+ ]
2091
+ },
2092
+ "1c87955a3a": {
2093
+ "turtle": [
2094
+ "2"
2095
+ ],
2096
+ "crocodile": [
2097
+ "1"
2098
+ ]
2099
+ },
2100
+ "1c9f9eb792": {
2101
+ "person": [
2102
+ "2"
2103
+ ]
2104
+ },
2105
+ "1ca240fede": {
2106
+ "train": [
2107
+ "1"
2108
+ ]
2109
+ },
2110
+ "1ca5673803": {
2111
+ "person": [
2112
+ "1",
2113
+ "3"
2114
+ ]
2115
+ },
2116
+ "1cada35274": {
2117
+ "duck": [
2118
+ "1"
2119
+ ]
2120
+ },
2121
+ "1cb44b920d": {
2122
+ "eagle": [
2123
+ "1",
2124
+ "2"
2125
+ ]
2126
+ },
2127
+ "1cd10e62be": {
2128
+ "leopard": [
2129
+ "1"
2130
+ ]
2131
+ },
2132
+ "1d3087d5e5": {
2133
+ "fish": [
2134
+ "1",
2135
+ "2",
2136
+ "3",
2137
+ "4",
2138
+ "5"
2139
+ ]
2140
+ },
2141
+ "1d3685150a": {
2142
+ "person": [
2143
+ "1",
2144
+ "3"
2145
+ ]
2146
+ },
2147
+ "1d6ff083aa": {
2148
+ "person": [
2149
+ "1",
2150
+ "2"
2151
+ ]
2152
+ }
2153
+ }
mbench/numbered_valid_obj_ids_gpt-4o.json ADDED
@@ -0,0 +1,2153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "003234408d": {
3
+ "penguin": [
4
+ "1",
5
+ "2",
6
+ "3",
7
+ "4",
8
+ "5"
9
+ ]
10
+ },
11
+ "0043f083b5": {
12
+ "sedan": [
13
+ "2",
14
+ "3"
15
+ ],
16
+ "bus": [
17
+ "1"
18
+ ]
19
+ },
20
+ "0044fa5fba": {
21
+ "giant_panda": [
22
+ "1"
23
+ ]
24
+ },
25
+ "005a527edd": {
26
+ "ape": [
27
+ "1",
28
+ "2"
29
+ ]
30
+ },
31
+ "0065b171f9": {
32
+ "giant_panda": [
33
+ "1"
34
+ ]
35
+ },
36
+ "00917dcfc4": {
37
+ "zebra": [
38
+ "1",
39
+ "2",
40
+ "3"
41
+ ]
42
+ },
43
+ "00a23ccf53": {
44
+ "shark": [
45
+ "1"
46
+ ]
47
+ },
48
+ "00ad5016a4": {
49
+ "airplane": [
50
+ "1"
51
+ ]
52
+ },
53
+ "01082ae388": {
54
+ "leopard": [
55
+ "1"
56
+ ]
57
+ },
58
+ "011ac0a06f": {
59
+ "ape": [
60
+ "1",
61
+ "2",
62
+ "3",
63
+ "4",
64
+ "5"
65
+ ]
66
+ },
67
+ "013099c098": {
68
+ "giant_panda": [
69
+ "1",
70
+ "2"
71
+ ]
72
+ },
73
+ "0155498c85": {
74
+ "person": [
75
+ "1"
76
+ ],
77
+ "motorbike": [
78
+ "2"
79
+ ]
80
+ },
81
+ "01694ad9c8": {
82
+ "bird": [
83
+ "1"
84
+ ]
85
+ },
86
+ "017ac35701": {
87
+ "giant_panda": [
88
+ "1"
89
+ ]
90
+ },
91
+ "01b80e8e1a": {
92
+ "zebra": [
93
+ "1",
94
+ "2"
95
+ ]
96
+ },
97
+ "01baa5a4e1": {},
98
+ "01c3111683": {
99
+ "whale": [
100
+ "1"
101
+ ]
102
+ },
103
+ "01c4cb5ffe": {
104
+ "person": [
105
+ "1",
106
+ "3"
107
+ ]
108
+ },
109
+ "01c76f0a82": {
110
+ "sedan": [
111
+ "1",
112
+ "4"
113
+ ]
114
+ },
115
+ "01c783268c": {
116
+ "person": [
117
+ "2"
118
+ ],
119
+ "ape": [
120
+ "1"
121
+ ]
122
+ },
123
+ "01e64dd36a": {
124
+ "cow": [
125
+ "1",
126
+ "2",
127
+ "3"
128
+ ]
129
+ },
130
+ "01ed275c6e": {
131
+ "giraffe": [
132
+ "1",
133
+ "2"
134
+ ]
135
+ },
136
+ "01ff60d1fa": {
137
+ "lizard": [
138
+ "1"
139
+ ]
140
+ },
141
+ "020cd28cd2": {
142
+ "person": [
143
+ "1"
144
+ ]
145
+ },
146
+ "02264db755": {
147
+ "fox": [
148
+ "1"
149
+ ]
150
+ },
151
+ "0248626d9a": {
152
+ "train": [
153
+ "1"
154
+ ]
155
+ },
156
+ "02668dbffa": {
157
+ "frog": [
158
+ "1"
159
+ ]
160
+ },
161
+ "0274193026": {
162
+ "person": [
163
+ "2"
164
+ ]
165
+ },
166
+ "02d28375aa": {
167
+ "fox": [
168
+ "1"
169
+ ]
170
+ },
171
+ "031ccc99b1": {
172
+ "person": [
173
+ "1",
174
+ "2",
175
+ "3"
176
+ ]
177
+ },
178
+ "0321b18c10": {
179
+ "elephant": [
180
+ "3"
181
+ ],
182
+ "person": [
183
+ "1",
184
+ "2"
185
+ ]
186
+ },
187
+ "0348a45bca": {
188
+ "fish": [
189
+ "1",
190
+ "2",
191
+ "3",
192
+ "4",
193
+ "5"
194
+ ]
195
+ },
196
+ "0355e92655": {
197
+ "person": [
198
+ "2"
199
+ ],
200
+ "boat": [
201
+ "3"
202
+ ]
203
+ },
204
+ "0358b938c1": {
205
+ "elephant": [
206
+ "1",
207
+ "2",
208
+ "3",
209
+ "4"
210
+ ]
211
+ },
212
+ "0368107cf1": {
213
+ "person": [
214
+ "1",
215
+ "2"
216
+ ]
217
+ },
218
+ "0379ddf557": {
219
+ "person": [
220
+ "1"
221
+ ]
222
+ },
223
+ "038b2cc71d": {
224
+ "lizard": [
225
+ "1"
226
+ ]
227
+ },
228
+ "038c15a5dd": {
229
+ "hedgehog": [
230
+ "1"
231
+ ]
232
+ },
233
+ "03a06cc98a": {
234
+ "giraffe": [
235
+ "1",
236
+ "2",
237
+ "3"
238
+ ]
239
+ },
240
+ "03a63e187f": {
241
+ "lizard": [
242
+ "1"
243
+ ]
244
+ },
245
+ "03c95b4dae": {
246
+ "elephant": [
247
+ "1",
248
+ "2",
249
+ "3"
250
+ ]
251
+ },
252
+ "03e2b57b0e": {
253
+ "lizard": [
254
+ "1"
255
+ ]
256
+ },
257
+ "04194e1248": {
258
+ "lizard": [
259
+ "1"
260
+ ]
261
+ },
262
+ "04259896e2": {
263
+ "lizard": [
264
+ "1"
265
+ ]
266
+ },
267
+ "0444918a5f": {
268
+ "truck": [
269
+ "1",
270
+ "2",
271
+ "3",
272
+ "4"
273
+ ]
274
+ },
275
+ "04460a7a52": {
276
+ "lizard": [
277
+ "1"
278
+ ]
279
+ },
280
+ "04474174a4": {
281
+ "ape": [
282
+ "1",
283
+ "2"
284
+ ]
285
+ },
286
+ "0450095513": {
287
+ "snail": [
288
+ "1"
289
+ ]
290
+ },
291
+ "045f00aed2": {
292
+ "tiger": [
293
+ "1"
294
+ ],
295
+ "person": [
296
+ "3"
297
+ ]
298
+ },
299
+ "04667fabaa": {
300
+ "parrot": [
301
+ "1"
302
+ ]
303
+ },
304
+ "04735c5030": {
305
+ "cat": [
306
+ "1",
307
+ "2"
308
+ ]
309
+ },
310
+ "04990d1915": {
311
+ "bus": [
312
+ "2"
313
+ ],
314
+ "truck": [
315
+ "3"
316
+ ],
317
+ "sedan": [
318
+ "1"
319
+ ]
320
+ },
321
+ "04d62d9d98": {
322
+ "person": [
323
+ "1"
324
+ ]
325
+ },
326
+ "04f21da964": {
327
+ "monkey": [
328
+ "1"
329
+ ]
330
+ },
331
+ "04fbad476e": {
332
+ "parrot": [
333
+ "1"
334
+ ]
335
+ },
336
+ "04fe256562": {
337
+ "truck": [
338
+ "2"
339
+ ],
340
+ "motorbike": [
341
+ "1"
342
+ ]
343
+ },
344
+ "0503bf89c9": {
345
+ "hedgehog": [
346
+ "1"
347
+ ]
348
+ },
349
+ "0536c9eed0": {
350
+ "cat": [
351
+ "1"
352
+ ]
353
+ },
354
+ "054acb238f": {
355
+ "owl": [
356
+ "1"
357
+ ]
358
+ },
359
+ "05579ca250": {
360
+ "person": [
361
+ "1"
362
+ ],
363
+ "sedan": [
364
+ "3"
365
+ ]
366
+ },
367
+ "056c200404": {},
368
+ "05774f3a2c": {
369
+ "ape": [
370
+ "1",
371
+ "2",
372
+ "3"
373
+ ]
374
+ },
375
+ "058a7592c8": {
376
+ "train": [
377
+ "1"
378
+ ]
379
+ },
380
+ "05a0a513df": {
381
+ "person": [
382
+ "1",
383
+ "2"
384
+ ]
385
+ },
386
+ "05a569d8aa": {
387
+ "mouse": [
388
+ "2"
389
+ ],
390
+ "cat": [
391
+ "1"
392
+ ]
393
+ },
394
+ "05aa652648": {
395
+ "ape": [
396
+ "1"
397
+ ]
398
+ },
399
+ "05d7715782": {},
400
+ "05e0b0f28f": {
401
+ "mouse": [
402
+ "1"
403
+ ],
404
+ "person": [
405
+ "2"
406
+ ]
407
+ },
408
+ "05fdbbdd7a": {},
409
+ "05ffcfed85": {
410
+ "monkey": [
411
+ "1",
412
+ "2"
413
+ ]
414
+ },
415
+ "0630391881": {
416
+ "person": [
417
+ "1"
418
+ ]
419
+ },
420
+ "06840b2bbe": {
421
+ "snake": [
422
+ "1"
423
+ ]
424
+ },
425
+ "068f7dce6f": {
426
+ "shark": [
427
+ "1"
428
+ ]
429
+ },
430
+ "0693719753": {
431
+ "turtle": [
432
+ "1",
433
+ "2"
434
+ ]
435
+ },
436
+ "06ce2b51fb": {
437
+ "person": [
438
+ "1",
439
+ "2"
440
+ ]
441
+ },
442
+ "06e224798e": {
443
+ "tiger": [
444
+ "1"
445
+ ]
446
+ },
447
+ "06ee361788": {
448
+ "duck": [
449
+ "1",
450
+ "2",
451
+ "3"
452
+ ]
453
+ },
454
+ "06fbb3fa2c": {
455
+ "eagle": [
456
+ "1"
457
+ ]
458
+ },
459
+ "0700264286": {
460
+ "cow": [
461
+ "1",
462
+ "2"
463
+ ]
464
+ },
465
+ "070c918ca7": {
466
+ "parrot": [
467
+ "1"
468
+ ]
469
+ },
470
+ "07129e14a4": {
471
+ "parrot": [
472
+ "1",
473
+ "2"
474
+ ],
475
+ "person": [
476
+ "3"
477
+ ]
478
+ },
479
+ "07177017e9": {
480
+ "motorbike": [
481
+ "1",
482
+ "2"
483
+ ]
484
+ },
485
+ "07238ffc58": {
486
+ "monkey": [
487
+ "1",
488
+ "2",
489
+ "3"
490
+ ]
491
+ },
492
+ "07353b2a89": {
493
+ "sheep": [
494
+ "1",
495
+ "2",
496
+ "3",
497
+ "4"
498
+ ]
499
+ },
500
+ "0738493cbf": {
501
+ "airplane": [
502
+ "1"
503
+ ]
504
+ },
505
+ "075926c651": {
506
+ "person": [
507
+ "1",
508
+ "2"
509
+ ]
510
+ },
511
+ "075c701292": {
512
+ "duck": [
513
+ "1",
514
+ "2",
515
+ "3",
516
+ "4"
517
+ ]
518
+ },
519
+ "0762ea9a30": {
520
+ "person": [
521
+ "1"
522
+ ]
523
+ },
524
+ "07652ee4af": {
525
+ "person": [
526
+ "1"
527
+ ]
528
+ },
529
+ "076f206928": {
530
+ "zebra": [
531
+ "1",
532
+ "2"
533
+ ],
534
+ "person": [
535
+ "3"
536
+ ]
537
+ },
538
+ "077d32af19": {
539
+ "train": [
540
+ "4"
541
+ ],
542
+ "person": [
543
+ "1",
544
+ "2",
545
+ "3"
546
+ ]
547
+ },
548
+ "079049275c": {
549
+ "mouse": [
550
+ "1"
551
+ ]
552
+ },
553
+ "07913cdda7": {
554
+ "train": [
555
+ "1"
556
+ ],
557
+ "person": [
558
+ "2",
559
+ "3"
560
+ ]
561
+ },
562
+ "07a11a35e8": {
563
+ "ape": [
564
+ "1",
565
+ "2"
566
+ ]
567
+ },
568
+ "07ac33b6df": {
569
+ "ape": [
570
+ "1"
571
+ ]
572
+ },
573
+ "07c62c3d11": {
574
+ "parrot": [
575
+ "1",
576
+ "2",
577
+ "3"
578
+ ]
579
+ },
580
+ "07cc1c7d74": {
581
+ "snake": [
582
+ "1"
583
+ ]
584
+ },
585
+ "080196ef01": {
586
+ "lizard": [
587
+ "1"
588
+ ]
589
+ },
590
+ "081207976e": {},
591
+ "081ae4fa44": {
592
+ "shark": [
593
+ "1",
594
+ "2"
595
+ ]
596
+ },
597
+ "081d8250cb": {
598
+ "person": [
599
+ "1"
600
+ ],
601
+ "sedan": [
602
+ "3"
603
+ ]
604
+ },
605
+ "082900c5d4": {
606
+ "duck": [
607
+ "1",
608
+ "2",
609
+ "3"
610
+ ]
611
+ },
612
+ "0860df21e2": {},
613
+ "0866d4c5e3": {
614
+ "bird": [
615
+ "1",
616
+ "2",
617
+ "3"
618
+ ]
619
+ },
620
+ "0891ac2eb6": {
621
+ "person": [
622
+ "1",
623
+ "2",
624
+ "3"
625
+ ]
626
+ },
627
+ "08931bc458": {
628
+ "person": [
629
+ "1"
630
+ ]
631
+ },
632
+ "08aa2705d5": {
633
+ "snake": [
634
+ "1"
635
+ ]
636
+ },
637
+ "08c8450db7": {},
638
+ "08d50b926c": {
639
+ "turtle": [
640
+ "1",
641
+ "2"
642
+ ]
643
+ },
644
+ "08e1e4de15": {
645
+ "monkey": [
646
+ "1",
647
+ "2",
648
+ "3",
649
+ "4"
650
+ ]
651
+ },
652
+ "08e48c1a48": {
653
+ "cow": [
654
+ "1"
655
+ ]
656
+ },
657
+ "08f561c65e": {
658
+ "giant_panda": [
659
+ "1"
660
+ ],
661
+ "person": [
662
+ "2"
663
+ ]
664
+ },
665
+ "08feb87790": {
666
+ "sheep": [
667
+ "1"
668
+ ]
669
+ },
670
+ "09049f6fe3": {
671
+ "mouse": [
672
+ "1",
673
+ "2"
674
+ ]
675
+ },
676
+ "092e4ff450": {
677
+ "snake": [
678
+ "1"
679
+ ]
680
+ },
681
+ "09338adea8": {
682
+ "whale": [
683
+ "1",
684
+ "2"
685
+ ]
686
+ },
687
+ "093c335ccc": {
688
+ "person": [
689
+ "2"
690
+ ]
691
+ },
692
+ "0970d28339": {
693
+ "ape": [
694
+ "1",
695
+ "2"
696
+ ]
697
+ },
698
+ "0974a213dc": {
699
+ "giraffe": [
700
+ "1",
701
+ "2",
702
+ "3"
703
+ ]
704
+ },
705
+ "097b471ed8": {
706
+ "cat": [
707
+ "1",
708
+ "2"
709
+ ]
710
+ },
711
+ "0990941758": {
712
+ "giant_panda": [
713
+ "1"
714
+ ]
715
+ },
716
+ "09a348f4fa": {
717
+ "lizard": [
718
+ "1"
719
+ ]
720
+ },
721
+ "09a6841288": {
722
+ "duck": [
723
+ "1",
724
+ "2"
725
+ ]
726
+ },
727
+ "09c5bad17b": {
728
+ "airplane": [
729
+ "1"
730
+ ]
731
+ },
732
+ "09c9ce80c7": {
733
+ "giant_panda": [
734
+ "1"
735
+ ]
736
+ },
737
+ "09ff54fef4": {
738
+ "fox": [
739
+ "1",
740
+ "2"
741
+ ]
742
+ },
743
+ "0a23765d15": {
744
+ "person": [
745
+ "1",
746
+ "2"
747
+ ]
748
+ },
749
+ "0a275e7f12": {
750
+ "elephant": [
751
+ "1"
752
+ ]
753
+ },
754
+ "0a2f2bd294": {
755
+ "motorbike": [
756
+ "1"
757
+ ]
758
+ },
759
+ "0a7a2514aa": {
760
+ "lizard": [
761
+ "2"
762
+ ],
763
+ "cat": [
764
+ "1"
765
+ ]
766
+ },
767
+ "0a7b27fde9": {
768
+ "parrot": [
769
+ "1",
770
+ "2"
771
+ ]
772
+ },
773
+ "0a8c467cc3": {
774
+ "fish": [
775
+ "1",
776
+ "2",
777
+ "3"
778
+ ]
779
+ },
780
+ "0ac8c560ae": {
781
+ "person": [
782
+ "2",
783
+ "3"
784
+ ]
785
+ },
786
+ "0b1627e896": {
787
+ "boat": [
788
+ "1"
789
+ ]
790
+ },
791
+ "0b285c47f6": {
792
+ "mouse": [
793
+ "1"
794
+ ]
795
+ },
796
+ "0b34ec1d55": {
797
+ "ape": [
798
+ "1"
799
+ ]
800
+ },
801
+ "0b5b5e8e5a": {
802
+ "person": [
803
+ "1"
804
+ ],
805
+ "sedan": [
806
+ "2"
807
+ ]
808
+ },
809
+ "0b68535614": {
810
+ "rabbit": [
811
+ "1"
812
+ ]
813
+ },
814
+ "0b6f9105fc": {
815
+ "rabbit": [
816
+ "1"
817
+ ]
818
+ },
819
+ "0b7dbfa3cb": {
820
+ "cow": [
821
+ "1"
822
+ ]
823
+ },
824
+ "0b9cea51ca": {
825
+ "whale": [
826
+ "1"
827
+ ]
828
+ },
829
+ "0b9d012be8": {
830
+ "camel": [
831
+ "1"
832
+ ]
833
+ },
834
+ "0bcfc4177d": {
835
+ "truck": [
836
+ "1"
837
+ ]
838
+ },
839
+ "0bd37b23c1": {
840
+ "motorbike": [
841
+ "1"
842
+ ]
843
+ },
844
+ "0bd864064c": {
845
+ "eagle": [
846
+ "1"
847
+ ]
848
+ },
849
+ "0c11c6bf7b": {
850
+ "deer": [
851
+ "1"
852
+ ]
853
+ },
854
+ "0c26bc77ac": {
855
+ "crocodile": [
856
+ "1"
857
+ ]
858
+ },
859
+ "0c3a04798c": {
860
+ "duck": [
861
+ "1"
862
+ ],
863
+ "fish": [
864
+ "2"
865
+ ]
866
+ },
867
+ "0c44a9d545": {
868
+ "tiger": [
869
+ "1"
870
+ ]
871
+ },
872
+ "0c817cc390": {
873
+ "hedgehog": [
874
+ "1"
875
+ ],
876
+ "dog": [
877
+ "2"
878
+ ]
879
+ },
880
+ "0ca839ee9a": {
881
+ "ape": [
882
+ "1",
883
+ "2"
884
+ ]
885
+ },
886
+ "0cd7ac0ac0": {
887
+ "rabbit": [
888
+ "1"
889
+ ]
890
+ },
891
+ "0ce06e0121": {
892
+ "parrot": [
893
+ "1",
894
+ "2"
895
+ ]
896
+ },
897
+ "0cfe974a89": {
898
+ "turtle": [
899
+ "1",
900
+ "2"
901
+ ]
902
+ },
903
+ "0d2fcc0dcd": {
904
+ "zebra": [
905
+ "1",
906
+ "2",
907
+ "3",
908
+ "4"
909
+ ]
910
+ },
911
+ "0d3aad05d2": {
912
+ "person": [
913
+ "1"
914
+ ]
915
+ },
916
+ "0d40b015f4": {
917
+ "person": [
918
+ "1"
919
+ ]
920
+ },
921
+ "0d97fba242": {
922
+ "person": [
923
+ "2"
924
+ ],
925
+ "dog": [
926
+ "1"
927
+ ]
928
+ },
929
+ "0d9cc80d7e": {
930
+ "person": [
931
+ "1",
932
+ "2",
933
+ "3"
934
+ ]
935
+ },
936
+ "0dab85b6d3": {
937
+ "lizard": [
938
+ "1",
939
+ "2"
940
+ ]
941
+ },
942
+ "0db5c427a5": {
943
+ "train": [
944
+ "1"
945
+ ]
946
+ },
947
+ "0dbaf284f1": {
948
+ "cat": [
949
+ "1",
950
+ "2"
951
+ ]
952
+ },
953
+ "0de4923598": {},
954
+ "0df28a9101": {
955
+ "turtle": [
956
+ "1",
957
+ "2",
958
+ "3"
959
+ ]
960
+ },
961
+ "0e04f636c4": {
962
+ "frog": [
963
+ "1"
964
+ ]
965
+ },
966
+ "0e05f0e232": {
967
+ "lizard": [
968
+ "1",
969
+ "2"
970
+ ]
971
+ },
972
+ "0e0930474b": {
973
+ "person": [
974
+ "2",
975
+ "3"
976
+ ],
977
+ "sedan": [
978
+ "1"
979
+ ]
980
+ },
981
+ "0e27472bea": {
982
+ "turtle": [
983
+ "1"
984
+ ]
985
+ },
986
+ "0e30020549": {
987
+ "parrot": [
988
+ "1"
989
+ ]
990
+ },
991
+ "0e621feb6c": {
992
+ "lizard": [
993
+ "1",
994
+ "2"
995
+ ]
996
+ },
997
+ "0e803c7d73": {},
998
+ "0e9ebe4e3c": {
999
+ "truck": [
1000
+ "1"
1001
+ ]
1002
+ },
1003
+ "0e9f2785ec": {
1004
+ "person": [
1005
+ "2"
1006
+ ]
1007
+ },
1008
+ "0ea68d418b": {
1009
+ "airplane": [
1010
+ "1"
1011
+ ]
1012
+ },
1013
+ "0eb403a222": {},
1014
+ "0ee92053d6": {
1015
+ "person": [
1016
+ "1"
1017
+ ]
1018
+ },
1019
+ "0eefca067f": {
1020
+ "giant_panda": [
1021
+ "1",
1022
+ "2"
1023
+ ]
1024
+ },
1025
+ "0f17fa6fcb": {
1026
+ "duck": [
1027
+ "1",
1028
+ "2",
1029
+ "3"
1030
+ ]
1031
+ },
1032
+ "0f1ac8e9a3": {
1033
+ "frog": [
1034
+ "1"
1035
+ ]
1036
+ },
1037
+ "0f202e9852": {
1038
+ "parrot": [
1039
+ "1"
1040
+ ]
1041
+ },
1042
+ "0f2ab8b1ff": {
1043
+ "dolphin": [
1044
+ "1",
1045
+ "2",
1046
+ "3"
1047
+ ]
1048
+ },
1049
+ "0f51a78756": {
1050
+ "sheep": [
1051
+ "1"
1052
+ ]
1053
+ },
1054
+ "0f5fbe16b0": {
1055
+ "raccoon": [
1056
+ "1",
1057
+ "2"
1058
+ ]
1059
+ },
1060
+ "0f6072077b": {
1061
+ "person": [
1062
+ "1",
1063
+ "2",
1064
+ "3"
1065
+ ]
1066
+ },
1067
+ "0f6b69b2f4": {
1068
+ "rabbit": [
1069
+ "1"
1070
+ ]
1071
+ },
1072
+ "0f6c2163de": {
1073
+ "snail": [
1074
+ "1"
1075
+ ]
1076
+ },
1077
+ "0f74ec5599": {
1078
+ "giant_panda": [
1079
+ "1"
1080
+ ]
1081
+ },
1082
+ "0f9683715b": {
1083
+ "elephant": [
1084
+ "1"
1085
+ ]
1086
+ },
1087
+ "0fa7b59356": {
1088
+ "duck": [
1089
+ "1"
1090
+ ]
1091
+ },
1092
+ "0fb173695b": {
1093
+ "person": [
1094
+ "3"
1095
+ ]
1096
+ },
1097
+ "0fc958cde2": {
1098
+ "owl": [
1099
+ "1"
1100
+ ]
1101
+ },
1102
+ "0fe7b1a621": {
1103
+ "parrot": [
1104
+ "1"
1105
+ ]
1106
+ },
1107
+ "0ffcdb491c": {
1108
+ "person": [
1109
+ "1",
1110
+ "2",
1111
+ "3"
1112
+ ]
1113
+ },
1114
+ "101caff7d4": {
1115
+ "giant_panda": [
1116
+ "1",
1117
+ "2"
1118
+ ]
1119
+ },
1120
+ "1022fe8417": {
1121
+ "person": [
1122
+ "1",
1123
+ "2",
1124
+ "3"
1125
+ ]
1126
+ },
1127
+ "1032e80b37": {
1128
+ "giraffe": [
1129
+ "1"
1130
+ ]
1131
+ },
1132
+ "103f501680": {
1133
+ "fish": [
1134
+ "1"
1135
+ ]
1136
+ },
1137
+ "104e64565f": {
1138
+ "elephant": [
1139
+ "1"
1140
+ ]
1141
+ },
1142
+ "104f1ab997": {
1143
+ "person": [
1144
+ "1",
1145
+ "2",
1146
+ "3"
1147
+ ]
1148
+ },
1149
+ "106242403f": {
1150
+ "person": [
1151
+ "1",
1152
+ "2"
1153
+ ]
1154
+ },
1155
+ "10b31f5431": {
1156
+ "person": [
1157
+ "1",
1158
+ "3",
1159
+ "4"
1160
+ ]
1161
+ },
1162
+ "10eced835e": {
1163
+ "giant_panda": [
1164
+ "1",
1165
+ "2"
1166
+ ]
1167
+ },
1168
+ "110d26fa3a": {
1169
+ "shark": [
1170
+ "1"
1171
+ ]
1172
+ },
1173
+ "1122c1d16a": {
1174
+ "parrot": [
1175
+ "1",
1176
+ "2",
1177
+ "3",
1178
+ "4",
1179
+ "5"
1180
+ ],
1181
+ "person": [
1182
+ "6"
1183
+ ]
1184
+ },
1185
+ "1145b49a5f": {
1186
+ "rabbit": [
1187
+ "1"
1188
+ ]
1189
+ },
1190
+ "11485838c2": {
1191
+ "giraffe": [
1192
+ "1",
1193
+ "2",
1194
+ "3"
1195
+ ]
1196
+ },
1197
+ "114e7676ec": {
1198
+ "person": [
1199
+ "1"
1200
+ ]
1201
+ },
1202
+ "1157472b95": {
1203
+ "parrot": [
1204
+ "1",
1205
+ "2"
1206
+ ]
1207
+ },
1208
+ "115ee1072c": {
1209
+ "cow": [
1210
+ "1"
1211
+ ]
1212
+ },
1213
+ "1171141012": {
1214
+ "person": [
1215
+ "2"
1216
+ ],
1217
+ "turtle": [
1218
+ "1"
1219
+ ]
1220
+ },
1221
+ "117757b4b8": {
1222
+ "snail": [
1223
+ "1"
1224
+ ]
1225
+ },
1226
+ "1178932d2f": {
1227
+ "person": [
1228
+ "1",
1229
+ "2"
1230
+ ],
1231
+ "motorbike": [
1232
+ "3"
1233
+ ]
1234
+ },
1235
+ "117cc76bda": {
1236
+ "whale": [
1237
+ "1"
1238
+ ]
1239
+ },
1240
+ "1180cbf814": {
1241
+ "fish": [
1242
+ "1",
1243
+ "2"
1244
+ ]
1245
+ },
1246
+ "1187bbd0e3": {
1247
+ "cat": [
1248
+ "1"
1249
+ ]
1250
+ },
1251
+ "1197e44b26": {
1252
+ "giant_panda": [
1253
+ "1"
1254
+ ]
1255
+ },
1256
+ "119cf20728": {
1257
+ "lizard": [
1258
+ "1"
1259
+ ]
1260
+ },
1261
+ "119dd54871": {
1262
+ "lion": [
1263
+ "1",
1264
+ "2"
1265
+ ]
1266
+ },
1267
+ "11a0c3b724": {
1268
+ "mouse": [
1269
+ "1",
1270
+ "2"
1271
+ ]
1272
+ },
1273
+ "11a6ba8c94": {
1274
+ "person": [
1275
+ "1",
1276
+ "2"
1277
+ ]
1278
+ },
1279
+ "11c722a456": {
1280
+ "turtle": [
1281
+ "1",
1282
+ "2"
1283
+ ]
1284
+ },
1285
+ "11cbcb0b4d": {
1286
+ "zebra": [
1287
+ "1"
1288
+ ]
1289
+ },
1290
+ "11ccf5e99d": {
1291
+ "person": [
1292
+ "2"
1293
+ ]
1294
+ },
1295
+ "11ce6f452e": {
1296
+ "person": [
1297
+ "1",
1298
+ "2",
1299
+ "3"
1300
+ ]
1301
+ },
1302
+ "11feabe596": {
1303
+ "rabbit": [
1304
+ "1"
1305
+ ]
1306
+ },
1307
+ "120cb9514d": {
1308
+ "person": [
1309
+ "1",
1310
+ "2",
1311
+ "3"
1312
+ ]
1313
+ },
1314
+ "12156b25b3": {
1315
+ "person": [
1316
+ "1"
1317
+ ]
1318
+ },
1319
+ "122896672d": {
1320
+ "person": [
1321
+ "1",
1322
+ "3"
1323
+ ]
1324
+ },
1325
+ "1233ac8596": {
1326
+ "dog": [
1327
+ "1"
1328
+ ]
1329
+ },
1330
+ "1239c87234": {
1331
+ "lizard": [
1332
+ "1"
1333
+ ]
1334
+ },
1335
+ "1250423f7c": {
1336
+ "elephant": [
1337
+ "3",
1338
+ "4"
1339
+ ],
1340
+ "person": [
1341
+ "2"
1342
+ ]
1343
+ },
1344
+ "1257a1bc67": {
1345
+ "snake": [
1346
+ "1"
1347
+ ]
1348
+ },
1349
+ "125d1b19dd": {
1350
+ "giant_panda": [
1351
+ "1",
1352
+ "2"
1353
+ ]
1354
+ },
1355
+ "126d203967": {
1356
+ "person": [
1357
+ "2"
1358
+ ]
1359
+ },
1360
+ "1295e19071": {
1361
+ "airplane": [
1362
+ "1"
1363
+ ]
1364
+ },
1365
+ "12ad198c54": {
1366
+ "person": [
1367
+ "1"
1368
+ ]
1369
+ },
1370
+ "12bddb2bcb": {
1371
+ "person": [
1372
+ "2"
1373
+ ]
1374
+ },
1375
+ "12ec9b93ee": {
1376
+ "giant_panda": [
1377
+ "1"
1378
+ ]
1379
+ },
1380
+ "12eebedc35": {
1381
+ "bird": [
1382
+ "1"
1383
+ ]
1384
+ },
1385
+ "132852e094": {
1386
+ "fox": [
1387
+ "1"
1388
+ ]
1389
+ },
1390
+ "1329409f2a": {
1391
+ "fish": [
1392
+ "1"
1393
+ ]
1394
+ },
1395
+ "13325cfa14": {
1396
+ "person": [
1397
+ "2"
1398
+ ]
1399
+ },
1400
+ "1336440745": {
1401
+ "mouse": [
1402
+ "1",
1403
+ "2"
1404
+ ]
1405
+ },
1406
+ "134d06dbf9": {
1407
+ "cat": [
1408
+ "1"
1409
+ ]
1410
+ },
1411
+ "135625b53d": {
1412
+ "parrot": [
1413
+ "1"
1414
+ ]
1415
+ },
1416
+ "13870016f9": {
1417
+ "person": [
1418
+ "1"
1419
+ ],
1420
+ "cow": [
1421
+ "2",
1422
+ "3"
1423
+ ]
1424
+ },
1425
+ "13960b3c84": {
1426
+ "giraffe": [
1427
+ "1",
1428
+ "2",
1429
+ "3"
1430
+ ]
1431
+ },
1432
+ "13adaad9d9": {
1433
+ "giant_panda": [
1434
+ "1"
1435
+ ]
1436
+ },
1437
+ "13ae097e20": {
1438
+ "giant_panda": [
1439
+ "1"
1440
+ ]
1441
+ },
1442
+ "13e3070469": {
1443
+ "zebra": [
1444
+ "1",
1445
+ "2",
1446
+ "3"
1447
+ ]
1448
+ },
1449
+ "13f6a8c20d": {
1450
+ "fish": [
1451
+ "1"
1452
+ ]
1453
+ },
1454
+ "1416925cf2": {
1455
+ "truck": [
1456
+ "1",
1457
+ "2"
1458
+ ]
1459
+ },
1460
+ "142d2621f5": {
1461
+ "person": [
1462
+ "1",
1463
+ "2"
1464
+ ],
1465
+ "motorbike": [
1466
+ "3"
1467
+ ]
1468
+ },
1469
+ "145d5d7c03": {
1470
+ "giant_panda": [
1471
+ "1"
1472
+ ]
1473
+ },
1474
+ "145fdc3ac5": {
1475
+ "lizard": [
1476
+ "1"
1477
+ ]
1478
+ },
1479
+ "1471274fa7": {
1480
+ "person": [
1481
+ "1"
1482
+ ]
1483
+ },
1484
+ "14a6b5a139": {
1485
+ "fish": [
1486
+ "1"
1487
+ ]
1488
+ },
1489
+ "14c21cea0d": {
1490
+ "monkey": [
1491
+ "1",
1492
+ "2"
1493
+ ]
1494
+ },
1495
+ "14dae0dc93": {
1496
+ "person": [
1497
+ "2"
1498
+ ]
1499
+ },
1500
+ "14f9bd22b5": {
1501
+ "tiger": [
1502
+ "1"
1503
+ ]
1504
+ },
1505
+ "14fd28ae99": {
1506
+ "parrot": [
1507
+ "1"
1508
+ ]
1509
+ },
1510
+ "15097d5d4e": {
1511
+ "parrot": [
1512
+ "1"
1513
+ ]
1514
+ },
1515
+ "150ea711f2": {
1516
+ "whale": [
1517
+ "1"
1518
+ ]
1519
+ },
1520
+ "1514e3563f": {
1521
+ "earless_seal": [
1522
+ "1",
1523
+ "2"
1524
+ ]
1525
+ },
1526
+ "152aaa3a9e": {
1527
+ "raccoon": [
1528
+ "1"
1529
+ ]
1530
+ },
1531
+ "152b7d3bd7": {
1532
+ "giant_panda": [
1533
+ "1"
1534
+ ]
1535
+ },
1536
+ "15617297cc": {
1537
+ "person": [
1538
+ "1"
1539
+ ]
1540
+ },
1541
+ "15abbe0c52": {
1542
+ "person": [
1543
+ "1"
1544
+ ]
1545
+ },
1546
+ "15d1fb3de5": {
1547
+ "owl": [
1548
+ "1"
1549
+ ],
1550
+ "cat": [
1551
+ "2"
1552
+ ]
1553
+ },
1554
+ "15f67b0fab": {
1555
+ "person": [
1556
+ "1"
1557
+ ]
1558
+ },
1559
+ "161eb59aad": {
1560
+ "cow": [
1561
+ "2",
1562
+ "3"
1563
+ ],
1564
+ "giraffe": [
1565
+ "1"
1566
+ ]
1567
+ },
1568
+ "16288ea47f": {
1569
+ "duck": [
1570
+ "1",
1571
+ "2"
1572
+ ]
1573
+ },
1574
+ "164410ce62": {
1575
+ "person": [
1576
+ "1"
1577
+ ]
1578
+ },
1579
+ "165c3c8cd4": {
1580
+ "person": [
1581
+ "1",
1582
+ "2",
1583
+ "3"
1584
+ ]
1585
+ },
1586
+ "165c42b41b": {
1587
+ "person": [
1588
+ "1",
1589
+ "4"
1590
+ ],
1591
+ "motorbike": [
1592
+ "2",
1593
+ "3"
1594
+ ]
1595
+ },
1596
+ "165ec9e22b": {
1597
+ "person": [
1598
+ "1",
1599
+ "2"
1600
+ ]
1601
+ },
1602
+ "1669502269": {
1603
+ "person": [
1604
+ "1"
1605
+ ]
1606
+ },
1607
+ "16763cccbb": {
1608
+ "ape": [
1609
+ "1"
1610
+ ]
1611
+ },
1612
+ "16adde065e": {
1613
+ "person": [
1614
+ "3"
1615
+ ],
1616
+ "cat": [
1617
+ "2"
1618
+ ]
1619
+ },
1620
+ "16af445362": {
1621
+ "airplane": [
1622
+ "1"
1623
+ ]
1624
+ },
1625
+ "16afd538ad": {
1626
+ "parrot": [
1627
+ "1",
1628
+ "2"
1629
+ ]
1630
+ },
1631
+ "16c3fa4d5d": {
1632
+ "sedan": [
1633
+ "1"
1634
+ ]
1635
+ },
1636
+ "16d1d65c27": {
1637
+ "monkey": [
1638
+ "1"
1639
+ ]
1640
+ },
1641
+ "16e8599e94": {
1642
+ "giant_panda": [
1643
+ "1"
1644
+ ]
1645
+ },
1646
+ "16fe9fb444": {
1647
+ "person": [
1648
+ "2"
1649
+ ],
1650
+ "motorbike": [
1651
+ "1"
1652
+ ]
1653
+ },
1654
+ "1705796b02": {
1655
+ "train": [
1656
+ "1"
1657
+ ]
1658
+ },
1659
+ "1724db7671": {
1660
+ "giant_panda": [
1661
+ "1"
1662
+ ]
1663
+ },
1664
+ "17418e81ea": {
1665
+ "shark": [
1666
+ "1"
1667
+ ]
1668
+ },
1669
+ "175169edbb": {
1670
+ "ape": [
1671
+ "1",
1672
+ "2"
1673
+ ]
1674
+ },
1675
+ "17622326fd": {
1676
+ "lizard": [
1677
+ "1"
1678
+ ]
1679
+ },
1680
+ "17656bae77": {
1681
+ "elephant": [
1682
+ "1"
1683
+ ]
1684
+ },
1685
+ "17b0d94172": {
1686
+ "airplane": [
1687
+ "1"
1688
+ ]
1689
+ },
1690
+ "17c220e4f6": {
1691
+ "giant_panda": [
1692
+ "1"
1693
+ ]
1694
+ },
1695
+ "17c7bcd146": {
1696
+ "train": [
1697
+ "1"
1698
+ ]
1699
+ },
1700
+ "17cb4afe89": {
1701
+ "tiger": [
1702
+ "1"
1703
+ ]
1704
+ },
1705
+ "17cd79a434": {
1706
+ "squirrel": [
1707
+ "1"
1708
+ ]
1709
+ },
1710
+ "17d18604c3": {
1711
+ "person": [
1712
+ "1",
1713
+ "2"
1714
+ ]
1715
+ },
1716
+ "17d8ca1a37": {
1717
+ "person": [
1718
+ "2"
1719
+ ],
1720
+ "owl": [
1721
+ "1"
1722
+ ]
1723
+ },
1724
+ "17e33f4330": {
1725
+ "monkey": [
1726
+ "1"
1727
+ ]
1728
+ },
1729
+ "17f7a6d805": {
1730
+ "snail": [
1731
+ "1"
1732
+ ]
1733
+ },
1734
+ "180abc8378": {
1735
+ "person": [
1736
+ "2"
1737
+ ],
1738
+ "owl": [
1739
+ "1"
1740
+ ]
1741
+ },
1742
+ "183ba3d652": {
1743
+ "person": [
1744
+ "2"
1745
+ ],
1746
+ "motorbike": [
1747
+ "3"
1748
+ ]
1749
+ },
1750
+ "185bf64702": {
1751
+ "zebra": [
1752
+ "1",
1753
+ "2"
1754
+ ]
1755
+ },
1756
+ "18913cc690": {
1757
+ "train": [
1758
+ "1"
1759
+ ]
1760
+ },
1761
+ "1892651815": {
1762
+ "camel": [
1763
+ "1"
1764
+ ]
1765
+ },
1766
+ "189ac8208a": {
1767
+ "giraffe": [
1768
+ "1",
1769
+ "2"
1770
+ ]
1771
+ },
1772
+ "189b44e92c": {
1773
+ "zebra": [
1774
+ "1"
1775
+ ]
1776
+ },
1777
+ "18ac264b76": {
1778
+ "person": [
1779
+ "2"
1780
+ ]
1781
+ },
1782
+ "18b245ab49": {
1783
+ "penguin": [
1784
+ "1",
1785
+ "2",
1786
+ "3",
1787
+ "4"
1788
+ ]
1789
+ },
1790
+ "18b5cebc34": {
1791
+ "mouse": [
1792
+ "1"
1793
+ ]
1794
+ },
1795
+ "18bad52083": {
1796
+ "parrot": [
1797
+ "1",
1798
+ "2"
1799
+ ]
1800
+ },
1801
+ "18bb5144d5": {
1802
+ "lizard": [
1803
+ "1"
1804
+ ]
1805
+ },
1806
+ "18c6f205c5": {
1807
+ "person": [
1808
+ "1",
1809
+ "2",
1810
+ "3"
1811
+ ]
1812
+ },
1813
+ "1903f9ea15": {
1814
+ "bird": [
1815
+ "1",
1816
+ "2",
1817
+ "3"
1818
+ ]
1819
+ },
1820
+ "1917b209f2": {
1821
+ "horse": [
1822
+ "2"
1823
+ ],
1824
+ "person": [
1825
+ "1"
1826
+ ],
1827
+ "cow": [
1828
+ "3",
1829
+ "4"
1830
+ ]
1831
+ },
1832
+ "191e74c01d": {
1833
+ "deer": [
1834
+ "1"
1835
+ ]
1836
+ },
1837
+ "19367bb94e": {
1838
+ "fish": [
1839
+ "1",
1840
+ "2",
1841
+ "3"
1842
+ ]
1843
+ },
1844
+ "193ffaa217": {
1845
+ "person": [
1846
+ "1",
1847
+ "2",
1848
+ "3"
1849
+ ]
1850
+ },
1851
+ "19696b67d3": {
1852
+ "cow": [
1853
+ "1"
1854
+ ]
1855
+ },
1856
+ "197f3ab6f3": {
1857
+ "giant_panda": [
1858
+ "1"
1859
+ ]
1860
+ },
1861
+ "1981e763cc": {
1862
+ "sheep": [
1863
+ "1",
1864
+ "2"
1865
+ ]
1866
+ },
1867
+ "198afe39ae": {
1868
+ "person": [
1869
+ "1"
1870
+ ]
1871
+ },
1872
+ "19a6e62b9b": {
1873
+ "monkey": [
1874
+ "1",
1875
+ "2"
1876
+ ]
1877
+ },
1878
+ "19b60d5335": {
1879
+ "hedgehog": [
1880
+ "1"
1881
+ ]
1882
+ },
1883
+ "19c00c11f9": {
1884
+ "person": [
1885
+ "1"
1886
+ ]
1887
+ },
1888
+ "19e061eb88": {
1889
+ "boat": [
1890
+ "1",
1891
+ "2"
1892
+ ]
1893
+ },
1894
+ "19e8bc6178": {
1895
+ "dog": [
1896
+ "1"
1897
+ ]
1898
+ },
1899
+ "19ee80dac6": {
1900
+ "person": [
1901
+ "1",
1902
+ "3",
1903
+ "4"
1904
+ ]
1905
+ },
1906
+ "1a25a9170a": {
1907
+ "person": [
1908
+ "2",
1909
+ "3"
1910
+ ],
1911
+ "cow": [
1912
+ "1"
1913
+ ]
1914
+ },
1915
+ "1a359a6c1a": {
1916
+ "sheep": [
1917
+ "1"
1918
+ ]
1919
+ },
1920
+ "1a3e87c566": {
1921
+ "frog": [
1922
+ "1"
1923
+ ]
1924
+ },
1925
+ "1a5fe06b00": {
1926
+ "bus": [
1927
+ "1"
1928
+ ]
1929
+ },
1930
+ "1a6c0fbd1e": {
1931
+ "person": [
1932
+ "1"
1933
+ ]
1934
+ },
1935
+ "1a6f3b5a4b": {
1936
+ "sedan": [
1937
+ "3"
1938
+ ]
1939
+ },
1940
+ "1a8afbad92": {
1941
+ "zebra": [
1942
+ "1",
1943
+ "2",
1944
+ "3"
1945
+ ]
1946
+ },
1947
+ "1a8bdc5842": {
1948
+ "parrot": [
1949
+ "1",
1950
+ "2"
1951
+ ]
1952
+ },
1953
+ "1a95752aca": {
1954
+ "duck": [
1955
+ "1",
1956
+ "2"
1957
+ ]
1958
+ },
1959
+ "1a9c131cb7": {
1960
+ "ape": [
1961
+ "1",
1962
+ "2",
1963
+ "3"
1964
+ ]
1965
+ },
1966
+ "1aa3da3ee3": {
1967
+ "sheep": [
1968
+ "1",
1969
+ "2",
1970
+ "3",
1971
+ "4"
1972
+ ]
1973
+ },
1974
+ "1ab27ec7ea": {
1975
+ "deer": [
1976
+ "1"
1977
+ ]
1978
+ },
1979
+ "1abf16d21d": {
1980
+ "turtle": [
1981
+ "1"
1982
+ ]
1983
+ },
1984
+ "1acd0f993b": {
1985
+ "person": [
1986
+ "3"
1987
+ ],
1988
+ "dog": [
1989
+ "1"
1990
+ ]
1991
+ },
1992
+ "1ad202e499": {
1993
+ "lizard": [
1994
+ "1",
1995
+ "2"
1996
+ ]
1997
+ },
1998
+ "1af8d2395d": {
1999
+ "person": [
2000
+ "1",
2001
+ "2"
2002
+ ],
2003
+ "airplane": [
2004
+ "4"
2005
+ ]
2006
+ },
2007
+ "1afd39a1fa": {
2008
+ "motorbike": [
2009
+ "2"
2010
+ ]
2011
+ },
2012
+ "1b2d31306f": {
2013
+ "lizard": [
2014
+ "1"
2015
+ ]
2016
+ },
2017
+ "1b3fa67f0e": {
2018
+ "airplane": [
2019
+ "1"
2020
+ ]
2021
+ },
2022
+ "1b43fa74b4": {
2023
+ "owl": [
2024
+ "1",
2025
+ "2"
2026
+ ]
2027
+ },
2028
+ "1b73ea9fc2": {
2029
+ "parrot": [
2030
+ "1"
2031
+ ]
2032
+ },
2033
+ "1b7e8bb255": {
2034
+ "person": [
2035
+ "2"
2036
+ ]
2037
+ },
2038
+ "1b8680f8cd": {
2039
+ "person": [
2040
+ "2",
2041
+ "3"
2042
+ ]
2043
+ },
2044
+ "1b883843c0": {
2045
+ "person": [
2046
+ "1",
2047
+ "2"
2048
+ ]
2049
+ },
2050
+ "1b8898785b": {
2051
+ "monkey": [
2052
+ "1",
2053
+ "2"
2054
+ ]
2055
+ },
2056
+ "1b88ba1aa4": {
2057
+ "giant_panda": [
2058
+ "1"
2059
+ ]
2060
+ },
2061
+ "1b96a498e5": {
2062
+ "ape": [
2063
+ "1"
2064
+ ]
2065
+ },
2066
+ "1bbc4c274f": {
2067
+ "fish": [
2068
+ "2"
2069
+ ]
2070
+ },
2071
+ "1bd87fe9ab": {
2072
+ "train": [
2073
+ "1"
2074
+ ]
2075
+ },
2076
+ "1c4090c75b": {
2077
+ "whale": [
2078
+ "1"
2079
+ ]
2080
+ },
2081
+ "1c41934f84": {
2082
+ "elephant": [
2083
+ "1",
2084
+ "2"
2085
+ ]
2086
+ },
2087
+ "1c72b04b56": {
2088
+ "lion": [
2089
+ "1"
2090
+ ]
2091
+ },
2092
+ "1c87955a3a": {
2093
+ "crocodile": [
2094
+ "1"
2095
+ ],
2096
+ "turtle": [
2097
+ "2"
2098
+ ]
2099
+ },
2100
+ "1c9f9eb792": {
2101
+ "person": [
2102
+ "2"
2103
+ ]
2104
+ },
2105
+ "1ca240fede": {
2106
+ "train": [
2107
+ "1"
2108
+ ]
2109
+ },
2110
+ "1ca5673803": {
2111
+ "person": [
2112
+ "1",
2113
+ "3"
2114
+ ]
2115
+ },
2116
+ "1cada35274": {
2117
+ "duck": [
2118
+ "1"
2119
+ ]
2120
+ },
2121
+ "1cb44b920d": {
2122
+ "eagle": [
2123
+ "1",
2124
+ "2"
2125
+ ]
2126
+ },
2127
+ "1cd10e62be": {
2128
+ "leopard": [
2129
+ "1"
2130
+ ]
2131
+ },
2132
+ "1d3087d5e5": {
2133
+ "fish": [
2134
+ "1",
2135
+ "2",
2136
+ "3",
2137
+ "4",
2138
+ "5"
2139
+ ]
2140
+ },
2141
+ "1d3685150a": {
2142
+ "person": [
2143
+ "1",
2144
+ "3"
2145
+ ]
2146
+ },
2147
+ "1d6ff083aa": {
2148
+ "person": [
2149
+ "1",
2150
+ "2"
2151
+ ]
2152
+ }
2153
+ }
mbench/numbered_valid_obj_ids_gpt-4o_no_mask_color.json ADDED
@@ -0,0 +1,2153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "003234408d": {
3
+ "penguin": [
4
+ "1",
5
+ "2",
6
+ "3",
7
+ "4",
8
+ "5"
9
+ ]
10
+ },
11
+ "0043f083b5": {
12
+ "bus": [
13
+ "1"
14
+ ],
15
+ "sedan": [
16
+ "2",
17
+ "3"
18
+ ]
19
+ },
20
+ "0044fa5fba": {
21
+ "giant_panda": [
22
+ "1"
23
+ ]
24
+ },
25
+ "005a527edd": {
26
+ "ape": [
27
+ "1",
28
+ "2"
29
+ ]
30
+ },
31
+ "0065b171f9": {
32
+ "giant_panda": [
33
+ "1"
34
+ ]
35
+ },
36
+ "00917dcfc4": {
37
+ "zebra": [
38
+ "1",
39
+ "2",
40
+ "3"
41
+ ]
42
+ },
43
+ "00a23ccf53": {
44
+ "shark": [
45
+ "1"
46
+ ]
47
+ },
48
+ "00ad5016a4": {
49
+ "airplane": [
50
+ "1"
51
+ ]
52
+ },
53
+ "01082ae388": {
54
+ "leopard": [
55
+ "1"
56
+ ]
57
+ },
58
+ "011ac0a06f": {
59
+ "ape": [
60
+ "1",
61
+ "2",
62
+ "3",
63
+ "4",
64
+ "5"
65
+ ]
66
+ },
67
+ "013099c098": {
68
+ "giant_panda": [
69
+ "1",
70
+ "2"
71
+ ]
72
+ },
73
+ "0155498c85": {
74
+ "person": [
75
+ "1"
76
+ ],
77
+ "motorbike": [
78
+ "2"
79
+ ]
80
+ },
81
+ "01694ad9c8": {
82
+ "bird": [
83
+ "1"
84
+ ]
85
+ },
86
+ "017ac35701": {
87
+ "giant_panda": [
88
+ "1"
89
+ ]
90
+ },
91
+ "01b80e8e1a": {
92
+ "zebra": [
93
+ "1",
94
+ "2"
95
+ ]
96
+ },
97
+ "01baa5a4e1": {},
98
+ "01c3111683": {
99
+ "whale": [
100
+ "1"
101
+ ]
102
+ },
103
+ "01c4cb5ffe": {
104
+ "person": [
105
+ "1",
106
+ "3"
107
+ ]
108
+ },
109
+ "01c76f0a82": {
110
+ "sedan": [
111
+ "1",
112
+ "4"
113
+ ]
114
+ },
115
+ "01c783268c": {
116
+ "person": [
117
+ "2"
118
+ ],
119
+ "ape": [
120
+ "1"
121
+ ]
122
+ },
123
+ "01e64dd36a": {
124
+ "cow": [
125
+ "1",
126
+ "2",
127
+ "3"
128
+ ]
129
+ },
130
+ "01ed275c6e": {
131
+ "giraffe": [
132
+ "1",
133
+ "2"
134
+ ]
135
+ },
136
+ "01ff60d1fa": {
137
+ "lizard": [
138
+ "1"
139
+ ]
140
+ },
141
+ "020cd28cd2": {
142
+ "person": [
143
+ "1"
144
+ ]
145
+ },
146
+ "02264db755": {
147
+ "fox": [
148
+ "1"
149
+ ]
150
+ },
151
+ "0248626d9a": {
152
+ "train": [
153
+ "1"
154
+ ]
155
+ },
156
+ "02668dbffa": {
157
+ "frog": [
158
+ "1"
159
+ ]
160
+ },
161
+ "0274193026": {
162
+ "person": [
163
+ "2"
164
+ ]
165
+ },
166
+ "02d28375aa": {
167
+ "fox": [
168
+ "1"
169
+ ]
170
+ },
171
+ "031ccc99b1": {
172
+ "person": [
173
+ "1",
174
+ "2",
175
+ "3"
176
+ ]
177
+ },
178
+ "0321b18c10": {
179
+ "person": [
180
+ "1",
181
+ "2"
182
+ ],
183
+ "elephant": [
184
+ "3"
185
+ ]
186
+ },
187
+ "0348a45bca": {
188
+ "fish": [
189
+ "1",
190
+ "2",
191
+ "3",
192
+ "4",
193
+ "5"
194
+ ]
195
+ },
196
+ "0355e92655": {
197
+ "boat": [
198
+ "3"
199
+ ],
200
+ "person": [
201
+ "2"
202
+ ]
203
+ },
204
+ "0358b938c1": {
205
+ "elephant": [
206
+ "1",
207
+ "2",
208
+ "3",
209
+ "4"
210
+ ]
211
+ },
212
+ "0368107cf1": {
213
+ "person": [
214
+ "1",
215
+ "2"
216
+ ]
217
+ },
218
+ "0379ddf557": {
219
+ "person": [
220
+ "1"
221
+ ]
222
+ },
223
+ "038b2cc71d": {
224
+ "lizard": [
225
+ "1"
226
+ ]
227
+ },
228
+ "038c15a5dd": {
229
+ "hedgehog": [
230
+ "1"
231
+ ]
232
+ },
233
+ "03a06cc98a": {
234
+ "giraffe": [
235
+ "1",
236
+ "2",
237
+ "3"
238
+ ]
239
+ },
240
+ "03a63e187f": {
241
+ "lizard": [
242
+ "1"
243
+ ]
244
+ },
245
+ "03c95b4dae": {
246
+ "elephant": [
247
+ "1",
248
+ "2",
249
+ "3"
250
+ ]
251
+ },
252
+ "03e2b57b0e": {
253
+ "lizard": [
254
+ "1"
255
+ ]
256
+ },
257
+ "04194e1248": {
258
+ "lizard": [
259
+ "1"
260
+ ]
261
+ },
262
+ "04259896e2": {
263
+ "lizard": [
264
+ "1"
265
+ ]
266
+ },
267
+ "0444918a5f": {
268
+ "truck": [
269
+ "1",
270
+ "2",
271
+ "3",
272
+ "4"
273
+ ]
274
+ },
275
+ "04460a7a52": {
276
+ "lizard": [
277
+ "1"
278
+ ]
279
+ },
280
+ "04474174a4": {
281
+ "ape": [
282
+ "1",
283
+ "2"
284
+ ]
285
+ },
286
+ "0450095513": {
287
+ "snail": [
288
+ "1"
289
+ ]
290
+ },
291
+ "045f00aed2": {
292
+ "tiger": [
293
+ "1"
294
+ ],
295
+ "person": [
296
+ "3"
297
+ ]
298
+ },
299
+ "04667fabaa": {
300
+ "parrot": [
301
+ "1"
302
+ ]
303
+ },
304
+ "04735c5030": {
305
+ "cat": [
306
+ "1",
307
+ "2"
308
+ ]
309
+ },
310
+ "04990d1915": {
311
+ "bus": [
312
+ "2"
313
+ ],
314
+ "truck": [
315
+ "3"
316
+ ],
317
+ "sedan": [
318
+ "1"
319
+ ]
320
+ },
321
+ "04d62d9d98": {
322
+ "person": [
323
+ "1"
324
+ ]
325
+ },
326
+ "04f21da964": {
327
+ "monkey": [
328
+ "1"
329
+ ]
330
+ },
331
+ "04fbad476e": {
332
+ "parrot": [
333
+ "1"
334
+ ]
335
+ },
336
+ "04fe256562": {
337
+ "motorbike": [
338
+ "1"
339
+ ],
340
+ "truck": [
341
+ "2"
342
+ ]
343
+ },
344
+ "0503bf89c9": {
345
+ "hedgehog": [
346
+ "1"
347
+ ]
348
+ },
349
+ "0536c9eed0": {
350
+ "cat": [
351
+ "1"
352
+ ]
353
+ },
354
+ "054acb238f": {
355
+ "owl": [
356
+ "1"
357
+ ]
358
+ },
359
+ "05579ca250": {
360
+ "person": [
361
+ "1"
362
+ ],
363
+ "sedan": [
364
+ "3"
365
+ ]
366
+ },
367
+ "056c200404": {},
368
+ "05774f3a2c": {
369
+ "ape": [
370
+ "1",
371
+ "2",
372
+ "3"
373
+ ]
374
+ },
375
+ "058a7592c8": {
376
+ "train": [
377
+ "1"
378
+ ]
379
+ },
380
+ "05a0a513df": {
381
+ "person": [
382
+ "1",
383
+ "2"
384
+ ]
385
+ },
386
+ "05a569d8aa": {
387
+ "cat": [
388
+ "1"
389
+ ],
390
+ "mouse": [
391
+ "2"
392
+ ]
393
+ },
394
+ "05aa652648": {
395
+ "ape": [
396
+ "1"
397
+ ]
398
+ },
399
+ "05d7715782": {},
400
+ "05e0b0f28f": {
401
+ "person": [
402
+ "2"
403
+ ],
404
+ "mouse": [
405
+ "1"
406
+ ]
407
+ },
408
+ "05fdbbdd7a": {},
409
+ "05ffcfed85": {
410
+ "monkey": [
411
+ "1",
412
+ "2"
413
+ ]
414
+ },
415
+ "0630391881": {
416
+ "person": [
417
+ "1"
418
+ ]
419
+ },
420
+ "06840b2bbe": {
421
+ "snake": [
422
+ "1"
423
+ ]
424
+ },
425
+ "068f7dce6f": {
426
+ "shark": [
427
+ "1"
428
+ ]
429
+ },
430
+ "0693719753": {
431
+ "turtle": [
432
+ "1",
433
+ "2"
434
+ ]
435
+ },
436
+ "06ce2b51fb": {
437
+ "person": [
438
+ "1",
439
+ "2"
440
+ ]
441
+ },
442
+ "06e224798e": {
443
+ "tiger": [
444
+ "1"
445
+ ]
446
+ },
447
+ "06ee361788": {
448
+ "duck": [
449
+ "1",
450
+ "2",
451
+ "3"
452
+ ]
453
+ },
454
+ "06fbb3fa2c": {
455
+ "eagle": [
456
+ "1"
457
+ ]
458
+ },
459
+ "0700264286": {
460
+ "cow": [
461
+ "1",
462
+ "2"
463
+ ]
464
+ },
465
+ "070c918ca7": {
466
+ "parrot": [
467
+ "1"
468
+ ]
469
+ },
470
+ "07129e14a4": {
471
+ "parrot": [
472
+ "1",
473
+ "2"
474
+ ],
475
+ "person": [
476
+ "3"
477
+ ]
478
+ },
479
+ "07177017e9": {
480
+ "motorbike": [
481
+ "1",
482
+ "2"
483
+ ]
484
+ },
485
+ "07238ffc58": {
486
+ "monkey": [
487
+ "1",
488
+ "2",
489
+ "3"
490
+ ]
491
+ },
492
+ "07353b2a89": {
493
+ "sheep": [
494
+ "1",
495
+ "2",
496
+ "3",
497
+ "4"
498
+ ]
499
+ },
500
+ "0738493cbf": {
501
+ "airplane": [
502
+ "1"
503
+ ]
504
+ },
505
+ "075926c651": {
506
+ "person": [
507
+ "1",
508
+ "2"
509
+ ]
510
+ },
511
+ "075c701292": {
512
+ "duck": [
513
+ "1",
514
+ "2",
515
+ "3",
516
+ "4"
517
+ ]
518
+ },
519
+ "0762ea9a30": {
520
+ "person": [
521
+ "1"
522
+ ]
523
+ },
524
+ "07652ee4af": {
525
+ "person": [
526
+ "1"
527
+ ]
528
+ },
529
+ "076f206928": {
530
+ "zebra": [
531
+ "1",
532
+ "2"
533
+ ],
534
+ "person": [
535
+ "3"
536
+ ]
537
+ },
538
+ "077d32af19": {
539
+ "train": [
540
+ "4"
541
+ ],
542
+ "person": [
543
+ "1",
544
+ "2",
545
+ "3"
546
+ ]
547
+ },
548
+ "079049275c": {
549
+ "mouse": [
550
+ "1"
551
+ ]
552
+ },
553
+ "07913cdda7": {
554
+ "train": [
555
+ "1"
556
+ ],
557
+ "person": [
558
+ "2",
559
+ "3"
560
+ ]
561
+ },
562
+ "07a11a35e8": {
563
+ "ape": [
564
+ "1",
565
+ "2"
566
+ ]
567
+ },
568
+ "07ac33b6df": {
569
+ "ape": [
570
+ "1"
571
+ ]
572
+ },
573
+ "07c62c3d11": {
574
+ "parrot": [
575
+ "1",
576
+ "2",
577
+ "3"
578
+ ]
579
+ },
580
+ "07cc1c7d74": {
581
+ "snake": [
582
+ "1"
583
+ ]
584
+ },
585
+ "080196ef01": {
586
+ "lizard": [
587
+ "1"
588
+ ]
589
+ },
590
+ "081207976e": {},
591
+ "081ae4fa44": {
592
+ "shark": [
593
+ "1",
594
+ "2"
595
+ ]
596
+ },
597
+ "081d8250cb": {
598
+ "person": [
599
+ "1"
600
+ ],
601
+ "sedan": [
602
+ "3"
603
+ ]
604
+ },
605
+ "082900c5d4": {
606
+ "duck": [
607
+ "1",
608
+ "2",
609
+ "3"
610
+ ]
611
+ },
612
+ "0860df21e2": {},
613
+ "0866d4c5e3": {
614
+ "bird": [
615
+ "1",
616
+ "2",
617
+ "3"
618
+ ]
619
+ },
620
+ "0891ac2eb6": {
621
+ "person": [
622
+ "1",
623
+ "2",
624
+ "3"
625
+ ]
626
+ },
627
+ "08931bc458": {
628
+ "person": [
629
+ "1"
630
+ ]
631
+ },
632
+ "08aa2705d5": {
633
+ "snake": [
634
+ "1"
635
+ ]
636
+ },
637
+ "08c8450db7": {},
638
+ "08d50b926c": {
639
+ "turtle": [
640
+ "1",
641
+ "2"
642
+ ]
643
+ },
644
+ "08e1e4de15": {
645
+ "monkey": [
646
+ "1",
647
+ "2",
648
+ "3",
649
+ "4"
650
+ ]
651
+ },
652
+ "08e48c1a48": {
653
+ "cow": [
654
+ "1"
655
+ ]
656
+ },
657
+ "08f561c65e": {
658
+ "person": [
659
+ "2"
660
+ ],
661
+ "giant_panda": [
662
+ "1"
663
+ ]
664
+ },
665
+ "08feb87790": {
666
+ "sheep": [
667
+ "1"
668
+ ]
669
+ },
670
+ "09049f6fe3": {
671
+ "mouse": [
672
+ "1",
673
+ "2"
674
+ ]
675
+ },
676
+ "092e4ff450": {
677
+ "snake": [
678
+ "1"
679
+ ]
680
+ },
681
+ "09338adea8": {
682
+ "whale": [
683
+ "1",
684
+ "2"
685
+ ]
686
+ },
687
+ "093c335ccc": {
688
+ "person": [
689
+ "2"
690
+ ]
691
+ },
692
+ "0970d28339": {
693
+ "ape": [
694
+ "1",
695
+ "2"
696
+ ]
697
+ },
698
+ "0974a213dc": {
699
+ "giraffe": [
700
+ "1",
701
+ "2",
702
+ "3"
703
+ ]
704
+ },
705
+ "097b471ed8": {
706
+ "cat": [
707
+ "1",
708
+ "2"
709
+ ]
710
+ },
711
+ "0990941758": {
712
+ "giant_panda": [
713
+ "1"
714
+ ]
715
+ },
716
+ "09a348f4fa": {
717
+ "lizard": [
718
+ "1"
719
+ ]
720
+ },
721
+ "09a6841288": {
722
+ "duck": [
723
+ "1",
724
+ "2"
725
+ ]
726
+ },
727
+ "09c5bad17b": {
728
+ "airplane": [
729
+ "1"
730
+ ]
731
+ },
732
+ "09c9ce80c7": {
733
+ "giant_panda": [
734
+ "1"
735
+ ]
736
+ },
737
+ "09ff54fef4": {
738
+ "fox": [
739
+ "1",
740
+ "2"
741
+ ]
742
+ },
743
+ "0a23765d15": {
744
+ "person": [
745
+ "1",
746
+ "2"
747
+ ]
748
+ },
749
+ "0a275e7f12": {
750
+ "elephant": [
751
+ "1"
752
+ ]
753
+ },
754
+ "0a2f2bd294": {
755
+ "motorbike": [
756
+ "1"
757
+ ]
758
+ },
759
+ "0a7a2514aa": {
760
+ "cat": [
761
+ "1"
762
+ ],
763
+ "lizard": [
764
+ "2"
765
+ ]
766
+ },
767
+ "0a7b27fde9": {
768
+ "parrot": [
769
+ "1",
770
+ "2"
771
+ ]
772
+ },
773
+ "0a8c467cc3": {
774
+ "fish": [
775
+ "1",
776
+ "2",
777
+ "3"
778
+ ]
779
+ },
780
+ "0ac8c560ae": {
781
+ "person": [
782
+ "2",
783
+ "3"
784
+ ]
785
+ },
786
+ "0b1627e896": {
787
+ "boat": [
788
+ "1"
789
+ ]
790
+ },
791
+ "0b285c47f6": {
792
+ "mouse": [
793
+ "1"
794
+ ]
795
+ },
796
+ "0b34ec1d55": {
797
+ "ape": [
798
+ "1"
799
+ ]
800
+ },
801
+ "0b5b5e8e5a": {
802
+ "person": [
803
+ "1"
804
+ ],
805
+ "sedan": [
806
+ "2"
807
+ ]
808
+ },
809
+ "0b68535614": {
810
+ "rabbit": [
811
+ "1"
812
+ ]
813
+ },
814
+ "0b6f9105fc": {
815
+ "rabbit": [
816
+ "1"
817
+ ]
818
+ },
819
+ "0b7dbfa3cb": {
820
+ "cow": [
821
+ "1"
822
+ ]
823
+ },
824
+ "0b9cea51ca": {
825
+ "whale": [
826
+ "1"
827
+ ]
828
+ },
829
+ "0b9d012be8": {
830
+ "camel": [
831
+ "1"
832
+ ]
833
+ },
834
+ "0bcfc4177d": {
835
+ "truck": [
836
+ "1"
837
+ ]
838
+ },
839
+ "0bd37b23c1": {
840
+ "motorbike": [
841
+ "1"
842
+ ]
843
+ },
844
+ "0bd864064c": {
845
+ "eagle": [
846
+ "1"
847
+ ]
848
+ },
849
+ "0c11c6bf7b": {
850
+ "deer": [
851
+ "1"
852
+ ]
853
+ },
854
+ "0c26bc77ac": {
855
+ "crocodile": [
856
+ "1"
857
+ ]
858
+ },
859
+ "0c3a04798c": {
860
+ "duck": [
861
+ "1"
862
+ ],
863
+ "fish": [
864
+ "2"
865
+ ]
866
+ },
867
+ "0c44a9d545": {
868
+ "tiger": [
869
+ "1"
870
+ ]
871
+ },
872
+ "0c817cc390": {
873
+ "hedgehog": [
874
+ "1"
875
+ ],
876
+ "dog": [
877
+ "2"
878
+ ]
879
+ },
880
+ "0ca839ee9a": {
881
+ "ape": [
882
+ "1",
883
+ "2"
884
+ ]
885
+ },
886
+ "0cd7ac0ac0": {
887
+ "rabbit": [
888
+ "1"
889
+ ]
890
+ },
891
+ "0ce06e0121": {
892
+ "parrot": [
893
+ "1",
894
+ "2"
895
+ ]
896
+ },
897
+ "0cfe974a89": {
898
+ "turtle": [
899
+ "1",
900
+ "2"
901
+ ]
902
+ },
903
+ "0d2fcc0dcd": {
904
+ "zebra": [
905
+ "1",
906
+ "2",
907
+ "3",
908
+ "4"
909
+ ]
910
+ },
911
+ "0d3aad05d2": {
912
+ "person": [
913
+ "1"
914
+ ]
915
+ },
916
+ "0d40b015f4": {
917
+ "person": [
918
+ "1"
919
+ ]
920
+ },
921
+ "0d97fba242": {
922
+ "person": [
923
+ "2"
924
+ ],
925
+ "dog": [
926
+ "1"
927
+ ]
928
+ },
929
+ "0d9cc80d7e": {
930
+ "person": [
931
+ "1",
932
+ "2",
933
+ "3"
934
+ ]
935
+ },
936
+ "0dab85b6d3": {
937
+ "lizard": [
938
+ "1",
939
+ "2"
940
+ ]
941
+ },
942
+ "0db5c427a5": {
943
+ "train": [
944
+ "1"
945
+ ]
946
+ },
947
+ "0dbaf284f1": {
948
+ "cat": [
949
+ "1",
950
+ "2"
951
+ ]
952
+ },
953
+ "0de4923598": {},
954
+ "0df28a9101": {
955
+ "turtle": [
956
+ "1",
957
+ "2",
958
+ "3"
959
+ ]
960
+ },
961
+ "0e04f636c4": {
962
+ "frog": [
963
+ "1"
964
+ ]
965
+ },
966
+ "0e05f0e232": {
967
+ "lizard": [
968
+ "1",
969
+ "2"
970
+ ]
971
+ },
972
+ "0e0930474b": {
973
+ "person": [
974
+ "2",
975
+ "3"
976
+ ],
977
+ "sedan": [
978
+ "1"
979
+ ]
980
+ },
981
+ "0e27472bea": {
982
+ "turtle": [
983
+ "1"
984
+ ]
985
+ },
986
+ "0e30020549": {
987
+ "parrot": [
988
+ "1"
989
+ ]
990
+ },
991
+ "0e621feb6c": {
992
+ "lizard": [
993
+ "1",
994
+ "2"
995
+ ]
996
+ },
997
+ "0e803c7d73": {},
998
+ "0e9ebe4e3c": {
999
+ "truck": [
1000
+ "1"
1001
+ ]
1002
+ },
1003
+ "0e9f2785ec": {
1004
+ "person": [
1005
+ "2"
1006
+ ]
1007
+ },
1008
+ "0ea68d418b": {
1009
+ "airplane": [
1010
+ "1"
1011
+ ]
1012
+ },
1013
+ "0eb403a222": {},
1014
+ "0ee92053d6": {
1015
+ "person": [
1016
+ "1"
1017
+ ]
1018
+ },
1019
+ "0eefca067f": {
1020
+ "giant_panda": [
1021
+ "1",
1022
+ "2"
1023
+ ]
1024
+ },
1025
+ "0f17fa6fcb": {
1026
+ "duck": [
1027
+ "1",
1028
+ "2",
1029
+ "3"
1030
+ ]
1031
+ },
1032
+ "0f1ac8e9a3": {
1033
+ "frog": [
1034
+ "1"
1035
+ ]
1036
+ },
1037
+ "0f202e9852": {
1038
+ "parrot": [
1039
+ "1"
1040
+ ]
1041
+ },
1042
+ "0f2ab8b1ff": {
1043
+ "dolphin": [
1044
+ "1",
1045
+ "2",
1046
+ "3"
1047
+ ]
1048
+ },
1049
+ "0f51a78756": {
1050
+ "sheep": [
1051
+ "1"
1052
+ ]
1053
+ },
1054
+ "0f5fbe16b0": {
1055
+ "raccoon": [
1056
+ "1",
1057
+ "2"
1058
+ ]
1059
+ },
1060
+ "0f6072077b": {
1061
+ "person": [
1062
+ "1",
1063
+ "2",
1064
+ "3"
1065
+ ]
1066
+ },
1067
+ "0f6b69b2f4": {
1068
+ "rabbit": [
1069
+ "1"
1070
+ ]
1071
+ },
1072
+ "0f6c2163de": {
1073
+ "snail": [
1074
+ "1"
1075
+ ]
1076
+ },
1077
+ "0f74ec5599": {
1078
+ "giant_panda": [
1079
+ "1"
1080
+ ]
1081
+ },
1082
+ "0f9683715b": {
1083
+ "elephant": [
1084
+ "1"
1085
+ ]
1086
+ },
1087
+ "0fa7b59356": {
1088
+ "duck": [
1089
+ "1"
1090
+ ]
1091
+ },
1092
+ "0fb173695b": {
1093
+ "person": [
1094
+ "3"
1095
+ ]
1096
+ },
1097
+ "0fc958cde2": {
1098
+ "owl": [
1099
+ "1"
1100
+ ]
1101
+ },
1102
+ "0fe7b1a621": {
1103
+ "parrot": [
1104
+ "1"
1105
+ ]
1106
+ },
1107
+ "0ffcdb491c": {
1108
+ "person": [
1109
+ "1",
1110
+ "2",
1111
+ "3"
1112
+ ]
1113
+ },
1114
+ "101caff7d4": {
1115
+ "giant_panda": [
1116
+ "1",
1117
+ "2"
1118
+ ]
1119
+ },
1120
+ "1022fe8417": {
1121
+ "person": [
1122
+ "1",
1123
+ "2",
1124
+ "3"
1125
+ ]
1126
+ },
1127
+ "1032e80b37": {
1128
+ "giraffe": [
1129
+ "1"
1130
+ ]
1131
+ },
1132
+ "103f501680": {
1133
+ "fish": [
1134
+ "1"
1135
+ ]
1136
+ },
1137
+ "104e64565f": {
1138
+ "elephant": [
1139
+ "1"
1140
+ ]
1141
+ },
1142
+ "104f1ab997": {
1143
+ "person": [
1144
+ "1",
1145
+ "2",
1146
+ "3"
1147
+ ]
1148
+ },
1149
+ "106242403f": {
1150
+ "person": [
1151
+ "1",
1152
+ "2"
1153
+ ]
1154
+ },
1155
+ "10b31f5431": {
1156
+ "person": [
1157
+ "1",
1158
+ "3",
1159
+ "4"
1160
+ ]
1161
+ },
1162
+ "10eced835e": {
1163
+ "giant_panda": [
1164
+ "1",
1165
+ "2"
1166
+ ]
1167
+ },
1168
+ "110d26fa3a": {
1169
+ "shark": [
1170
+ "1"
1171
+ ]
1172
+ },
1173
+ "1122c1d16a": {
1174
+ "parrot": [
1175
+ "1",
1176
+ "2",
1177
+ "3",
1178
+ "4",
1179
+ "5"
1180
+ ],
1181
+ "person": [
1182
+ "6"
1183
+ ]
1184
+ },
1185
+ "1145b49a5f": {
1186
+ "rabbit": [
1187
+ "1"
1188
+ ]
1189
+ },
1190
+ "11485838c2": {
1191
+ "giraffe": [
1192
+ "1",
1193
+ "2",
1194
+ "3"
1195
+ ]
1196
+ },
1197
+ "114e7676ec": {
1198
+ "person": [
1199
+ "1"
1200
+ ]
1201
+ },
1202
+ "1157472b95": {
1203
+ "parrot": [
1204
+ "1",
1205
+ "2"
1206
+ ]
1207
+ },
1208
+ "115ee1072c": {
1209
+ "cow": [
1210
+ "1"
1211
+ ]
1212
+ },
1213
+ "1171141012": {
1214
+ "person": [
1215
+ "2"
1216
+ ],
1217
+ "turtle": [
1218
+ "1"
1219
+ ]
1220
+ },
1221
+ "117757b4b8": {
1222
+ "snail": [
1223
+ "1"
1224
+ ]
1225
+ },
1226
+ "1178932d2f": {
1227
+ "person": [
1228
+ "1",
1229
+ "2"
1230
+ ],
1231
+ "motorbike": [
1232
+ "3"
1233
+ ]
1234
+ },
1235
+ "117cc76bda": {
1236
+ "whale": [
1237
+ "1"
1238
+ ]
1239
+ },
1240
+ "1180cbf814": {
1241
+ "fish": [
1242
+ "1",
1243
+ "2"
1244
+ ]
1245
+ },
1246
+ "1187bbd0e3": {
1247
+ "cat": [
1248
+ "1"
1249
+ ]
1250
+ },
1251
+ "1197e44b26": {
1252
+ "giant_panda": [
1253
+ "1"
1254
+ ]
1255
+ },
1256
+ "119cf20728": {
1257
+ "lizard": [
1258
+ "1"
1259
+ ]
1260
+ },
1261
+ "119dd54871": {
1262
+ "lion": [
1263
+ "1",
1264
+ "2"
1265
+ ]
1266
+ },
1267
+ "11a0c3b724": {
1268
+ "mouse": [
1269
+ "1",
1270
+ "2"
1271
+ ]
1272
+ },
1273
+ "11a6ba8c94": {
1274
+ "person": [
1275
+ "1",
1276
+ "2"
1277
+ ]
1278
+ },
1279
+ "11c722a456": {
1280
+ "turtle": [
1281
+ "1",
1282
+ "2"
1283
+ ]
1284
+ },
1285
+ "11cbcb0b4d": {
1286
+ "zebra": [
1287
+ "1"
1288
+ ]
1289
+ },
1290
+ "11ccf5e99d": {
1291
+ "person": [
1292
+ "2"
1293
+ ]
1294
+ },
1295
+ "11ce6f452e": {
1296
+ "person": [
1297
+ "1",
1298
+ "2",
1299
+ "3"
1300
+ ]
1301
+ },
1302
+ "11feabe596": {
1303
+ "rabbit": [
1304
+ "1"
1305
+ ]
1306
+ },
1307
+ "120cb9514d": {
1308
+ "person": [
1309
+ "1",
1310
+ "2",
1311
+ "3"
1312
+ ]
1313
+ },
1314
+ "12156b25b3": {
1315
+ "person": [
1316
+ "1"
1317
+ ]
1318
+ },
1319
+ "122896672d": {
1320
+ "person": [
1321
+ "1",
1322
+ "3"
1323
+ ]
1324
+ },
1325
+ "1233ac8596": {
1326
+ "dog": [
1327
+ "1"
1328
+ ]
1329
+ },
1330
+ "1239c87234": {
1331
+ "lizard": [
1332
+ "1"
1333
+ ]
1334
+ },
1335
+ "1250423f7c": {
1336
+ "person": [
1337
+ "2"
1338
+ ],
1339
+ "elephant": [
1340
+ "3",
1341
+ "4"
1342
+ ]
1343
+ },
1344
+ "1257a1bc67": {
1345
+ "snake": [
1346
+ "1"
1347
+ ]
1348
+ },
1349
+ "125d1b19dd": {
1350
+ "giant_panda": [
1351
+ "1",
1352
+ "2"
1353
+ ]
1354
+ },
1355
+ "126d203967": {
1356
+ "person": [
1357
+ "2"
1358
+ ]
1359
+ },
1360
+ "1295e19071": {
1361
+ "airplane": [
1362
+ "1"
1363
+ ]
1364
+ },
1365
+ "12ad198c54": {
1366
+ "person": [
1367
+ "1"
1368
+ ]
1369
+ },
1370
+ "12bddb2bcb": {
1371
+ "person": [
1372
+ "2"
1373
+ ]
1374
+ },
1375
+ "12ec9b93ee": {
1376
+ "giant_panda": [
1377
+ "1"
1378
+ ]
1379
+ },
1380
+ "12eebedc35": {
1381
+ "bird": [
1382
+ "1"
1383
+ ]
1384
+ },
1385
+ "132852e094": {
1386
+ "fox": [
1387
+ "1"
1388
+ ]
1389
+ },
1390
+ "1329409f2a": {
1391
+ "fish": [
1392
+ "1"
1393
+ ]
1394
+ },
1395
+ "13325cfa14": {
1396
+ "person": [
1397
+ "2"
1398
+ ]
1399
+ },
1400
+ "1336440745": {
1401
+ "mouse": [
1402
+ "1",
1403
+ "2"
1404
+ ]
1405
+ },
1406
+ "134d06dbf9": {
1407
+ "cat": [
1408
+ "1"
1409
+ ]
1410
+ },
1411
+ "135625b53d": {
1412
+ "parrot": [
1413
+ "1"
1414
+ ]
1415
+ },
1416
+ "13870016f9": {
1417
+ "person": [
1418
+ "1"
1419
+ ],
1420
+ "cow": [
1421
+ "2",
1422
+ "3"
1423
+ ]
1424
+ },
1425
+ "13960b3c84": {
1426
+ "giraffe": [
1427
+ "1",
1428
+ "2",
1429
+ "3"
1430
+ ]
1431
+ },
1432
+ "13adaad9d9": {
1433
+ "giant_panda": [
1434
+ "1"
1435
+ ]
1436
+ },
1437
+ "13ae097e20": {
1438
+ "giant_panda": [
1439
+ "1"
1440
+ ]
1441
+ },
1442
+ "13e3070469": {
1443
+ "zebra": [
1444
+ "1",
1445
+ "2",
1446
+ "3"
1447
+ ]
1448
+ },
1449
+ "13f6a8c20d": {
1450
+ "fish": [
1451
+ "1"
1452
+ ]
1453
+ },
1454
+ "1416925cf2": {
1455
+ "truck": [
1456
+ "1",
1457
+ "2"
1458
+ ]
1459
+ },
1460
+ "142d2621f5": {
1461
+ "person": [
1462
+ "1",
1463
+ "2"
1464
+ ],
1465
+ "motorbike": [
1466
+ "3"
1467
+ ]
1468
+ },
1469
+ "145d5d7c03": {
1470
+ "giant_panda": [
1471
+ "1"
1472
+ ]
1473
+ },
1474
+ "145fdc3ac5": {
1475
+ "lizard": [
1476
+ "1"
1477
+ ]
1478
+ },
1479
+ "1471274fa7": {
1480
+ "person": [
1481
+ "1"
1482
+ ]
1483
+ },
1484
+ "14a6b5a139": {
1485
+ "fish": [
1486
+ "1"
1487
+ ]
1488
+ },
1489
+ "14c21cea0d": {
1490
+ "monkey": [
1491
+ "1",
1492
+ "2"
1493
+ ]
1494
+ },
1495
+ "14dae0dc93": {
1496
+ "person": [
1497
+ "2"
1498
+ ]
1499
+ },
1500
+ "14f9bd22b5": {
1501
+ "tiger": [
1502
+ "1"
1503
+ ]
1504
+ },
1505
+ "14fd28ae99": {
1506
+ "parrot": [
1507
+ "1"
1508
+ ]
1509
+ },
1510
+ "15097d5d4e": {
1511
+ "parrot": [
1512
+ "1"
1513
+ ]
1514
+ },
1515
+ "150ea711f2": {
1516
+ "whale": [
1517
+ "1"
1518
+ ]
1519
+ },
1520
+ "1514e3563f": {
1521
+ "earless_seal": [
1522
+ "1",
1523
+ "2"
1524
+ ]
1525
+ },
1526
+ "152aaa3a9e": {
1527
+ "raccoon": [
1528
+ "1"
1529
+ ]
1530
+ },
1531
+ "152b7d3bd7": {
1532
+ "giant_panda": [
1533
+ "1"
1534
+ ]
1535
+ },
1536
+ "15617297cc": {
1537
+ "person": [
1538
+ "1"
1539
+ ]
1540
+ },
1541
+ "15abbe0c52": {
1542
+ "person": [
1543
+ "1"
1544
+ ]
1545
+ },
1546
+ "15d1fb3de5": {
1547
+ "cat": [
1548
+ "2"
1549
+ ],
1550
+ "owl": [
1551
+ "1"
1552
+ ]
1553
+ },
1554
+ "15f67b0fab": {
1555
+ "person": [
1556
+ "1"
1557
+ ]
1558
+ },
1559
+ "161eb59aad": {
1560
+ "cow": [
1561
+ "2",
1562
+ "3"
1563
+ ],
1564
+ "giraffe": [
1565
+ "1"
1566
+ ]
1567
+ },
1568
+ "16288ea47f": {
1569
+ "duck": [
1570
+ "1",
1571
+ "2"
1572
+ ]
1573
+ },
1574
+ "164410ce62": {
1575
+ "person": [
1576
+ "1"
1577
+ ]
1578
+ },
1579
+ "165c3c8cd4": {
1580
+ "person": [
1581
+ "1",
1582
+ "2",
1583
+ "3"
1584
+ ]
1585
+ },
1586
+ "165c42b41b": {
1587
+ "person": [
1588
+ "1",
1589
+ "4"
1590
+ ],
1591
+ "motorbike": [
1592
+ "2",
1593
+ "3"
1594
+ ]
1595
+ },
1596
+ "165ec9e22b": {
1597
+ "person": [
1598
+ "1",
1599
+ "2"
1600
+ ]
1601
+ },
1602
+ "1669502269": {
1603
+ "person": [
1604
+ "1"
1605
+ ]
1606
+ },
1607
+ "16763cccbb": {
1608
+ "ape": [
1609
+ "1"
1610
+ ]
1611
+ },
1612
+ "16adde065e": {
1613
+ "cat": [
1614
+ "2"
1615
+ ],
1616
+ "person": [
1617
+ "3"
1618
+ ]
1619
+ },
1620
+ "16af445362": {
1621
+ "airplane": [
1622
+ "1"
1623
+ ]
1624
+ },
1625
+ "16afd538ad": {
1626
+ "parrot": [
1627
+ "1",
1628
+ "2"
1629
+ ]
1630
+ },
1631
+ "16c3fa4d5d": {
1632
+ "sedan": [
1633
+ "1"
1634
+ ]
1635
+ },
1636
+ "16d1d65c27": {
1637
+ "monkey": [
1638
+ "1"
1639
+ ]
1640
+ },
1641
+ "16e8599e94": {
1642
+ "giant_panda": [
1643
+ "1"
1644
+ ]
1645
+ },
1646
+ "16fe9fb444": {
1647
+ "motorbike": [
1648
+ "1"
1649
+ ],
1650
+ "person": [
1651
+ "2"
1652
+ ]
1653
+ },
1654
+ "1705796b02": {
1655
+ "train": [
1656
+ "1"
1657
+ ]
1658
+ },
1659
+ "1724db7671": {
1660
+ "giant_panda": [
1661
+ "1"
1662
+ ]
1663
+ },
1664
+ "17418e81ea": {
1665
+ "shark": [
1666
+ "1"
1667
+ ]
1668
+ },
1669
+ "175169edbb": {
1670
+ "ape": [
1671
+ "1",
1672
+ "2"
1673
+ ]
1674
+ },
1675
+ "17622326fd": {
1676
+ "lizard": [
1677
+ "1"
1678
+ ]
1679
+ },
1680
+ "17656bae77": {
1681
+ "elephant": [
1682
+ "1"
1683
+ ]
1684
+ },
1685
+ "17b0d94172": {
1686
+ "airplane": [
1687
+ "1"
1688
+ ]
1689
+ },
1690
+ "17c220e4f6": {
1691
+ "giant_panda": [
1692
+ "1"
1693
+ ]
1694
+ },
1695
+ "17c7bcd146": {
1696
+ "train": [
1697
+ "1"
1698
+ ]
1699
+ },
1700
+ "17cb4afe89": {
1701
+ "tiger": [
1702
+ "1"
1703
+ ]
1704
+ },
1705
+ "17cd79a434": {
1706
+ "squirrel": [
1707
+ "1"
1708
+ ]
1709
+ },
1710
+ "17d18604c3": {
1711
+ "person": [
1712
+ "1",
1713
+ "2"
1714
+ ]
1715
+ },
1716
+ "17d8ca1a37": {
1717
+ "person": [
1718
+ "2"
1719
+ ],
1720
+ "owl": [
1721
+ "1"
1722
+ ]
1723
+ },
1724
+ "17e33f4330": {
1725
+ "monkey": [
1726
+ "1"
1727
+ ]
1728
+ },
1729
+ "17f7a6d805": {
1730
+ "snail": [
1731
+ "1"
1732
+ ]
1733
+ },
1734
+ "180abc8378": {
1735
+ "person": [
1736
+ "2"
1737
+ ],
1738
+ "owl": [
1739
+ "1"
1740
+ ]
1741
+ },
1742
+ "183ba3d652": {
1743
+ "person": [
1744
+ "2"
1745
+ ],
1746
+ "motorbike": [
1747
+ "3"
1748
+ ]
1749
+ },
1750
+ "185bf64702": {
1751
+ "zebra": [
1752
+ "1",
1753
+ "2"
1754
+ ]
1755
+ },
1756
+ "18913cc690": {
1757
+ "train": [
1758
+ "1"
1759
+ ]
1760
+ },
1761
+ "1892651815": {
1762
+ "camel": [
1763
+ "1"
1764
+ ]
1765
+ },
1766
+ "189ac8208a": {
1767
+ "giraffe": [
1768
+ "1",
1769
+ "2"
1770
+ ]
1771
+ },
1772
+ "189b44e92c": {
1773
+ "zebra": [
1774
+ "1"
1775
+ ]
1776
+ },
1777
+ "18ac264b76": {
1778
+ "person": [
1779
+ "2"
1780
+ ]
1781
+ },
1782
+ "18b245ab49": {
1783
+ "penguin": [
1784
+ "1",
1785
+ "2",
1786
+ "3",
1787
+ "4"
1788
+ ]
1789
+ },
1790
+ "18b5cebc34": {
1791
+ "mouse": [
1792
+ "1"
1793
+ ]
1794
+ },
1795
+ "18bad52083": {
1796
+ "parrot": [
1797
+ "1",
1798
+ "2"
1799
+ ]
1800
+ },
1801
+ "18bb5144d5": {
1802
+ "lizard": [
1803
+ "1"
1804
+ ]
1805
+ },
1806
+ "18c6f205c5": {
1807
+ "person": [
1808
+ "1",
1809
+ "2",
1810
+ "3"
1811
+ ]
1812
+ },
1813
+ "1903f9ea15": {
1814
+ "bird": [
1815
+ "1",
1816
+ "2",
1817
+ "3"
1818
+ ]
1819
+ },
1820
+ "1917b209f2": {
1821
+ "person": [
1822
+ "1"
1823
+ ],
1824
+ "cow": [
1825
+ "3",
1826
+ "4"
1827
+ ],
1828
+ "horse": [
1829
+ "2"
1830
+ ]
1831
+ },
1832
+ "191e74c01d": {
1833
+ "deer": [
1834
+ "1"
1835
+ ]
1836
+ },
1837
+ "19367bb94e": {
1838
+ "fish": [
1839
+ "1",
1840
+ "2",
1841
+ "3"
1842
+ ]
1843
+ },
1844
+ "193ffaa217": {
1845
+ "person": [
1846
+ "1",
1847
+ "2",
1848
+ "3"
1849
+ ]
1850
+ },
1851
+ "19696b67d3": {
1852
+ "cow": [
1853
+ "1"
1854
+ ]
1855
+ },
1856
+ "197f3ab6f3": {
1857
+ "giant_panda": [
1858
+ "1"
1859
+ ]
1860
+ },
1861
+ "1981e763cc": {
1862
+ "sheep": [
1863
+ "1",
1864
+ "2"
1865
+ ]
1866
+ },
1867
+ "198afe39ae": {
1868
+ "person": [
1869
+ "1"
1870
+ ]
1871
+ },
1872
+ "19a6e62b9b": {
1873
+ "monkey": [
1874
+ "1",
1875
+ "2"
1876
+ ]
1877
+ },
1878
+ "19b60d5335": {
1879
+ "hedgehog": [
1880
+ "1"
1881
+ ]
1882
+ },
1883
+ "19c00c11f9": {
1884
+ "person": [
1885
+ "1"
1886
+ ]
1887
+ },
1888
+ "19e061eb88": {
1889
+ "boat": [
1890
+ "1",
1891
+ "2"
1892
+ ]
1893
+ },
1894
+ "19e8bc6178": {
1895
+ "dog": [
1896
+ "1"
1897
+ ]
1898
+ },
1899
+ "19ee80dac6": {
1900
+ "person": [
1901
+ "1",
1902
+ "3",
1903
+ "4"
1904
+ ]
1905
+ },
1906
+ "1a25a9170a": {
1907
+ "person": [
1908
+ "2",
1909
+ "3"
1910
+ ],
1911
+ "cow": [
1912
+ "1"
1913
+ ]
1914
+ },
1915
+ "1a359a6c1a": {
1916
+ "sheep": [
1917
+ "1"
1918
+ ]
1919
+ },
1920
+ "1a3e87c566": {
1921
+ "frog": [
1922
+ "1"
1923
+ ]
1924
+ },
1925
+ "1a5fe06b00": {
1926
+ "bus": [
1927
+ "1"
1928
+ ]
1929
+ },
1930
+ "1a6c0fbd1e": {
1931
+ "person": [
1932
+ "1"
1933
+ ]
1934
+ },
1935
+ "1a6f3b5a4b": {
1936
+ "sedan": [
1937
+ "3"
1938
+ ]
1939
+ },
1940
+ "1a8afbad92": {
1941
+ "zebra": [
1942
+ "1",
1943
+ "2",
1944
+ "3"
1945
+ ]
1946
+ },
1947
+ "1a8bdc5842": {
1948
+ "parrot": [
1949
+ "1",
1950
+ "2"
1951
+ ]
1952
+ },
1953
+ "1a95752aca": {
1954
+ "duck": [
1955
+ "1",
1956
+ "2"
1957
+ ]
1958
+ },
1959
+ "1a9c131cb7": {
1960
+ "ape": [
1961
+ "1",
1962
+ "2",
1963
+ "3"
1964
+ ]
1965
+ },
1966
+ "1aa3da3ee3": {
1967
+ "sheep": [
1968
+ "1",
1969
+ "2",
1970
+ "3",
1971
+ "4"
1972
+ ]
1973
+ },
1974
+ "1ab27ec7ea": {
1975
+ "deer": [
1976
+ "1"
1977
+ ]
1978
+ },
1979
+ "1abf16d21d": {
1980
+ "turtle": [
1981
+ "1"
1982
+ ]
1983
+ },
1984
+ "1acd0f993b": {
1985
+ "person": [
1986
+ "3"
1987
+ ],
1988
+ "dog": [
1989
+ "1"
1990
+ ]
1991
+ },
1992
+ "1ad202e499": {
1993
+ "lizard": [
1994
+ "1",
1995
+ "2"
1996
+ ]
1997
+ },
1998
+ "1af8d2395d": {
1999
+ "person": [
2000
+ "1",
2001
+ "2"
2002
+ ],
2003
+ "airplane": [
2004
+ "4"
2005
+ ]
2006
+ },
2007
+ "1afd39a1fa": {
2008
+ "motorbike": [
2009
+ "2"
2010
+ ]
2011
+ },
2012
+ "1b2d31306f": {
2013
+ "lizard": [
2014
+ "1"
2015
+ ]
2016
+ },
2017
+ "1b3fa67f0e": {
2018
+ "airplane": [
2019
+ "1"
2020
+ ]
2021
+ },
2022
+ "1b43fa74b4": {
2023
+ "owl": [
2024
+ "1",
2025
+ "2"
2026
+ ]
2027
+ },
2028
+ "1b73ea9fc2": {
2029
+ "parrot": [
2030
+ "1"
2031
+ ]
2032
+ },
2033
+ "1b7e8bb255": {
2034
+ "person": [
2035
+ "2"
2036
+ ]
2037
+ },
2038
+ "1b8680f8cd": {
2039
+ "person": [
2040
+ "2",
2041
+ "3"
2042
+ ]
2043
+ },
2044
+ "1b883843c0": {
2045
+ "person": [
2046
+ "1",
2047
+ "2"
2048
+ ]
2049
+ },
2050
+ "1b8898785b": {
2051
+ "monkey": [
2052
+ "1",
2053
+ "2"
2054
+ ]
2055
+ },
2056
+ "1b88ba1aa4": {
2057
+ "giant_panda": [
2058
+ "1"
2059
+ ]
2060
+ },
2061
+ "1b96a498e5": {
2062
+ "ape": [
2063
+ "1"
2064
+ ]
2065
+ },
2066
+ "1bbc4c274f": {
2067
+ "fish": [
2068
+ "2"
2069
+ ]
2070
+ },
2071
+ "1bd87fe9ab": {
2072
+ "train": [
2073
+ "1"
2074
+ ]
2075
+ },
2076
+ "1c4090c75b": {
2077
+ "whale": [
2078
+ "1"
2079
+ ]
2080
+ },
2081
+ "1c41934f84": {
2082
+ "elephant": [
2083
+ "1",
2084
+ "2"
2085
+ ]
2086
+ },
2087
+ "1c72b04b56": {
2088
+ "lion": [
2089
+ "1"
2090
+ ]
2091
+ },
2092
+ "1c87955a3a": {
2093
+ "crocodile": [
2094
+ "1"
2095
+ ],
2096
+ "turtle": [
2097
+ "2"
2098
+ ]
2099
+ },
2100
+ "1c9f9eb792": {
2101
+ "person": [
2102
+ "2"
2103
+ ]
2104
+ },
2105
+ "1ca240fede": {
2106
+ "train": [
2107
+ "1"
2108
+ ]
2109
+ },
2110
+ "1ca5673803": {
2111
+ "person": [
2112
+ "1",
2113
+ "3"
2114
+ ]
2115
+ },
2116
+ "1cada35274": {
2117
+ "duck": [
2118
+ "1"
2119
+ ]
2120
+ },
2121
+ "1cb44b920d": {
2122
+ "eagle": [
2123
+ "1",
2124
+ "2"
2125
+ ]
2126
+ },
2127
+ "1cd10e62be": {
2128
+ "leopard": [
2129
+ "1"
2130
+ ]
2131
+ },
2132
+ "1d3087d5e5": {
2133
+ "fish": [
2134
+ "1",
2135
+ "2",
2136
+ "3",
2137
+ "4",
2138
+ "5"
2139
+ ]
2140
+ },
2141
+ "1d3685150a": {
2142
+ "person": [
2143
+ "1",
2144
+ "3"
2145
+ ]
2146
+ },
2147
+ "1d6ff083aa": {
2148
+ "person": [
2149
+ "1",
2150
+ "2"
2151
+ ]
2152
+ }
2153
+ }
mbench/numbered_valid_obj_ids_gpt-4o_nomask_randcap.json ADDED
@@ -0,0 +1,2153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "003234408d": {
3
+ "penguin": [
4
+ "1",
5
+ "2",
6
+ "3",
7
+ "4",
8
+ "5"
9
+ ]
10
+ },
11
+ "0043f083b5": {
12
+ "sedan": [
13
+ "2",
14
+ "3"
15
+ ],
16
+ "bus": [
17
+ "1"
18
+ ]
19
+ },
20
+ "0044fa5fba": {
21
+ "giant_panda": [
22
+ "1"
23
+ ]
24
+ },
25
+ "005a527edd": {
26
+ "ape": [
27
+ "1",
28
+ "2"
29
+ ]
30
+ },
31
+ "0065b171f9": {
32
+ "giant_panda": [
33
+ "1"
34
+ ]
35
+ },
36
+ "00917dcfc4": {
37
+ "zebra": [
38
+ "1",
39
+ "2",
40
+ "3"
41
+ ]
42
+ },
43
+ "00a23ccf53": {
44
+ "shark": [
45
+ "1"
46
+ ]
47
+ },
48
+ "00ad5016a4": {
49
+ "airplane": [
50
+ "1"
51
+ ]
52
+ },
53
+ "01082ae388": {
54
+ "leopard": [
55
+ "1"
56
+ ]
57
+ },
58
+ "011ac0a06f": {
59
+ "ape": [
60
+ "1",
61
+ "2",
62
+ "3",
63
+ "4",
64
+ "5"
65
+ ]
66
+ },
67
+ "013099c098": {
68
+ "giant_panda": [
69
+ "1",
70
+ "2"
71
+ ]
72
+ },
73
+ "0155498c85": {
74
+ "motorbike": [
75
+ "2"
76
+ ],
77
+ "person": [
78
+ "1"
79
+ ]
80
+ },
81
+ "01694ad9c8": {
82
+ "bird": [
83
+ "1"
84
+ ]
85
+ },
86
+ "017ac35701": {
87
+ "giant_panda": [
88
+ "1"
89
+ ]
90
+ },
91
+ "01b80e8e1a": {
92
+ "zebra": [
93
+ "1",
94
+ "2"
95
+ ]
96
+ },
97
+ "01baa5a4e1": {},
98
+ "01c3111683": {
99
+ "whale": [
100
+ "1"
101
+ ]
102
+ },
103
+ "01c4cb5ffe": {
104
+ "person": [
105
+ "1",
106
+ "3"
107
+ ]
108
+ },
109
+ "01c76f0a82": {
110
+ "sedan": [
111
+ "1",
112
+ "4"
113
+ ]
114
+ },
115
+ "01c783268c": {
116
+ "ape": [
117
+ "1"
118
+ ],
119
+ "person": [
120
+ "2"
121
+ ]
122
+ },
123
+ "01e64dd36a": {
124
+ "cow": [
125
+ "1",
126
+ "2",
127
+ "3"
128
+ ]
129
+ },
130
+ "01ed275c6e": {
131
+ "giraffe": [
132
+ "1",
133
+ "2"
134
+ ]
135
+ },
136
+ "01ff60d1fa": {
137
+ "lizard": [
138
+ "1"
139
+ ]
140
+ },
141
+ "020cd28cd2": {
142
+ "person": [
143
+ "1"
144
+ ]
145
+ },
146
+ "02264db755": {
147
+ "fox": [
148
+ "1"
149
+ ]
150
+ },
151
+ "0248626d9a": {
152
+ "train": [
153
+ "1"
154
+ ]
155
+ },
156
+ "02668dbffa": {
157
+ "frog": [
158
+ "1"
159
+ ]
160
+ },
161
+ "0274193026": {
162
+ "person": [
163
+ "2"
164
+ ]
165
+ },
166
+ "02d28375aa": {
167
+ "fox": [
168
+ "1"
169
+ ]
170
+ },
171
+ "031ccc99b1": {
172
+ "person": [
173
+ "1",
174
+ "2",
175
+ "3"
176
+ ]
177
+ },
178
+ "0321b18c10": {
179
+ "elephant": [
180
+ "3"
181
+ ],
182
+ "person": [
183
+ "1",
184
+ "2"
185
+ ]
186
+ },
187
+ "0348a45bca": {
188
+ "fish": [
189
+ "1",
190
+ "2",
191
+ "3",
192
+ "4",
193
+ "5"
194
+ ]
195
+ },
196
+ "0355e92655": {
197
+ "boat": [
198
+ "3"
199
+ ],
200
+ "person": [
201
+ "2"
202
+ ]
203
+ },
204
+ "0358b938c1": {
205
+ "elephant": [
206
+ "1",
207
+ "2",
208
+ "3",
209
+ "4"
210
+ ]
211
+ },
212
+ "0368107cf1": {
213
+ "person": [
214
+ "1",
215
+ "2"
216
+ ]
217
+ },
218
+ "0379ddf557": {
219
+ "person": [
220
+ "1"
221
+ ]
222
+ },
223
+ "038b2cc71d": {
224
+ "lizard": [
225
+ "1"
226
+ ]
227
+ },
228
+ "038c15a5dd": {
229
+ "hedgehog": [
230
+ "1"
231
+ ]
232
+ },
233
+ "03a06cc98a": {
234
+ "giraffe": [
235
+ "1",
236
+ "2",
237
+ "3"
238
+ ]
239
+ },
240
+ "03a63e187f": {
241
+ "lizard": [
242
+ "1"
243
+ ]
244
+ },
245
+ "03c95b4dae": {
246
+ "elephant": [
247
+ "1",
248
+ "2",
249
+ "3"
250
+ ]
251
+ },
252
+ "03e2b57b0e": {
253
+ "lizard": [
254
+ "1"
255
+ ]
256
+ },
257
+ "04194e1248": {
258
+ "lizard": [
259
+ "1"
260
+ ]
261
+ },
262
+ "04259896e2": {
263
+ "lizard": [
264
+ "1"
265
+ ]
266
+ },
267
+ "0444918a5f": {
268
+ "truck": [
269
+ "1",
270
+ "2",
271
+ "3",
272
+ "4"
273
+ ]
274
+ },
275
+ "04460a7a52": {
276
+ "lizard": [
277
+ "1"
278
+ ]
279
+ },
280
+ "04474174a4": {
281
+ "ape": [
282
+ "1",
283
+ "2"
284
+ ]
285
+ },
286
+ "0450095513": {
287
+ "snail": [
288
+ "1"
289
+ ]
290
+ },
291
+ "045f00aed2": {
292
+ "person": [
293
+ "3"
294
+ ],
295
+ "tiger": [
296
+ "1"
297
+ ]
298
+ },
299
+ "04667fabaa": {
300
+ "parrot": [
301
+ "1"
302
+ ]
303
+ },
304
+ "04735c5030": {
305
+ "cat": [
306
+ "1",
307
+ "2"
308
+ ]
309
+ },
310
+ "04990d1915": {
311
+ "sedan": [
312
+ "1"
313
+ ],
314
+ "bus": [
315
+ "2"
316
+ ],
317
+ "truck": [
318
+ "3"
319
+ ]
320
+ },
321
+ "04d62d9d98": {
322
+ "person": [
323
+ "1"
324
+ ]
325
+ },
326
+ "04f21da964": {
327
+ "monkey": [
328
+ "1"
329
+ ]
330
+ },
331
+ "04fbad476e": {
332
+ "parrot": [
333
+ "1"
334
+ ]
335
+ },
336
+ "04fe256562": {
337
+ "motorbike": [
338
+ "1"
339
+ ],
340
+ "truck": [
341
+ "2"
342
+ ]
343
+ },
344
+ "0503bf89c9": {
345
+ "hedgehog": [
346
+ "1"
347
+ ]
348
+ },
349
+ "0536c9eed0": {
350
+ "cat": [
351
+ "1"
352
+ ]
353
+ },
354
+ "054acb238f": {
355
+ "owl": [
356
+ "1"
357
+ ]
358
+ },
359
+ "05579ca250": {
360
+ "sedan": [
361
+ "3"
362
+ ],
363
+ "person": [
364
+ "1"
365
+ ]
366
+ },
367
+ "056c200404": {},
368
+ "05774f3a2c": {
369
+ "ape": [
370
+ "1",
371
+ "2",
372
+ "3"
373
+ ]
374
+ },
375
+ "058a7592c8": {
376
+ "train": [
377
+ "1"
378
+ ]
379
+ },
380
+ "05a0a513df": {
381
+ "person": [
382
+ "1",
383
+ "2"
384
+ ]
385
+ },
386
+ "05a569d8aa": {
387
+ "mouse": [
388
+ "2"
389
+ ],
390
+ "cat": [
391
+ "1"
392
+ ]
393
+ },
394
+ "05aa652648": {
395
+ "ape": [
396
+ "1"
397
+ ]
398
+ },
399
+ "05d7715782": {},
400
+ "05e0b0f28f": {
401
+ "mouse": [
402
+ "1"
403
+ ],
404
+ "person": [
405
+ "2"
406
+ ]
407
+ },
408
+ "05fdbbdd7a": {},
409
+ "05ffcfed85": {
410
+ "monkey": [
411
+ "1",
412
+ "2"
413
+ ]
414
+ },
415
+ "0630391881": {
416
+ "person": [
417
+ "1"
418
+ ]
419
+ },
420
+ "06840b2bbe": {
421
+ "snake": [
422
+ "1"
423
+ ]
424
+ },
425
+ "068f7dce6f": {
426
+ "shark": [
427
+ "1"
428
+ ]
429
+ },
430
+ "0693719753": {
431
+ "turtle": [
432
+ "1",
433
+ "2"
434
+ ]
435
+ },
436
+ "06ce2b51fb": {
437
+ "person": [
438
+ "1",
439
+ "2"
440
+ ]
441
+ },
442
+ "06e224798e": {
443
+ "tiger": [
444
+ "1"
445
+ ]
446
+ },
447
+ "06ee361788": {
448
+ "duck": [
449
+ "1",
450
+ "2",
451
+ "3"
452
+ ]
453
+ },
454
+ "06fbb3fa2c": {
455
+ "eagle": [
456
+ "1"
457
+ ]
458
+ },
459
+ "0700264286": {
460
+ "cow": [
461
+ "1",
462
+ "2"
463
+ ]
464
+ },
465
+ "070c918ca7": {
466
+ "parrot": [
467
+ "1"
468
+ ]
469
+ },
470
+ "07129e14a4": {
471
+ "parrot": [
472
+ "1",
473
+ "2"
474
+ ],
475
+ "person": [
476
+ "3"
477
+ ]
478
+ },
479
+ "07177017e9": {
480
+ "motorbike": [
481
+ "1",
482
+ "2"
483
+ ]
484
+ },
485
+ "07238ffc58": {
486
+ "monkey": [
487
+ "1",
488
+ "2",
489
+ "3"
490
+ ]
491
+ },
492
+ "07353b2a89": {
493
+ "sheep": [
494
+ "1",
495
+ "2",
496
+ "3",
497
+ "4"
498
+ ]
499
+ },
500
+ "0738493cbf": {
501
+ "airplane": [
502
+ "1"
503
+ ]
504
+ },
505
+ "075926c651": {
506
+ "person": [
507
+ "1",
508
+ "2"
509
+ ]
510
+ },
511
+ "075c701292": {
512
+ "duck": [
513
+ "1",
514
+ "2",
515
+ "3",
516
+ "4"
517
+ ]
518
+ },
519
+ "0762ea9a30": {
520
+ "person": [
521
+ "1"
522
+ ]
523
+ },
524
+ "07652ee4af": {
525
+ "person": [
526
+ "1"
527
+ ]
528
+ },
529
+ "076f206928": {
530
+ "zebra": [
531
+ "1",
532
+ "2"
533
+ ],
534
+ "person": [
535
+ "3"
536
+ ]
537
+ },
538
+ "077d32af19": {
539
+ "train": [
540
+ "4"
541
+ ],
542
+ "person": [
543
+ "1",
544
+ "2",
545
+ "3"
546
+ ]
547
+ },
548
+ "079049275c": {
549
+ "mouse": [
550
+ "1"
551
+ ]
552
+ },
553
+ "07913cdda7": {
554
+ "train": [
555
+ "1"
556
+ ],
557
+ "person": [
558
+ "2",
559
+ "3"
560
+ ]
561
+ },
562
+ "07a11a35e8": {
563
+ "ape": [
564
+ "1",
565
+ "2"
566
+ ]
567
+ },
568
+ "07ac33b6df": {
569
+ "ape": [
570
+ "1"
571
+ ]
572
+ },
573
+ "07c62c3d11": {
574
+ "parrot": [
575
+ "1",
576
+ "2",
577
+ "3"
578
+ ]
579
+ },
580
+ "07cc1c7d74": {
581
+ "snake": [
582
+ "1"
583
+ ]
584
+ },
585
+ "080196ef01": {
586
+ "lizard": [
587
+ "1"
588
+ ]
589
+ },
590
+ "081207976e": {},
591
+ "081ae4fa44": {
592
+ "shark": [
593
+ "1",
594
+ "2"
595
+ ]
596
+ },
597
+ "081d8250cb": {
598
+ "sedan": [
599
+ "3"
600
+ ],
601
+ "person": [
602
+ "1"
603
+ ]
604
+ },
605
+ "082900c5d4": {
606
+ "duck": [
607
+ "1",
608
+ "2",
609
+ "3"
610
+ ]
611
+ },
612
+ "0860df21e2": {},
613
+ "0866d4c5e3": {
614
+ "bird": [
615
+ "1",
616
+ "2",
617
+ "3"
618
+ ]
619
+ },
620
+ "0891ac2eb6": {
621
+ "person": [
622
+ "1",
623
+ "2",
624
+ "3"
625
+ ]
626
+ },
627
+ "08931bc458": {
628
+ "person": [
629
+ "1"
630
+ ]
631
+ },
632
+ "08aa2705d5": {
633
+ "snake": [
634
+ "1"
635
+ ]
636
+ },
637
+ "08c8450db7": {},
638
+ "08d50b926c": {
639
+ "turtle": [
640
+ "1",
641
+ "2"
642
+ ]
643
+ },
644
+ "08e1e4de15": {
645
+ "monkey": [
646
+ "1",
647
+ "2",
648
+ "3",
649
+ "4"
650
+ ]
651
+ },
652
+ "08e48c1a48": {
653
+ "cow": [
654
+ "1"
655
+ ]
656
+ },
657
+ "08f561c65e": {
658
+ "person": [
659
+ "2"
660
+ ],
661
+ "giant_panda": [
662
+ "1"
663
+ ]
664
+ },
665
+ "08feb87790": {
666
+ "sheep": [
667
+ "1"
668
+ ]
669
+ },
670
+ "09049f6fe3": {
671
+ "mouse": [
672
+ "1",
673
+ "2"
674
+ ]
675
+ },
676
+ "092e4ff450": {
677
+ "snake": [
678
+ "1"
679
+ ]
680
+ },
681
+ "09338adea8": {
682
+ "whale": [
683
+ "1",
684
+ "2"
685
+ ]
686
+ },
687
+ "093c335ccc": {
688
+ "person": [
689
+ "2"
690
+ ]
691
+ },
692
+ "0970d28339": {
693
+ "ape": [
694
+ "1",
695
+ "2"
696
+ ]
697
+ },
698
+ "0974a213dc": {
699
+ "giraffe": [
700
+ "1",
701
+ "2",
702
+ "3"
703
+ ]
704
+ },
705
+ "097b471ed8": {
706
+ "cat": [
707
+ "1",
708
+ "2"
709
+ ]
710
+ },
711
+ "0990941758": {
712
+ "giant_panda": [
713
+ "1"
714
+ ]
715
+ },
716
+ "09a348f4fa": {
717
+ "lizard": [
718
+ "1"
719
+ ]
720
+ },
721
+ "09a6841288": {
722
+ "duck": [
723
+ "1",
724
+ "2"
725
+ ]
726
+ },
727
+ "09c5bad17b": {
728
+ "airplane": [
729
+ "1"
730
+ ]
731
+ },
732
+ "09c9ce80c7": {
733
+ "giant_panda": [
734
+ "1"
735
+ ]
736
+ },
737
+ "09ff54fef4": {
738
+ "fox": [
739
+ "1",
740
+ "2"
741
+ ]
742
+ },
743
+ "0a23765d15": {
744
+ "person": [
745
+ "1",
746
+ "2"
747
+ ]
748
+ },
749
+ "0a275e7f12": {
750
+ "elephant": [
751
+ "1"
752
+ ]
753
+ },
754
+ "0a2f2bd294": {
755
+ "motorbike": [
756
+ "1"
757
+ ]
758
+ },
759
+ "0a7a2514aa": {
760
+ "cat": [
761
+ "1"
762
+ ],
763
+ "lizard": [
764
+ "2"
765
+ ]
766
+ },
767
+ "0a7b27fde9": {
768
+ "parrot": [
769
+ "1",
770
+ "2"
771
+ ]
772
+ },
773
+ "0a8c467cc3": {
774
+ "fish": [
775
+ "1",
776
+ "2",
777
+ "3"
778
+ ]
779
+ },
780
+ "0ac8c560ae": {
781
+ "person": [
782
+ "2",
783
+ "3"
784
+ ]
785
+ },
786
+ "0b1627e896": {
787
+ "boat": [
788
+ "1"
789
+ ]
790
+ },
791
+ "0b285c47f6": {
792
+ "mouse": [
793
+ "1"
794
+ ]
795
+ },
796
+ "0b34ec1d55": {
797
+ "ape": [
798
+ "1"
799
+ ]
800
+ },
801
+ "0b5b5e8e5a": {
802
+ "sedan": [
803
+ "2"
804
+ ],
805
+ "person": [
806
+ "1"
807
+ ]
808
+ },
809
+ "0b68535614": {
810
+ "rabbit": [
811
+ "1"
812
+ ]
813
+ },
814
+ "0b6f9105fc": {
815
+ "rabbit": [
816
+ "1"
817
+ ]
818
+ },
819
+ "0b7dbfa3cb": {
820
+ "cow": [
821
+ "1"
822
+ ]
823
+ },
824
+ "0b9cea51ca": {
825
+ "whale": [
826
+ "1"
827
+ ]
828
+ },
829
+ "0b9d012be8": {
830
+ "camel": [
831
+ "1"
832
+ ]
833
+ },
834
+ "0bcfc4177d": {
835
+ "truck": [
836
+ "1"
837
+ ]
838
+ },
839
+ "0bd37b23c1": {
840
+ "motorbike": [
841
+ "1"
842
+ ]
843
+ },
844
+ "0bd864064c": {
845
+ "eagle": [
846
+ "1"
847
+ ]
848
+ },
849
+ "0c11c6bf7b": {
850
+ "deer": [
851
+ "1"
852
+ ]
853
+ },
854
+ "0c26bc77ac": {
855
+ "crocodile": [
856
+ "1"
857
+ ]
858
+ },
859
+ "0c3a04798c": {
860
+ "fish": [
861
+ "2"
862
+ ],
863
+ "duck": [
864
+ "1"
865
+ ]
866
+ },
867
+ "0c44a9d545": {
868
+ "tiger": [
869
+ "1"
870
+ ]
871
+ },
872
+ "0c817cc390": {
873
+ "hedgehog": [
874
+ "1"
875
+ ],
876
+ "dog": [
877
+ "2"
878
+ ]
879
+ },
880
+ "0ca839ee9a": {
881
+ "ape": [
882
+ "1",
883
+ "2"
884
+ ]
885
+ },
886
+ "0cd7ac0ac0": {
887
+ "rabbit": [
888
+ "1"
889
+ ]
890
+ },
891
+ "0ce06e0121": {
892
+ "parrot": [
893
+ "1",
894
+ "2"
895
+ ]
896
+ },
897
+ "0cfe974a89": {
898
+ "turtle": [
899
+ "1",
900
+ "2"
901
+ ]
902
+ },
903
+ "0d2fcc0dcd": {
904
+ "zebra": [
905
+ "1",
906
+ "2",
907
+ "3",
908
+ "4"
909
+ ]
910
+ },
911
+ "0d3aad05d2": {
912
+ "person": [
913
+ "1"
914
+ ]
915
+ },
916
+ "0d40b015f4": {
917
+ "person": [
918
+ "1"
919
+ ]
920
+ },
921
+ "0d97fba242": {
922
+ "dog": [
923
+ "1"
924
+ ],
925
+ "person": [
926
+ "2"
927
+ ]
928
+ },
929
+ "0d9cc80d7e": {
930
+ "person": [
931
+ "1",
932
+ "2",
933
+ "3"
934
+ ]
935
+ },
936
+ "0dab85b6d3": {
937
+ "lizard": [
938
+ "1",
939
+ "2"
940
+ ]
941
+ },
942
+ "0db5c427a5": {
943
+ "train": [
944
+ "1"
945
+ ]
946
+ },
947
+ "0dbaf284f1": {
948
+ "cat": [
949
+ "1",
950
+ "2"
951
+ ]
952
+ },
953
+ "0de4923598": {},
954
+ "0df28a9101": {
955
+ "turtle": [
956
+ "1",
957
+ "2",
958
+ "3"
959
+ ]
960
+ },
961
+ "0e04f636c4": {
962
+ "frog": [
963
+ "1"
964
+ ]
965
+ },
966
+ "0e05f0e232": {
967
+ "lizard": [
968
+ "1",
969
+ "2"
970
+ ]
971
+ },
972
+ "0e0930474b": {
973
+ "sedan": [
974
+ "1"
975
+ ],
976
+ "person": [
977
+ "2",
978
+ "3"
979
+ ]
980
+ },
981
+ "0e27472bea": {
982
+ "turtle": [
983
+ "1"
984
+ ]
985
+ },
986
+ "0e30020549": {
987
+ "parrot": [
988
+ "1"
989
+ ]
990
+ },
991
+ "0e621feb6c": {
992
+ "lizard": [
993
+ "1",
994
+ "2"
995
+ ]
996
+ },
997
+ "0e803c7d73": {},
998
+ "0e9ebe4e3c": {
999
+ "truck": [
1000
+ "1"
1001
+ ]
1002
+ },
1003
+ "0e9f2785ec": {
1004
+ "person": [
1005
+ "2"
1006
+ ]
1007
+ },
1008
+ "0ea68d418b": {
1009
+ "airplane": [
1010
+ "1"
1011
+ ]
1012
+ },
1013
+ "0eb403a222": {},
1014
+ "0ee92053d6": {
1015
+ "person": [
1016
+ "1"
1017
+ ]
1018
+ },
1019
+ "0eefca067f": {
1020
+ "giant_panda": [
1021
+ "1",
1022
+ "2"
1023
+ ]
1024
+ },
1025
+ "0f17fa6fcb": {
1026
+ "duck": [
1027
+ "1",
1028
+ "2",
1029
+ "3"
1030
+ ]
1031
+ },
1032
+ "0f1ac8e9a3": {
1033
+ "frog": [
1034
+ "1"
1035
+ ]
1036
+ },
1037
+ "0f202e9852": {
1038
+ "parrot": [
1039
+ "1"
1040
+ ]
1041
+ },
1042
+ "0f2ab8b1ff": {
1043
+ "dolphin": [
1044
+ "1",
1045
+ "2",
1046
+ "3"
1047
+ ]
1048
+ },
1049
+ "0f51a78756": {
1050
+ "sheep": [
1051
+ "1"
1052
+ ]
1053
+ },
1054
+ "0f5fbe16b0": {
1055
+ "raccoon": [
1056
+ "1",
1057
+ "2"
1058
+ ]
1059
+ },
1060
+ "0f6072077b": {
1061
+ "person": [
1062
+ "1",
1063
+ "2",
1064
+ "3"
1065
+ ]
1066
+ },
1067
+ "0f6b69b2f4": {
1068
+ "rabbit": [
1069
+ "1"
1070
+ ]
1071
+ },
1072
+ "0f6c2163de": {
1073
+ "snail": [
1074
+ "1"
1075
+ ]
1076
+ },
1077
+ "0f74ec5599": {
1078
+ "giant_panda": [
1079
+ "1"
1080
+ ]
1081
+ },
1082
+ "0f9683715b": {
1083
+ "elephant": [
1084
+ "1"
1085
+ ]
1086
+ },
1087
+ "0fa7b59356": {
1088
+ "duck": [
1089
+ "1"
1090
+ ]
1091
+ },
1092
+ "0fb173695b": {
1093
+ "person": [
1094
+ "3"
1095
+ ]
1096
+ },
1097
+ "0fc958cde2": {
1098
+ "owl": [
1099
+ "1"
1100
+ ]
1101
+ },
1102
+ "0fe7b1a621": {
1103
+ "parrot": [
1104
+ "1"
1105
+ ]
1106
+ },
1107
+ "0ffcdb491c": {
1108
+ "person": [
1109
+ "1",
1110
+ "2",
1111
+ "3"
1112
+ ]
1113
+ },
1114
+ "101caff7d4": {
1115
+ "giant_panda": [
1116
+ "1",
1117
+ "2"
1118
+ ]
1119
+ },
1120
+ "1022fe8417": {
1121
+ "person": [
1122
+ "1",
1123
+ "2",
1124
+ "3"
1125
+ ]
1126
+ },
1127
+ "1032e80b37": {
1128
+ "giraffe": [
1129
+ "1"
1130
+ ]
1131
+ },
1132
+ "103f501680": {
1133
+ "fish": [
1134
+ "1"
1135
+ ]
1136
+ },
1137
+ "104e64565f": {
1138
+ "elephant": [
1139
+ "1"
1140
+ ]
1141
+ },
1142
+ "104f1ab997": {
1143
+ "person": [
1144
+ "1",
1145
+ "2",
1146
+ "3"
1147
+ ]
1148
+ },
1149
+ "106242403f": {
1150
+ "person": [
1151
+ "1",
1152
+ "2"
1153
+ ]
1154
+ },
1155
+ "10b31f5431": {
1156
+ "person": [
1157
+ "1",
1158
+ "3",
1159
+ "4"
1160
+ ]
1161
+ },
1162
+ "10eced835e": {
1163
+ "giant_panda": [
1164
+ "1",
1165
+ "2"
1166
+ ]
1167
+ },
1168
+ "110d26fa3a": {
1169
+ "shark": [
1170
+ "1"
1171
+ ]
1172
+ },
1173
+ "1122c1d16a": {
1174
+ "parrot": [
1175
+ "1",
1176
+ "2",
1177
+ "3",
1178
+ "4",
1179
+ "5"
1180
+ ],
1181
+ "person": [
1182
+ "6"
1183
+ ]
1184
+ },
1185
+ "1145b49a5f": {
1186
+ "rabbit": [
1187
+ "1"
1188
+ ]
1189
+ },
1190
+ "11485838c2": {
1191
+ "giraffe": [
1192
+ "1",
1193
+ "2",
1194
+ "3"
1195
+ ]
1196
+ },
1197
+ "114e7676ec": {
1198
+ "person": [
1199
+ "1"
1200
+ ]
1201
+ },
1202
+ "1157472b95": {
1203
+ "parrot": [
1204
+ "1",
1205
+ "2"
1206
+ ]
1207
+ },
1208
+ "115ee1072c": {
1209
+ "cow": [
1210
+ "1"
1211
+ ]
1212
+ },
1213
+ "1171141012": {
1214
+ "turtle": [
1215
+ "1"
1216
+ ],
1217
+ "person": [
1218
+ "2"
1219
+ ]
1220
+ },
1221
+ "117757b4b8": {
1222
+ "snail": [
1223
+ "1"
1224
+ ]
1225
+ },
1226
+ "1178932d2f": {
1227
+ "motorbike": [
1228
+ "3"
1229
+ ],
1230
+ "person": [
1231
+ "1",
1232
+ "2"
1233
+ ]
1234
+ },
1235
+ "117cc76bda": {
1236
+ "whale": [
1237
+ "1"
1238
+ ]
1239
+ },
1240
+ "1180cbf814": {
1241
+ "fish": [
1242
+ "1",
1243
+ "2"
1244
+ ]
1245
+ },
1246
+ "1187bbd0e3": {
1247
+ "cat": [
1248
+ "1"
1249
+ ]
1250
+ },
1251
+ "1197e44b26": {
1252
+ "giant_panda": [
1253
+ "1"
1254
+ ]
1255
+ },
1256
+ "119cf20728": {
1257
+ "lizard": [
1258
+ "1"
1259
+ ]
1260
+ },
1261
+ "119dd54871": {
1262
+ "lion": [
1263
+ "1",
1264
+ "2"
1265
+ ]
1266
+ },
1267
+ "11a0c3b724": {
1268
+ "mouse": [
1269
+ "1",
1270
+ "2"
1271
+ ]
1272
+ },
1273
+ "11a6ba8c94": {
1274
+ "person": [
1275
+ "1",
1276
+ "2"
1277
+ ]
1278
+ },
1279
+ "11c722a456": {
1280
+ "turtle": [
1281
+ "1",
1282
+ "2"
1283
+ ]
1284
+ },
1285
+ "11cbcb0b4d": {
1286
+ "zebra": [
1287
+ "1"
1288
+ ]
1289
+ },
1290
+ "11ccf5e99d": {
1291
+ "person": [
1292
+ "2"
1293
+ ]
1294
+ },
1295
+ "11ce6f452e": {
1296
+ "person": [
1297
+ "1",
1298
+ "2",
1299
+ "3"
1300
+ ]
1301
+ },
1302
+ "11feabe596": {
1303
+ "rabbit": [
1304
+ "1"
1305
+ ]
1306
+ },
1307
+ "120cb9514d": {
1308
+ "person": [
1309
+ "1",
1310
+ "2",
1311
+ "3"
1312
+ ]
1313
+ },
1314
+ "12156b25b3": {
1315
+ "person": [
1316
+ "1"
1317
+ ]
1318
+ },
1319
+ "122896672d": {
1320
+ "person": [
1321
+ "1",
1322
+ "3"
1323
+ ]
1324
+ },
1325
+ "1233ac8596": {
1326
+ "dog": [
1327
+ "1"
1328
+ ]
1329
+ },
1330
+ "1239c87234": {
1331
+ "lizard": [
1332
+ "1"
1333
+ ]
1334
+ },
1335
+ "1250423f7c": {
1336
+ "elephant": [
1337
+ "3",
1338
+ "4"
1339
+ ],
1340
+ "person": [
1341
+ "2"
1342
+ ]
1343
+ },
1344
+ "1257a1bc67": {
1345
+ "snake": [
1346
+ "1"
1347
+ ]
1348
+ },
1349
+ "125d1b19dd": {
1350
+ "giant_panda": [
1351
+ "1",
1352
+ "2"
1353
+ ]
1354
+ },
1355
+ "126d203967": {
1356
+ "person": [
1357
+ "2"
1358
+ ]
1359
+ },
1360
+ "1295e19071": {
1361
+ "airplane": [
1362
+ "1"
1363
+ ]
1364
+ },
1365
+ "12ad198c54": {
1366
+ "person": [
1367
+ "1"
1368
+ ]
1369
+ },
1370
+ "12bddb2bcb": {
1371
+ "person": [
1372
+ "2"
1373
+ ]
1374
+ },
1375
+ "12ec9b93ee": {
1376
+ "giant_panda": [
1377
+ "1"
1378
+ ]
1379
+ },
1380
+ "12eebedc35": {
1381
+ "bird": [
1382
+ "1"
1383
+ ]
1384
+ },
1385
+ "132852e094": {
1386
+ "fox": [
1387
+ "1"
1388
+ ]
1389
+ },
1390
+ "1329409f2a": {
1391
+ "fish": [
1392
+ "1"
1393
+ ]
1394
+ },
1395
+ "13325cfa14": {
1396
+ "person": [
1397
+ "2"
1398
+ ]
1399
+ },
1400
+ "1336440745": {
1401
+ "mouse": [
1402
+ "1",
1403
+ "2"
1404
+ ]
1405
+ },
1406
+ "134d06dbf9": {
1407
+ "cat": [
1408
+ "1"
1409
+ ]
1410
+ },
1411
+ "135625b53d": {
1412
+ "parrot": [
1413
+ "1"
1414
+ ]
1415
+ },
1416
+ "13870016f9": {
1417
+ "cow": [
1418
+ "2",
1419
+ "3"
1420
+ ],
1421
+ "person": [
1422
+ "1"
1423
+ ]
1424
+ },
1425
+ "13960b3c84": {
1426
+ "giraffe": [
1427
+ "1",
1428
+ "2",
1429
+ "3"
1430
+ ]
1431
+ },
1432
+ "13adaad9d9": {
1433
+ "giant_panda": [
1434
+ "1"
1435
+ ]
1436
+ },
1437
+ "13ae097e20": {
1438
+ "giant_panda": [
1439
+ "1"
1440
+ ]
1441
+ },
1442
+ "13e3070469": {
1443
+ "zebra": [
1444
+ "1",
1445
+ "2",
1446
+ "3"
1447
+ ]
1448
+ },
1449
+ "13f6a8c20d": {
1450
+ "fish": [
1451
+ "1"
1452
+ ]
1453
+ },
1454
+ "1416925cf2": {
1455
+ "truck": [
1456
+ "1",
1457
+ "2"
1458
+ ]
1459
+ },
1460
+ "142d2621f5": {
1461
+ "motorbike": [
1462
+ "3"
1463
+ ],
1464
+ "person": [
1465
+ "1",
1466
+ "2"
1467
+ ]
1468
+ },
1469
+ "145d5d7c03": {
1470
+ "giant_panda": [
1471
+ "1"
1472
+ ]
1473
+ },
1474
+ "145fdc3ac5": {
1475
+ "lizard": [
1476
+ "1"
1477
+ ]
1478
+ },
1479
+ "1471274fa7": {
1480
+ "person": [
1481
+ "1"
1482
+ ]
1483
+ },
1484
+ "14a6b5a139": {
1485
+ "fish": [
1486
+ "1"
1487
+ ]
1488
+ },
1489
+ "14c21cea0d": {
1490
+ "monkey": [
1491
+ "1",
1492
+ "2"
1493
+ ]
1494
+ },
1495
+ "14dae0dc93": {
1496
+ "person": [
1497
+ "2"
1498
+ ]
1499
+ },
1500
+ "14f9bd22b5": {
1501
+ "tiger": [
1502
+ "1"
1503
+ ]
1504
+ },
1505
+ "14fd28ae99": {
1506
+ "parrot": [
1507
+ "1"
1508
+ ]
1509
+ },
1510
+ "15097d5d4e": {
1511
+ "parrot": [
1512
+ "1"
1513
+ ]
1514
+ },
1515
+ "150ea711f2": {
1516
+ "whale": [
1517
+ "1"
1518
+ ]
1519
+ },
1520
+ "1514e3563f": {
1521
+ "earless_seal": [
1522
+ "1",
1523
+ "2"
1524
+ ]
1525
+ },
1526
+ "152aaa3a9e": {
1527
+ "raccoon": [
1528
+ "1"
1529
+ ]
1530
+ },
1531
+ "152b7d3bd7": {
1532
+ "giant_panda": [
1533
+ "1"
1534
+ ]
1535
+ },
1536
+ "15617297cc": {
1537
+ "person": [
1538
+ "1"
1539
+ ]
1540
+ },
1541
+ "15abbe0c52": {
1542
+ "person": [
1543
+ "1"
1544
+ ]
1545
+ },
1546
+ "15d1fb3de5": {
1547
+ "owl": [
1548
+ "1"
1549
+ ],
1550
+ "cat": [
1551
+ "2"
1552
+ ]
1553
+ },
1554
+ "15f67b0fab": {
1555
+ "person": [
1556
+ "1"
1557
+ ]
1558
+ },
1559
+ "161eb59aad": {
1560
+ "giraffe": [
1561
+ "1"
1562
+ ],
1563
+ "cow": [
1564
+ "2",
1565
+ "3"
1566
+ ]
1567
+ },
1568
+ "16288ea47f": {
1569
+ "duck": [
1570
+ "1",
1571
+ "2"
1572
+ ]
1573
+ },
1574
+ "164410ce62": {
1575
+ "person": [
1576
+ "1"
1577
+ ]
1578
+ },
1579
+ "165c3c8cd4": {
1580
+ "person": [
1581
+ "1",
1582
+ "2",
1583
+ "3"
1584
+ ]
1585
+ },
1586
+ "165c42b41b": {
1587
+ "motorbike": [
1588
+ "2",
1589
+ "3"
1590
+ ],
1591
+ "person": [
1592
+ "1",
1593
+ "4"
1594
+ ]
1595
+ },
1596
+ "165ec9e22b": {
1597
+ "person": [
1598
+ "1",
1599
+ "2"
1600
+ ]
1601
+ },
1602
+ "1669502269": {
1603
+ "person": [
1604
+ "1"
1605
+ ]
1606
+ },
1607
+ "16763cccbb": {
1608
+ "ape": [
1609
+ "1"
1610
+ ]
1611
+ },
1612
+ "16adde065e": {
1613
+ "cat": [
1614
+ "2"
1615
+ ],
1616
+ "person": [
1617
+ "3"
1618
+ ]
1619
+ },
1620
+ "16af445362": {
1621
+ "airplane": [
1622
+ "1"
1623
+ ]
1624
+ },
1625
+ "16afd538ad": {
1626
+ "parrot": [
1627
+ "1",
1628
+ "2"
1629
+ ]
1630
+ },
1631
+ "16c3fa4d5d": {
1632
+ "sedan": [
1633
+ "1"
1634
+ ]
1635
+ },
1636
+ "16d1d65c27": {
1637
+ "monkey": [
1638
+ "1"
1639
+ ]
1640
+ },
1641
+ "16e8599e94": {
1642
+ "giant_panda": [
1643
+ "1"
1644
+ ]
1645
+ },
1646
+ "16fe9fb444": {
1647
+ "motorbike": [
1648
+ "1"
1649
+ ],
1650
+ "person": [
1651
+ "2"
1652
+ ]
1653
+ },
1654
+ "1705796b02": {
1655
+ "train": [
1656
+ "1"
1657
+ ]
1658
+ },
1659
+ "1724db7671": {
1660
+ "giant_panda": [
1661
+ "1"
1662
+ ]
1663
+ },
1664
+ "17418e81ea": {
1665
+ "shark": [
1666
+ "1"
1667
+ ]
1668
+ },
1669
+ "175169edbb": {
1670
+ "ape": [
1671
+ "1",
1672
+ "2"
1673
+ ]
1674
+ },
1675
+ "17622326fd": {
1676
+ "lizard": [
1677
+ "1"
1678
+ ]
1679
+ },
1680
+ "17656bae77": {
1681
+ "elephant": [
1682
+ "1"
1683
+ ]
1684
+ },
1685
+ "17b0d94172": {
1686
+ "airplane": [
1687
+ "1"
1688
+ ]
1689
+ },
1690
+ "17c220e4f6": {
1691
+ "giant_panda": [
1692
+ "1"
1693
+ ]
1694
+ },
1695
+ "17c7bcd146": {
1696
+ "train": [
1697
+ "1"
1698
+ ]
1699
+ },
1700
+ "17cb4afe89": {
1701
+ "tiger": [
1702
+ "1"
1703
+ ]
1704
+ },
1705
+ "17cd79a434": {
1706
+ "squirrel": [
1707
+ "1"
1708
+ ]
1709
+ },
1710
+ "17d18604c3": {
1711
+ "person": [
1712
+ "1",
1713
+ "2"
1714
+ ]
1715
+ },
1716
+ "17d8ca1a37": {
1717
+ "owl": [
1718
+ "1"
1719
+ ],
1720
+ "person": [
1721
+ "2"
1722
+ ]
1723
+ },
1724
+ "17e33f4330": {
1725
+ "monkey": [
1726
+ "1"
1727
+ ]
1728
+ },
1729
+ "17f7a6d805": {
1730
+ "snail": [
1731
+ "1"
1732
+ ]
1733
+ },
1734
+ "180abc8378": {
1735
+ "owl": [
1736
+ "1"
1737
+ ],
1738
+ "person": [
1739
+ "2"
1740
+ ]
1741
+ },
1742
+ "183ba3d652": {
1743
+ "motorbike": [
1744
+ "3"
1745
+ ],
1746
+ "person": [
1747
+ "2"
1748
+ ]
1749
+ },
1750
+ "185bf64702": {
1751
+ "zebra": [
1752
+ "1",
1753
+ "2"
1754
+ ]
1755
+ },
1756
+ "18913cc690": {
1757
+ "train": [
1758
+ "1"
1759
+ ]
1760
+ },
1761
+ "1892651815": {
1762
+ "camel": [
1763
+ "1"
1764
+ ]
1765
+ },
1766
+ "189ac8208a": {
1767
+ "giraffe": [
1768
+ "1",
1769
+ "2"
1770
+ ]
1771
+ },
1772
+ "189b44e92c": {
1773
+ "zebra": [
1774
+ "1"
1775
+ ]
1776
+ },
1777
+ "18ac264b76": {
1778
+ "person": [
1779
+ "2"
1780
+ ]
1781
+ },
1782
+ "18b245ab49": {
1783
+ "penguin": [
1784
+ "1",
1785
+ "2",
1786
+ "3",
1787
+ "4"
1788
+ ]
1789
+ },
1790
+ "18b5cebc34": {
1791
+ "mouse": [
1792
+ "1"
1793
+ ]
1794
+ },
1795
+ "18bad52083": {
1796
+ "parrot": [
1797
+ "1",
1798
+ "2"
1799
+ ]
1800
+ },
1801
+ "18bb5144d5": {
1802
+ "lizard": [
1803
+ "1"
1804
+ ]
1805
+ },
1806
+ "18c6f205c5": {
1807
+ "person": [
1808
+ "1",
1809
+ "2",
1810
+ "3"
1811
+ ]
1812
+ },
1813
+ "1903f9ea15": {
1814
+ "bird": [
1815
+ "1",
1816
+ "2",
1817
+ "3"
1818
+ ]
1819
+ },
1820
+ "1917b209f2": {
1821
+ "cow": [
1822
+ "3",
1823
+ "4"
1824
+ ],
1825
+ "horse": [
1826
+ "2"
1827
+ ],
1828
+ "person": [
1829
+ "1"
1830
+ ]
1831
+ },
1832
+ "191e74c01d": {
1833
+ "deer": [
1834
+ "1"
1835
+ ]
1836
+ },
1837
+ "19367bb94e": {
1838
+ "fish": [
1839
+ "1",
1840
+ "2",
1841
+ "3"
1842
+ ]
1843
+ },
1844
+ "193ffaa217": {
1845
+ "person": [
1846
+ "1",
1847
+ "2",
1848
+ "3"
1849
+ ]
1850
+ },
1851
+ "19696b67d3": {
1852
+ "cow": [
1853
+ "1"
1854
+ ]
1855
+ },
1856
+ "197f3ab6f3": {
1857
+ "giant_panda": [
1858
+ "1"
1859
+ ]
1860
+ },
1861
+ "1981e763cc": {
1862
+ "sheep": [
1863
+ "1",
1864
+ "2"
1865
+ ]
1866
+ },
1867
+ "198afe39ae": {
1868
+ "person": [
1869
+ "1"
1870
+ ]
1871
+ },
1872
+ "19a6e62b9b": {
1873
+ "monkey": [
1874
+ "1",
1875
+ "2"
1876
+ ]
1877
+ },
1878
+ "19b60d5335": {
1879
+ "hedgehog": [
1880
+ "1"
1881
+ ]
1882
+ },
1883
+ "19c00c11f9": {
1884
+ "person": [
1885
+ "1"
1886
+ ]
1887
+ },
1888
+ "19e061eb88": {
1889
+ "boat": [
1890
+ "1",
1891
+ "2"
1892
+ ]
1893
+ },
1894
+ "19e8bc6178": {
1895
+ "dog": [
1896
+ "1"
1897
+ ]
1898
+ },
1899
+ "19ee80dac6": {
1900
+ "person": [
1901
+ "1",
1902
+ "3",
1903
+ "4"
1904
+ ]
1905
+ },
1906
+ "1a25a9170a": {
1907
+ "person": [
1908
+ "2",
1909
+ "3"
1910
+ ],
1911
+ "cow": [
1912
+ "1"
1913
+ ]
1914
+ },
1915
+ "1a359a6c1a": {
1916
+ "sheep": [
1917
+ "1"
1918
+ ]
1919
+ },
1920
+ "1a3e87c566": {
1921
+ "frog": [
1922
+ "1"
1923
+ ]
1924
+ },
1925
+ "1a5fe06b00": {
1926
+ "bus": [
1927
+ "1"
1928
+ ]
1929
+ },
1930
+ "1a6c0fbd1e": {
1931
+ "person": [
1932
+ "1"
1933
+ ]
1934
+ },
1935
+ "1a6f3b5a4b": {
1936
+ "sedan": [
1937
+ "3"
1938
+ ]
1939
+ },
1940
+ "1a8afbad92": {
1941
+ "zebra": [
1942
+ "1",
1943
+ "2",
1944
+ "3"
1945
+ ]
1946
+ },
1947
+ "1a8bdc5842": {
1948
+ "parrot": [
1949
+ "1",
1950
+ "2"
1951
+ ]
1952
+ },
1953
+ "1a95752aca": {
1954
+ "duck": [
1955
+ "1",
1956
+ "2"
1957
+ ]
1958
+ },
1959
+ "1a9c131cb7": {
1960
+ "ape": [
1961
+ "1",
1962
+ "2",
1963
+ "3"
1964
+ ]
1965
+ },
1966
+ "1aa3da3ee3": {
1967
+ "sheep": [
1968
+ "1",
1969
+ "2",
1970
+ "3",
1971
+ "4"
1972
+ ]
1973
+ },
1974
+ "1ab27ec7ea": {
1975
+ "deer": [
1976
+ "1"
1977
+ ]
1978
+ },
1979
+ "1abf16d21d": {
1980
+ "turtle": [
1981
+ "1"
1982
+ ]
1983
+ },
1984
+ "1acd0f993b": {
1985
+ "dog": [
1986
+ "1"
1987
+ ],
1988
+ "person": [
1989
+ "3"
1990
+ ]
1991
+ },
1992
+ "1ad202e499": {
1993
+ "lizard": [
1994
+ "1",
1995
+ "2"
1996
+ ]
1997
+ },
1998
+ "1af8d2395d": {
1999
+ "airplane": [
2000
+ "4"
2001
+ ],
2002
+ "person": [
2003
+ "1",
2004
+ "2"
2005
+ ]
2006
+ },
2007
+ "1afd39a1fa": {
2008
+ "motorbike": [
2009
+ "2"
2010
+ ]
2011
+ },
2012
+ "1b2d31306f": {
2013
+ "lizard": [
2014
+ "1"
2015
+ ]
2016
+ },
2017
+ "1b3fa67f0e": {
2018
+ "airplane": [
2019
+ "1"
2020
+ ]
2021
+ },
2022
+ "1b43fa74b4": {
2023
+ "owl": [
2024
+ "1",
2025
+ "2"
2026
+ ]
2027
+ },
2028
+ "1b73ea9fc2": {
2029
+ "parrot": [
2030
+ "1"
2031
+ ]
2032
+ },
2033
+ "1b7e8bb255": {
2034
+ "person": [
2035
+ "2"
2036
+ ]
2037
+ },
2038
+ "1b8680f8cd": {
2039
+ "person": [
2040
+ "2",
2041
+ "3"
2042
+ ]
2043
+ },
2044
+ "1b883843c0": {
2045
+ "person": [
2046
+ "1",
2047
+ "2"
2048
+ ]
2049
+ },
2050
+ "1b8898785b": {
2051
+ "monkey": [
2052
+ "1",
2053
+ "2"
2054
+ ]
2055
+ },
2056
+ "1b88ba1aa4": {
2057
+ "giant_panda": [
2058
+ "1"
2059
+ ]
2060
+ },
2061
+ "1b96a498e5": {
2062
+ "ape": [
2063
+ "1"
2064
+ ]
2065
+ },
2066
+ "1bbc4c274f": {
2067
+ "fish": [
2068
+ "2"
2069
+ ]
2070
+ },
2071
+ "1bd87fe9ab": {
2072
+ "train": [
2073
+ "1"
2074
+ ]
2075
+ },
2076
+ "1c4090c75b": {
2077
+ "whale": [
2078
+ "1"
2079
+ ]
2080
+ },
2081
+ "1c41934f84": {
2082
+ "elephant": [
2083
+ "1",
2084
+ "2"
2085
+ ]
2086
+ },
2087
+ "1c72b04b56": {
2088
+ "lion": [
2089
+ "1"
2090
+ ]
2091
+ },
2092
+ "1c87955a3a": {
2093
+ "crocodile": [
2094
+ "1"
2095
+ ],
2096
+ "turtle": [
2097
+ "2"
2098
+ ]
2099
+ },
2100
+ "1c9f9eb792": {
2101
+ "person": [
2102
+ "2"
2103
+ ]
2104
+ },
2105
+ "1ca240fede": {
2106
+ "train": [
2107
+ "1"
2108
+ ]
2109
+ },
2110
+ "1ca5673803": {
2111
+ "person": [
2112
+ "1",
2113
+ "3"
2114
+ ]
2115
+ },
2116
+ "1cada35274": {
2117
+ "duck": [
2118
+ "1"
2119
+ ]
2120
+ },
2121
+ "1cb44b920d": {
2122
+ "eagle": [
2123
+ "1",
2124
+ "2"
2125
+ ]
2126
+ },
2127
+ "1cd10e62be": {
2128
+ "leopard": [
2129
+ "1"
2130
+ ]
2131
+ },
2132
+ "1d3087d5e5": {
2133
+ "fish": [
2134
+ "1",
2135
+ "2",
2136
+ "3",
2137
+ "4",
2138
+ "5"
2139
+ ]
2140
+ },
2141
+ "1d3685150a": {
2142
+ "person": [
2143
+ "1",
2144
+ "3"
2145
+ ]
2146
+ },
2147
+ "1d6ff083aa": {
2148
+ "person": [
2149
+ "1",
2150
+ "2"
2151
+ ]
2152
+ }
2153
+ }
mbench/numbered_valid_obj_ids_gpt-4o_randcap.json ADDED
@@ -0,0 +1,2153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "003234408d": {
3
+ "penguin": [
4
+ "1",
5
+ "2",
6
+ "3",
7
+ "4",
8
+ "5"
9
+ ]
10
+ },
11
+ "0043f083b5": {
12
+ "bus": [
13
+ "1"
14
+ ],
15
+ "sedan": [
16
+ "2",
17
+ "3"
18
+ ]
19
+ },
20
+ "0044fa5fba": {
21
+ "giant_panda": [
22
+ "1"
23
+ ]
24
+ },
25
+ "005a527edd": {
26
+ "ape": [
27
+ "1",
28
+ "2"
29
+ ]
30
+ },
31
+ "0065b171f9": {
32
+ "giant_panda": [
33
+ "1"
34
+ ]
35
+ },
36
+ "00917dcfc4": {
37
+ "zebra": [
38
+ "1",
39
+ "2",
40
+ "3"
41
+ ]
42
+ },
43
+ "00a23ccf53": {
44
+ "shark": [
45
+ "1"
46
+ ]
47
+ },
48
+ "00ad5016a4": {
49
+ "airplane": [
50
+ "1"
51
+ ]
52
+ },
53
+ "01082ae388": {
54
+ "leopard": [
55
+ "1"
56
+ ]
57
+ },
58
+ "011ac0a06f": {
59
+ "ape": [
60
+ "1",
61
+ "2",
62
+ "3",
63
+ "4",
64
+ "5"
65
+ ]
66
+ },
67
+ "013099c098": {
68
+ "giant_panda": [
69
+ "1",
70
+ "2"
71
+ ]
72
+ },
73
+ "0155498c85": {
74
+ "motorbike": [
75
+ "2"
76
+ ],
77
+ "person": [
78
+ "1"
79
+ ]
80
+ },
81
+ "01694ad9c8": {
82
+ "bird": [
83
+ "1"
84
+ ]
85
+ },
86
+ "017ac35701": {
87
+ "giant_panda": [
88
+ "1"
89
+ ]
90
+ },
91
+ "01b80e8e1a": {
92
+ "zebra": [
93
+ "1",
94
+ "2"
95
+ ]
96
+ },
97
+ "01baa5a4e1": {},
98
+ "01c3111683": {
99
+ "whale": [
100
+ "1"
101
+ ]
102
+ },
103
+ "01c4cb5ffe": {
104
+ "person": [
105
+ "1",
106
+ "3"
107
+ ]
108
+ },
109
+ "01c76f0a82": {
110
+ "sedan": [
111
+ "1",
112
+ "4"
113
+ ]
114
+ },
115
+ "01c783268c": {
116
+ "person": [
117
+ "2"
118
+ ],
119
+ "ape": [
120
+ "1"
121
+ ]
122
+ },
123
+ "01e64dd36a": {
124
+ "cow": [
125
+ "1",
126
+ "2",
127
+ "3"
128
+ ]
129
+ },
130
+ "01ed275c6e": {
131
+ "giraffe": [
132
+ "1",
133
+ "2"
134
+ ]
135
+ },
136
+ "01ff60d1fa": {
137
+ "lizard": [
138
+ "1"
139
+ ]
140
+ },
141
+ "020cd28cd2": {
142
+ "person": [
143
+ "1"
144
+ ]
145
+ },
146
+ "02264db755": {
147
+ "fox": [
148
+ "1"
149
+ ]
150
+ },
151
+ "0248626d9a": {
152
+ "train": [
153
+ "1"
154
+ ]
155
+ },
156
+ "02668dbffa": {
157
+ "frog": [
158
+ "1"
159
+ ]
160
+ },
161
+ "0274193026": {
162
+ "person": [
163
+ "2"
164
+ ]
165
+ },
166
+ "02d28375aa": {
167
+ "fox": [
168
+ "1"
169
+ ]
170
+ },
171
+ "031ccc99b1": {
172
+ "person": [
173
+ "1",
174
+ "2",
175
+ "3"
176
+ ]
177
+ },
178
+ "0321b18c10": {
179
+ "elephant": [
180
+ "3"
181
+ ],
182
+ "person": [
183
+ "1",
184
+ "2"
185
+ ]
186
+ },
187
+ "0348a45bca": {
188
+ "fish": [
189
+ "1",
190
+ "2",
191
+ "3",
192
+ "4",
193
+ "5"
194
+ ]
195
+ },
196
+ "0355e92655": {
197
+ "boat": [
198
+ "3"
199
+ ],
200
+ "person": [
201
+ "2"
202
+ ]
203
+ },
204
+ "0358b938c1": {
205
+ "elephant": [
206
+ "1",
207
+ "2",
208
+ "3",
209
+ "4"
210
+ ]
211
+ },
212
+ "0368107cf1": {
213
+ "person": [
214
+ "1",
215
+ "2"
216
+ ]
217
+ },
218
+ "0379ddf557": {
219
+ "person": [
220
+ "1"
221
+ ]
222
+ },
223
+ "038b2cc71d": {
224
+ "lizard": [
225
+ "1"
226
+ ]
227
+ },
228
+ "038c15a5dd": {
229
+ "hedgehog": [
230
+ "1"
231
+ ]
232
+ },
233
+ "03a06cc98a": {
234
+ "giraffe": [
235
+ "1",
236
+ "2",
237
+ "3"
238
+ ]
239
+ },
240
+ "03a63e187f": {
241
+ "lizard": [
242
+ "1"
243
+ ]
244
+ },
245
+ "03c95b4dae": {
246
+ "elephant": [
247
+ "1",
248
+ "2",
249
+ "3"
250
+ ]
251
+ },
252
+ "03e2b57b0e": {
253
+ "lizard": [
254
+ "1"
255
+ ]
256
+ },
257
+ "04194e1248": {
258
+ "lizard": [
259
+ "1"
260
+ ]
261
+ },
262
+ "04259896e2": {
263
+ "lizard": [
264
+ "1"
265
+ ]
266
+ },
267
+ "0444918a5f": {
268
+ "truck": [
269
+ "1",
270
+ "2",
271
+ "3",
272
+ "4"
273
+ ]
274
+ },
275
+ "04460a7a52": {
276
+ "lizard": [
277
+ "1"
278
+ ]
279
+ },
280
+ "04474174a4": {
281
+ "ape": [
282
+ "1",
283
+ "2"
284
+ ]
285
+ },
286
+ "0450095513": {
287
+ "snail": [
288
+ "1"
289
+ ]
290
+ },
291
+ "045f00aed2": {
292
+ "person": [
293
+ "3"
294
+ ],
295
+ "tiger": [
296
+ "1"
297
+ ]
298
+ },
299
+ "04667fabaa": {
300
+ "parrot": [
301
+ "1"
302
+ ]
303
+ },
304
+ "04735c5030": {
305
+ "cat": [
306
+ "1",
307
+ "2"
308
+ ]
309
+ },
310
+ "04990d1915": {
311
+ "truck": [
312
+ "3"
313
+ ],
314
+ "bus": [
315
+ "2"
316
+ ],
317
+ "sedan": [
318
+ "1"
319
+ ]
320
+ },
321
+ "04d62d9d98": {
322
+ "person": [
323
+ "1"
324
+ ]
325
+ },
326
+ "04f21da964": {
327
+ "monkey": [
328
+ "1"
329
+ ]
330
+ },
331
+ "04fbad476e": {
332
+ "parrot": [
333
+ "1"
334
+ ]
335
+ },
336
+ "04fe256562": {
337
+ "motorbike": [
338
+ "1"
339
+ ],
340
+ "truck": [
341
+ "2"
342
+ ]
343
+ },
344
+ "0503bf89c9": {
345
+ "hedgehog": [
346
+ "1"
347
+ ]
348
+ },
349
+ "0536c9eed0": {
350
+ "cat": [
351
+ "1"
352
+ ]
353
+ },
354
+ "054acb238f": {
355
+ "owl": [
356
+ "1"
357
+ ]
358
+ },
359
+ "05579ca250": {
360
+ "person": [
361
+ "1"
362
+ ],
363
+ "sedan": [
364
+ "3"
365
+ ]
366
+ },
367
+ "056c200404": {},
368
+ "05774f3a2c": {
369
+ "ape": [
370
+ "1",
371
+ "2",
372
+ "3"
373
+ ]
374
+ },
375
+ "058a7592c8": {
376
+ "train": [
377
+ "1"
378
+ ]
379
+ },
380
+ "05a0a513df": {
381
+ "person": [
382
+ "1",
383
+ "2"
384
+ ]
385
+ },
386
+ "05a569d8aa": {
387
+ "cat": [
388
+ "1"
389
+ ],
390
+ "mouse": [
391
+ "2"
392
+ ]
393
+ },
394
+ "05aa652648": {
395
+ "ape": [
396
+ "1"
397
+ ]
398
+ },
399
+ "05d7715782": {},
400
+ "05e0b0f28f": {
401
+ "mouse": [
402
+ "1"
403
+ ],
404
+ "person": [
405
+ "2"
406
+ ]
407
+ },
408
+ "05fdbbdd7a": {},
409
+ "05ffcfed85": {
410
+ "monkey": [
411
+ "1",
412
+ "2"
413
+ ]
414
+ },
415
+ "0630391881": {
416
+ "person": [
417
+ "1"
418
+ ]
419
+ },
420
+ "06840b2bbe": {
421
+ "snake": [
422
+ "1"
423
+ ]
424
+ },
425
+ "068f7dce6f": {
426
+ "shark": [
427
+ "1"
428
+ ]
429
+ },
430
+ "0693719753": {
431
+ "turtle": [
432
+ "1",
433
+ "2"
434
+ ]
435
+ },
436
+ "06ce2b51fb": {
437
+ "person": [
438
+ "1",
439
+ "2"
440
+ ]
441
+ },
442
+ "06e224798e": {
443
+ "tiger": [
444
+ "1"
445
+ ]
446
+ },
447
+ "06ee361788": {
448
+ "duck": [
449
+ "1",
450
+ "2",
451
+ "3"
452
+ ]
453
+ },
454
+ "06fbb3fa2c": {
455
+ "eagle": [
456
+ "1"
457
+ ]
458
+ },
459
+ "0700264286": {
460
+ "cow": [
461
+ "1",
462
+ "2"
463
+ ]
464
+ },
465
+ "070c918ca7": {
466
+ "parrot": [
467
+ "1"
468
+ ]
469
+ },
470
+ "07129e14a4": {
471
+ "person": [
472
+ "3"
473
+ ],
474
+ "parrot": [
475
+ "1",
476
+ "2"
477
+ ]
478
+ },
479
+ "07177017e9": {
480
+ "motorbike": [
481
+ "1",
482
+ "2"
483
+ ]
484
+ },
485
+ "07238ffc58": {
486
+ "monkey": [
487
+ "1",
488
+ "2",
489
+ "3"
490
+ ]
491
+ },
492
+ "07353b2a89": {
493
+ "sheep": [
494
+ "1",
495
+ "2",
496
+ "3",
497
+ "4"
498
+ ]
499
+ },
500
+ "0738493cbf": {
501
+ "airplane": [
502
+ "1"
503
+ ]
504
+ },
505
+ "075926c651": {
506
+ "person": [
507
+ "1",
508
+ "2"
509
+ ]
510
+ },
511
+ "075c701292": {
512
+ "duck": [
513
+ "1",
514
+ "2",
515
+ "3",
516
+ "4"
517
+ ]
518
+ },
519
+ "0762ea9a30": {
520
+ "person": [
521
+ "1"
522
+ ]
523
+ },
524
+ "07652ee4af": {
525
+ "person": [
526
+ "1"
527
+ ]
528
+ },
529
+ "076f206928": {
530
+ "person": [
531
+ "3"
532
+ ],
533
+ "zebra": [
534
+ "1",
535
+ "2"
536
+ ]
537
+ },
538
+ "077d32af19": {
539
+ "train": [
540
+ "4"
541
+ ],
542
+ "person": [
543
+ "1",
544
+ "2",
545
+ "3"
546
+ ]
547
+ },
548
+ "079049275c": {
549
+ "mouse": [
550
+ "1"
551
+ ]
552
+ },
553
+ "07913cdda7": {
554
+ "train": [
555
+ "1"
556
+ ],
557
+ "person": [
558
+ "2",
559
+ "3"
560
+ ]
561
+ },
562
+ "07a11a35e8": {
563
+ "ape": [
564
+ "1",
565
+ "2"
566
+ ]
567
+ },
568
+ "07ac33b6df": {
569
+ "ape": [
570
+ "1"
571
+ ]
572
+ },
573
+ "07c62c3d11": {
574
+ "parrot": [
575
+ "1",
576
+ "2",
577
+ "3"
578
+ ]
579
+ },
580
+ "07cc1c7d74": {
581
+ "snake": [
582
+ "1"
583
+ ]
584
+ },
585
+ "080196ef01": {
586
+ "lizard": [
587
+ "1"
588
+ ]
589
+ },
590
+ "081207976e": {},
591
+ "081ae4fa44": {
592
+ "shark": [
593
+ "1",
594
+ "2"
595
+ ]
596
+ },
597
+ "081d8250cb": {
598
+ "person": [
599
+ "1"
600
+ ],
601
+ "sedan": [
602
+ "3"
603
+ ]
604
+ },
605
+ "082900c5d4": {
606
+ "duck": [
607
+ "1",
608
+ "2",
609
+ "3"
610
+ ]
611
+ },
612
+ "0860df21e2": {},
613
+ "0866d4c5e3": {
614
+ "bird": [
615
+ "1",
616
+ "2",
617
+ "3"
618
+ ]
619
+ },
620
+ "0891ac2eb6": {
621
+ "person": [
622
+ "1",
623
+ "2",
624
+ "3"
625
+ ]
626
+ },
627
+ "08931bc458": {
628
+ "person": [
629
+ "1"
630
+ ]
631
+ },
632
+ "08aa2705d5": {
633
+ "snake": [
634
+ "1"
635
+ ]
636
+ },
637
+ "08c8450db7": {},
638
+ "08d50b926c": {
639
+ "turtle": [
640
+ "1",
641
+ "2"
642
+ ]
643
+ },
644
+ "08e1e4de15": {
645
+ "monkey": [
646
+ "1",
647
+ "2",
648
+ "3",
649
+ "4"
650
+ ]
651
+ },
652
+ "08e48c1a48": {
653
+ "cow": [
654
+ "1"
655
+ ]
656
+ },
657
+ "08f561c65e": {
658
+ "person": [
659
+ "2"
660
+ ],
661
+ "giant_panda": [
662
+ "1"
663
+ ]
664
+ },
665
+ "08feb87790": {
666
+ "sheep": [
667
+ "1"
668
+ ]
669
+ },
670
+ "09049f6fe3": {
671
+ "mouse": [
672
+ "1",
673
+ "2"
674
+ ]
675
+ },
676
+ "092e4ff450": {
677
+ "snake": [
678
+ "1"
679
+ ]
680
+ },
681
+ "09338adea8": {
682
+ "whale": [
683
+ "1",
684
+ "2"
685
+ ]
686
+ },
687
+ "093c335ccc": {
688
+ "person": [
689
+ "2"
690
+ ]
691
+ },
692
+ "0970d28339": {
693
+ "ape": [
694
+ "1",
695
+ "2"
696
+ ]
697
+ },
698
+ "0974a213dc": {
699
+ "giraffe": [
700
+ "1",
701
+ "2",
702
+ "3"
703
+ ]
704
+ },
705
+ "097b471ed8": {
706
+ "cat": [
707
+ "1",
708
+ "2"
709
+ ]
710
+ },
711
+ "0990941758": {
712
+ "giant_panda": [
713
+ "1"
714
+ ]
715
+ },
716
+ "09a348f4fa": {
717
+ "lizard": [
718
+ "1"
719
+ ]
720
+ },
721
+ "09a6841288": {
722
+ "duck": [
723
+ "1",
724
+ "2"
725
+ ]
726
+ },
727
+ "09c5bad17b": {
728
+ "airplane": [
729
+ "1"
730
+ ]
731
+ },
732
+ "09c9ce80c7": {
733
+ "giant_panda": [
734
+ "1"
735
+ ]
736
+ },
737
+ "09ff54fef4": {
738
+ "fox": [
739
+ "1",
740
+ "2"
741
+ ]
742
+ },
743
+ "0a23765d15": {
744
+ "person": [
745
+ "1",
746
+ "2"
747
+ ]
748
+ },
749
+ "0a275e7f12": {
750
+ "elephant": [
751
+ "1"
752
+ ]
753
+ },
754
+ "0a2f2bd294": {
755
+ "motorbike": [
756
+ "1"
757
+ ]
758
+ },
759
+ "0a7a2514aa": {
760
+ "cat": [
761
+ "1"
762
+ ],
763
+ "lizard": [
764
+ "2"
765
+ ]
766
+ },
767
+ "0a7b27fde9": {
768
+ "parrot": [
769
+ "1",
770
+ "2"
771
+ ]
772
+ },
773
+ "0a8c467cc3": {
774
+ "fish": [
775
+ "1",
776
+ "2",
777
+ "3"
778
+ ]
779
+ },
780
+ "0ac8c560ae": {
781
+ "person": [
782
+ "2",
783
+ "3"
784
+ ]
785
+ },
786
+ "0b1627e896": {
787
+ "boat": [
788
+ "1"
789
+ ]
790
+ },
791
+ "0b285c47f6": {
792
+ "mouse": [
793
+ "1"
794
+ ]
795
+ },
796
+ "0b34ec1d55": {
797
+ "ape": [
798
+ "1"
799
+ ]
800
+ },
801
+ "0b5b5e8e5a": {
802
+ "person": [
803
+ "1"
804
+ ],
805
+ "sedan": [
806
+ "2"
807
+ ]
808
+ },
809
+ "0b68535614": {
810
+ "rabbit": [
811
+ "1"
812
+ ]
813
+ },
814
+ "0b6f9105fc": {
815
+ "rabbit": [
816
+ "1"
817
+ ]
818
+ },
819
+ "0b7dbfa3cb": {
820
+ "cow": [
821
+ "1"
822
+ ]
823
+ },
824
+ "0b9cea51ca": {
825
+ "whale": [
826
+ "1"
827
+ ]
828
+ },
829
+ "0b9d012be8": {
830
+ "camel": [
831
+ "1"
832
+ ]
833
+ },
834
+ "0bcfc4177d": {
835
+ "truck": [
836
+ "1"
837
+ ]
838
+ },
839
+ "0bd37b23c1": {
840
+ "motorbike": [
841
+ "1"
842
+ ]
843
+ },
844
+ "0bd864064c": {
845
+ "eagle": [
846
+ "1"
847
+ ]
848
+ },
849
+ "0c11c6bf7b": {
850
+ "deer": [
851
+ "1"
852
+ ]
853
+ },
854
+ "0c26bc77ac": {
855
+ "crocodile": [
856
+ "1"
857
+ ]
858
+ },
859
+ "0c3a04798c": {
860
+ "fish": [
861
+ "2"
862
+ ],
863
+ "duck": [
864
+ "1"
865
+ ]
866
+ },
867
+ "0c44a9d545": {
868
+ "tiger": [
869
+ "1"
870
+ ]
871
+ },
872
+ "0c817cc390": {
873
+ "hedgehog": [
874
+ "1"
875
+ ],
876
+ "dog": [
877
+ "2"
878
+ ]
879
+ },
880
+ "0ca839ee9a": {
881
+ "ape": [
882
+ "1",
883
+ "2"
884
+ ]
885
+ },
886
+ "0cd7ac0ac0": {
887
+ "rabbit": [
888
+ "1"
889
+ ]
890
+ },
891
+ "0ce06e0121": {
892
+ "parrot": [
893
+ "1",
894
+ "2"
895
+ ]
896
+ },
897
+ "0cfe974a89": {
898
+ "turtle": [
899
+ "1",
900
+ "2"
901
+ ]
902
+ },
903
+ "0d2fcc0dcd": {
904
+ "zebra": [
905
+ "1",
906
+ "2",
907
+ "3",
908
+ "4"
909
+ ]
910
+ },
911
+ "0d3aad05d2": {
912
+ "person": [
913
+ "1"
914
+ ]
915
+ },
916
+ "0d40b015f4": {
917
+ "person": [
918
+ "1"
919
+ ]
920
+ },
921
+ "0d97fba242": {
922
+ "dog": [
923
+ "1"
924
+ ],
925
+ "person": [
926
+ "2"
927
+ ]
928
+ },
929
+ "0d9cc80d7e": {
930
+ "person": [
931
+ "1",
932
+ "2",
933
+ "3"
934
+ ]
935
+ },
936
+ "0dab85b6d3": {
937
+ "lizard": [
938
+ "1",
939
+ "2"
940
+ ]
941
+ },
942
+ "0db5c427a5": {
943
+ "train": [
944
+ "1"
945
+ ]
946
+ },
947
+ "0dbaf284f1": {
948
+ "cat": [
949
+ "1",
950
+ "2"
951
+ ]
952
+ },
953
+ "0de4923598": {},
954
+ "0df28a9101": {
955
+ "turtle": [
956
+ "1",
957
+ "2",
958
+ "3"
959
+ ]
960
+ },
961
+ "0e04f636c4": {
962
+ "frog": [
963
+ "1"
964
+ ]
965
+ },
966
+ "0e05f0e232": {
967
+ "lizard": [
968
+ "1",
969
+ "2"
970
+ ]
971
+ },
972
+ "0e0930474b": {
973
+ "person": [
974
+ "2",
975
+ "3"
976
+ ],
977
+ "sedan": [
978
+ "1"
979
+ ]
980
+ },
981
+ "0e27472bea": {
982
+ "turtle": [
983
+ "1"
984
+ ]
985
+ },
986
+ "0e30020549": {
987
+ "parrot": [
988
+ "1"
989
+ ]
990
+ },
991
+ "0e621feb6c": {
992
+ "lizard": [
993
+ "1",
994
+ "2"
995
+ ]
996
+ },
997
+ "0e803c7d73": {},
998
+ "0e9ebe4e3c": {
999
+ "truck": [
1000
+ "1"
1001
+ ]
1002
+ },
1003
+ "0e9f2785ec": {
1004
+ "person": [
1005
+ "2"
1006
+ ]
1007
+ },
1008
+ "0ea68d418b": {
1009
+ "airplane": [
1010
+ "1"
1011
+ ]
1012
+ },
1013
+ "0eb403a222": {},
1014
+ "0ee92053d6": {
1015
+ "person": [
1016
+ "1"
1017
+ ]
1018
+ },
1019
+ "0eefca067f": {
1020
+ "giant_panda": [
1021
+ "1",
1022
+ "2"
1023
+ ]
1024
+ },
1025
+ "0f17fa6fcb": {
1026
+ "duck": [
1027
+ "1",
1028
+ "2",
1029
+ "3"
1030
+ ]
1031
+ },
1032
+ "0f1ac8e9a3": {
1033
+ "frog": [
1034
+ "1"
1035
+ ]
1036
+ },
1037
+ "0f202e9852": {
1038
+ "parrot": [
1039
+ "1"
1040
+ ]
1041
+ },
1042
+ "0f2ab8b1ff": {
1043
+ "dolphin": [
1044
+ "1",
1045
+ "2",
1046
+ "3"
1047
+ ]
1048
+ },
1049
+ "0f51a78756": {
1050
+ "sheep": [
1051
+ "1"
1052
+ ]
1053
+ },
1054
+ "0f5fbe16b0": {
1055
+ "raccoon": [
1056
+ "1",
1057
+ "2"
1058
+ ]
1059
+ },
1060
+ "0f6072077b": {
1061
+ "person": [
1062
+ "1",
1063
+ "2",
1064
+ "3"
1065
+ ]
1066
+ },
1067
+ "0f6b69b2f4": {
1068
+ "rabbit": [
1069
+ "1"
1070
+ ]
1071
+ },
1072
+ "0f6c2163de": {
1073
+ "snail": [
1074
+ "1"
1075
+ ]
1076
+ },
1077
+ "0f74ec5599": {
1078
+ "giant_panda": [
1079
+ "1"
1080
+ ]
1081
+ },
1082
+ "0f9683715b": {
1083
+ "elephant": [
1084
+ "1"
1085
+ ]
1086
+ },
1087
+ "0fa7b59356": {
1088
+ "duck": [
1089
+ "1"
1090
+ ]
1091
+ },
1092
+ "0fb173695b": {
1093
+ "person": [
1094
+ "3"
1095
+ ]
1096
+ },
1097
+ "0fc958cde2": {
1098
+ "owl": [
1099
+ "1"
1100
+ ]
1101
+ },
1102
+ "0fe7b1a621": {
1103
+ "parrot": [
1104
+ "1"
1105
+ ]
1106
+ },
1107
+ "0ffcdb491c": {
1108
+ "person": [
1109
+ "1",
1110
+ "2",
1111
+ "3"
1112
+ ]
1113
+ },
1114
+ "101caff7d4": {
1115
+ "giant_panda": [
1116
+ "1",
1117
+ "2"
1118
+ ]
1119
+ },
1120
+ "1022fe8417": {
1121
+ "person": [
1122
+ "1",
1123
+ "2",
1124
+ "3"
1125
+ ]
1126
+ },
1127
+ "1032e80b37": {
1128
+ "giraffe": [
1129
+ "1"
1130
+ ]
1131
+ },
1132
+ "103f501680": {
1133
+ "fish": [
1134
+ "1"
1135
+ ]
1136
+ },
1137
+ "104e64565f": {
1138
+ "elephant": [
1139
+ "1"
1140
+ ]
1141
+ },
1142
+ "104f1ab997": {
1143
+ "person": [
1144
+ "1",
1145
+ "2",
1146
+ "3"
1147
+ ]
1148
+ },
1149
+ "106242403f": {
1150
+ "person": [
1151
+ "1",
1152
+ "2"
1153
+ ]
1154
+ },
1155
+ "10b31f5431": {
1156
+ "person": [
1157
+ "1",
1158
+ "3",
1159
+ "4"
1160
+ ]
1161
+ },
1162
+ "10eced835e": {
1163
+ "giant_panda": [
1164
+ "1",
1165
+ "2"
1166
+ ]
1167
+ },
1168
+ "110d26fa3a": {
1169
+ "shark": [
1170
+ "1"
1171
+ ]
1172
+ },
1173
+ "1122c1d16a": {
1174
+ "person": [
1175
+ "6"
1176
+ ],
1177
+ "parrot": [
1178
+ "1",
1179
+ "2",
1180
+ "3",
1181
+ "4",
1182
+ "5"
1183
+ ]
1184
+ },
1185
+ "1145b49a5f": {
1186
+ "rabbit": [
1187
+ "1"
1188
+ ]
1189
+ },
1190
+ "11485838c2": {
1191
+ "giraffe": [
1192
+ "1",
1193
+ "2",
1194
+ "3"
1195
+ ]
1196
+ },
1197
+ "114e7676ec": {
1198
+ "person": [
1199
+ "1"
1200
+ ]
1201
+ },
1202
+ "1157472b95": {
1203
+ "parrot": [
1204
+ "1",
1205
+ "2"
1206
+ ]
1207
+ },
1208
+ "115ee1072c": {
1209
+ "cow": [
1210
+ "1"
1211
+ ]
1212
+ },
1213
+ "1171141012": {
1214
+ "turtle": [
1215
+ "1"
1216
+ ],
1217
+ "person": [
1218
+ "2"
1219
+ ]
1220
+ },
1221
+ "117757b4b8": {
1222
+ "snail": [
1223
+ "1"
1224
+ ]
1225
+ },
1226
+ "1178932d2f": {
1227
+ "motorbike": [
1228
+ "3"
1229
+ ],
1230
+ "person": [
1231
+ "1",
1232
+ "2"
1233
+ ]
1234
+ },
1235
+ "117cc76bda": {
1236
+ "whale": [
1237
+ "1"
1238
+ ]
1239
+ },
1240
+ "1180cbf814": {
1241
+ "fish": [
1242
+ "1",
1243
+ "2"
1244
+ ]
1245
+ },
1246
+ "1187bbd0e3": {
1247
+ "cat": [
1248
+ "1"
1249
+ ]
1250
+ },
1251
+ "1197e44b26": {
1252
+ "giant_panda": [
1253
+ "1"
1254
+ ]
1255
+ },
1256
+ "119cf20728": {
1257
+ "lizard": [
1258
+ "1"
1259
+ ]
1260
+ },
1261
+ "119dd54871": {
1262
+ "lion": [
1263
+ "1",
1264
+ "2"
1265
+ ]
1266
+ },
1267
+ "11a0c3b724": {
1268
+ "mouse": [
1269
+ "1",
1270
+ "2"
1271
+ ]
1272
+ },
1273
+ "11a6ba8c94": {
1274
+ "person": [
1275
+ "1",
1276
+ "2"
1277
+ ]
1278
+ },
1279
+ "11c722a456": {
1280
+ "turtle": [
1281
+ "1",
1282
+ "2"
1283
+ ]
1284
+ },
1285
+ "11cbcb0b4d": {
1286
+ "zebra": [
1287
+ "1"
1288
+ ]
1289
+ },
1290
+ "11ccf5e99d": {
1291
+ "person": [
1292
+ "2"
1293
+ ]
1294
+ },
1295
+ "11ce6f452e": {
1296
+ "person": [
1297
+ "1",
1298
+ "2",
1299
+ "3"
1300
+ ]
1301
+ },
1302
+ "11feabe596": {
1303
+ "rabbit": [
1304
+ "1"
1305
+ ]
1306
+ },
1307
+ "120cb9514d": {
1308
+ "person": [
1309
+ "1",
1310
+ "2",
1311
+ "3"
1312
+ ]
1313
+ },
1314
+ "12156b25b3": {
1315
+ "person": [
1316
+ "1"
1317
+ ]
1318
+ },
1319
+ "122896672d": {
1320
+ "person": [
1321
+ "1",
1322
+ "3"
1323
+ ]
1324
+ },
1325
+ "1233ac8596": {
1326
+ "dog": [
1327
+ "1"
1328
+ ]
1329
+ },
1330
+ "1239c87234": {
1331
+ "lizard": [
1332
+ "1"
1333
+ ]
1334
+ },
1335
+ "1250423f7c": {
1336
+ "elephant": [
1337
+ "3",
1338
+ "4"
1339
+ ],
1340
+ "person": [
1341
+ "2"
1342
+ ]
1343
+ },
1344
+ "1257a1bc67": {
1345
+ "snake": [
1346
+ "1"
1347
+ ]
1348
+ },
1349
+ "125d1b19dd": {
1350
+ "giant_panda": [
1351
+ "1",
1352
+ "2"
1353
+ ]
1354
+ },
1355
+ "126d203967": {
1356
+ "person": [
1357
+ "2"
1358
+ ]
1359
+ },
1360
+ "1295e19071": {
1361
+ "airplane": [
1362
+ "1"
1363
+ ]
1364
+ },
1365
+ "12ad198c54": {
1366
+ "person": [
1367
+ "1"
1368
+ ]
1369
+ },
1370
+ "12bddb2bcb": {
1371
+ "person": [
1372
+ "2"
1373
+ ]
1374
+ },
1375
+ "12ec9b93ee": {
1376
+ "giant_panda": [
1377
+ "1"
1378
+ ]
1379
+ },
1380
+ "12eebedc35": {
1381
+ "bird": [
1382
+ "1"
1383
+ ]
1384
+ },
1385
+ "132852e094": {
1386
+ "fox": [
1387
+ "1"
1388
+ ]
1389
+ },
1390
+ "1329409f2a": {
1391
+ "fish": [
1392
+ "1"
1393
+ ]
1394
+ },
1395
+ "13325cfa14": {
1396
+ "person": [
1397
+ "2"
1398
+ ]
1399
+ },
1400
+ "1336440745": {
1401
+ "mouse": [
1402
+ "1",
1403
+ "2"
1404
+ ]
1405
+ },
1406
+ "134d06dbf9": {
1407
+ "cat": [
1408
+ "1"
1409
+ ]
1410
+ },
1411
+ "135625b53d": {
1412
+ "parrot": [
1413
+ "1"
1414
+ ]
1415
+ },
1416
+ "13870016f9": {
1417
+ "cow": [
1418
+ "2",
1419
+ "3"
1420
+ ],
1421
+ "person": [
1422
+ "1"
1423
+ ]
1424
+ },
1425
+ "13960b3c84": {
1426
+ "giraffe": [
1427
+ "1",
1428
+ "2",
1429
+ "3"
1430
+ ]
1431
+ },
1432
+ "13adaad9d9": {
1433
+ "giant_panda": [
1434
+ "1"
1435
+ ]
1436
+ },
1437
+ "13ae097e20": {
1438
+ "giant_panda": [
1439
+ "1"
1440
+ ]
1441
+ },
1442
+ "13e3070469": {
1443
+ "zebra": [
1444
+ "1",
1445
+ "2",
1446
+ "3"
1447
+ ]
1448
+ },
1449
+ "13f6a8c20d": {
1450
+ "fish": [
1451
+ "1"
1452
+ ]
1453
+ },
1454
+ "1416925cf2": {
1455
+ "truck": [
1456
+ "1",
1457
+ "2"
1458
+ ]
1459
+ },
1460
+ "142d2621f5": {
1461
+ "motorbike": [
1462
+ "3"
1463
+ ],
1464
+ "person": [
1465
+ "1",
1466
+ "2"
1467
+ ]
1468
+ },
1469
+ "145d5d7c03": {
1470
+ "giant_panda": [
1471
+ "1"
1472
+ ]
1473
+ },
1474
+ "145fdc3ac5": {
1475
+ "lizard": [
1476
+ "1"
1477
+ ]
1478
+ },
1479
+ "1471274fa7": {
1480
+ "person": [
1481
+ "1"
1482
+ ]
1483
+ },
1484
+ "14a6b5a139": {
1485
+ "fish": [
1486
+ "1"
1487
+ ]
1488
+ },
1489
+ "14c21cea0d": {
1490
+ "monkey": [
1491
+ "1",
1492
+ "2"
1493
+ ]
1494
+ },
1495
+ "14dae0dc93": {
1496
+ "person": [
1497
+ "2"
1498
+ ]
1499
+ },
1500
+ "14f9bd22b5": {
1501
+ "tiger": [
1502
+ "1"
1503
+ ]
1504
+ },
1505
+ "14fd28ae99": {
1506
+ "parrot": [
1507
+ "1"
1508
+ ]
1509
+ },
1510
+ "15097d5d4e": {
1511
+ "parrot": [
1512
+ "1"
1513
+ ]
1514
+ },
1515
+ "150ea711f2": {
1516
+ "whale": [
1517
+ "1"
1518
+ ]
1519
+ },
1520
+ "1514e3563f": {
1521
+ "earless_seal": [
1522
+ "1",
1523
+ "2"
1524
+ ]
1525
+ },
1526
+ "152aaa3a9e": {
1527
+ "raccoon": [
1528
+ "1"
1529
+ ]
1530
+ },
1531
+ "152b7d3bd7": {
1532
+ "giant_panda": [
1533
+ "1"
1534
+ ]
1535
+ },
1536
+ "15617297cc": {
1537
+ "person": [
1538
+ "1"
1539
+ ]
1540
+ },
1541
+ "15abbe0c52": {
1542
+ "person": [
1543
+ "1"
1544
+ ]
1545
+ },
1546
+ "15d1fb3de5": {
1547
+ "owl": [
1548
+ "1"
1549
+ ],
1550
+ "cat": [
1551
+ "2"
1552
+ ]
1553
+ },
1554
+ "15f67b0fab": {
1555
+ "person": [
1556
+ "1"
1557
+ ]
1558
+ },
1559
+ "161eb59aad": {
1560
+ "cow": [
1561
+ "2",
1562
+ "3"
1563
+ ],
1564
+ "giraffe": [
1565
+ "1"
1566
+ ]
1567
+ },
1568
+ "16288ea47f": {
1569
+ "duck": [
1570
+ "1",
1571
+ "2"
1572
+ ]
1573
+ },
1574
+ "164410ce62": {
1575
+ "person": [
1576
+ "1"
1577
+ ]
1578
+ },
1579
+ "165c3c8cd4": {
1580
+ "person": [
1581
+ "1",
1582
+ "2",
1583
+ "3"
1584
+ ]
1585
+ },
1586
+ "165c42b41b": {
1587
+ "motorbike": [
1588
+ "2",
1589
+ "3"
1590
+ ],
1591
+ "person": [
1592
+ "1",
1593
+ "4"
1594
+ ]
1595
+ },
1596
+ "165ec9e22b": {
1597
+ "person": [
1598
+ "1",
1599
+ "2"
1600
+ ]
1601
+ },
1602
+ "1669502269": {
1603
+ "person": [
1604
+ "1"
1605
+ ]
1606
+ },
1607
+ "16763cccbb": {
1608
+ "ape": [
1609
+ "1"
1610
+ ]
1611
+ },
1612
+ "16adde065e": {
1613
+ "cat": [
1614
+ "2"
1615
+ ],
1616
+ "person": [
1617
+ "3"
1618
+ ]
1619
+ },
1620
+ "16af445362": {
1621
+ "airplane": [
1622
+ "1"
1623
+ ]
1624
+ },
1625
+ "16afd538ad": {
1626
+ "parrot": [
1627
+ "1",
1628
+ "2"
1629
+ ]
1630
+ },
1631
+ "16c3fa4d5d": {
1632
+ "sedan": [
1633
+ "1"
1634
+ ]
1635
+ },
1636
+ "16d1d65c27": {
1637
+ "monkey": [
1638
+ "1"
1639
+ ]
1640
+ },
1641
+ "16e8599e94": {
1642
+ "giant_panda": [
1643
+ "1"
1644
+ ]
1645
+ },
1646
+ "16fe9fb444": {
1647
+ "motorbike": [
1648
+ "1"
1649
+ ],
1650
+ "person": [
1651
+ "2"
1652
+ ]
1653
+ },
1654
+ "1705796b02": {
1655
+ "train": [
1656
+ "1"
1657
+ ]
1658
+ },
1659
+ "1724db7671": {
1660
+ "giant_panda": [
1661
+ "1"
1662
+ ]
1663
+ },
1664
+ "17418e81ea": {
1665
+ "shark": [
1666
+ "1"
1667
+ ]
1668
+ },
1669
+ "175169edbb": {
1670
+ "ape": [
1671
+ "1",
1672
+ "2"
1673
+ ]
1674
+ },
1675
+ "17622326fd": {
1676
+ "lizard": [
1677
+ "1"
1678
+ ]
1679
+ },
1680
+ "17656bae77": {
1681
+ "elephant": [
1682
+ "1"
1683
+ ]
1684
+ },
1685
+ "17b0d94172": {
1686
+ "airplane": [
1687
+ "1"
1688
+ ]
1689
+ },
1690
+ "17c220e4f6": {
1691
+ "giant_panda": [
1692
+ "1"
1693
+ ]
1694
+ },
1695
+ "17c7bcd146": {
1696
+ "train": [
1697
+ "1"
1698
+ ]
1699
+ },
1700
+ "17cb4afe89": {
1701
+ "tiger": [
1702
+ "1"
1703
+ ]
1704
+ },
1705
+ "17cd79a434": {
1706
+ "squirrel": [
1707
+ "1"
1708
+ ]
1709
+ },
1710
+ "17d18604c3": {
1711
+ "person": [
1712
+ "1",
1713
+ "2"
1714
+ ]
1715
+ },
1716
+ "17d8ca1a37": {
1717
+ "owl": [
1718
+ "1"
1719
+ ],
1720
+ "person": [
1721
+ "2"
1722
+ ]
1723
+ },
1724
+ "17e33f4330": {
1725
+ "monkey": [
1726
+ "1"
1727
+ ]
1728
+ },
1729
+ "17f7a6d805": {
1730
+ "snail": [
1731
+ "1"
1732
+ ]
1733
+ },
1734
+ "180abc8378": {
1735
+ "owl": [
1736
+ "1"
1737
+ ],
1738
+ "person": [
1739
+ "2"
1740
+ ]
1741
+ },
1742
+ "183ba3d652": {
1743
+ "person": [
1744
+ "2"
1745
+ ],
1746
+ "motorbike": [
1747
+ "3"
1748
+ ]
1749
+ },
1750
+ "185bf64702": {
1751
+ "zebra": [
1752
+ "1",
1753
+ "2"
1754
+ ]
1755
+ },
1756
+ "18913cc690": {
1757
+ "train": [
1758
+ "1"
1759
+ ]
1760
+ },
1761
+ "1892651815": {
1762
+ "camel": [
1763
+ "1"
1764
+ ]
1765
+ },
1766
+ "189ac8208a": {
1767
+ "giraffe": [
1768
+ "1",
1769
+ "2"
1770
+ ]
1771
+ },
1772
+ "189b44e92c": {
1773
+ "zebra": [
1774
+ "1"
1775
+ ]
1776
+ },
1777
+ "18ac264b76": {
1778
+ "person": [
1779
+ "2"
1780
+ ]
1781
+ },
1782
+ "18b245ab49": {
1783
+ "penguin": [
1784
+ "1",
1785
+ "2",
1786
+ "3",
1787
+ "4"
1788
+ ]
1789
+ },
1790
+ "18b5cebc34": {
1791
+ "mouse": [
1792
+ "1"
1793
+ ]
1794
+ },
1795
+ "18bad52083": {
1796
+ "parrot": [
1797
+ "1",
1798
+ "2"
1799
+ ]
1800
+ },
1801
+ "18bb5144d5": {
1802
+ "lizard": [
1803
+ "1"
1804
+ ]
1805
+ },
1806
+ "18c6f205c5": {
1807
+ "person": [
1808
+ "1",
1809
+ "2",
1810
+ "3"
1811
+ ]
1812
+ },
1813
+ "1903f9ea15": {
1814
+ "bird": [
1815
+ "1",
1816
+ "2",
1817
+ "3"
1818
+ ]
1819
+ },
1820
+ "1917b209f2": {
1821
+ "cow": [
1822
+ "3",
1823
+ "4"
1824
+ ],
1825
+ "person": [
1826
+ "1"
1827
+ ],
1828
+ "horse": [
1829
+ "2"
1830
+ ]
1831
+ },
1832
+ "191e74c01d": {
1833
+ "deer": [
1834
+ "1"
1835
+ ]
1836
+ },
1837
+ "19367bb94e": {
1838
+ "fish": [
1839
+ "1",
1840
+ "2",
1841
+ "3"
1842
+ ]
1843
+ },
1844
+ "193ffaa217": {
1845
+ "person": [
1846
+ "1",
1847
+ "2",
1848
+ "3"
1849
+ ]
1850
+ },
1851
+ "19696b67d3": {
1852
+ "cow": [
1853
+ "1"
1854
+ ]
1855
+ },
1856
+ "197f3ab6f3": {
1857
+ "giant_panda": [
1858
+ "1"
1859
+ ]
1860
+ },
1861
+ "1981e763cc": {
1862
+ "sheep": [
1863
+ "1",
1864
+ "2"
1865
+ ]
1866
+ },
1867
+ "198afe39ae": {
1868
+ "person": [
1869
+ "1"
1870
+ ]
1871
+ },
1872
+ "19a6e62b9b": {
1873
+ "monkey": [
1874
+ "1",
1875
+ "2"
1876
+ ]
1877
+ },
1878
+ "19b60d5335": {
1879
+ "hedgehog": [
1880
+ "1"
1881
+ ]
1882
+ },
1883
+ "19c00c11f9": {
1884
+ "person": [
1885
+ "1"
1886
+ ]
1887
+ },
1888
+ "19e061eb88": {
1889
+ "boat": [
1890
+ "1",
1891
+ "2"
1892
+ ]
1893
+ },
1894
+ "19e8bc6178": {
1895
+ "dog": [
1896
+ "1"
1897
+ ]
1898
+ },
1899
+ "19ee80dac6": {
1900
+ "person": [
1901
+ "1",
1902
+ "3",
1903
+ "4"
1904
+ ]
1905
+ },
1906
+ "1a25a9170a": {
1907
+ "cow": [
1908
+ "1"
1909
+ ],
1910
+ "person": [
1911
+ "2",
1912
+ "3"
1913
+ ]
1914
+ },
1915
+ "1a359a6c1a": {
1916
+ "sheep": [
1917
+ "1"
1918
+ ]
1919
+ },
1920
+ "1a3e87c566": {
1921
+ "frog": [
1922
+ "1"
1923
+ ]
1924
+ },
1925
+ "1a5fe06b00": {
1926
+ "bus": [
1927
+ "1"
1928
+ ]
1929
+ },
1930
+ "1a6c0fbd1e": {
1931
+ "person": [
1932
+ "1"
1933
+ ]
1934
+ },
1935
+ "1a6f3b5a4b": {
1936
+ "sedan": [
1937
+ "3"
1938
+ ]
1939
+ },
1940
+ "1a8afbad92": {
1941
+ "zebra": [
1942
+ "1",
1943
+ "2",
1944
+ "3"
1945
+ ]
1946
+ },
1947
+ "1a8bdc5842": {
1948
+ "parrot": [
1949
+ "1",
1950
+ "2"
1951
+ ]
1952
+ },
1953
+ "1a95752aca": {
1954
+ "duck": [
1955
+ "1",
1956
+ "2"
1957
+ ]
1958
+ },
1959
+ "1a9c131cb7": {
1960
+ "ape": [
1961
+ "1",
1962
+ "2",
1963
+ "3"
1964
+ ]
1965
+ },
1966
+ "1aa3da3ee3": {
1967
+ "sheep": [
1968
+ "1",
1969
+ "2",
1970
+ "3",
1971
+ "4"
1972
+ ]
1973
+ },
1974
+ "1ab27ec7ea": {
1975
+ "deer": [
1976
+ "1"
1977
+ ]
1978
+ },
1979
+ "1abf16d21d": {
1980
+ "turtle": [
1981
+ "1"
1982
+ ]
1983
+ },
1984
+ "1acd0f993b": {
1985
+ "dog": [
1986
+ "1"
1987
+ ],
1988
+ "person": [
1989
+ "3"
1990
+ ]
1991
+ },
1992
+ "1ad202e499": {
1993
+ "lizard": [
1994
+ "1",
1995
+ "2"
1996
+ ]
1997
+ },
1998
+ "1af8d2395d": {
1999
+ "person": [
2000
+ "1",
2001
+ "2"
2002
+ ],
2003
+ "airplane": [
2004
+ "4"
2005
+ ]
2006
+ },
2007
+ "1afd39a1fa": {
2008
+ "motorbike": [
2009
+ "2"
2010
+ ]
2011
+ },
2012
+ "1b2d31306f": {
2013
+ "lizard": [
2014
+ "1"
2015
+ ]
2016
+ },
2017
+ "1b3fa67f0e": {
2018
+ "airplane": [
2019
+ "1"
2020
+ ]
2021
+ },
2022
+ "1b43fa74b4": {
2023
+ "owl": [
2024
+ "1",
2025
+ "2"
2026
+ ]
2027
+ },
2028
+ "1b73ea9fc2": {
2029
+ "parrot": [
2030
+ "1"
2031
+ ]
2032
+ },
2033
+ "1b7e8bb255": {
2034
+ "person": [
2035
+ "2"
2036
+ ]
2037
+ },
2038
+ "1b8680f8cd": {
2039
+ "person": [
2040
+ "2",
2041
+ "3"
2042
+ ]
2043
+ },
2044
+ "1b883843c0": {
2045
+ "person": [
2046
+ "1",
2047
+ "2"
2048
+ ]
2049
+ },
2050
+ "1b8898785b": {
2051
+ "monkey": [
2052
+ "1",
2053
+ "2"
2054
+ ]
2055
+ },
2056
+ "1b88ba1aa4": {
2057
+ "giant_panda": [
2058
+ "1"
2059
+ ]
2060
+ },
2061
+ "1b96a498e5": {
2062
+ "ape": [
2063
+ "1"
2064
+ ]
2065
+ },
2066
+ "1bbc4c274f": {
2067
+ "fish": [
2068
+ "2"
2069
+ ]
2070
+ },
2071
+ "1bd87fe9ab": {
2072
+ "train": [
2073
+ "1"
2074
+ ]
2075
+ },
2076
+ "1c4090c75b": {
2077
+ "whale": [
2078
+ "1"
2079
+ ]
2080
+ },
2081
+ "1c41934f84": {
2082
+ "elephant": [
2083
+ "1",
2084
+ "2"
2085
+ ]
2086
+ },
2087
+ "1c72b04b56": {
2088
+ "lion": [
2089
+ "1"
2090
+ ]
2091
+ },
2092
+ "1c87955a3a": {
2093
+ "crocodile": [
2094
+ "1"
2095
+ ],
2096
+ "turtle": [
2097
+ "2"
2098
+ ]
2099
+ },
2100
+ "1c9f9eb792": {
2101
+ "person": [
2102
+ "2"
2103
+ ]
2104
+ },
2105
+ "1ca240fede": {
2106
+ "train": [
2107
+ "1"
2108
+ ]
2109
+ },
2110
+ "1ca5673803": {
2111
+ "person": [
2112
+ "1",
2113
+ "3"
2114
+ ]
2115
+ },
2116
+ "1cada35274": {
2117
+ "duck": [
2118
+ "1"
2119
+ ]
2120
+ },
2121
+ "1cb44b920d": {
2122
+ "eagle": [
2123
+ "1",
2124
+ "2"
2125
+ ]
2126
+ },
2127
+ "1cd10e62be": {
2128
+ "leopard": [
2129
+ "1"
2130
+ ]
2131
+ },
2132
+ "1d3087d5e5": {
2133
+ "fish": [
2134
+ "1",
2135
+ "2",
2136
+ "3",
2137
+ "4",
2138
+ "5"
2139
+ ]
2140
+ },
2141
+ "1d3685150a": {
2142
+ "person": [
2143
+ "1",
2144
+ "3"
2145
+ ]
2146
+ },
2147
+ "1d6ff083aa": {
2148
+ "person": [
2149
+ "1",
2150
+ "2"
2151
+ ]
2152
+ }
2153
+ }