dianecy commited on 27 days ago

Commit

5c8ef86

verified ·

1 Parent(s): 8d82201

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.history/datasets/__init___20250113130205.py +40 -0
.history/datasets/ytvos_20241227174300.py +246 -0
.history/datasets/ytvos_20250113131154.py +246 -0
.history/datasets/ytvos_20250113131303.py +246 -0
.history/datasets/ytvos_ref_20250113130047.py +237 -0
.history/datasets/ytvos_ref_20250113131359.py +239 -0
.history/datasets/ytvos_ref_20250113162825.py +244 -0
.history/datasets/ytvos_ref_20250113163406.py +250 -0
.history/datasets/ytvos_ref_20250113163605.py +250 -0
.history/datasets/ytvos_ref_20250113180729.py +250 -0
.history/datasets/ytvos_ref_20250114201918.py +253 -0
.history/datasets/ytvos_ref_20250114202502.py +250 -0
.history/datasets/ytvos_ref_20250114205233.py +252 -0
.history/datasets/ytvos_ref_20250114210537.py +250 -0
.history/make_ref-ytvos/annotate_ref_ytvos_20241227174304.py +288 -0
.history/make_ref-ytvos/annotate_ref_ytvos_20250113111315.py +288 -0
davis2017/utils.py +174 -0
inference_davis.py +330 -0
main.py +243 -0
main_joint.py +198 -0
main_pretrain.py +304 -0
make_refcoco/refcocog_google/motion_split_generation_grefg_val.ipynb +0 -0
make_refcoco/refcocog_google/part4_ref_id.txt +130 -0
make_refcoco/refcocog_google/revised_refid_part4.json +506 -0
make_refcoco/refcocog_umd/motion_split_generation.ipynb +0 -0
make_refcoco/refcocog_umd/part4_ref_id.txt +126 -0
make_refcoco/refcocog_umd/revised_refid_part4.json +498 -0
mbench/__init__.py +0 -0
mbench/__pycache__/transforms_video.cpython-39.pyc +0 -0
mbench/__pycache__/ytvos_ref.cpython-39.pyc +0 -0
mbench/check_image.ipynb +0 -0
mbench/check_image_numbered.ipynb +0 -0
mbench/check_image_revised.ipynb +164 -0
mbench/gpt_ref-ytvos-revised.py +428 -0
mbench/gpt_ref-ytvos.ipynb +0 -0
mbench/gpt_ref-ytvos.py +302 -0
mbench/gpt_ref-ytvos_numbered_cy.py +460 -0
mbench/gpt_ref-ytvos_numbered_cy_sanity.py +643 -0
mbench/gpt_ref-ytvos_numbered_cy_sanity_2.py +676 -0
mbench/gpt_test.ipynb +0 -0
mbench/make_ref-ytvos_json.py +108 -0
mbench/numbered_captions_gpt-4o_final.json +0 -0
mbench/numbered_captions_gpt-4o_no_mask_color.json +0 -0
mbench/numbered_captions_gpt-4o_nomask_randcap.json +0 -0
mbench/numbered_captions_gpt-4o_randcap.json +0 -0
mbench/numbered_valid_obj_ids.json +2153 -0
mbench/numbered_valid_obj_ids_gpt-4o.json +2153 -0
mbench/numbered_valid_obj_ids_gpt-4o_no_mask_color.json +2153 -0
mbench/numbered_valid_obj_ids_gpt-4o_nomask_randcap.json +2153 -0
mbench/numbered_valid_obj_ids_gpt-4o_randcap.json +2153 -0

.history/datasets/__init___20250113130205.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import torch.utils.data
+import torchvision
+from .ytvos import build as build_ytvos
+from .ytvos_ref import build as build_ytvos_ref
+from .davis import build as build_davis
+from .a2d import build as build_a2d
+from .jhmdb import build as build_jhmdb
+from .refexp import build as build_refexp
+from .concat_dataset import build as build_joint
+def get_coco_api_from_dataset(dataset):
+    for _ in range(10):
+        # if isinstance(dataset, torchvision.datasets.CocoDetection):
+        #     break
+        if isinstance(dataset, torch.utils.data.Subset):
+            dataset = dataset.dataset
+    if isinstance(dataset, torchvision.datasets.CocoDetection):
+        return dataset.coco
+def build_dataset(dataset_file: str, image_set: str, args):
+    if dataset_file == 'ytvos':
+        return build_ytvos(image_set, args)
+    if dataset_file == 'ytvos_ref':
+        return build_ytvos_ref(image_set, args)
+    if dataset_file == 'davis':
+        return build_davis(image_set, args)
+    if dataset_file == 'a2d':
+        return build_a2d(image_set, args)
+    if dataset_file == 'jhmdb':
+        return build_jhmdb(image_set, args)
+    # for pretraining
+    if dataset_file == "refcoco" or dataset_file == "refcoco+" or dataset_file == "refcocog":
+        return build_refexp(dataset_file, image_set, args)
+    # for joint training of refcoco and ytvos
+    if dataset_file == 'joint':
+        return build_joint(image_set, args)
+    raise ValueError(f'dataset {dataset_file} not supported')

.history/datasets/ytvos_20241227174300.py ADDED Viewed

	@@ -0,0 +1,246 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                for frame_id in range(0, vid_len, self.num_frames):
+                    meta = {}
+                    meta['video'] = vid
+                    meta['exp'] = exp_dict['exp']
+                    meta['obj_id'] = int(exp_dict['obj_id'])
+                    meta['frames'] = vid_frames
+                    meta['frame_id'] = frame_id
+                    # get object category
+                    obj_id = exp_dict['obj_id']
+                    meta['category'] = vid_meta['objects'][obj_id]['category']
+                    self.metas.append(meta)
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, frame_id = \
+                        meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['frame_id']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            num_frames = self.num_frames
+            # random sparse sample
+            sample_indx = [frame_id]
+            if self.num_frames != 1:
+                # local sample
+                sample_id_before = random.randint(1, 3)
+                sample_id_after = random.randint(1, 3)
+                local_indx = [max(0, frame_id - sample_id_before), min(vid_len - 1, frame_id + sample_id_after)]
+                sample_indx.extend(local_indx)
+                sample_indx = list(set(sample_indx))
+                # global sampling
+                if num_frames > 3:
+                    all_inds = list(range(vid_len))
+                    global_inds = all_inds[:min(sample_indx)] + all_inds[max(sample_indx):]
+                    global_n = num_frames - len(sample_indx)
+                    if len(global_inds) > global_n:
+                        select_id = random.sample(range(len(global_inds)), global_n)
+                        for s_id in select_id:
+                            sample_indx.append(global_inds[s_id])
+                    elif vid_len >=global_n:  # sample long range global frames
+                        select_id = random.sample(range(vid_len), global_n)
+                        for s_id in select_id:
+                            sample_indx.append(all_inds[s_id])
+                    else:
+                        select_id = random.sample(range(vid_len), global_n - vid_len) + list(range(vid_len))
+                        for s_id in select_id:
+                            sample_indx.append(all_inds[s_id])
+            sample_indx.sort()
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for j in range(self.num_frames):
+                frame_indx = sample_indx[j]
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_indx), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_20250113131154.py ADDED Viewed

	@@ -0,0 +1,246 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                for frame_id in range(0, vid_len, self.num_frames):
+                    meta = {}
+                    meta['video'] = vid
+                    meta['exp'] = exp_dict['exp']
+                    meta['obj_id'] = int(exp_dict['obj_id'])
+                    meta['frames'] = vid_frames
+                    meta['frame_id'] = frame_id
+                    # get object category
+                    obj_id =
+                    meta['category'] = vid_meta['objects'][obj_id]['category']
+                    self.metas.append(meta)
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+exp_dict['obj_id']
+            video, exp, obj_id, category, frames, frame_id = \
+                        meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['frame_id']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            num_frames = self.num_frames
+            # random sparse sample
+            sample_indx = [frame_id]
+            if self.num_frames != 1:
+                # local sample
+                sample_id_before = random.randint(1, 3)
+                sample_id_after = random.randint(1, 3)
+                local_indx = [max(0, frame_id - sample_id_before), min(vid_len - 1, frame_id + sample_id_after)]
+                sample_indx.extend(local_indx)
+                sample_indx = list(set(sample_indx))
+                # global sampling
+                if num_frames > 3:
+                    all_inds = list(range(vid_len))
+                    global_inds = all_inds[:min(sample_indx)] + all_inds[max(sample_indx):]
+                    global_n = num_frames - len(sample_indx)
+                    if len(global_inds) > global_n:
+                        select_id = random.sample(range(len(global_inds)), global_n)
+                        for s_id in select_id:
+                            sample_indx.append(global_inds[s_id])
+                    elif vid_len >=global_n:  # sample long range global frames
+                        select_id = random.sample(range(vid_len), global_n)
+                        for s_id in select_id:
+                            sample_indx.append(all_inds[s_id])
+                    else:
+                        select_id = random.sample(range(vid_len), global_n - vid_len) + list(range(vid_len))
+                        for s_id in select_id:
+                            sample_indx.append(all_inds[s_id])
+            sample_indx.sort()
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for j in range(self.num_frames):
+                frame_indx = sample_indx[j]
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_indx), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_20250113131303.py ADDED Viewed

	@@ -0,0 +1,246 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                for frame_id in range(0, vid_len, self.num_frames):
+                    meta = {}
+                    meta['video'] = vid
+                    meta['exp'] = exp_dict['exp']
+                    meta['obj_id'] = int(exp_dict['obj_id'])
+                    meta['frames'] = vid_frames
+                    meta['frame_id'] = frame_id
+                    # get object category
+                    obj_id = exp_dict['obj_id']
+                    meta['category'] = vid_meta['objects'][obj_id]['category']
+                    self.metas.append(meta)
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, frame_id = \
+                        meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['frame_id']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            num_frames = self.num_frames
+            # random sparse sample
+            sample_indx = [frame_id]
+            if self.num_frames != 1:
+                # local sample
+                sample_id_before = random.randint(1, 3)
+                sample_id_after = random.randint(1, 3)
+                local_indx = [max(0, frame_id - sample_id_before), min(vid_len - 1, frame_id + sample_id_after)]
+                sample_indx.extend(local_indx)
+                sample_indx = list(set(sample_indx))
+                # global sampling
+                if num_frames > 3:
+                    all_inds = list(range(vid_len))
+                    global_inds = all_inds[:min(sample_indx)] + all_inds[max(sample_indx):]
+                    global_n = num_frames - len(sample_indx)
+                    if len(global_inds) > global_n:
+                        select_id = random.sample(range(len(global_inds)), global_n)
+                        for s_id in select_id:
+                            sample_indx.append(global_inds[s_id])
+                    elif vid_len >=global_n:  # sample long range global frames
+                        select_id = random.sample(range(vid_len), global_n)
+                        for s_id in select_id:
+                            sample_indx.append(all_inds[s_id])
+                    else:
+                        select_id = random.sample(range(vid_len), global_n - vid_len) + list(range(vid_len))
+                        for s_id in select_id:
+                            sample_indx.append(all_inds[s_id])
+            sample_indx.sort()
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for j in range(self.num_frames):
+                frame_indx = sample_indx[j]
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_indx), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250113130047.py ADDED Viewed

	@@ -0,0 +1,237 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+                    bins.append((bin_start, bin_end))
+                meta = {
+                    'video': vid,
+                    'exp': exp_dict['exp'],
+                    'obj_id': int(exp_dict['obj_id']),
+                    'frames': vid_frames,
+                    'bins': bins,
+                    'category': vid_meta['objects'][int(exp_dict['obj_id'])]['category']
+                }
+                self.metas.append(meta)
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_indx:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_indx), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250113131359.py ADDED Viewed

	@@ -0,0 +1,239 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+                    bins.append((bin_start, bin_end))
+                meta = {
+                    'video': vid,
+                    'exp': exp_dict['exp'],
+                    'obj_id': int(exp_dict['obj_id']),
+                    'frames': vid_frames,
+                    'bins': bins,
+                    'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                }
+                self.metas.append(meta)
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_indx:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_indx), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250113162825.py ADDED Viewed

	@@ -0,0 +1,244 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+                    bins.append((bin_start, bin_end))
+                # Random sample one frame from each bin
+                sample_indx = []
+                for start_idx, end_idx in bins:
+                    try:
+                        sample_indx.append(random.randint(start_idx, end_idx - 1))
+                    except ValueError:
+                        print(bins)
+                sample_indx.sort()  # Ensure indices are in order
+                for frame_id in sample_indx:
+                    meta = {
+                        'video': vid,
+                        'exp': exp_dict['exp'],
+                        'obj_id': int(exp_dict['obj_id']),
+                        'frames': vid_frames,
+                        'frame_id' : frame_id,
+                        'sample_frames_id' : sample_indx,
+                        'bins': bins,
+                        'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                    }
+                    self.metas.append(meta)
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, frame_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], metas['frame_id'], metas['sample_frames_id'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250113163406.py ADDED Viewed

	@@ -0,0 +1,250 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+                    bins.append((bin_start, bin_end))
+                # Random sample one frame from each bin
+                sample_indx = []
+                for start_idx, end_idx in bins:
+                    sample_indx.append(random.randint(start_idx, end_idx - 1))
+                sample_indx.sort()  # Ensure indices are in order
+                for frame_id in sample_indx:
+                    meta = {
+                        'video': vid,
+                        'exp': exp_dict['exp'],
+                        'obj_id': int(exp_dict['obj_id']),
+                        'frames': vid_frames,
+                        'frame_id' : frame_id,
+                        'sample_frames_id' : sample_indx,
+                        'bins': bins,
+                        'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                    }
+                    self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, frame_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], metas['frame_id'], metas['sample_frames_id'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250113163605.py ADDED Viewed

	@@ -0,0 +1,250 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+                    bins.append((bin_start, bin_end))
+                # Random sample one frame from each bin
+                sample_indx = []
+                for start_idx, end_idx in bins:
+                    sample_indx.append(random.randint(start_idx, end_idx - 1))
+                sample_indx.sort()  # Ensure indices are in order
+                for frame_id in sample_indx:
+                    meta = {
+                        'video': vid,
+                        'exp': exp_dict['exp'],
+                        'obj_id': int(exp_dict['obj_id']),
+                        'frames': vid_frames,
+                        'frame_id' : frame_id,
+                        'sample_frames_id' : sample_indx,
+                        'bins': bins,
+                        'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                    }
+                    self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, frame_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['frame_id'], meta['sample_frames_id'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250113180729.py ADDED Viewed

	@@ -0,0 +1,250 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+                    bins.append((bin_start, bin_end))
+                # Random sample one frame from each bin
+                sample_indx = []
+                for start_idx, end_idx in bins:
+                    sample_indx.append(random.randint(start_idx, end_idx - 1))
+                sample_indx.sort()  # Ensure indices are in order
+                for sample_id in sample_indx:
+                    meta = {
+                        'video': vid,
+                        'exp': exp_dict['exp'],
+                        'obj_id': int(exp_dict['obj_id']),
+                        'frames': vid_frames,
+                        'sample_id' : sample_id,
+                        'sample_frames_id' : sample_indx,
+                        'bins': bins,
+                        'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                    }
+                    self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250114201918.py ADDED Viewed

	@@ -0,0 +1,253 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            print(f"vid_data: {vid_data}")
+            print(f"vid_meta: {vid_meta}")
+            return
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+                    bins.append((bin_start, bin_end))
+                # Random sample one frame from each bin
+                sample_indx = []
+                for start_idx, end_idx in bins:
+                    sample_indx.append(random.randint(start_idx, end_idx - 1))
+                sample_indx.sort()  # Ensure indices are in order
+                for sample_id in sample_indx:
+                    meta = {
+                        'video': vid,
+                        'exp': exp_dict['exp'],
+                        'obj_id': int(exp_dict['obj_id']),
+                        'frames': vid_frames,
+                        'sample_id' : sample_id,
+                        'sample_frames_id' : sample_indx,
+                        'bins': bins,
+                        'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                    }
+                    self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250114202502.py ADDED Viewed

	@@ -0,0 +1,250 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+                    bins.append((bin_start, bin_end))
+                # Random sample one frame from each bin
+                sample_indx = []
+                for start_idx, end_idx in bins:
+                    sample_indx.append(random.randint(start_idx, end_idx - 1))
+                sample_indx.sort()  # Ensure indices are in order
+                for sample_id in sample_indx:
+                    meta = {
+                        'video': vid,
+                        'exp': exp_dict['exp'],
+                        'obj_id': int(exp_dict['obj_id']),
+                        'frames': vid_frames,
+                        'sample_id' : sample_id,
+                        'sample_frames_id' : sample_indx,
+                        'bins': bins,
+                        'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                    }
+                    self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250114205233.py ADDED Viewed

	@@ -0,0 +1,252 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.vid_meta, self.vid_data = self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            return vid_meta, vid_data
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+                start_idx , end_idx = 2, vid_len-2
+                bin_size = (end_idx - start_idx) // 4
+                bins = []
+                for i in range(4):
+                    bin_start = start_idx + i * bin_size
+                    bin_end = bin_start + bin_size if i < 3 else end_idx
+                    bins.append((bin_start, bin_end))
+                # Random sample one frame from each bin
+                sample_indx = []
+                for start_idx, end_idx in bins:
+                    sample_indx.append(random.randint(start_idx, end_idx - 1))
+                sample_indx.sort()  # Ensure indices are in order
+                for sample_id in sample_indx:
+                    meta = {
+                        'video': vid,
+                        'exp': exp_dict['exp'],
+                        'obj_id': int(exp_dict['obj_id']),
+                        'frames': vid_frames,
+                        'sample_id' : sample_id,
+                        'sample_frames_id' : sample_indx,
+                        'bins': bins,
+                        'category': vid_meta['objects'][exp_dict['obj_id']]['category']
+                    }
+                    self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/datasets/ytvos_ref_20250114210537.py ADDED Viewed

	@@ -0,0 +1,250 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+            start_idx , end_idx = 2, vid_len-2
+            bin_size = (end_idx - start_idx) // 4
+            bins = []
+            for i in range(4):
+                bin_start = start_idx + i * bin_size
+                bin_end = bin_start + bin_size if i < 3 else end_idx
+                bins.append((bin_start, bin_end))
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            meta = {
+                'video':vid,
+                'sample_indx':sample_indx,
+                'bins':bins
+            }
+            obj_id_cat = {}
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                obj_id = exp_dict['obj_id']
+                if obj_id not in obj_id_cat:
+                    obj_id_cat[obj_id] = vid_meta[obj_id]['category']
+            meta['obj_id_cat'] = obj_id_cat
+            self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        instance_check = False
+        while not instance_check:
+            meta = self.metas[idx]  # dict
+            video, exp, obj_id, category, frames, sample_id, sample_frames_id, bins = \
+                    meta['video'], meta['exp'], meta['obj_id'], meta['category'], meta['frames'], meta['sample_id'], meta['sample_frames_id'], meta['bins']
+            # clean up the caption
+            exp = " ".join(exp.lower().split())
+            category_id = category_dict[category]
+            vid_len = len(frames)
+            # num_frames = self.num_frames
+            # read frames and masks
+            imgs, labels, boxes, masks, valid = [], [], [], [], []
+            for frame_indx in sample_frames_id:
+                frame_name = frames[frame_indx]
+                img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+                mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+                img = Image.open(img_path).convert('RGB')
+                mask = Image.open(mask_path).convert('P')
+                # create the target
+                label =  torch.tensor(category_id)
+                mask = np.array(mask)
+                mask = (mask==obj_id).astype(np.float32) # 0,1 binary
+                if (mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                mask = torch.from_numpy(mask)
+                # append
+                imgs.append(img)
+                labels.append(label)
+                masks.append(mask)
+                boxes.append(box)
+            # transform
+            w, h = img.size
+            labels = torch.stack(labels, dim=0)
+            boxes = torch.stack(boxes, dim=0)
+            boxes[:, 0::2].clamp_(min=0, max=w)
+            boxes[:, 1::2].clamp_(min=0, max=h)
+            masks = torch.stack(masks, dim=0)
+            target = {
+                'frames_idx': torch.tensor(sample_frames_id), # [T,]
+                'labels': labels,                        # [T,]
+                'boxes': boxes,                          # [T, 4], xyxy
+                'masks': masks,                          # [T, H, W]
+                'valid': torch.tensor(valid),            # [T,]
+                'caption': exp,
+                'orig_size': torch.as_tensor([int(h), int(w)]),
+                'size': torch.as_tensor([int(h), int(w)])
+            }
+            # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+            if self._transforms:
+                imgs, target = self._transforms(imgs, target)
+                imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+            else:
+                imgs = np.array(imgs)
+                imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+            # FIXME: handle "valid", since some box may be removed due to random crop
+            if torch.any(target['valid'] == 1):  # at leatst one instance
+                instance_check = True
+            else:
+                idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/make_ref-ytvos/annotate_ref_ytvos_20241227174304.py ADDED Viewed

	@@ -0,0 +1,288 @@

+from datasets import build_dataset
+import argparse
+import opts
+import sys
+from pathlib import Path
+from os import path as osp
+import io
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+args = parser.parse_args()
+#==================데이터 불러오기===================
+# 전체 데이터셋
+train_dataset = build_dataset('ytvos', image_set = 'train', args = args)
+# 전체 데이터셋 메타데이터
+metas = train_dataset.metas
+# 필터링한 프레임들
+selected_frames_df = pd.read_json("selected_frames4.jsonl", lines = True)
+#==================마스크 만드는 함수들===================
+def prepare_mask_for_pil(mask_tensor):
+    mask_array = mask_tensor.squeeze(0).cpu().numpy()
+    mask_array = (mask_array * 255).astype(np.uint8)
+    mask_image = Image.fromarray(mask_array)
+    return mask_image
+def create_sub_masks(mask_image):
+    width, height = mask_image.size
+    sub_masks = {}
+    for x in range(width):
+        for y in range(height):
+            # Get the RGB values of the pixel
+            pixel = mask_image.getpixel((x, y))
+            # If the pixel is not black...
+            if pixel != 0 :
+                # Check to see if we've created a sub-mask...
+                pixel_str = str(pixel)
+                sub_mask = sub_masks.get(pixel_str)
+                if sub_mask is None:
+                   # Create a sub-mask (one bit per pixel) and add to the dictionary
+                    # Note: we add 1 pixel of padding in each direction
+                    # because the contours module doesn't handle cases
+                    # where pixels bleed to the edge of the image
+                    sub_masks[pixel_str] = Image.new('1', (width+2, height+2))
+                # Set the pixel value to 1 (default is 0), accounting for padding
+                sub_masks[pixel_str].putpixel((x+1, y+1), 1)
+    return sub_masks
+#==================마스크 annotation 만드는 함수===================
+def create_sub_mask_annotation(sub_mask, image_id, annotation_id, is_crowd):
+    # Find contours (boundary lines) around each sub-mask
+    # Note: there could be multiple contours if the object
+    # is partially occluded. (E.g. an elephant behind a tree)
+    contours = measure.find_contours(sub_mask, 0.5, positive_orientation='low')
+    segmentations = []
+    polygons = []
+    for contour in contours:
+        # Flip from (row, col) representation to (x, y)
+        # and subtract the padding pixel
+        for i in range(len(contour)):
+            row, col = contour[i]
+            contour[i] = (col - 1, row - 1)
+        # Make a polygon and simplify it
+        poly = Polygon(contour)
+        poly = poly.simplify(1.0, preserve_topology=False)
+        polygons.append(poly)
+        segmentation = np.array(poly.exterior.coords).ravel().tolist()
+        segmentations.append(segmentation)
+    # Combine the polygons to calculate the bounding box and area
+    multi_poly = MultiPolygon(polygons)
+    x, y, max_x, max_y = multi_poly.bounds
+    width = max_x - x
+    height = max_y - y
+    bbox = (x, y, width, height)
+    area = multi_poly.area
+    annotation = {
+        'segmentation': segmentations,
+        'iscrowd': is_crowd,
+        'image_id': image_id,
+        'id': annotation_id,
+        'bbox': bbox,
+        'area': area
+    }
+    return annotation
+#==================시각화 함수===================
+# annotation dictionary as input
+def showRef(annotation, image_dir, seg_box='seg'):
+    ax = plt.gca()
+    I = io.imread(osp.join(image_dir, annotation['file_name']))
+    ax.imshow(I)
+    for sid, sent in enumerate(annotation['sentences']):
+        print('%s. %s' % (sid + 1, sent))
+    if seg_box == 'seg':
+        polygons = []
+        color = []
+        c = (np.random.random((1, 3)) * 0.6 + 0.4).tolist()[0]
+        if type(annotation['segmentation'][0]) == list:
+            # polygon used for refcoco*
+            for seg in annotation['segmentation']:
+                poly = np.array(seg).reshape((int(len(seg) / 2), 2))
+                polygons.append(Polygon(poly))
+                color.append(c)
+            p = PatchCollection(polygons,
+                                facecolors=(221/255, 160/255, 221/255),  # 연보라색
+                                linewidths=0,
+                                alpha=0.4)
+            ax.add_collection(p)
+            p = PatchCollection(polygons,
+                                facecolors='none',
+                                edgecolors=color,
+                                linewidths=2)
+            ax.add_collection(p)
+        # else:
+        #     # mask used for refclef
+        #     rle = annotation['segmentation']
+        #     m = mask.decode(rle)
+        #     img = np.ones((m.shape[0], m.shape[1], 3))
+        #     color_mask = np.array([2.0, 166.0, 101.0]) / 255
+        #     for i in range(3):
+        #         img[:, :, i] = color_mask[i]
+        #     ax.imshow(np.dstack((img, m * 0.5)))
+    # bounding box
+    elif seg_box == 'box':
+        bbox = annotation['bbox']
+        box_plot = Rectangle((bbox[0], bbox[1]),
+                            bbox[2],
+                            bbox[3],
+                            fill=False,
+                            edgecolor='green',
+                            linewidth=3)
+        ax.add_patch(box_plot)
+#==================모두 종합한 함수===================
+def create_dict_from_selected_images(selected_frames_df):
+    image_id = 0
+    anno_id = 0
+    train_idx = 0
+    with open("/home/yejin/data/data/dataset/VRIS/mbench/ytvos/selected_instances2.jsonl", "w") as f:
+        for selected_idx in range(len(selected_frames_df)):
+            selected = selected_frames_df.loc[selected_idx]
+            selected_vid_id = selected['video']
+            selected_frame_id = selected['frame_id']
+            for obj_id in selected['objects'].keys():
+                selected_exp = selected['objects'][obj_id][0]  #캡션
+                selected_verb = selected['objects'][obj_id][1]  #동사
+                train_idx = next(
+                    idx for idx, meta in enumerate(metas)
+                    if meta['video'] == selected_vid_id
+                    and meta['frame_id'] == selected_frame_id
+                    and meta['obj_id'] == int(obj_id)
+                    and meta['exp'] == selected_exp
+                )
+                train_frames, train_info = train_dataset[train_idx]
+                try:
+                    valid_frame_loc = train_info['frames_idx'].tolist().index(selected_frame_id)  #valid한 frame이 있는 index
+                except ValueError:
+                    print(f"selected vid id: {selected_vid_id}, metas['frame_id']: {metas[train_idx]['frame_id']}, selected frame id: {selected_frame_id}, train_info['frames_idx']: {train_info['frames_idx'].tolist()}")
+                frame = train_frames[valid_frame_loc]  #해당 frame
+                frame = F.to_pil_image(frame)
+                image_file_name = f"{selected_vid_id}_{str(selected_frame_id).rjust(5, '0')}"
+                #원래 frame 저장하기
+                save_dir = Path("/home/yejin/data/data/dataset/VRIS/mbench/ytvos/selected_frames")
+                #save_dir.mkdir(exist_ok=True)
+                save_path = save_dir / f"{image_file_name}.png"
+                #frame.save(save_path)
+                #카테고리
+                label = train_info['labels'][valid_frame_loc].item()  #category id
+                category_name = metas[train_idx]['category']  #category name
+                #박스 정보
+                box = train_info['boxes'][valid_frame_loc]
+                # Annotation tools ########################################################################
+                mask = train_info['masks'][valid_frame_loc]
+                # print(mask.shape)
+                # frame과 mask 맞는지 확인만
+                # plt.imshow(frame.permute(1, 2, 0))
+                # mask_color = np.zeros((*mask.shape, 3), dtype = np.uint8)
+                # mask_color[mask == 1] = [255, 0, 0]
+                # plt.imshow(mask_color, alpha = 0.5)
+                # plt.show()
+                mask_image = prepare_mask_for_pil(mask)
+                sub_masks = create_sub_masks(mask_image)
+                for color, sub_mask in sub_masks.items():
+                    # print(f"Color: {color}, Sub-mask size: {sub_mask.size}")
+                    sub_mask_array = np.array(sub_mask, dtype=np.uint8)
+                    annotation = create_sub_mask_annotation(sub_mask_array, image_id, anno_id, is_crowd = 0)
+                    anno_id += 1
+                image_id += 1
+                #파일 경로 추가
+                annotation['file_name'] = f"{image_file_name}.png"
+                #불필요한 정보 지우기
+                annotation.pop('iscrowd', None)
+                annotation.pop('image_id', None)
+                annotation.pop('id', None)
+                valid = train_info['valid'][valid_frame_loc]
+                orig_size = train_info['orig_size']
+                size = train_info['size']
+                caption = metas[train_idx]['exp']
+                #filename, height, width 추가
+                #annotation['file_name'] = save_path
+                annotation['height'] = orig_size[0].item()
+                annotation['width'] = orig_size[1].item()
+                # category id,name, sentence dictionary 추가
+                annotation['label'] = label
+                annotation['category_name'] = category_name
+                sentence_dict = {
+                    "tokens" : caption.split(' '),
+                    "raw" : caption,
+                    "sent" : re.sub('[^A-Za-z0-9\s]+', '', caption.lower())
+                }
+                annotation['sentences'] = sentence_dict
+                ############################################################################################
+                # double check for segmentation annotation
+                # orig_img_np = draw_polygon_on_image(frame, annotation['segmentation'])
+                # plt.imshow(orig_img_np)
+                # plt.axis('off')
+                # plt.show()
+                # showRef(annotation, save_dir)
+                ############################################################################################
+                # 최종
+                f.write(json.dumps(annotation) + "\n")
+                f.flush()
+if __name__ == '__main__':
+    create_dict_from_selected_images(selected_frames_df)

.history/make_ref-ytvos/annotate_ref_ytvos_20250113111315.py ADDED Viewed

	@@ -0,0 +1,288 @@

+from datasets import build_dataset
+import argparse
+import opts
+import sys
+from pathlib import Path
+from os import path as osp
+import io
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+args = parser.parse_args()
+#==================데이터 불러오기===================
+# 전체 데이터셋
+train_dataset = build_dataset('ytvos', image_set = 'train', args = args)
+# 전체 데이터셋 메타데이터
+metas = train_dataset.metas
+# 필터링한 프레임들
+selected_frames_df = pd.read_json("selected_frames4.jsonl", lines = True)
+#==================마스크 만드는 함수들===================
+def prepare_mask_for_pil(mask_tensor):
+    mask_array = mask_tensor.squeeze(0).cpu().numpy()
+    mask_array = (mask_array * 255).astype(np.uint8)
+    mask_image = Image.fromarray(mask_array)
+    return mask_image
+def create_sub_masks(mask_image):
+    width, height = mask_image.size
+    sub_masks = {}
+    for x in range(width):
+        for y in range(height):
+            # Get the RGB values of the pixel
+            pixel = mask_image.getpixel((x, y))
+            # If the pixel is not black...
+            if pixel != 0 :
+                # Check to see if we've created a sub-mask...
+                pixel_str = str(pixel)
+                sub_mask = sub_masks.get(pixel_str)
+                if sub_mask is None:
+                   # Create a sub-mask (one bit per pixel) and add to the dictionary
+                    # Note: we add 1 pixel of padding in each direction
+                    # because the contours module doesn't handle cases
+                    # where pixels bleed to the edge of the image
+                    sub_masks[pixel_str] = Image.new('1', (width+2, height+2))
+                # Set the pixel value to 1 (default is 0), accounting for padding
+                sub_masks[pixel_str].putpixel((x+1, y+1), 1)
+    return sub_masks
+#==================마스크 annotation 만드는 함수===================
+def create_sub_mask_annotation(sub_mask, image_id, annotation_id, is_crowd):
+    # Find contours (boundary lines) around each sub-mask
+    # Note: there could be multiple contours if the object
+    # is partially occluded. (E.g. an elephant behind a tree)
+    contours = measure.find_contours(sub_mask, 0.5, positive_orientation='low')
+    segmentations = []
+    polygons = []
+    for contour in contours:
+        # Flip from (row, col) representation to (x, y)
+        # and subtract the padding pixel
+        for i in range(len(contour)):
+            row, col = contour[i]
+            contour[i] = (col - 1, row - 1)
+        # Make a polygon and simplify it
+        poly = Polygon(contour)
+        poly = poly.simplify(1.0, preserve_topology=False)
+        polygons.append(poly)
+        segmentation = np.array(poly.exterior.coords).ravel().tolist()
+        segmentations.append(segmentation)
+    # Combine the polygons to calculate the bounding box and area
+    multi_poly = MultiPolygon(polygons)
+    x, y, max_x, max_y = multi_poly.bounds
+    width = max_x - x
+    height = max_y - y
+    bbox = (x, y, width, height)
+    area = multi_poly.area
+    annotation = {
+        'segmentation': segmentations,
+        'iscrowd': is_crowd,
+        'image_id': image_id,
+        'id': annotation_id,
+        'bbox': bbox,
+        'area': area
+    }
+    return annotation
+#==================시각화 함수===================
+# annotation dictionary as input
+def showRef(annotation, image_dir, seg_box='seg'):
+    ax = plt.gca()
+    I = io.imread(osp.join(image_dir, annotation['file_name']))
+    ax.imshow(I)
+    for sid, sent in enumerate(annotation['sentences']):
+        print('%s. %s' % (sid + 1, sent))
+    if seg_box == 'seg':
+        polygons = []
+        color = []
+        c = (np.random.random((1, 3)) * 0.6 + 0.4).tolist()[0]
+        if type(annotation['segmentation'][0]) == list:
+            # polygon used for refcoco*
+            for seg in annotation['segmentation']:
+                poly = np.array(seg).reshape((int(len(seg) / 2), 2))
+                polygons.append(Polygon(poly))
+                color.append(c)
+            p = PatchCollection(polygons,
+                                facecolors=(221/255, 160/255, 221/255),  # 연보라색
+                                linewidths=0,
+                                alpha=0.4)
+            ax.add_collection(p)
+            p = PatchCollection(polygons,
+                                facecolors='none',
+                                edgecolors=color,
+                                linewidths=2)
+            ax.add_collection(p)
+        # else:
+        #     # mask used for refclef
+        #     rle = annotation['segmentation']
+        #     m = mask.decode(rle)
+        #     img = np.ones((m.shape[0], m.shape[1], 3))
+        #     color_mask = np.array([2.0, 166.0, 101.0]) / 255
+        #     for i in range(3):
+        #         img[:, :, i] = color_mask[i]
+        #     ax.imshow(np.dstack((img, m * 0.5)))
+    # bounding box
+    elif seg_box == 'box':
+        bbox = annotation['bbox']
+        box_plot = Rectangle((bbox[0], bbox[1]),
+                            bbox[2],
+                            bbox[3],
+                            fill=False,
+                            edgecolor='green',
+                            linewidth=3)
+        ax.add_patch(box_plot)
+#==================모두 종합한 함수===================
+def create_dict_from_selected_images(selected_frames_df):
+    image_id = 0
+    anno_id = 0
+    train_idx = 0
+    with open("/home/yejin/data/data/dataset/VRIS/mbench/ytvos/selected_instances2.jsonl", "w") as f:
+        for selected_idx in range(len(selected_frames_df)):
+            selected = selected_frames_df.loc[selected_idx]
+            selected_vid_id = selected['video']
+            selected_frame_id = selected['frame_id']
+            for obj_id in selected['objects'].keys():
+                selected_exp = selected['objects'][obj_id][0]  #캡션
+                selected_verb = selected['objects'][obj_id][1]  #동사
+                train_idx = next(
+                    idx for idx, meta in enumerate(metas)
+                    if meta['video'] == selected_vid_id
+                    and meta['frame_id'] == selected_frame_id
+                    and meta['obj_id'] == int(obj_id)
+                    and meta['exp'] == selected_exp
+                )
+                train_frames, train_info = train_dataset[train_idx]
+                try:
+                    valid_frame_loc = train_info['frames_idx'].tolist().index(selected_frame_id)  #valid한 frame이 있는 index
+                except ValueError:
+                    print(f"selected vid id: {selected_vid_id}, metas['frame_id']: {metas[train_idx]['frame_id']}, selected frame id: {selected_frame_id}, train_info['frames_idx']: {train_info['frames_idx'].tolist()}")
+                frame = train_frames[valid_frame_loc]  #해당 frame
+                frame = F.to_pil_image(frame)
+                image_file_name = f"{selected_vid_id}_{str(selected_frame_id).rjust(5, '0')}"
+                #원래 frame 저장하기
+                save_dir = Path("/home/yejin/data/data/dataset/VRIS/mbench/ytvos/selected_frames")
+                #save_dir.mkdir(exist_ok=True)
+                save_path = save_dir / f"{image_file_name}.png"
+                #frame.save(save_path)
+                #카테고리
+                label = train_info['labels'][valid_frame_loc].item()  #category id
+                category_name = metas[train_idx]['category']  #category name
+                #박스 정보
+                box = train_info['boxes'][valid_frame_loc]
+                # Annotation tools ########################################################################
+                mask = train_info['masks'][valid_frame_loc]
+                # print(mask.shape)
+                # frame과 mask 맞는지 확인만
+                # plt.imshow(frame.permute(1, 2, 0))
+                # mask_color = np.zeros((*mask.shape, 3), dtype = np.uint8)
+                # mask_color[mask == 1] = [255, 0, 0]
+                # plt.imshow(mask_color, alpha = 0.5)
+                # plt.show()
+                mask_image = prepare_mask_for_pil(mask)
+                sub_masks = create_sub_masks(mask_image)
+                for color, sub_mask in sub_masks.items():
+                    # print(f"Color: {color}, Sub-mask size: {sub_mask.size}")
+                    sub_mask_array = np.array(sub_mask, dtype=np.uint8)
+                    annotation = create_sub_mask_annotation(sub_mask_array, image_id, anno_id, is_crowd = 0)
+                    anno_id += 1
+                image_id += 1
+                #파일 경로 추가
+                annotation['file_name'] = f"{image_file_name}.png"
+                #불필요한 정보 지우기
+                annotation.pop('iscrowd', None)
+                annotation.pop('image_id', None)
+                annotation.pop('id', None)
+                valid = train_info['valid'][valid_frame_loc]
+                orig_size = train_info['orig_size']
+                size = train_info['size']
+                caption = metas[train_idx]['exp']
+                #filename, height, width 추가
+                #annotation['file_name'] = save_path
+                annotation['height'] = orig_size[0].item()
+                annotation['width'] = orig_size[1].item()
+                # category id,name, sentence dictionary 추가
+                annotation['label'] = label
+                annotation['category_name'] = category_name
+                sentence_dict = {
+                    "tokens" : caption.split(' '),
+                    "raw" : caption,
+                    "sent" : re.sub('[^A-Za-z0-9\s]+', '', caption.lower())
+                }
+                annotation['sentences'] = sentence_dict
+                ############################################################################################
+                # double check for segmentation annotation
+                # orig_img_np = draw_polygon_on_image(frame, annotation['segmentation'])
+                # plt.imshow(orig_img_np)
+                # plt.axis('off')
+                # plt.show()
+                # showRef(annotation, save_dir)
+                ############################################################################################
+                # 최종
+                f.write(json.dumps(annotation) + "\n")
+                f.flush()
+# if __name__ == '__main__':
+#     create_dict_from_selected_images(selected_frames_df)

davis2017/utils.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import os
+import errno
+import numpy as np
+from PIL import Image
+import warnings
+from davis2017.davis import DAVIS
+def _pascal_color_map(N=256, normalized=False):
+    """
+    Python implementation of the color map function for the PASCAL VOC data set.
+    Official Matlab version can be found in the PASCAL VOC devkit
+    http://host.robots.ox.ac.uk/pascal/VOC/voc2012/index.html#devkit
+    """
+    def bitget(byteval, idx):
+        return (byteval & (1 << idx)) != 0
+    dtype = 'float32' if normalized else 'uint8'
+    cmap = np.zeros((N, 3), dtype=dtype)
+    for i in range(N):
+        r = g = b = 0
+        c = i
+        for j in range(8):
+            r = r | (bitget(c, 0) << 7 - j)
+            g = g | (bitget(c, 1) << 7 - j)
+            b = b | (bitget(c, 2) << 7 - j)
+            c = c >> 3
+        cmap[i] = np.array([r, g, b])
+    cmap = cmap / 255 if normalized else cmap
+    return cmap
+def overlay_semantic_mask(im, ann, alpha=0.5, colors=None, contour_thickness=None):
+    im, ann = np.asarray(im, dtype=np.uint8), np.asarray(ann, dtype=np.int)
+    if im.shape[:-1] != ann.shape:
+        raise ValueError('First two dimensions of `im` and `ann` must match')
+    if im.shape[-1] != 3:
+        raise ValueError('im must have three channels at the 3 dimension')
+    colors = colors or _pascal_color_map()
+    colors = np.asarray(colors, dtype=np.uint8)
+    mask = colors[ann]
+    fg = im * alpha + (1 - alpha) * mask
+    img = im.copy()
+    img[ann > 0] = fg[ann > 0]
+    if contour_thickness:  # pragma: no cover
+        import cv2
+        for obj_id in np.unique(ann[ann > 0]):
+            contours = cv2.findContours((ann == obj_id).astype(
+                np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)[-2:]
+            cv2.drawContours(img, contours[0], -1, colors[obj_id].tolist(),
+                             contour_thickness)
+    return img
+def generate_obj_proposals(davis_root, subset, num_proposals, save_path):
+    dataset = DAVIS(davis_root, subset=subset, codalab=True)
+    for seq in dataset.get_sequences():
+        save_dir = os.path.join(save_path, seq)
+        if os.path.exists(save_dir):
+            continue
+        all_gt_masks, all_masks_id = dataset.get_all_masks(seq, True)
+        img_size = all_gt_masks.shape[2:]
+        num_rows = int(np.ceil(np.sqrt(num_proposals)))
+        proposals = np.zeros((num_proposals, len(all_masks_id), *img_size))
+        height_slices = np.floor(np.arange(0, img_size[0] + 1, img_size[0]/num_rows)).astype(np.uint).tolist()
+        width_slices = np.floor(np.arange(0, img_size[1] + 1, img_size[1]/num_rows)).astype(np.uint).tolist()
+        ii = 0
+        prev_h, prev_w = 0, 0
+        for h in height_slices[1:]:
+            for w in width_slices[1:]:
+                proposals[ii, :, prev_h:h, prev_w:w] = 1
+                prev_w = w
+                ii += 1
+                if ii == num_proposals:
+                    break
+            prev_h, prev_w = h, 0
+            if ii == num_proposals:
+                break
+        os.makedirs(save_dir, exist_ok=True)
+        for i, mask_id in enumerate(all_masks_id):
+            mask = np.sum(proposals[:, i, ...] * np.arange(1, proposals.shape[0] + 1)[:, None, None], axis=0)
+            save_mask(mask, os.path.join(save_dir, f'{mask_id}.png'))
+def generate_random_permutation_gt_obj_proposals(davis_root, subset, save_path):
+    dataset = DAVIS(davis_root, subset=subset, codalab=True)
+    for seq in dataset.get_sequences():
+        gt_masks, all_masks_id = dataset.get_all_masks(seq, True)
+        obj_swap = np.random.permutation(np.arange(gt_masks.shape[0]))
+        gt_masks = gt_masks[obj_swap, ...]
+        save_dir = os.path.join(save_path, seq)
+        os.makedirs(save_dir, exist_ok=True)
+        for i, mask_id in enumerate(all_masks_id):
+            mask = np.sum(gt_masks[:, i, ...] * np.arange(1, gt_masks.shape[0] + 1)[:, None, None], axis=0)
+            save_mask(mask, os.path.join(save_dir, f'{mask_id}.png'))
+def color_map(N=256, normalized=False):
+    def bitget(byteval, idx):
+        return ((byteval & (1 << idx)) != 0)
+    dtype = 'float32' if normalized else 'uint8'
+    cmap = np.zeros((N, 3), dtype=dtype)
+    for i in range(N):
+        r = g = b = 0
+        c = i
+        for j in range(8):
+            r = r | (bitget(c, 0) << 7-j)
+            g = g | (bitget(c, 1) << 7-j)
+            b = b | (bitget(c, 2) << 7-j)
+            c = c >> 3
+        cmap[i] = np.array([r, g, b])
+    cmap = cmap/255 if normalized else cmap
+    return cmap
+def save_mask(mask, img_path):
+    if np.max(mask) > 255:
+        raise ValueError('Maximum id pixel value is 255')
+    mask_img = Image.fromarray(mask.astype(np.uint8))
+    mask_img.putpalette(color_map().flatten().tolist())
+    mask_img.save(img_path)
+def db_statistics(per_frame_values):
+    """ Compute mean,recall and decay from per-frame evaluation.
+    Arguments:
+        per_frame_values (ndarray): per-frame evaluation
+    Returns:
+        M,O,D (float,float,float):
+            return evaluation statistics: mean,recall,decay.
+    """
+    # strip off nan values
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=RuntimeWarning)
+        M = np.nanmean(per_frame_values)
+        O = np.nanmean(per_frame_values > 0.5)
+    N_bins = 4
+    ids = np.round(np.linspace(1, len(per_frame_values), N_bins + 1) + 1e-10) - 1
+    ids = ids.astype(np.uint8)
+    D_bins = [per_frame_values[ids[i]:ids[i + 1] + 1] for i in range(0, 4)]
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=RuntimeWarning)
+        D = np.nanmean(D_bins[0]) - np.nanmean(D_bins[3])
+    return M, O, D
+def list_files(dir, extension=".png"):
+    return [os.path.splitext(file_)[0] for file_ in os.listdir(dir) if file_.endswith(extension)]
+def force_symlink(file1, file2):
+    try:
+        os.symlink(file1, file2)
+    except OSError as e:
+        if e.errno == errno.EEXIST:
+            os.remove(file2)
+        os.symlink(file1, file2)

inference_davis.py ADDED Viewed

	@@ -0,0 +1,330 @@

+'''
+Inference code for ReferFormer, on Ref-Youtube-VOS
+Modified from DETR (https://github.com/facebookresearch/detr)
+Ref-Davis17 does not support visualize
+'''
+import argparse
+import json
+import random
+import time
+from pathlib import Path
+import numpy as np
+import torch
+import util.misc as utils
+from models import build_model
+import torchvision.transforms as T
+import matplotlib.pyplot as plt
+import os
+import cv2
+from PIL import Image, ImageDraw
+import math
+import torch.nn.functional as F
+import json
+import opts
+from tqdm import tqdm
+import multiprocessing as mp
+import threading
+from tools.colormap import colormap
+# colormap
+color_list = colormap()
+color_list = color_list.astype('uint8').tolist()
+# build transform
+transform = T.Compose([
+    T.Resize(360),
+    T.ToTensor(),
+    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+])
+def main(args):
+    args.dataset_file = "davis"
+    args.masks = True
+    args.batch_size == 1
+    print("Inference only supports for batch size = 1")
+    print(args)
+    # fix the seed for reproducibility
+    seed = args.seed + utils.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    split = args.split
+    # save path
+    output_dir = args.output_dir
+    save_path_prefix = os.path.join(output_dir, split)
+    if not os.path.exists(save_path_prefix):
+        os.makedirs(save_path_prefix)
+    save_visualize_path_prefix = os.path.join(output_dir, split + '_images')
+    if args.visualize:
+        if not os.path.exists(save_visualize_path_prefix):
+            os.makedirs(save_visualize_path_prefix)
+    # load data
+    root = Path(args.davis_path) # data/ref-davis
+    img_folder = os.path.join(root, split, "JPEGImages")
+    meta_file = os.path.join(root, "meta_expressions", split, "meta_expressions.json")
+    with open(meta_file, "r") as f:
+        data = json.load(f)["videos"]
+    video_list = list(data.keys())
+    # create subprocess
+    thread_num = args.ngpu
+    global result_dict
+    result_dict = mp.Manager().dict()
+    processes = []
+    lock = threading.Lock()
+    video_num = len(video_list)
+    per_thread_video_num = math.ceil(float(video_num) / float(thread_num))
+    start_time = time.time()
+    print('Start inference')
+    for i in range(thread_num):
+        if i == thread_num - 1:
+            sub_video_list = video_list[i * per_thread_video_num:]
+        else:
+            sub_video_list = video_list[i * per_thread_video_num: (i + 1) * per_thread_video_num]
+        p = mp.Process(target=sub_processor, args=(lock, i, args, data,
+                                                   save_path_prefix, save_visualize_path_prefix,
+                                                   img_folder, sub_video_list))
+        p.start()
+        processes.append(p)
+    for p in processes:
+        p.join()
+    end_time = time.time()
+    total_time = end_time - start_time
+    result_dict = dict(result_dict)
+    num_all_frames_gpus = 0
+    for pid, num_all_frames in result_dict.items():
+        num_all_frames_gpus += num_all_frames
+    print("Total inference time: %.4f s" %(total_time))
+def sub_processor(lock, pid, args, data, save_path_prefix, save_visualize_path_prefix, img_folder, video_list):
+    text = 'processor %d' % pid
+    with lock:
+        progress = tqdm(
+            total=len(video_list),
+            position=pid,
+            desc=text,
+            ncols=0
+        )
+    torch.cuda.set_device(pid)
+    # model
+    model, criterion, _ = build_model(args)
+    device = args.device
+    model.to(device)
+    model_without_ddp = model
+    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    if pid == 0:
+        print('number of params:', n_parameters)
+    if args.resume:
+        checkpoint = torch.load(args.resume, map_location='cpu')
+        missing_keys, unexpected_keys = model_without_ddp.load_state_dict(checkpoint['model'], strict=False)
+        unexpected_keys = [k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops'))]
+        if len(missing_keys) > 0:
+            print('Missing Keys: {}'.format(missing_keys))
+        if len(unexpected_keys) > 0:
+            print('Unexpected Keys: {}'.format(unexpected_keys))
+    else:
+    	raise ValueError('Please specify the checkpoint for inference.')
+    # get palette
+    palette_img = os.path.join(args.davis_path, "valid/Annotations/blackswan/00000.png")
+    palette = Image.open(palette_img).getpalette()
+    # start inference
+    num_all_frames = 0
+    model.eval()
+    # 1. for each video
+    for video in video_list:
+        metas = []
+        expressions = data[video]["expressions"]
+        expression_list = list(expressions.keys())
+        num_expressions = len(expression_list)
+        video_len = len(data[video]["frames"])
+        # read all the anno meta
+        for i in range(num_expressions):
+            meta = {}
+            meta["video"] = video
+            meta["exp"] = expressions[expression_list[i]]["exp"]
+            meta["exp_id"] = expression_list[i] # start from 0
+            meta["frames"] = data[video]["frames"]
+            metas.append(meta)
+        meta = metas
+        # since there are 4 annotations
+        num_obj = num_expressions // 4
+        # 2. for each annotator
+        for anno_id in range(4): # 4 annotators
+            anno_logits = []
+            anno_masks = []   # [num_obj+1, video_len, h, w], +1 for background
+            for obj_id in range(num_obj):
+                i = obj_id * 4 + anno_id
+                video_name = meta[i]["video"]
+                exp = meta[i]["exp"]
+                exp_id = meta[i]["exp_id"]
+                frames = meta[i]["frames"]
+                video_len = len(frames)
+                # NOTE: the im2col_step for MSDeformAttention is set as 64
+                # so the max length for a clip is 64
+                # store the video pred results
+                all_pred_logits = []
+                all_pred_masks = []
+                # 3. for each clip
+                for clip_id in range(0, video_len, 36):
+                    frames_ids = [x for x in range(video_len)]
+                    clip_frames_ids = frames_ids[clip_id : clip_id + 36]
+                    clip_len = len(clip_frames_ids)
+                    # load the clip images
+                    imgs = []
+                    for t in clip_frames_ids:
+                        frame = frames[t]
+                        img_path = os.path.join(img_folder, video_name, frame + ".jpg")
+                        img = Image.open(img_path).convert('RGB')
+                        origin_w, origin_h = img.size
+                        imgs.append(transform(img)) # list[Img]
+                    imgs = torch.stack(imgs, dim=0).to(args.device) # [video_len, 3, H, W]
+                    img_h, img_w = imgs.shape[-2:]
+                    size = torch.as_tensor([int(img_h), int(img_w)]).to(args.device)
+                    target = {"size": size}
+                    with torch.no_grad():
+                        outputs = model([imgs], [exp], [target])
+                    pred_logits = outputs["pred_logits"][0] # [t, q, k]
+                    pred_masks = outputs["pred_masks"][0]   # [t, q, h, w]
+                    # according to pred_logits, select the query index
+                    pred_scores = pred_logits.sigmoid() # [t, q, k]
+                    pred_scores = pred_scores.mean(0)   # [q, K]
+                    max_scores, _ = pred_scores.max(-1) # [q,]
+                    _, max_ind = max_scores.max(-1)     # [1,]
+                    max_inds = max_ind.repeat(clip_len)
+                    pred_masks = pred_masks[range(clip_len), max_inds, ...] # [t, h, w]
+                    pred_masks = pred_masks.unsqueeze(0)
+                    pred_masks = F.interpolate(pred_masks, size=(origin_h, origin_w), mode='bilinear', align_corners=False)
+                    pred_masks = pred_masks.sigmoid()[0] # [t, h, w], NOTE: here mask is score
+                    # store the clip results
+                    pred_logits = pred_logits[range(clip_len), max_inds] # [t, k]
+                    all_pred_logits.append(pred_logits)
+                    all_pred_masks.append(pred_masks)
+                all_pred_logits = torch.cat(all_pred_logits, dim=0) # (video_len, K)
+                all_pred_masks = torch.cat(all_pred_masks, dim=0)   # (video_len, h, w)
+                anno_logits.append(all_pred_logits)
+                anno_masks.append(all_pred_masks)
+            # handle a complete image (all objects of a annotator)
+            anno_logits = torch.stack(anno_logits) # [num_obj, video_len, k]
+            anno_masks = torch.stack(anno_masks)   # [num_obj, video_len, h, w]
+            t, h, w = anno_masks.shape[-3:]
+            anno_masks[anno_masks < 0.5] = 0.0
+            background = 0.1 * torch.ones(1, t, h, w).to(args.device)
+            anno_masks = torch.cat([background, anno_masks], dim=0) # [num_obj+1, video_len, h, w]
+            out_masks = torch.argmax(anno_masks, dim=0) # int, the value indicate which object, [video_len, h, w]
+            out_masks = out_masks.detach().cpu().numpy().astype(np.uint8) # [video_len, h, w]
+            # save results
+            anno_save_path = os.path.join(save_path_prefix, f"anno_{anno_id}", video)
+            if not os.path.exists(anno_save_path):
+                os.makedirs(anno_save_path)
+            for f in range(out_masks.shape[0]):
+                img_E = Image.fromarray(out_masks[f])
+                img_E.putpalette(palette)
+                img_E.save(os.path.join(anno_save_path, '{:05d}.png'.format(f)))
+        with lock:
+            progress.update(1)
+    result_dict[str(pid)] = num_all_frames
+    with lock:
+        progress.close()
+# Post-process functions
+def box_cxcywh_to_xyxy(x):
+    x_c, y_c, w, h = x.unbind(1)
+    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
+         (x_c + 0.5 * w), (y_c + 0.5 * h)]
+    return torch.stack(b, dim=1)
+def rescale_bboxes(out_bbox, size):
+    img_w, img_h = size
+    b = box_cxcywh_to_xyxy(out_bbox)
+    b = b.cpu() * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
+    return b
+# Visualization functions
+def draw_reference_points(draw, reference_points, img_size, color):
+    W, H = img_size
+    for i, ref_point in enumerate(reference_points):
+        init_x, init_y = ref_point
+        x, y = W * init_x, H * init_y
+        cur_color = color
+        draw.line((x-10, y, x+10, y), tuple(cur_color), width=4)
+        draw.line((x, y-10, x, y+10), tuple(cur_color), width=4)
+def draw_sample_points(draw, sample_points, img_size, color_list):
+    alpha = 255
+    for i, samples in enumerate(sample_points):
+        for sample in samples:
+            x, y = sample
+            cur_color = color_list[i % len(color_list)][::-1]
+            cur_color += [alpha]
+            draw.ellipse((x-2, y-2, x+2, y+2),
+                            fill=tuple(cur_color), outline=tuple(cur_color), width=1)
+def vis_add_mask(img, mask, color):
+    origin_img = np.asarray(img.convert('RGB')).copy()
+    color = np.array(color)
+    mask = mask.reshape(mask.shape[0], mask.shape[1]).astype('uint8') # np
+    mask = mask > 0.5
+    origin_img[mask] = origin_img[mask] * 0.5 + color * 0.5
+    origin_img = Image.fromarray(origin_img)
+    return origin_img
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer inference script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+    main(args)

main.py ADDED Viewed

	@@ -0,0 +1,243 @@

+"""
+Training script of ReferFormer
+Modified from DETR (https://github.com/facebookresearch/detr)
+"""
+import argparse
+import datetime
+import json
+import random
+import time
+from pathlib import Path
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, DistributedSampler
+import util.misc as utils
+import datasets.samplers as samplers
+from datasets import build_dataset, get_coco_api_from_dataset
+from engine import train_one_epoch, evaluate, evaluate_a2d
+from models import build_model
+from tools.load_pretrained_weights import pre_trained_model_to_finetune
+import opts
+def main(args):
+    args.masks = True
+    utils.init_distributed_mode(args)
+    print("git:\n  {}\n".format(utils.get_sha()))
+    print(args)
+    print(f'\n Run on {args.dataset_file} dataset.')
+    print('\n')
+    device = torch.device(args.device)
+    # fix the seed for reproducibility
+    seed = args.seed + utils.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    model, criterion, postprocessor = build_model(args)
+    model.to(device)
+    model_without_ddp = model
+    if args.distributed:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+        model_without_ddp = model.module
+    # for n, p in model_without_ddp.named_parameters():
+    #     print(n)
+    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print('number of params:', n_parameters)
+    def match_name_keywords(n, name_keywords):
+        out = False
+        for b in name_keywords:
+            if b in n:
+                out = True
+                break
+        return out
+    param_dicts = [
+        {
+            "params":
+                [p for n, p in model_without_ddp.named_parameters()
+                 if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_text_encoder_names)
+                 and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad],
+            "lr": args.lr,
+        },
+        {
+            "params": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad],
+            "lr": args.lr_backbone,
+        },
+        {
+            "params": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_text_encoder_names) and p.requires_grad],
+            "lr": args.lr_text_encoder,
+        },
+        {
+            "params": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad],
+            "lr": args.lr * args.lr_linear_proj_mult,
+        }
+    ]
+    optimizer = torch.optim.AdamW(param_dicts, lr=args.lr,
+                                  weight_decay=args.weight_decay)
+    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, args.lr_drop)
+    # no validation ground truth for ytvos dataset
+    dataset_train = build_dataset(args.dataset_file, image_set='train', args=args)
+    if args.distributed:
+        if args.cache_mode:
+            sampler_train = samplers.NodeDistributedSampler(dataset_train)
+        else:
+            sampler_train = samplers.DistributedSampler(dataset_train)
+    else:
+        sampler_train = torch.utils.data.RandomSampler(dataset_train)
+    batch_sampler_train = torch.utils.data.BatchSampler(
+        sampler_train, args.batch_size, drop_last=True)
+    data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train,
+                                   collate_fn=utils.collate_fn, num_workers=args.num_workers)
+    # A2D-Sentences
+    if args.dataset_file == 'a2d' or args.dataset_file == 'jhmdb':
+        dataset_val = build_dataset(args.dataset_file, image_set='val', args=args)
+        if args.distributed:
+            if args.cache_mode:
+                sampler_val = samplers.NodeDistributedSampler(dataset_val, shuffle=False)
+            else:
+                sampler_val = samplers.DistributedSampler(dataset_val, shuffle=False)
+        else:
+            sampler_val = torch.utils.data.SequentialSampler(dataset_val)
+        data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val,
+                                     drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers,
+                                     pin_memory=True)
+    if args.dataset_file == "davis":
+        assert args.pretrained_weights is not None, "Please provide the pretrained weight to finetune for Ref-DAVIS17"
+        print("============================================>")
+        print("Ref-DAVIS17 are finetuned using the checkpoint trained on Ref-Youtube-VOS")
+        print("Load checkpoint weights from {} ...".format(args.pretrained_weights))
+        checkpoint = torch.load(args.pretrained_weights, map_location="cpu")
+        checkpoint_dict = pre_trained_model_to_finetune(checkpoint, args)
+        model_without_ddp.load_state_dict(checkpoint_dict, strict=False)
+        print("============================================>")
+    if args.dataset_file == "jhmdb":
+        assert args.resume is not None, "Please provide the checkpoint to resume for JHMDB-Sentences"
+        print("============================================>")
+        print("JHMDB-Sentences are directly evaluated using the checkpoint trained on A2D-Sentences")
+        print("Load checkpoint weights from {} ...".format(args.pretrained_weights))
+        # load checkpoint in the args.resume
+        print("============================================>")
+    # for Ref-Youtube-VOS and A2D-Sentences
+    # finetune using the pretrained weights on Ref-COCO
+    if args.dataset_file != "davis" and args.dataset_file != "jhmdb" and args.pretrained_weights is not None:
+        print("============================================>")
+        print("Load pretrained weights from {} ...".format(args.pretrained_weights))
+        checkpoint = torch.load(args.pretrained_weights, map_location="cpu")
+        checkpoint_dict = pre_trained_model_to_finetune(checkpoint, args)
+        model_without_ddp.load_state_dict(checkpoint_dict, strict=False)
+        print("============================================>")
+    output_dir = Path(args.output_dir)
+    if args.resume:
+        if args.resume.startswith('https'):
+            checkpoint = torch.hub.load_state_dict_from_url(
+                args.resume, map_location='cpu', check_hash=True)
+        else:
+            checkpoint = torch.load(args.resume, map_location='cpu')
+        missing_keys, unexpected_keys = model_without_ddp.load_state_dict(checkpoint['model'], strict=False)
+        unexpected_keys = [k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops'))]
+        if len(missing_keys) > 0:
+            print('Missing Keys: {}'.format(missing_keys))
+        if len(unexpected_keys) > 0:
+            print('Unexpected Keys: {}'.format(unexpected_keys))
+        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
+            import copy
+            p_groups = copy.deepcopy(optimizer.param_groups)
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            for pg, pg_old in zip(optimizer.param_groups, p_groups):
+                pg['lr'] = pg_old['lr']
+                pg['initial_lr'] = pg_old['initial_lr']
+            print(optimizer.param_groups)
+            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+            # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance).
+            args.override_resumed_lr_drop = True
+            if args.override_resumed_lr_drop:
+                print('Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.')
+                lr_scheduler.step_size = args.lr_drop
+                lr_scheduler.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups))
+            lr_scheduler.step(lr_scheduler.last_epoch)
+            args.start_epoch = checkpoint['epoch'] + 1
+    if args.eval:
+        assert args.dataset_file == 'a2d' or args.dataset_file == 'jhmdb', \
+                    'Only A2D-Sentences and JHMDB-Sentences datasets support evaluation'
+        test_stats = evaluate_a2d(model, data_loader_val, postprocessor, device, args)
+        return
+    print("Start training")
+    start_time = time.time()
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            sampler_train.set_epoch(epoch)
+        train_stats = train_one_epoch(
+            model, criterion, data_loader_train, optimizer, device, epoch,
+            args.clip_max_norm)
+        lr_scheduler.step()
+        if args.output_dir:
+            checkpoint_paths = [output_dir / 'checkpoint.pth']
+            # extra checkpoint before LR drop and every epochs
+            # if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 1 == 0:
+            if (epoch + 1) % 1 == 0:
+                checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth')
+            for checkpoint_path in checkpoint_paths:
+                utils.save_on_master({
+                    'model': model_without_ddp.state_dict(),
+                    'optimizer': optimizer.state_dict(),
+                    'lr_scheduler': lr_scheduler.state_dict(),
+                    'epoch': epoch,
+                    'args': args,
+                }, checkpoint_path)
+        log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
+                     'epoch': epoch,
+                     'n_parameters': n_parameters}
+        if args.dataset_file == 'a2d':
+            test_stats = evaluate_a2d(model, data_loader_val, postprocessor, device, args)
+            log_stats.update({**{f'{k}': v for k, v in test_stats.items()}})
+        if args.output_dir and utils.is_main_process():
+            with (output_dir / "log.txt").open("a") as f:
+                f.write(json.dumps(log_stats) + "\n")
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print('Training time {}'.format(total_time_str))
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+    if args.output_dir:
+        Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+    main(args)

main_joint.py ADDED Viewed

	@@ -0,0 +1,198 @@

+"""
+Training script of ReferFormer
+Modified from DETR (https://github.com/facebookresearch/detr)
+"""
+import argparse
+import datetime
+import json
+import random
+import time
+from pathlib import Path
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, DistributedSampler
+import util.misc as utils
+import datasets.samplers as samplers
+from datasets import build_dataset, get_coco_api_from_dataset
+from engine import train_one_epoch, evaluate, evaluate_a2d
+from models import build_model
+from tools.load_pretrained_weights import pre_trained_model_to_finetune
+import opts
+def main(args):
+    args.masks = True
+    args.dataset_file = 'joint' # joint training of ytvos and refcoco
+    args.binary = 1             # only run on binary referred
+    utils.init_distributed_mode(args)
+    print("git:\n  {}\n".format(utils.get_sha()))
+    print(args)
+    print(f'\n Run on {args.dataset_file} dataset.')
+    print('\n')
+    device = torch.device(args.device)
+    # fix the seed for reproducibility
+    seed = args.seed + utils.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    model, criterion, postprocessor = build_model(args)
+    model.to(device)
+    model_without_ddp = model
+    if args.distributed:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+        model_without_ddp = model.module
+    # for n, p in model_without_ddp.named_parameters():
+    #     print(n)
+    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print('number of params:', n_parameters)
+    def match_name_keywords(n, name_keywords):
+        out = False
+        for b in name_keywords:
+            if b in n:
+                out = True
+                break
+        return out
+    param_dicts = [
+        {
+            "params":
+                [p for n, p in model_without_ddp.named_parameters()
+                 if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_text_encoder_names)
+                 and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad],
+            "lr": args.lr,
+        },
+        {
+            "params": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad],
+            "lr": args.lr_backbone,
+        },
+        {
+            "params": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_text_encoder_names) and p.requires_grad],
+            "lr": args.lr_text_encoder,
+        },
+        {
+            "params": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad],
+            "lr": args.lr * args.lr_linear_proj_mult,
+        }
+    ]
+    optimizer = torch.optim.AdamW(param_dicts, lr=args.lr,
+                                  weight_decay=args.weight_decay)
+    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, args.lr_drop)
+    # no validation ground truth for ytvos dataset
+    dataset_train = build_dataset(args.dataset_file, image_set='train', args=args)
+    if args.distributed:
+        if args.cache_mode:
+            sampler_train = samplers.NodeDistributedSampler(dataset_train)
+        else:
+            sampler_train = samplers.DistributedSampler(dataset_train)
+    else:
+        sampler_train = torch.utils.data.RandomSampler(dataset_train)
+    batch_sampler_train = torch.utils.data.BatchSampler(
+        sampler_train, args.batch_size, drop_last=True)
+    data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train,
+                                   collate_fn=utils.collate_fn, num_workers=args.num_workers)
+    output_dir = Path(args.output_dir)
+    if args.resume:
+        if args.resume.startswith('https'):
+            checkpoint = torch.hub.load_state_dict_from_url(
+                args.resume, map_location='cpu', check_hash=True)
+        else:
+            checkpoint = torch.load(args.resume, map_location='cpu')
+        missing_keys, unexpected_keys = model_without_ddp.load_state_dict(checkpoint['model'], strict=False)
+        unexpected_keys = [k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops'))]
+        if len(missing_keys) > 0:
+            print('Missing Keys: {}'.format(missing_keys))
+        if len(unexpected_keys) > 0:
+            print('Unexpected Keys: {}'.format(unexpected_keys))
+        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
+            import copy
+            p_groups = copy.deepcopy(optimizer.param_groups)
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            for pg, pg_old in zip(optimizer.param_groups, p_groups):
+                pg['lr'] = pg_old['lr']
+                pg['initial_lr'] = pg_old['initial_lr']
+            print(optimizer.param_groups)
+            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+            # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance).
+            args.override_resumed_lr_drop = True
+            if args.override_resumed_lr_drop:
+                print('Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.')
+                lr_scheduler.step_size = args.lr_drop
+                lr_scheduler.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups))
+            lr_scheduler.step(lr_scheduler.last_epoch)
+            args.start_epoch = checkpoint['epoch'] + 1
+    if args.eval:
+        assert args.dataset_file == 'a2d' or args.dataset_file == 'jhmdb', \
+                    'Only A2D-Sentences and JHMDB-Sentences datasets support evaluation'
+        test_stats = evaluate_a2d(model, data_loader_val, postprocessor, device, args)
+        return
+    print("Start training")
+    start_time = time.time()
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            sampler_train.set_epoch(epoch)
+        train_stats = train_one_epoch(
+            model, criterion, data_loader_train, optimizer, device, epoch,
+            args.clip_max_norm)
+        lr_scheduler.step()
+        if args.output_dir:
+            checkpoint_paths = [output_dir / 'checkpoint.pth']
+            # extra checkpoint before LR drop and every epochs
+            # if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 1 == 0:
+            if (epoch + 1) % 1 == 0:
+                checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth')
+            for checkpoint_path in checkpoint_paths:
+                utils.save_on_master({
+                    'model': model_without_ddp.state_dict(),
+                    'optimizer': optimizer.state_dict(),
+                    'lr_scheduler': lr_scheduler.state_dict(),
+                    'epoch': epoch,
+                    'args': args,
+                }, checkpoint_path)
+        log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
+                     'epoch': epoch,
+                     'n_parameters': n_parameters}
+        if args.output_dir and utils.is_main_process():
+            with (output_dir / "log.txt").open("a") as f:
+                f.write(json.dumps(log_stats) + "\n")
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print('Training time {}'.format(total_time_str))
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+    if args.output_dir:
+        Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+    main(args)

main_pretrain.py ADDED Viewed

	@@ -0,0 +1,304 @@

+import argparse
+import datetime
+import json
+import random
+import time
+from pathlib import Path
+from collections import namedtuple
+from functools import partial
+import os
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, DistributedSampler
+import util.misc as utils
+import datasets.samplers as samplers
+from datasets.coco_eval import CocoEvaluator
+from datasets import build_dataset, get_coco_api_from_dataset
+from engine import evaluate, train_one_epoch
+from models import build_model
+from models.postprocessors import build_postprocessors
+import opts
+def main(args):
+    # set environ
+    os.environ["MDETR_CPU_REDUCE"] = "1"
+    args.masks = True
+    assert args.dataset_file in ["refcoco", "refcoco+", "refcocog", "all"]
+    utils.init_distributed_mode(args)
+    print("git:\n  {}\n".format(utils.get_sha()))
+    print(args)
+    device = torch.device(args.device)
+    # fix the seed for reproducibility
+    seed = args.seed + utils.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    model, criterion, postprocessors = build_model(args)
+    model.to(device)
+    model_without_ddp = model
+    if args.distributed:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+        model_without_ddp = model.module
+    n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print('number of params:', n_parameters)
+    # lr_backbone_names = ["backbone.0", "text_encoder"]
+    def match_name_keywords(n, name_keywords):
+        out = False
+        for b in name_keywords:
+            if b in n:
+                out = True
+                break
+        return out
+    # for n, p in model_without_ddp.named_parameters():
+    #    print(n)
+    param_dicts = [
+        {
+            "params":
+                [p for n, p in model_without_ddp.named_parameters()
+                 if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_text_encoder_names)
+                 and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad],
+            "lr": args.lr,
+        },
+        {
+            "params": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad],
+            "lr": args.lr_backbone,
+        },
+        {
+            "params": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_text_encoder_names) and p.requires_grad],
+            "lr": args.lr_text_encoder,
+        },
+        {
+            "params": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad],
+            "lr": args.lr * args.lr_linear_proj_mult,
+        }
+    ]
+    optimizer = torch.optim.AdamW(param_dicts, lr=args.lr,
+                                  weight_decay=args.weight_decay)
+    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, args.lr_drop)
+    # build train  dataset
+    if args.dataset_file != "all":
+        dataset_train = build_dataset(args.dataset_file, image_set='train', args=args)
+    else:
+        dataset_names = ["refcoco", "refcoco+", "refcocog"]
+        dataset_train = torch.utils.data.ConcatDataset(
+            [build_dataset(name, image_set="train", args=args) for name in dataset_names]
+        )
+    print("\nTrain dataset sample number: ", len(dataset_train))
+    print("\n")
+    if args.distributed:
+        if args.cache_mode:
+            sampler_train = samplers.NodeDistributedSampler(dataset_train)
+        else:
+            sampler_train = samplers.DistributedSampler(dataset_train)
+    else:
+        sampler_train = torch.utils.data.RandomSampler(dataset_train)
+    batch_sampler_train = torch.utils.data.BatchSampler(
+        sampler_train, args.batch_size, drop_last=True)
+    data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train,
+                                   collate_fn=utils.collate_fn, num_workers=args.num_workers,
+                                   pin_memory=True)
+    # build val datasets
+    Val_all = namedtuple(typename="val_data", field_names=["dataset_name", "dataloader", "base_ds", "evaluator_list"])
+    if args.dataset_file != "all":
+        dataset_names = [args.dataset_file]
+    else:
+        dataset_names = ["refcoco", "refcoco+", "refcocog"]
+    val_tuples = []
+    for name in dataset_names:
+        dataset_val = build_dataset(name, image_set="val", args=args)
+        sampler_val = (
+            samplers.DistributedSampler(dataset_val, shuffle=False) if args.distributed else torch.utils.data.SequentialSampler(dataset_val)
+        )
+        data_loader_val = DataLoader(
+            dataset_val,
+            args.batch_size,
+            sampler=sampler_val,
+            drop_last=False,
+            collate_fn=utils.collate_fn,
+            num_workers=args.num_workers,
+        )
+        base_ds = get_coco_api_from_dataset(dataset_val)
+        val_tuples.append(Val_all(dataset_name=name, dataloader=data_loader_val, base_ds=base_ds, evaluator_list=None))
+    # build evaluator list for dataset_val
+    def build_evaluator_list(base_ds, dataset_name):
+        """Helper function to build the list of evaluators for a given dataset"""
+        evaluator_list = []
+        iou_types = ["bbox"]
+        if args.masks:
+            iou_types.append("segm")
+        evaluator_list.append(CocoEvaluator(base_ds, tuple(iou_types), useCats=False))
+        # TODO: currently ont support RefExpEvaluator (memory error)
+        return evaluator_list
+    output_dir = Path(args.output_dir)
+    if args.resume:
+        print("Resume from {}".format(args.resume))
+        if args.resume.startswith('https'):
+            checkpoint = torch.hub.load_state_dict_from_url(
+                args.resume, map_location='cpu', check_hash=True)
+        else:
+            checkpoint = torch.load(args.resume, map_location='cpu')
+        missing_keys, unexpected_keys = model_without_ddp.load_state_dict(checkpoint['model'], strict=False)
+        unexpected_keys = [k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops'))]
+        if len(missing_keys) > 0:
+            print('Missing Keys: {}'.format(missing_keys))
+        if len(unexpected_keys) > 0:
+            print('Unexpected Keys: {}'.format(unexpected_keys))
+        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
+            import copy
+            p_groups = copy.deepcopy(optimizer.param_groups)
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            for pg, pg_old in zip(optimizer.param_groups, p_groups):
+                pg['lr'] = pg_old['lr']
+                pg['initial_lr'] = pg_old['initial_lr']
+            print(optimizer.param_groups)
+            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+            # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance).
+            args.override_resumed_lr_drop = True
+            if args.override_resumed_lr_drop:
+                print('Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.')
+                lr_scheduler.step_size = args.lr_drop
+                lr_scheduler.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups))
+            lr_scheduler.step(lr_scheduler.last_epoch)
+            args.start_epoch = checkpoint['epoch'] + 1
+        if not args.eval:
+            test_stats = {}
+            for i, item in enumerate(val_tuples):
+                evaluator_list = build_evaluator_list(item.base_ds, item.dataset_name)
+                postprocessors = build_postprocessors(args, item.dataset_name)
+                item = item._replace(evaluator_list=evaluator_list)
+                print(f"Evaluating {item.dataset_name}")
+                curr_test_stats = evaluate(
+                    model=model,
+                    criterion=criterion,
+                    postprocessors=postprocessors,
+                    data_loader=item.dataloader,
+                    evaluator_list=item.evaluator_list,
+                    device=device,
+                    args=args,
+                )
+                test_stats.update({item.dataset_name + "_" + k: v for k, v in curr_test_stats.items()})
+            log_stats = {
+                **{f"test_{k}": v for k, v in test_stats.items()},
+                "n_parameters": n_parameters,
+            }
+            print(log_stats)
+    if args.eval:
+        print("Evaluating......")
+        test_stats = {}
+        for i, item in enumerate(val_tuples):
+            evaluator_list = build_evaluator_list(item.base_ds, item.dataset_name)
+            postprocessors = build_postprocessors(args, item.dataset_name)
+            item = item._replace(evaluator_list=evaluator_list)
+            print(f"Evaluating {item.dataset_name}")
+            curr_test_stats = evaluate(
+                model=model,
+                criterion=criterion,
+                postprocessors=postprocessors,
+                data_loader=item.dataloader,
+                evaluator_list=item.evaluator_list,
+                device=device,
+                args=args,
+            )
+            test_stats.update({item.dataset_name + "_" + k: v for k, v in curr_test_stats.items()})
+        log_stats = {
+            **{f"test_{k}": v for k, v in test_stats.items()},
+            "n_parameters": n_parameters,
+        }
+        print(log_stats)
+        return
+    print("Start training")
+    start_time = time.time()
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            sampler_train.set_epoch(epoch)
+        train_stats = train_one_epoch(
+            model, criterion, data_loader_train, optimizer, device, epoch,
+            args.clip_max_norm)
+        lr_scheduler.step()
+        if args.output_dir:
+            checkpoint_paths = [output_dir / 'checkpoint.pth']
+            # extra checkpoint before LR drop and every epochs
+            # if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 1 == 0:
+            if (epoch + 1) % 1 == 0:
+                checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth')
+            for checkpoint_path in checkpoint_paths:
+                utils.save_on_master({
+                    'model': model_without_ddp.state_dict(),
+                    'optimizer': optimizer.state_dict(),
+                    'lr_scheduler': lr_scheduler.state_dict(),
+                    'epoch': epoch,
+                    'args': args,
+                }, checkpoint_path)
+        test_stats = {}
+        for i, item in enumerate(val_tuples):
+            evaluator_list = build_evaluator_list(item.base_ds, item.dataset_name)
+            postprocessors = build_postprocessors(args, item.dataset_name)
+            item = item._replace(evaluator_list=evaluator_list)
+            print(f"Evaluating {item.dataset_name}")
+            curr_test_stats = evaluate(
+                model=model,
+                criterion=criterion,
+                postprocessors=postprocessors,
+                data_loader=item.dataloader,
+                evaluator_list=item.evaluator_list,
+                device=device,
+                args=args,
+            )
+            test_stats.update({item.dataset_name + "_" + k: v for k, v in curr_test_stats.items()})
+        log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
+                     **{f'test_{k}': v for k, v in test_stats.items()},
+                     'epoch': epoch,
+                     'n_parameters': n_parameters}
+        if args.output_dir and utils.is_main_process():
+            with (output_dir / "log.txt").open("a") as f:
+                f.write(json.dumps(log_stats) + "\n")
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print('Training time {}'.format(total_time_str))
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer pretrain training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+    if args.output_dir:
+        Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+    main(args)

make_refcoco/refcocog_google/motion_split_generation_grefg_val.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

make_refcoco/refcocog_google/part4_ref_id.txt ADDED Viewed

	@@ -0,0 +1,130 @@

+4859
+678
+162
+3052
+2355
+3408
+834
+328
+1646
+4400
+3683
+3788
+4701
+1211
+2138
+3510
+899
+293
+3196
+1939
+2659
+2849
+756
+4573
+4514
+304
+3465
+1092
+2025
+1701
+2958
+4793
+1247
+1841
+4404
+4536
+2787
+3377
+3889
+2194
+2969
+1951
+508
+2312
+3948
+1388
+2690
+1109
+1374
+3475
+1333
+2068
+2824
+2294
+2446
+4771
+2686
+4558
+1499
+4303
+1376
+3544
+1858
+434
+3024
+513
+693
+2523
+4987
+3133
+4041
+2105
+135
+3613
+1722
+1607
+2761
+2454
+1603
+4794
+2485
+3280
+3336
+3118
+4494
+3004
+127
+3389
+2568
+2283
+1530
+4251
+2540
+2870
+4946
+113
+711
+3209
+3620
+4382
+2861
+3954
+1984
+2069
+2016
+1153
+3614
+198
+3012
+4247
+2205
+4831
+4534
+638
+1419
+1992
+542
+2223
+4865
+751
+3540
+3765
+2879
+4529
+2131
+1306
+3508
+4165
+4126
+388

make_refcoco/refcocog_google/revised_refid_part4.json ADDED Viewed

	@@ -0,0 +1,506 @@

+{
+    "4859": {
+        "(motion, 101105)": "man sitting on the ground playing wii",
+        "(static, 101106)": "man in white and light blue t - shirt"
+    },
+    "678": {
+        "(motion, 14720)": "the man crouching inside the plane",
+        "(static, 14721)": "the man wearing white hat"
+    },
+    "162": {
+        "(motion, 2908)": "the man resting his face on his hands",
+        "(static, 2909)": "the man with a plastic bag between his feet"
+    },
+    "3052": {
+        "(motion, 63901)": "person looking at a book",
+        "(static, 63902)": "person wearing a hat and backpack"
+    },
+    "2355": {
+        "(motion, 49522)": "the cat sitting in the chair",
+        "(static, 49523)": "cat on left side"
+    },
+    "3408": {
+        "(motion, 71397)": "a man bending and judging a tennis match",
+        "(static, 71398)": "a man wearing a red shirt and black pants"
+    },
+    "834": {
+        "(motion, 17983)": "a giraffe who is eating hay out of a feeder",
+        "(static, 17984)": "the giraffe on the right side of the pole"
+    },
+    "328": {
+        "(motion, 6730)": "person bending over",
+        "(static, 6731)": "big person in blue cap"
+    },
+    "1646": {
+        "(motion, 35169)": "person about to hit a ball",
+        "(static, 35170)": "person wearing shirt and pants"
+    },
+    "4400": {
+        "(motion, 91825)": "boy sitting on his skateboard and looking at another boy",
+        "(static, 91826)": "boy wearing dark t - shirt and jeans"
+    },
+    "3683": {
+        "(motion, 77184)": "a man dishing up food",
+        "(static, 77185)": "a man in military camo and a black hat on the right"
+    },
+    "3788": {
+        "(motion, 79367)": "a black cat sitting and starring",
+        "(static, 79368)": "a cat with a heart shaped tag"
+    },
+    "4701": {
+        "(motion, 97795)": "person whose tie is being pulled by another person",
+        "(static, 97796)": "person in blue shirt with a red undone tie"
+    },
+    "1211": {
+        "(motion, 26003)": "person putting arm around another person",
+        "(static, 26004)": "person with backpack"
+    },
+    "2138": {
+        "(motion, 45446)": "a person sleeping on the top bunk",
+        "(static, 45447)": "a person in a green shirt and brown shorts"
+    },
+    "3510": {
+        "(motion, 73478)": "personn sitting in a train compartment and reading book",
+        "(static, 73479)": "person in striped shirt"
+    },
+    "899": {
+        "(motion, 19308)": "a man serving soup",
+        "(static, 19309)": "a man with tattoo on his arm"
+    },
+    "293": {
+        "(motion, 5939)": "a lady laughing and looking at another lady",
+        "(static, 5940)": "a lady with dark hair and a dark shirt"
+    },
+    "3196": {
+        "(motion, 67017)": "person holding a pen",
+        "(static, 67018)": "person in a brown suit"
+    },
+    "1939": {
+        "(motion, 41076)": "a person sitting cross legged on the beach",
+        "(static, 41077)": "person in khakis and a white shirt with yellow flowers"
+    },
+    "2659": {
+        "(motion, 56121)": "person helping another cross a stream",
+        "(static, 56122)": "person in white dress"
+    },
+    "2849": {
+        "(motion, 59798)": "person looking down drinking a glass of wine",
+        "(static, 59799)": "person on the right side not wearing glasses"
+    },
+    "756": {
+        "(motion, 16375)": "the woman about to pick up a slice of pizza",
+        "(static, 16376)": "a woman with a flower shirt"
+    },
+    "4573": {
+        "(motion, 95258)": "person reaching for another person with the frisbee",
+        "(static, 95259)": "person with blue and white striped shirt on"
+    },
+    "4514": {
+        "(motion, 94061)": "person running behind",
+        "(static, 94062)": "person in dark brown top and jeans"
+    },
+    "304": {
+        "(motion, 6165)": "person resting her head in hand and crossing one's legs",
+        "(static, 6166)": "the person in pink jacket"
+    },
+    "3465": {
+        "(motion, 72753)": "person sitting on a love seat and watching others play wii",
+        "(static, 72754)": "person in a black shirt and white shorts"
+    },
+    "1092": {
+        "(motion, 23796)": "a bear standing up with its mouth open",
+        "(static, 23797)": "a bear on the right"
+    },
+    "2025": {
+        "(motion, 42838)": "the person leading the horse",
+        "(static, 42839)": "the person in gray top and jeans"
+    },
+    "1701": {
+        "(motion, 36094)": "giraffe biting off of a tree",
+        "(static, 36095)": "tall giraffe on the right"
+    },
+    "2958": {
+        "(motion, 62137)": "person playing with dog",
+        "(static, 62138)": "balding person wearing brown hoodie"
+    },
+    "4793": {
+        "(motion, 99824)": "the girl eating and looking at her plate",
+        "(static, 99825)": "the girl wearing a pink shirt"
+    },
+    "1247": {
+        "(motion, 26624)": "the person holding the bat",
+        "(static, 26625)": "the person in white t - shirt and grey pants"
+    },
+    "1841": {
+        "(motion, 38888)": "person resting hands on other people's shoulders",
+        "(static, 38889)": "tallest person wearing bright suit"
+    },
+    "4404": {
+        "(motion, 91907)": "a elephant whose trunk pointing to the floor , may be touching",
+        "(static, 91908)": "elephant more on the right side of the picture"
+    },
+    "4536": {
+        "(motion, 94448)": "a person reaching for the microwave looking at the camera",
+        "(static, 94449)": "person in black t shirt"
+    },
+    "2787": {
+        "(motion, 58740)": "a giraffe snacking on the tree",
+        "(static, 58741)": "a giraffe on the right"
+    },
+    "3377": {
+        "(motion, 70765)": "a zebra resting its head on another zebra ' s back",
+        "(static, 70766)": "a zebra on the left"
+    },
+    "3889": {
+        "(motion, 81051)": "a man holding a basket of pastries",
+        "(static, 81052)": "a man wearing grey hoodie"
+    },
+    "2194": {
+        "(motion, 46507)": "standing dog",
+        "(static, 46508)": "a black and white dog with a blue collar tag"
+    },
+    "508": {
+        "(motion, 11146)": "person being held by another person",
+        "(static, 11147)": "person dressed in a red suit and blue cap"
+    },
+    "2312": {
+        "(motion, 48847)": "a bird standing on a table",
+        "(static, 48848)": "a bird on the left"
+    },
+    "3948": {
+        "(motion, 82190)": "the woman who is squinting in one eye",
+        "(static, 82191)": "a blue eyed brown haired woman not wearing glasses"
+    },
+    "1388": {
+        "(motion, 29353)": "person holding another person while watching giraffe drink water",
+        "(static, 29354)": "person in brown shirt with bag"
+    },
+    "2690": {
+        "(motion, 56849)": "a man about to kick a ball",
+        "(static, 56850)": "a man in all white with number 23 on his chest"
+    },
+    "1109": {
+        "(motion, 24043)": "man holding the ktie",
+        "(static, 24044)": "man on the right"
+    },
+    "1374": {
+        "(motion, 29120)": "person arranging pansts of another person",
+        "(static, 29121)": "the person with in the black tuxedo and glasses in his head"
+    },
+    "3475": {
+        "(motion, 72951)": "woman holding the horse",
+        "(static, 72952)": "a woman wearing spectacles with violet shirt and flourecent colour waist vest"
+    },
+    "1333": {
+        "(motion, 28225)": "a person holding another person",
+        "(static, 28226)": "a person in a pink and orange flannel shirt"
+    },
+    "2068": {
+        "(motion, 43909)": "person standing and playing wii",
+        "(static, 43910)": "person wearing black t - shirt"
+    },
+    "2824": {
+        "(motion, 59394)": "person standing besides a table crossing arms",
+        "(static, 59395)": "person with glasses and long hair"
+    },
+    "2294": {
+        "(motion, 48483)": "a person sitting on bike holding another person",
+        "(static, 48484)": "a person with a helmet on the head"
+    },
+    "2446": {
+        "(motion, 51355)": "an elephant that has it ' s trunk pointing towards the water",
+        "(static, 51356)": "elephant on the left"
+    },
+    "2686": {
+        "(motion, 56783)": "a man staring at another man",
+        "(static, 56784)": "a man in an orange tie"
+    },
+    "4558": {
+        "(motion, 94950)": "a zebra facing the camera",
+        "(static, 94951)": "a small zebra beside a larger zebra"
+    },
+    "1499": {
+        "(motion, 32051)": "a man resting on a metal fence",
+        "(static, 32052)": "a man in white shirt and polka dot tie"
+    },
+    "4303": {
+        "(motion, 89833)": "a man throwing a banana",
+        "(static, 89834)": "a man in bike gear on the right of the picture"
+    },
+    "1376": {
+        "(motion, 29146)": "a man sitting down with his hands together",
+        "(static, 29147)": "a man with a purple shirt and khaki pants "
+    },
+    "3544": {
+        "(motion, 74100)": "the man holding a riding crop",
+        "(static, 74101)": "man in black shirt and slacks on the left"
+    },
+    "1858": {
+        "(motion, 39103)": "a bull standing",
+        "(static, 39104)": "a white and brown bull on the left of the picture"
+    },
+    "434": {
+        "(motion, 9561)": "the man looking down",
+        "(static, 9562)": "the man on the left"
+    },
+    "3024": {
+        "(motion, 63345)": "a baseball player sliding into a base",
+        "(static, 63346)": "baseball player wearing the number 12"
+    },
+    "513": {
+        "(motion, 11239)": "a man riding on a skateboard as his picture is being taken",
+        "(static, 11240)": "a man in a purple t - shirt and ripped jeans"
+    },
+    "693": {
+        "(motion, 14989)": "a person standing",
+        "(static, 14990)": "a small person"
+    },
+    "2523": {
+        "(motion, 53103)": "a baseball player sliding into home plate and getting tagged by the catcher",
+        "(static, 53104)": "a la dodgers player on the right of the picture"
+    },
+    "4987": {
+        "(motion, 104145)": "a girl punching out her arm while playing an interactive video game",
+        "(static, 104146)": "girl wearing grey and white stripes and sweatpants"
+    },
+    "4041": {
+        "(motion, 84159)": "soccer player about to kick soccer ball",
+        "(static, 84160)": "soccer player wearing black t - shirt and black gloves"
+    },
+    "2105": {
+        "(motion, 44674)": "a baseball player holding his arm up to catch a ball",
+        "(static, 44675)": "a baseball player wearing helmet and vest"
+    },
+    "135": {
+        "(motion, 2353)": "dog resting it ' s head on a table",
+        "(static, 2354)": "golden dog"
+    },
+    "3613": {
+        "(motion, 75580)": "person talking to another person while crossing legs",
+        "(static, 75581)": "person with long sleeve shirt, jeans and cap"
+    },
+    "1722": {
+        "(motion, 36451)": "person pulling another person's tie",
+        "(static, 36452)": "blonde person in black dress"
+    },
+    "1607": {
+        "(motion, 34281)": "a person reading a book to another person he ' s holding",
+        "(static, 34282)": "a bald person wearing a beige t - shirt and gray jeans"
+    },
+    "2761": {
+        "(motion, 58225)": "girl propping her chin on her hand",
+        "(static, 58226)": "girl in a pink shirt near window"
+    },
+    "2454": {
+        "(motion, 51492)": "a man looking at laptop",
+        "(static, 51493)": "the man with glasses and painted fingernails"
+    },
+    "1603": {
+        "(motion, 34234)": "person eating a donut",
+        "(static, 34235)": "person with the black beanie"
+    },
+    "4794": {
+        "(motion, 99868)": "a duck that is looking straight ahead",
+        "(static, 99869)": "the duck on the right side"
+    },
+    "2485": {
+        "(motion, 52246)": "a person reaching across the net",
+        "(static, 52247)": "tallest person in a grey shirt and shorts"
+    },
+    "3280": {
+        "(motion, 68799)": "a boy walking towards his skate board",
+        "(static, 68800)": "a boy in a striped shirt"
+    },
+    "3336": {
+        "(motion, 69882)": "person holding a piece of chocolate cake",
+        "(static, 69883)": "person wearing a purple dress"
+    },
+    "3118": {
+        "(motion, 65349)": "giraffe stretching its neck straight up",
+        "(static, 65350)": "taller giraffe"
+    },
+    "4494": {
+        "(motion, 93729)": "man touching the frisbee",
+        "(static, 93730)": "a man in a white shirt"
+    },
+    "3004": {
+        "(motion, 62940)": "person crouching to catch a ball",
+        "(static, 62941)": "person in a red uniform and helmet"
+    },
+    "127": {
+        "(motion, 2256)": "a person holding a plate",
+        "(static, 2257)": "the person in the purple coat"
+    },
+    "3389": {
+        "(motion, 70905)": "person waving",
+        "(static, 70906)": "person in black sneakers"
+    },
+    "2568": {
+        "(motion, 54256)": "person looking at phone",
+        "(static, 54257)": "blonde person on the right"
+    },
+    "2283": {
+        "(motion, 48251)": "the cook holding a plate",
+        "(static, 48252)": "middle cook of three cooks"
+    },
+    "1530": {
+        "(motion, 32639)": "person petting the cat",
+        "(static, 32640)": "person with sleeves rolled up"
+    },
+    "4251": {
+        "(motion, 88833)": "a person reading a book",
+        "(static, 88834)": "person in a striped jacket "
+    },
+    "2540": {
+        "(motion, 53539)": "a man reaching out his right arm holding a controller",
+        "(static, 53540)": "a man in red shirt and black jeans"
+    },
+    "2870": {
+        "(motion, 60169)": "a person watching horse riding",
+        "(static, 60170)": "a person in a white jacket and beige pants"
+    },
+    "4946": {
+        "(motion, 103092)": "a man about to hit a ball",
+        "(static, 103093)": "a man in red shirt and blue vest"
+    },
+    "113": {
+        "(motion, 1973)": "person holding phone",
+        "(static, 1974)": "person with a black shirt and brown coat"
+    },
+    "711": {
+        "(motion, 15398)": "girl crouching and holding an umbrella",
+        "(static, 15399)": "girl wearing light green socks on the left"
+    },
+    "3209": {
+        "(motion, 67236)": "the person that is sliding into home , getting tagged out by the catcher",
+        "(static, 67237)": "the person in the white vest over the blue shirt"
+    },
+    "3620": {
+        "(motion, 75711)": "person petting a horse",
+        "(static, 75712)": "a person in white t - shirt"
+    },
+    "4382": {
+        "(motion, 91559)": "horse being hugged by a person",
+        "(static, 91560)": "white and brown horse"
+    },
+    "2861": {
+        "(motion, 60004)": "a man playing tennis",
+        "(static, 60005)": "a man wearing a blue shirt and white shorts"
+    },
+    "3954": {
+        "(motion, 82306)": "a person putting gloves on",
+        "(static, 82307)": "person with dark blue jumper"
+    },
+    "1984": {
+        "(motion, 42076)": "a person being held by another person",
+        "(static, 42077)": "little person on pink skiis with yellow parka on"
+    },
+    "2069": {
+        "(motion, 43945)": "a person helping another person ski",
+        "(static, 43946)": "a big person in white jumper and backpack"
+    },
+    "2016": {
+        "(motion, 42686)": "person putting food in the oven",
+        "(static, 42687)": "person in green t - shirt"
+    },
+    "1153": {
+        "(motion, 25076)": "a giraffe , with head lowered , crosses in front of another giraffe",
+        "(static, 25077)": "giraffe in the middle"
+    },
+    "3614": {
+        "(motion, 75583)": "a man in explaining something on a tablet",
+        "(static, 75584)": "a man with a blue cap and striped shirt"
+    },
+    "198": {
+        "(motion, 3830)": "a giraffe bending down to eat grass",
+        "(static, 3831)": "giraffe in front"
+    },
+    "3012": {
+        "(motion, 63097)": "person standing with hands on hips",
+        "(static, 63098)": "person in a white collared shirt and jeans"
+    },
+    "4247": {
+        "(motion, 88808)": "man pointing toward another man",
+        "(static, 88809)": "man in plaid shirt"
+    },
+    "2205": {
+        "(motion, 46674)": "person bending over",
+        "(static, 46675)": "person in red shirt and cap"
+    },
+    "4831": {
+        "(motion, 100694)": "person holding bat in hands",
+        "(static, 100695)": "person wearing light blue shirt and glass"
+    },
+    "4534": {
+        "(motion, 94419)": "the bird not drinking",
+        "(static, 94420)": "the bird on the left"
+    },
+    "638": {
+        "(motion, 13717)": "person sitting on another person's lap and holding the remote controller",
+        "(static, 13718)": "small person in red shirt"
+    },
+    "1419": {
+        "(motion, 30082)": "person squatting on the ground to catch a ball",
+        "(static, 30083)": "person in red and white wearing glove"
+    },
+    "1992": {
+        "(motion, 42197)": "a person reaching for a cupcake",
+        "(static, 42198)": "a person in a blue vest"
+    },
+    "542": {
+        "(motion, 11877)": "man receiving food",
+        "(static, 11878)": "a black man in a black shirt"
+    },
+    "2223": {
+        "(motion, 47051)": "person sitting a chair holding a protest sign",
+        "(static, 47052)": "old person in grey t - shirt and blue jeans"
+    },
+    "4865": {
+        "(motion, 101219)": "person being held by another person",
+        "(static, 101220)": "a young person wearing a yellow shirt"
+    },
+    "751": {
+        "(motion, 16247)": "person holding a painting brush",
+        "(static, 16248)": "person wearing white top and cap"
+    },
+    "3540": {
+        "(motion, 74039)": "a man swinging a bat",
+        "(static, 74040)": "a man in a blue baseball shirt and white pants"
+    },
+    "3765": {
+        "(motion, 78908)": "person sitting",
+        "(static, 78909)": "person wearing white shirt and red shoes"
+    },
+    "2879": {
+        "(motion, 60471)": "bear standing against the fence",
+        "(static, 60472)": "a small bear on the right"
+    },
+    "4529": {
+        "(motion, 94312)": "kid holding out left arm playing wii",
+        "(static, 94313)": "kid in a green and red sweatshirt"
+    },
+    "2131": {
+        "(motion, 45308)": "man putting both hands behind his head",
+        "(static, 45309)": "a man with the pool noodle"
+    },
+    "1306": {
+        "(motion, 27841)": "a cow eating grass",
+        "(static, 27842)": "the cow on the right"
+    },
+    "3508": {
+        "(motion, 73469)": "a person standing and playing a video game",
+        "(static, 73470)": "a little person dressed in brown"
+    },
+    "4165": {
+        "(motion, 87036)": "a child holding feathers",
+        "(static, 87037)": "a child wearing green t - shirt"
+    },
+    "4126": {
+        "(motion, 86073)": "a person standing and reading a book",
+        "(static, 86074)": "a person in a suit"
+    },
+    "388": {
+        "(motion, 8339)": "a man holding up an umbrella in the rain for a man who is fixing a tire",
+        "(static, 8340)": "a man wearing glasses in a red jacket"
+    }
+}

make_refcoco/refcocog_umd/motion_split_generation.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

make_refcoco/refcocog_umd/part4_ref_id.txt ADDED Viewed

	@@ -0,0 +1,126 @@

+1679
+4048
+2530
+4385
+5018
+2290
+2347
+3143
+4745
+1688
+944
+3477
+2497
+4110
+2011
+2884
+1076
+4803
+3508
+169
+258
+3661
+4831
+2214
+2266
+2477
+5005
+2919
+1850
+3757
+524
+4363
+2976
+838
+3044
+2426
+2113
+2327
+4727
+859
+935
+1105
+395
+771
+2942
+41
+885
+4862
+1246
+3346
+3657
+540
+3364
+1880
+1949
+1620
+2902
+397
+732
+1173
+2920
+1643
+1454
+1725
+2338
+4249
+3917
+1156
+1998
+3571
+292
+3367
+2069
+4050
+2953
+4280
+1743
+4598
+3380
+3439
+3355
+3409
+711
+3764
+113
+518
+3158
+3223
+914
+3568
+592
+2856
+4879
+157
+1774
+2354
+174
+2369
+4247
+1014
+1080
+2272
+2495
+3511
+3955
+2409
+2775
+996
+4789
+1028
+244
+3538
+557
+1810
+4982
+4570
+1698
+3182
+846
+671
+3254
+3318
+1424
+3926
+862
+2932

make_refcoco/refcocog_umd/revised_refid_part4.json ADDED Viewed

	@@ -0,0 +1,498 @@

+{
+    "1679": {
+        "(motion, 37582)": "player holding a baseball glove",
+        "(static, 37583)": "a blurred player"
+    },
+    "4048": {
+        "(motion, 92810)": "player hitting a ball with a baseball bat",
+        "(static, 92811)": "player with number 18 on his back"
+    },
+    "2530": {
+        "(motion, 57782)": "man crouching ready to catch a ball",
+        "(static, 57783)": "man with 55 on his back"
+    },
+    "4385": {
+        "(motion, 101410)": "man leaning on one leg watching the players",
+        "(static, 101411)": "man in gray pants"
+    },
+    "5018": {
+        "(motion, 102413)": "man standing ready to swing his bat",
+        "(static, 102414)": "man in front of the other two men"
+    },
+    "2290": {
+        "(motion, 52302)": "sheep standing in the pasture next to a sitting sheep",
+        "(static, 52303)": "the front most sheep"
+    },
+    "2347": {
+        "(motion, 53861)": "a sheep sitting down in the grass",
+        "(static, 53862)": "a sheep in the background"
+    },
+    "3143": {
+        "(motion, 71854)": "a horse being led by it ' s trainer",
+        "(static, 71855)": "a horse in front of the picture"
+    },
+    "1688": {
+        "(motion, 37818)": "zebra eating grass",
+        "(static, 37819)": "the zebra in the middle with its face near the ground"
+    },
+    "944": {
+        "(motion, 21007)": "a bird touching its neck with its right feet",
+        "(static, 21008)": "a bird in the back"
+    },
+    "3477": {
+        "(motion, 79163)": "the bird standing and looking to the left",
+        "(static, 79164)": "bird with both feet in the water"
+    },
+    "2497": {
+        "(motion, 56845)": "person holding a baseball bat",
+        "(static, 56846)": "person in blue baseball cap"
+    },
+    "4110": {
+        "(motion, 94298)": "person sitting and watching children play a ballgame",
+        "(static, 94299)": "person wearing a white shirt and black leggings"
+    },
+    "2011": {
+        "(motion, 45909)": "a woman talking on her cell phone",
+        "(static, 45910)": "a blonde woman wearing a blue shirt and white shorts"
+    },
+    "2884": {
+        "(motion, 65819)": "a woman looking at her phone",
+        "(static, 65820)": "a woman with black hair wearing jeans, a striped gray shirt and flip flops"
+    },
+    "1076": {
+        "(motion, 24000)": "person crossing a stream of water",
+        "(static, 24001)": "person wearing jeans and a green vest"
+    },
+    "4803": {
+        "(motion, 56121)": "person helping the other cross a stream",
+        "(static, 56122)": "person in white dress"
+    },
+    "3508": {
+        "(motion, 80112)": "baseball player placing his hands on his hips",
+        "(static, 80113)": "a baseball player named datz"
+    },
+    "169": {
+        "(motion, 4002)": "person feeding a giraffe",
+        "(static, 4003)": "a small person in light blue shirt"
+    },
+    "258": {
+        "(motion, 5988)": "person holding a child",
+        "(static, 5989)": "person wearing glasses and navy shirt"
+    },
+    "3661": {
+        "(motion, 83542)": "person sitting on the floor",
+        "(static, 83543)": "person in a grey shirt and dark pants"
+    },
+    "4831": {
+        "(motion, 62137)": "person sitting on couch and playing with a dog",
+        "(static, 62138)": "bald person wearing jeans and brown hoodie"
+    },
+    "2214": {
+        "(motion, 50208)": "a woman eating a donut",
+        "(static, 50209)": "a brown hair woman in gray sweater"
+    },
+    "2266": {
+        "(motion, 51661)": "a woman holding a purse",
+        "(static, 51662)": "a woman with blonde hair and a black shirt"
+    },
+    "2477": {
+        "(motion, 56429)": "girl talking and looking at another girl",
+        "(static, 56430)": "girl in black"
+    },
+    "5005": {
+        "(motion, 99824)": "girl eating and looking at her plate",
+        "(static, 99825)": "girl wearing a pink shirt"
+    },
+    "2919": {
+        "(motion, 66832)": "person riding a bike",
+        "(static, 66833)": "asian person wearing black jacket"
+    },
+    "1850": {
+        "(motion, 42078)": "man placing his hand on another man's shoulder",
+        "(static, 42079)": "a man who is wearing a red color tie"
+    },
+    "3757": {
+        "(motion, 85761)": "boy holding a cell phone",
+        "(static, 85762)": "boy in a blue hoodie"
+    },
+    "524": {
+        "(motion, 12089)": "a zebra that is not eating grass",
+        "(static, 12090)": "a zebra on the far right"
+    },
+    "4363": {
+        "(motion, 100914)": "elephant holding up its trunk",
+        "(static, 100915)": "an elephant in front of another"
+    },
+    "2976": {
+        "(motion, 68306)": "girl eating food from her right hand",
+        "(static, 68307)": "a girl in a black flowered top"
+    },
+    "838": {
+        "(motion, 18887)": "man leaning on bike on boat",
+        "(static, 18888)": "a man not wearing a hat"
+    },
+    "3044": {
+        "(motion, 69755)": "man rowing boat",
+        "(static, 69756)": "a man on the left side of the picture"
+    },
+    "2426": {
+        "(motion, 55424)": "the baseball player facing towards the right not doing a high five",
+        "(static, 55425)": "baseball player in catcher ' s uniform"
+    },
+    "2113": {
+        "(motion, 47984)": "person that is dancing",
+        "(static, 47985)": "person with the thick beard, glasses and a hat"
+    },
+    "2327": {
+        "(motion, 53376)": "person bathing another person",
+        "(static, 53377)": "person in a floral print dress and hat"
+    },
+    "4727": {
+        "(motion, 39103)": "a bull laying down",
+        "(static, 39104)": "a white and brown bull on the right"
+    },
+    "859": {
+        "(motion, 19350)": "cat sitting on a luggage and staring at the camera",
+        "(static, 19351)": "cat infront of another cat"
+    },
+    "935": {
+        "(motion, 20809)": "cat laying down on a bag",
+        "(static, 20810)": "cat behind another cat"
+    },
+    "1105": {
+        "(motion, 24654)": "an elephant stepping on a large log",
+        "(static, 24655)": "elephant on far right"
+    },
+    "395": {
+        "(motion, 8819)": "person placing her hands on one's hips",
+        "(static, 8820)": "person on the far left"
+    },
+    "771": {
+        "(motion, 17614)": "person holding a child on one's shoulders",
+        "(static, 17615)": "tall person on the right"
+    },
+    "2942": {
+        "(motion, 67334)": "person sitting on another person's shoulders",
+        "(static, 67335)": "small person on the right"
+    },
+    "41": {
+        "(motion, 961)": "a lady pouring wine in a glass",
+        "(static, 962)": "a lady in black tank top"
+    },
+    "885": {
+        "(motion, 19926)": "person feeding another person with a bottle",
+        "(static, 19927)": "person in black blouse"
+    },
+    "4862": {
+        "(motion, 69276)": "person drinking from a bottle",
+        "(static, 69277)": "small person in white pajamas"
+    },
+    "1246": {
+        "(motion, 27831)": "person holding a laptop",
+        "(static, 27832)": "person with curly brown hair wearing jeans"
+    },
+    "3346": {
+        "(motion, 76051)": "person filing her nails",
+        "(static, 76052)": "person wearing a red robe and has a towel on her head"
+    },
+    "3657": {
+        "(motion, 83493)": "person holding a bottle and listening to music",
+        "(static, 83494)": "person wearing black in headphones"
+    },
+    "540": {
+        "(motion, 12381)": "the woman is swinging the controller",
+        "(static, 12382)": "woman in brown top on the right"
+    },
+    "3364": {
+        "(motion, 76757)": "the woman looking at the camera and opening her mouth",
+        "(static, 76758)": "a woman wearing a brown hooded sweatshirt on the left"
+    },
+    "1880": {
+        "(motion, 42973)": "man looking ahead at the tv",
+        "(static, 42974)": "a man in a white shirt"
+    },
+    "1949": {
+        "(motion, 44400)": "a man looking at his phone",
+        "(static, 44401)": "man in black t - shirt and cap"
+    },
+    "1620": {
+        "(motion, 36248)": "person playing tennis",
+        "(static, 36249)": "person in red tank top and black shorts"
+    },
+    "2902": {
+        "(motion, 66297)": "person sitting and watching a tennis game",
+        "(static, 66298)": "person in blue top"
+    },
+    "397": {
+        "(motion, 8843)": "giraffe bending its head down",
+        "(static, 8844)": "giraffe on the far right"
+    },
+    "732": {
+        "(motion, 16725)": "baseball player squatting and watching closely to judge a play",
+        "(static, 16726)": "baseball player in black top and gray pants"
+    },
+    "1173": {
+        "(motion, 26074)": "a man swinging a bat",
+        "(static, 26075)": "a man in blue and grey"
+    },
+    "2920": {
+        "(motion, 66854)": "a man reaching out his left arm to catch a ball",
+        "(static, 66855)": "a man in red uniform and helmet"
+    },
+    "1643": {
+        "(motion, 36762)": "a man smiling looking down at other people",
+        "(static, 36763)": "a man in a grey suite wearing a pink tie"
+    },
+    "1454": {
+        "(motion, 32177)": "person in putting hands in one's pockets",
+        "(static, 32178)": "person in gray shirt and jeans"
+    },
+    "1725": {
+        "(motion, 38835)": "person crossing her arms walking with another person",
+        "(static, 38836)": "person in a black shirt and jeans"
+    },
+    "2338": {
+        "(motion, 53733)": "the person crouching and placing his hands on his knees",
+        "(static, 53734)": "person with a black shirt and dark grey pants"
+    },
+    "4249": {
+        "(motion, 97957)": "a baseball player reaching out his arm to catch a ball",
+        "(static, 97958)": "a baseball player in green top"
+    },
+    "3917": {
+        "(motion, 89675)": "cow looking at camera",
+        "(static, 89676)": "a cow with an ear tag with the number 949 on it"
+    },
+    "1156": {
+        "(motion, 25761)": "man sitting on the couch using a laptop",
+        "(static, 25762)": "a man with a hat"
+    },
+    "1998": {
+        "(motion, 45619)": "a person watching his phone",
+        "(static, 45620)": "person wearing glasses"
+    },
+    "3571": {
+        "(motion, 81719)": "person looking at one's phone",
+        "(static, 81720)": "mature person with blonde hair and glasses"
+    },
+    "292": {
+        "(motion, 6707)": "a zebra lying down in dirt",
+        "(static, 6708)": "the zebra in the foreground"
+    },
+    "3367": {
+        "(motion, 76808)": "a zebra standing in the zoo",
+        "(static, 76809)": "a zebra in the background"
+    },
+    "2069": {
+        "(motion, 47212)": "person leaning forward on skis",
+        "(static, 47213)": "person in blue hat and jacket, black pants"
+    },
+    "4050": {
+        "(motion, 92834)": "person standing straight looking at another person",
+        "(static, 92835)": "a small person wearing purple pants"
+    },
+    "2953": {
+        "(motion, 67711)": "person who is looking away",
+        "(static, 67712)": "person in a suit"
+    },
+    "4280": {
+        "(motion, 98813)": "person pulling another person's tie",
+        "(static, 98814)": "a person in a white shirt"
+    },
+    "1743": {
+        "(motion, 39371)": "a person holding and looking at another person",
+        "(static, 39372)": "person with bald head and glasses"
+    },
+    "4598": {
+        "(motion, 13717)": "person playing with the remote controller",
+        "(static, 13718)": "small person in red shirt"
+    },
+    "3380": {
+        "(motion, 77052)": "a person cutting a cake",
+        "(static, 77053)": "a person in gray shirt that is not striped"
+    },
+    "3439": {
+        "(motion, 78305)": "a person holding a spatula getting readyy to have a cake",
+        "(static, 78306)": "a person in striped shirt"
+    },
+    "3355": {
+        "(motion, 76309)": "a man swining his bat",
+        "(static, 76310)": "a man in a baseball uniform with a brace on his left ankle"
+    },
+    "3409": {
+        "(motion, 77608)": "a man holding out his arm to catch a ball",
+        "(static, 77609)": "a man wearing a red vest with red shin guards"
+    },
+    "711": {
+        "(motion, 16184)": "the man holding a cat in his arms",
+        "(static, 16185)": "this is a man with thin rimmed glasses and a black scarf"
+    },
+    "3764": {
+        "(motion, 85913)": "person holding a remote and smilling",
+        "(static, 85914)": "person in a black t - shirt and not wearing glasses"
+    },
+    "113": {
+        "(motion, 2741)": "a sheep being fed by a little girl",
+        "(static, 2742)": "a sheep on the right"
+    },
+    "518": {
+        "(motion, 12021)": "a sheep eating grass with its head down",
+        "(static, 12022)": "a sheep on the left"
+    },
+    "3158": {
+        "(motion, 72128)": "a boy crouching and placing both hands on his knees",
+        "(static, 72129)": "boy wearing white baseball helmet , white baseball uniform with orange writing"
+    },
+    "3223": {
+        "(motion, 73555)": "a boy pitching the ball to a player",
+        "(static, 73556)": "a boy with the number 4 on his blue jersey"
+    },
+    "914": {
+        "(motion, 20478)": "a person standing on a surf board , riding a wave",
+        "(static, 20479)": "a person on the right"
+    },
+    "3568": {
+        "(motion, 81669)": "surfer laying down",
+        "(static, 81670)": "surfer on the left"
+    },
+    "592": {
+        "(motion, 13643)": "person sits on the floor watching tv",
+        "(static, 13644)": "person with a black hat and a beige shirt"
+    },
+    "2856": {
+        "(motion, 65208)": "person sitting on a chair watching another person play video games",
+        "(static, 65209)": "person in black shirt and jeans"
+    },
+    "4879": {
+        "(motion, 73469)": "person playing a video game",
+        "(static, 73470)": "blonde person dressed in brown"
+    },
+    "157": {
+        "(motion, 3682)": "a woman holding a plate and reaching for condiments",
+        "(static, 3683)": "woman wearing grey button up sweater"
+    },
+    "1774": {
+        "(motion, 40317)": "person being held by another person",
+        "(static, 40318)": "person with red hair, wearing a pink shirt"
+    },
+    "2354": {
+        "(motion, 53948)": "person with child , catching a frisby",
+        "(static, 53949)": "bigger person in white t - shirt"
+    },
+    "174": {
+        "(motion, 4179)": "a lamb eating grass",
+        "(static, 4180)": "a lamb to the left of another lamb"
+    },
+    "2369": {
+        "(motion, 54196)": "the sheep that is looking into the camera",
+        "(static, 54197)": "a white sheep with a black head on the right"
+    },
+    "4247": {
+        "(motion, 97897)": "a woman holding an umbrella on a bench",
+        "(static, 97898)": "woman on the right"
+    },
+    "1014": {
+        "(motion, 22621)": "man receiving an award",
+        "(static, 22622)": "a man in an orange and white uniform with a black cap"
+    },
+    "1080": {
+        "(motion, 24100)": "a man offers a trophy to anothe man",
+        "(static, 24101)": "a man in a suit"
+    },
+    "2272": {
+        "(motion, 51815)": "the baseball player catching a ball",
+        "(static, 51816)": "the baseball player in dark top and helmet"
+    },
+    "2495": {
+        "(motion, 56804)": "a baseball player swinging at a ball",
+        "(static, 56805)": "the baseball player in white uniform"
+    },
+    "3511": {
+        "(motion, 80309)": "person holding a cup",
+        "(static, 80310)": "person wearing pink shirt"
+    },
+    "3955": {
+        "(motion, 90542)": "person holding a remote control",
+        "(static, 90543)": "person in orange shirt"
+    },
+    "2409": {
+        "(motion, 55054)": "a man adjusting his head band",
+        "(static, 55055)": "man in orange and gray shirt"
+    },
+    "2775": {
+        "(motion, 63273)": "a person holding a remote control",
+        "(static, 63274)": "a tall person in white striped shirt and black pants"
+    },
+    "996": {
+        "(motion, 22281)": "a woman holding a baby",
+        "(static, 22282)": "woman wearing a black shirt and green apron"
+    },
+    "4789": {
+        "(motion, 52629)": "a person holding skies in one's hands",
+        "(static, 52630)": "a person with orange mirrored goggles"
+    },
+    "1028": {
+        "(motion, 22786)": "the cow standing up",
+        "(static, 22787)": "a cow in the middle"
+    },
+    "244": {
+        "(motion, 5666)": "a man holding wine glass",
+        "(static, 5668)": "a blonde man in a white shirt"
+    },
+    "3538": {
+        "(motion, 80923)": "the man throwing the ball from the picther ' s mound",
+        "(static, 80924)": "the man in front"
+    },
+    "557": {
+        "(motion, 12739)": "a baseball player getting ready to swing the bat",
+        "(static, 12740)": "a baseball player , wearing a white and blue uniform"
+    },
+    "4982": {
+        "(motion, 95870)": "cat sitting in front of television on a stand",
+        "(static, 95871)": "orange cat on the right side of the picture"
+    },
+    "4570": {
+        "(motion, 6638)": "a woman cutting a cake",
+        "(static, 6639)": "a woman wearing a long sleeve pink sweater"
+    },
+    "1698": {
+        "(motion, 38093)": "a baseball player swinging his bat",
+        "(static, 38094)": "a baseball player weaing a white uniform and blue helmet"
+    },
+    "3182": {
+        "(motion, 72616)": "the baseball player playing the catcher position",
+        "(static, 72617)": "the baseball player wearing a red and white uniform"
+    },
+    "846": {
+        "(motion, 19100)": "a man holding a toothbrush in his mouth",
+        "(static, 19101)": "a man wearing striped shirt"
+    },
+    "671": {
+        "(motion, 15227)": "person petting a horse",
+        "(static, 15228)": "person wearing a red jacket"
+    },
+    "3254": {
+        "(motion, 74216)": "person sitting in the chair",
+        "(static, 74217)": "person in the tan shirt wearing glasses"
+    },
+    "3318": {
+        "(motion, 75539)": "the person who is smashing cake in his own face",
+        "(static, 75540)": "person with a fake tie on its onesie"
+    },
+    "1424": {
+        "(motion, 31548)": "person watching another person eat",
+        "(static, 31549)": "person in the green shirt"
+    },
+    "3926": {
+        "(motion, 89831)": "person eating a sandwich",
+        "(static, 89832)": "person in orange top with sunglasses in one's head"
+    },
+    "862": {
+        "(motion, 19444)": "a man driving a bicycle and pulling a cart behind",
+        "(static, 19445)": "the man is wearing a pair of khaki shorts"
+    },
+    "2932": {
+        "(motion, 67140)": "man standing on bike",
+        "(static, 67141)": "man in blue jean shorts"
+    }
+}

mbench/__init__.py ADDED Viewed

File without changes

mbench/__pycache__/transforms_video.cpython-39.pyc ADDED Viewed

Binary file (20 kB). View file

mbench/__pycache__/ytvos_ref.cpython-39.pyc ADDED Viewed

Binary file (7.4 kB). View file

mbench/check_image.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

mbench/check_image_numbered.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

mbench/check_image_revised.ipynb ADDED Viewed

	@@ -0,0 +1,164 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import argparse\n",
+    "import sys\n",
+    "import opts\n",
+    "import matplotlib.pyplot as plt\n",
+    "import matplotlib.patches as patches\n",
+    "import textwrap\n",
+    "\n",
+    "from PIL import Image, ImageDraw\n",
+    "import json\n",
+    "import numpy as np\n",
+    "from mbench.ytvos_ref import build as build_ytvos_ref"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "img_folder = 'data/ref-youtube-vos/train'\n",
+    "text_colors = ['red', 'blue']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('mbench/result_revised50.json') as file:\n",
+    "    data = json.load(file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def bounding_box(img):\n",
+    "    rows = np.any(img, axis=1)\n",
+    "    cols = np.any(img, axis=0)\n",
+    "    rmin, rmax = np.where(rows)[0][[0, -1]]\n",
+    "    cmin, cmax = np.where(cols)[0][[0, -1]]\n",
+    "    return rmin, rmax, cmin, cmax # y1, y2, x1, x2 "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 97,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def showImageRef(vid_id):\n",
+    "    vid_data = data[vid_id]\n",
+    "    cats = list(vid_data.keys())\n",
+    "\n",
+    "    for cat in cats:\n",
+    "        cat_data = vid_data[cat]\n",
+    "        frames = list(cat_data.keys())\n",
+    "        \n",
+    "        for frame in frames:\n",
+    "            frame_data = cat_data[frame]\n",
+    "            \n",
+    "            img_path = os.path.join(img_folder, 'JPEGImages', vid_id, frame + '.jpg')\n",
+    "            mask_path = os.path.join(img_folder, 'Annotations', vid_id, frame + '.png')\n",
+    "            img = Image.open(img_path).convert('RGB')\n",
+    "            mask = Image.open(mask_path).convert('P')\n",
+    "            mask = np.array(mask)\n",
+    "            \n",
+    "            if frame_data:\n",
+    "                obj_ids = list(frame_data.keys())\n",
+    "                obj_nums = len(obj_ids)\n",
+    "\n",
+    "                fig, axes = plt.subplots(1, obj_nums, figsize=(16, obj_nums))\n",
+    "\n",
+    "                for i in range(len(obj_ids)):\n",
+    "                    obj_id = obj_ids[i]\n",
+    "                    obj_data = frame_data[obj_id]\n",
+    "                    if obj_data:\n",
+    "                        ref_exp = obj_data['ref_exp']\n",
+    "                        isValid = obj_data['isValid']\n",
+    "\n",
+    "                        obj_mask = (mask == int(obj_id)).astype(np.float32)\n",
+    "                        if (obj_mask > 0).any():\n",
+    "                            y1, y2, x1, x2 = bounding_box(obj_mask)\n",
+    "                            box = np.array([x1, y1, x2, y2])\n",
+    "                        else:\n",
+    "                            box = np.array([0, 0, 0, 0])\n",
+    "                        \n",
+    "                        if obj_nums == 1:\n",
+    "                            ax = axes\n",
+    "                        else:\n",
+    "                            ax = axes[i]\n",
+    "                        ax.imshow(img)\n",
+    "                        width, height = box[2] - box[0], box[3] - box[1]\n",
+    "                        rect = patches.Rectangle((x1, y1), width, height, linewidth=2, edgecolor='red', facecolor='none')\n",
+    "                        ax.add_patch(rect)\n",
+    "\n",
+    "                        wrapped_text = \"\\n\".join(textwrap.wrap(ref_exp, width=30))\n",
+    "                        ax.annotate(wrapped_text, xy=(0.5, -1.5), xycoords=\"axes fraction\", ha = \"center\", color=text_colors[isValid])\n",
+    "                \n",
+    "                plt.suptitle(f\"video: {vid_id} - cat: {cat} - frame: {frame}\")\n",
+    "                plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 142,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "04667fabaa\n"
+     ]
+    }
+   ],
+   "source": [
+    "vid_id = list(data.keys())[49]\n",
+    "print(vid_id)\n",
+    "showImageRef(vid_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "referformer",
+   "language": "python",
+   "name": "referformer"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

mbench/gpt_ref-ytvos-revised.py ADDED Viewed

	@@ -0,0 +1,428 @@

+import sys
+from os import path as osp
+sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
+from mbench.ytvos_ref import build as build_ytvos_ref
+import argparse
+import opts
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+from openai import OpenAI
+import base64
+# Function to encode the image
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+# Captioner
+ytvos_category_valid_list = [
+    'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
+    'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
+    'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
+    'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
+    'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
+    'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
+]
+def getCaption(video_id, json_data):
+    #데이터 가져오기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    cat_names = set()
+    all_captions = dict()
+    for obj_id in list(video_data['annotations'][0].keys()):
+        cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
+    # cat_names : person, snowboard
+    # 1. gpt에서 직접 action의 대상이 될 수 있는가 물어보기
+    # 2. ref-youtube-vos 에서 제공하는 카테고리 정보에서 우리가 처리하고 싶은 카테고리 이름만 남긴다
+    for cat_name in list(cat_names) :
+        image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
+        image_captions = {}
+        captioner = OpenAI()
+        #0단계: action의 대상이 될 수 있는가?
+        is_movable = False
+        if cat_name in ytvos_category_valid_list :
+            is_movable = True
+        # response_check = captioner.chat.completions.create(
+        #     model="gpt-4o",
+        #     messages=[
+        #         {
+        #             "role": "user",
+        #             "content": f"""
+        #                 Can a {cat_name} be a subject of distinct actions or movements?
+        #                 For example, if {cat_name} is a person, animal, or vehicle, it is likely an action-capable subject.
+        #                 However, if it is an inanimate object like a snowboard, tree, or book, it cannot independently perform actions.
+        #                 Respond with YES if {cat_name} can perform distinct actions or movements; otherwise, respond with NONE.
+        #                 Answer only YES or NONE.
+        #             """
+        #         }
+        #     ],
+        # )
+        # response_check_content = response_check.choices[0].message.content.strip().lower()
+        # print(f"Movable Check for {cat_name}: {response_check_content}")
+        # if response_check_content == "yes": is_movable = True
+        if not is_movable:
+            print(f"Skipping {cat_name}: Determined to be non-movable.")
+            continue
+        for i in range(len(image_paths)):
+            image_path = image_paths[i]
+            frame_name = frame_names[i]
+            base64_image = encode_image(image_path)
+            #1단계: 필터링
+            #print(f"-----------category name: {cat_name}, frame name: {frame_name}")
+            response1 = captioner.chat.completions.create(
+                model="chatgpt-4o-latest",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": f"""Are there multiple {cat_name}s in the image, each performing distinct and recognizable actions?
+                                        Focus only on clear and prominent actions, avoiding minor or ambiguous ones.
+                                        Each action should be unique and clearly associated with a specific object.
+                                        Respond with YES if:
+                                        - The {cat_name}s are people, animals or vehicles, and their actions are distinct and recognizable.
+                                        - The {cat_name}s involve clear, distinguishable actions performed independently.
+                                        Respond with NONE if:
+                                        - The {cat_name}s are objects (e.g., snowboard, tree, books) and do not involve direct interaction with a person.
+                                        - Actions are ambiguous, minor, or not clearly visible.
+                                        If the {cat_name} is 'snowboard' and it is not actively being used or interacted with by a person, output NONE.
+                                        If the {cat_name} is 'person' and their actions are distinct and clear, output YES.
+                                        Answer only YES or NONE."""
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                            },
+                        ],
+                    }
+                ],
+            )
+            response_content = response1.choices[0].message.content
+            should_caption = True if "yes" in response_content.lower() else False
+            #print(f"are {cat_name}s distinguished by action: {response_content}")
+            #2단계: dense caption 만들기
+            if should_caption:
+                response2 = captioner.chat.completions.create(
+                    model="chatgpt-4o-latest",
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": f"""
+                                            Generate a detailed action-centric caption describing the actions of the {cat_name}s in the image.
+                                            1. Focus only on clear, unique, and prominent actions that distinguish each object.
+                                            2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
+                                            3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
+                                            4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
+                                            5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
+                                            6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                                            7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+                                            8. Include interactions with objects or other entities when they are prominent and observable.
+                                            9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
+                                            Output only the caption.""",
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                },
+                            ],
+                        }
+                    ],
+                )
+                caption = response2.choices[0].message.content
+                #print(f"{image_path} - {frame_name}: {caption}")
+            else:
+                caption = None
+            image_captions[frame_name] = caption
+        all_captions[cat_name] = image_captions
+    # final : also prepare valid object ids
+    valid_obj_ids = []
+    valid_cat_names = list(all_captions.keys())
+    for obj_id in list(video_data['annotations'][0].keys()):
+        cat = video_data['annotations'][0][obj_id]['category_name']
+        if cat in valid_cat_names : valid_obj_ids.append(obj_id)
+    return all_captions, valid_obj_ids
+# Referring expression generator and QA filter
+def getRefExp(video_id, frame_name, caption, obj_id, json_data):
+    # 이미지에 해당 물체 바운딩 박스 그리기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
+    frame_indx = frame_names.index(frame_name)
+    obj_data = video_data['annotations'][frame_indx][obj_id]
+    bbox = obj_data['bbox']
+    cat_name = obj_data['category_name']
+    valid = obj_data['valid']
+    if valid == 0:
+        print("Object not in this frame!")
+        return {}
+    x_min, y_min, x_max, y_max = bbox
+    x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
+    cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
+    plt.figure()
+    plt.imshow(I)
+    plt.axis('off')
+    plt.show()
+    #cropped object for visibility check
+    cropped_I = I[y_min:y_max, x_min:x_max]
+    pil_cropped_I = Image.fromarray(cropped_I)
+    buff_crop = BytesIO()
+    pil_cropped_I.save(buff_crop, format='JPEG')
+    base64_cropped_I = base64.b64encode(buff_crop.getvalue()).decode("utf-8")
+    #entire image for referring expression generation
+    pil_I = Image.fromarray(I)
+    buff = BytesIO()
+    pil_I.save(buff, format='JPEG')
+    base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
+    # 구분 가능 여부 확인
+    generator = OpenAI()
+    response_check = generator.chat.completions.create(
+        model="chatgpt-4o-latest",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Can the {cat_name} in the provided cropped image be clearly identified as belonging to the category {cat_name}?
+                                    Focus on whether the cropped image provides enough visible features (e.g., ears, head shape, fur texture) to confirm that it is a {cat_name}, even if the full body is not visible.
+                                    Guidelines:
+                                    - If the visible features (like ears, fur texture or head shape) are sufficient to identify the {cat_name}, respond with YES.
+                                    - If multiple {cat_name}s are entangled or overlapping, making it difficult to distinguish one from another, respond with NONE.
+                                    - If the object is clearly visible and identifiable as a {cat_name}, respond with YES.
+                                    Output only either YES or NONE.
+                        """
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
+                    }
+                ]
+            },
+        ]
+    )
+    response_check_content = response_check.choices[0].message.content.strip().lower()
+    #print(f"is object {obj_id} visible: {response_check_content}")
+    if "yes" not in response_check_content:
+        print(f"Referring expression not generated: {cat_name} is ambiguous in this frame.")
+        return {"ref_exp": "NONE", "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : False}
+    # Referring expression 만들기
+    # generator = OpenAI()
+    response = generator.chat.completions.create(
+        model="chatgpt-4o-latest",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box, corresponding to Object ID {obj_id}.
+                        Guidelines for creating the referring expression:
+                        1. The referring expression should describe the prominent actions or poses of the highlighted {cat_name} (Object ID {obj_id}).
+                        2. Focus on the behavior or pose described in the caption that is specifically associated with this {cat_name}. Do not include actions or poses of other {cat_name}s.
+                        3. If multiple {cat_name}s are present, ensure that the referring expression exclusively describes the {cat_name} corresponding to Object ID {obj_id}.
+                        4. Avoid ambiguous or subjective terms. Use specific and clear action verbs to describe the highlighted {cat_name}.
+                        5. The referring expression should only describe Object ID {obj_id} and not any other objects or entities.
+                        6. Use '{cat_name}' as the noun for the referring expressions.
+                        Output only the referring expression for the highlighted {cat_name} (Object ID {obj_id}).
+                        {caption}
+                        """
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                    # {
+                    #     "type": "image_url",
+                    #     "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
+                    # }
+                ],
+            }
+        ],
+    )
+    ref_exp = response.choices[0].message.content.strip()
+    #QA filtering
+    #QA1: 원하는 물체를 설명하는지
+    filter = OpenAI()
+    response1 = filter.chat.completions.create(
+        model="chatgpt-4o-latest",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    response1_content = response1.choices[0].message.content
+    describesHighlighted = True if "yes" in response1_content.lower() else False
+    #QA2: 원하지 않는 물체를 설명하지 않는지
+    response2 = filter.chat.completions.create(
+        model="chatgpt-4o-latest",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    response2_content = response2.choices[0].message.content
+    notDescribesNotHighlighted = False if "yes" in response2_content.lower() else True
+    isValid = True if describesHighlighted and notDescribesNotHighlighted else False
+    #print(f"describesHighlighted: {describesHighlighted}, notDescribesNotHighlighted: {notDescribesNotHighlighted}")
+    #print(f"ref exp: {ref_exp}")
+    #print("")
+    return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
+if __name__ == '__main__':
+    with open('mbench/sampled_frame3.json', 'r') as file:
+        data = json.load(file)
+    vid_ids = list(data.keys())
+    all_ref_exps = {}
+    os.environ['OPENAI_API_KEY'] = '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'
+    # 전체 데이터셋의 vid_id에 대해
+    for i in range(50):
+        vid_id = vid_ids[i]
+        #====캡션 만들기====
+        # print("=====================captioner========================")
+        captions, valid_obj_ids = getCaption(vid_id, data)
+        cats_in_vid = list(captions.keys())
+        # print()
+        #====referring expression 만들고 QA filtering====
+        # print("=====================referring expression generator & QA filter========================")
+        ref_expressions = {}
+        # 각 카테고리별로
+        for cat_name in cats_in_vid:
+            if cat_name not in ref_expressions:
+                ref_expressions[cat_name] = {}
+            # 각 비디오 프레임 별로
+            for frame_name in data[vid_id]['frame_names']:
+                # print(f'--------category: {cat_name}, frame_name: {frame_name}')
+                if frame_name not in ref_expressions[cat_name]:
+                    ref_expressions[cat_name][frame_name] = {}  # Create frame-level dictionary
+                caption = captions[cat_name][frame_name]
+                if not caption : continue
+                else :
+                    # 각 obj id별로
+                    for obj_id in valid_obj_ids:
+                        ref_exp = getRefExp(vid_id, frame_name, caption, obj_id, data)
+                        ref_expressions[cat_name][frame_name][obj_id] = ref_exp  # Store ref_exp
+        all_ref_exps[vid_id] = ref_expressions
+    with open('mbench/result_revised50.json', 'w') as file:
+        json.dump(all_ref_exps, file, indent=4)

mbench/gpt_ref-ytvos.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

mbench/gpt_ref-ytvos.py ADDED Viewed

	@@ -0,0 +1,302 @@

+import sys
+from os import path as osp
+sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
+from datasets import build_dataset
+import argparse
+import opts
+from pathlib import Path
+import os
+import skimage
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+from openai import OpenAI
+import base64
+os.environ['OPENAI_API_KEY'] = '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'
+ytvos_category_valid_list = [
+    'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
+    'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
+    'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
+    'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
+    'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
+    'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
+]
+# Function to encode the image
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+def getCaption(video_id, json_data):
+    #데이터 가져오기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    cat_names = set()
+    for obj_id in list(video_data['annotations'][0].keys()):
+        cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
+    if len(cat_names) == 1:
+        cat_name = next(iter(cat_names))
+    else:
+        print("more than 2 categories")
+        return -1
+    image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
+    image_captions = {}
+    captioner = OpenAI()
+    for i in range(len(image_paths)):
+        image_path = image_paths[i]
+        frame_name = frame_names[i]
+        base64_image = encode_image(image_path)
+        #1단계: 필터링
+        response1 = captioner.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": f"Are there multiple {cat_name}s that can be distinguished by action? Each action should be prominent and describe the corresponding object only. If so, only output YES. If not, only output None",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                        },
+                    ],
+                }
+            ],
+        )
+        response_content = response1.choices[0].message.content
+        should_caption = True if "yes" in response_content.lower() else False
+        #2단계: dense caption 만들기
+        if should_caption:
+            response2 = captioner.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": f"""
+                                    Describe the image in detail focusing on the {cat_name}s' actions.
+                                    1. Each action should be prominent, clear and unique, describing the corresponding object only.
+                                    2. Avoid overly detailed or indeterminate details such as ‘in anticipation’.
+                                    3. Avoid subjective descriptions such as ‘soft’, ‘controlled’, ‘attentive’, ‘skilled’, ‘casual atmosphere’ and descriptions of the setting.
+                                    4. Do not include actions that needs to be guessed or suggested.""",
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                            },
+                        ],
+                    }
+                ],
+            )
+            caption = response2.choices[0].message.content
+        else:
+            caption = None
+        image_captions[frame_name] = caption
+    return image_captions
+def getRefExp(video_id, frame_name, caption, obj_id, json_data):
+    # 이미지에 해당 물체 바운딩 박스 그리기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
+    frame_indx = frame_names.index(frame_name)
+    obj_data = video_data['annotations'][frame_indx][obj_id]
+    bbox = obj_data['bbox']
+    cat_name = obj_data['category_name']
+    valid = obj_data['valid']
+    if valid == 0:
+        print("Object not in this frame!")
+        return {}
+    x_min, y_min, x_max, y_max = bbox
+    x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
+    cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
+    # plt.figure()
+    # plt.imshow(I)
+    # plt.axis('off')
+    # plt.show()
+    pil_I = Image.fromarray(I)
+    buff = BytesIO()
+    pil_I.save(buff, format='JPEG')
+    base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
+    #ref expression 만들기
+    generator = OpenAI()
+    response = generator.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box.
+                                1. The referring expression describes the action and does not contain information about appearance or location in the picture.
+                                2. Focus only on prominent actions and avoid overly detailed or indeterminate details.
+                                3. Avoid subjective terms describing emotion such as ‘in anticipation’, ‘attentively’ or ‘relaxed’ and professional, difficult words.
+                                4. The referring expression should only describe the highlighted {cat_name} and not any other.
+                                5. Use '{cat_name}' as the noun for the referring expressions.
+                                Output only the referring expression.
+                                {caption}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    ref_exp = response.choices[0].message.content
+    #QA filtering
+    #QA1: 원하는 물체를 설명하는지
+    filter = OpenAI()
+    response1 = filter.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    response1_content = response1.choices[0].message.content
+    describesHighlighted = True if "yes" in response1_content.lower() else False
+    #QA2: 원하지 않는 물체를 설명하지 않는지
+    response2 = filter.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    response2_content = response2.choices[0].message.content
+    describesNotHighlighted = True if "yes" in response2_content.lower() else False
+    isValid = True if describesHighlighted and not describesNotHighlighted else False
+    print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
+    return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
+def createRefExp(video_id, json_data):
+    video_data = json_data[video_id]
+    obj_ids = list(video_data['annotations'][0].keys())
+    frame_names = video_data['frame_names']
+    captions_per_frame = getCaption(video_id, json_data)
+    if captions_per_frame == -1:
+        print("There are more than 2 cateories")
+        return None
+    video_ref_exps = {}
+    for frame_name in frame_names:
+        frame_caption = captions_per_frame[frame_name]
+        if frame_caption == None:
+            video_ref_exps[frame_name] = None
+        else:
+            frame_ref_exps = {}
+            for obj_id in obj_ids:
+                exp_per_obj = getRefExp(video_id, frame_name, frame_caption, obj_id, json_data)
+                frame_ref_exps[obj_id] = exp_per_obj
+            video_ref_exps[frame_name] = frame_ref_exps
+    return video_ref_exps
+if __name__ == '__main__':
+    with open('mbench/sampled_frame3.json', 'r') as file:
+        data = json.load(file)
+    videos = set()
+    with open('make_ref-ytvos/selected_frames.jsonl', 'r') as file:
+        manual_select = list(file)
+    for frame in manual_select:
+        result = json.loads(frame)
+        videos.add(result['video'])
+    videos = list(videos)
+    all_video_refs = {}
+    for i in range(10):
+        video_id = videos[i]
+        video_ref = createRefExp(video_id, data)
+        all_video_refs[video_id] = video_ref
+    json_obj = json.dumps(all_video_refs, indent=4)
+    with open('mbench/result.json', 'w') as file:
+        file.write(json_obj)

mbench/gpt_ref-ytvos_numbered_cy.py ADDED Viewed

	@@ -0,0 +1,460 @@

+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import time
+from os import path as osp
+from io import BytesIO
+from mbench.ytvos_ref import build as build_ytvos_ref
+import argparse
+import opts
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import textwrap
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+from openai import OpenAI
+import base64
+import json
+def number_objects_and_encode(idx, color_mask=False):
+    encoded_frames = {}
+    contoured_frames = {}  # New dictionary for original images
+    vid_cat_cnts = {}
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    frame_indx = vid_meta['sample_indx']
+    cat_names = set(vid_meta['obj_id_cat'].values())
+    imgs = vid_data[0]
+    for cat in cat_names:
+        cat_frames = []
+        contour_frames = []
+        frame_cat_cnts = {}
+        for i in range(imgs.size(0)):
+            frame_name = frame_indx[i]
+            frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_data = vid_data[2][frame_name]
+            obj_ids = list(frame_data.keys())
+            cat_cnt = 0
+            for j in range(len(obj_ids)):
+                obj_id = obj_ids[j]
+                obj_data = frame_data[obj_id]
+                obj_bbox = obj_data['bbox']
+                obj_valid = obj_data['valid']
+                obj_mask = obj_data['mask'].numpy().astype(np.uint8)
+                obj_cat = obj_data['category_name']
+                if obj_cat == cat and obj_valid:
+                    cat_cnt += 1
+                    if color_mask == False:
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 3)
+                        for i, contour in enumerate(contours):
+                            # 윤곽선 중심 계산
+                            moments = cv2.moments(contour)
+                            if moments["m00"] != 0:  # 중심 계산 가능 여부 확인
+                                cx = int(moments["m10"] / moments["m00"])
+                                cy = int(moments["m01"] / moments["m00"])
+                            else:
+                                cx, cy = contour[0][0]  # 중심 계산 불가시 대체 좌표 사용
+                            # 텍스트 배경 (검은색 배경 만들기)
+                            font = cv2.FONT_HERSHEY_SIMPLEX
+                            text = obj_id
+                            text_size = cv2.getTextSize(text, font, 1, 2)[0]
+                            text_w, text_h = text_size
+                            # 텍스트 배경 그리기 (검은색 배경)
+                            cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
+                                        (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
+                            # 텍스트 그리기 (흰색 텍스트)
+                            cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
+                                        font, 1, (255, 255, 255), 2)
+                    else:
+                        alpha = 0.08
+                        colored_obj_mask = np.zeros_like(frame)
+                        colored_obj_mask[obj_mask == 1] = colors[j]
+                        frame[obj_mask == 1] = (
+                            (1 - alpha) * frame[obj_mask == 1]
+                            + alpha * colored_obj_mask[obj_mask == 1]
+                        )
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 2)
+                        cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
+                        if len(contours) > 0:
+                            largest_contour = max(contours, key=cv2.contourArea)
+                            M = cv2.moments(largest_contour)
+                            if M["m00"] != 0:
+                                center_x = int(M["m10"] / M["m00"])
+                                center_y = int(M["m01"] / M["m00"])
+                            else:
+                                center_x, center_y = 0, 0
+                        font = cv2.FONT_HERSHEY_SIMPLEX
+                        text = obj_id
+                        font_scale = 0.9
+                        text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
+                        text_x = center_x - text_size[0] // 1  # 텍스트의 가로 중심
+                        text_y = center_y
+                        # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
+                        # 텍스트 배경 사각형 좌표 계산
+                        rect_start = (text_x - 5, text_y - text_size[1] - 5)  # 배경 사각형 좌상단
+                        # rect_end = (text_x + text_size[0] + 5, text_y + 5)
+                        rect_end = (text_x + text_size[0] + 5, text_y)
+                        cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+                        cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
+            # plt.figure(figsize=(12, 8))
+            # plt.imshow(frame)
+            # plt.title(f"frame {frame_name}")
+            # plt.tight_layout()
+            # plt.axis('off')
+            # plt.show()
+            buffer = BytesIO()
+            frame = Image.fromarray(frame)
+            frame.save(buffer, format='jpeg')
+            buffer.seek(0)
+            cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+            frame_cat_cnts[frame_name] = cat_cnt
+            buffer.seek(0)  # Reuse buffer instead of creating a new one
+            buffer.truncate()
+            frame_for_contour = Image.fromarray(frame_for_contour)
+            frame_for_contour.save(buffer, format='jpeg')
+            buffer.seek(0)
+            contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+        encoded_frames[cat] = cat_frames
+        contoured_frames[cat] = contour_frames
+        vid_cat_cnts[cat] = frame_cat_cnts
+    return encoded_frames, vid_cat_cnts, contoured_frames
+def getCaption(idx, model='gpt-4o', color_mask=True):
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    print(f"vid id: {vid_id}\n")
+    frame_indx = vid_meta['sample_indx']             # e.g. [4, 7, 9, 16]
+    cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
+    all_captions = dict()
+    base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
+    #marked = "mask with boundary" if color_mask else "boundary"
+    for cat_name in list(cat_names) :
+        is_movable = False
+        if cat_name in ytvos_category_valid_list :
+            is_movable = True
+        if not is_movable:
+            print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
+        image_captions = {}
+        captioner = OpenAI()
+        cat_base64_frames = base64_frames[cat_name]
+        cont_base64_frames = contoured_frames[cat_name]
+        for i in range(len(cat_base64_frames)):
+            frame_name = frame_indx[i]
+            cont_base64_image = cont_base64_frames[i]
+            base64_image = cat_base64_frames[i]
+            should_filter = False
+            frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
+            if frame_cat_cnts >= 2:
+                should_filter = True
+            else:
+                print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
+            if is_movable and should_filter:
+                #1단계: 필터링
+                print(f"-----------category name: {cat_name}, frame name: {frame_name}")
+                caption_filter_text = f"""
+                You are a visual assistant analyzing a single frame from a video.
+                In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
+                Are {cat_name}s in the image performing all different and recognizable actions or postures?
+                Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
+                facial expressions, and any notable interactions with objects or other {cat_name}s or people.
+                Only focus on obvious, prominent actions that can be reliably identified from this single frame.
+                - Respond with "YES" if:
+                1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
+                    (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
+                2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
+                3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
+                - Respond with "NONE" if:
+                1) The actions or pose are not clearly differentiable or too similar.
+                2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
+                3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
+                Answer strictly with either "YES" or "NONE".
+                """
+                response1 = captioner.chat.completions.create(
+                    model=model,
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": caption_filter_text,
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                }
+                            ],
+                        }
+                    ],
+                )
+                response_content = response1.choices[0].message.content
+                should_caption = True if "yes" in response_content.lower() else False
+                print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
+            else:
+                should_caption = False
+            #2단계: dense caption 만들기
+            dense_caption_prompt_1 =  f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
+                                        In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
+                                        I want to use your expressions to create a action-centric referring expression dataset.
+                                        Therefore, your expressions for these {cat_name}s should describe unique action of each object.
+                                        1. Focus only on clear, unique, and prominent actions that distinguish each object.
+                                        2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
+                                        3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
+                                        4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
+                                        5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
+                                        6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                                        7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+                                        8. Include interactions with objects or other entities when they are prominent and observable.
+                                        9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
+                                        10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
+                                        11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
+                                        12. Do not mention object IDs.
+                                        13. Use '{cat_name}' as the noun for the referring expressions.
+                                        Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
+                                        Output referring expressions for each object id.
+                                        """
+            dense_caption_prompt = f"""
+            You are a visual assistant analyzing a single frame of a video.
+            In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
+            I want to use your expressions to create an **action-centric referring expression** dataset.
+            Please describe each {cat_name} using **clearly observable** and **specific** actions.
+            ---
+            ## Guidelines:
+            1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
+            2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
+            3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
+            4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
+            5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
+            6. If multiple {cat_name}s appear, ensure each description **differentiates** their actions.
+            7. Base your description on these action definitions:
+            - Avoid using term 'minimal' or 'slightly'.
+            - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
+            - details such as motion and intention, facial with object manipulation
+            - movements with objects or other entities when they are prominent and observable. expression should be specific.
+                (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
+            ---
+            ## Output Format:
+            - For each labeled {cat_name}, output **exactly one line**. Your answer should contain details and follow the following format :
+                object id. using {cat_name} as subject noun, action-oriented description
+                (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
+            - **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
+            ### Example
+            If the frame has 2 labeled bears, your output should look like:
+            1. the bear reaching his right arm while leaning forward to capture the prey
+            2. a bear standing upright facing right, touching the bike aside
+            ---
+            **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
+            **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
+            **Do not include markdown** in the output.
+            Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
+            For each labeled {cat_name}, output referring expressions for each object id.
+            """
+            MAX_RETRIES = 2
+            retry_count = 0
+            if should_caption:
+                while retry_count < MAX_RETRIES:
+                    response2 = captioner.chat.completions.create(
+                        model=model,
+                        messages=[
+                            {
+                                "role": "user",
+                                "content": [
+                                    {
+                                        "type": "text",
+                                        "text": dense_caption_prompt,
+                                    },
+                                    {
+                                        "type": "image_url",
+                                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                    },
+                                ],
+                            }
+                        ],
+                    )
+                    # caption = response2.choices[0].message.content
+                    #print(f"{image_path} - {frame_name}: {caption}")
+                    caption = response2.choices[0].message.content.strip()
+                    caption_lower = caption.lower().lstrip()
+                    if caption_lower.startswith("1.") and not any(
+                        phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
+                    ):
+                        break
+                    print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
+                    retry_count += 1
+                    time.sleep(2)
+                if retry_count == MAX_RETRIES:
+                    caption = None
+                    print("Max retries reached. Caption generation failed.")
+            else:
+                caption = None
+            image_captions[frame_name] = caption
+        all_captions[cat_name] = image_captions
+    # final : also prepare valid object ids
+    valid_obj_ids = dict()
+    for cat in cat_names:
+        if cat in ytvos_category_valid_list:
+            obj_id_cat = vid_meta['obj_id_cat']
+            valid_cat_ids = []
+            for obj_id in list(obj_id_cat.keys()):
+                if obj_id_cat[obj_id] == cat:
+                    valid_cat_ids.append(obj_id)
+            valid_obj_ids[cat] = valid_cat_ids
+    return vid_id, all_captions, valid_obj_ids
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
+    parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_ytvos_ref(image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    # 색상 후보 8개 (RGB 형식)
+    colors = [
+        (255, 0, 0),    # Red
+        (0, 255, 0),    # Green
+        (0, 0, 255),    # Blue
+        (255, 255, 0),  # Yellow
+        (255, 0, 255),  # Magenta
+        (0, 255, 255),  # Cyan
+        (128, 0, 128),  # Purple
+        (255, 165, 0)   # Orange
+    ]
+    ytvos_category_valid_list = [
+    'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
+    'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
+    'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
+    'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
+    'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
+    'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
+    ]
+    #==================gpt 돌리기===================
+    os.environ['OPENAI_API_KEY'] = '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'
+    result_captions = {}
+    result_valid_obj_ids = {}
+    for i in range(370):
+        vid_id, all_captions, valid_obj_ids = getCaption(i, color_mask=False)
+        if vid_id not in result_captions:
+            result_captions[vid_id] = all_captions
+        if vid_id not in result_valid_obj_ids:
+            result_valid_obj_ids[vid_id] = valid_obj_ids
+    print("Finished!", flush=True)
+    with open(args.save_caption_path, "w") as file:
+        json.dump(result_captions, file, indent=4)
+    with open(args.save_valid_obj_ids_path, "w") as file:
+        json.dump(result_valid_obj_ids, file, indent=4)

mbench/gpt_ref-ytvos_numbered_cy_sanity.py ADDED Viewed

	@@ -0,0 +1,643 @@

+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import time
+from os import path as osp
+from io import BytesIO
+import random
+from mbench.ytvos_ref import build as build_ytvos_ref
+import argparse
+import opts
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import textwrap
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+from openai import OpenAI
+import base64
+import json
+def number_objects_and_encode(idx, color_mask=False):
+    encoded_frames = {}
+    contoured_frames = {}  # New dictionary for original images
+    vid_cat_cnts = {}
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    frame_indx = vid_meta['sample_indx']
+    cat_names = set(vid_meta['obj_id_cat'].values())
+    imgs = vid_data[0]
+    for cat in cat_names:
+        cat_frames = []
+        contour_frames = []
+        frame_cat_cnts = {}
+        for i in range(imgs.size(0)):
+            frame_name = frame_indx[i]
+            frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_data = vid_data[2][frame_name]
+            obj_ids = list(frame_data.keys())
+            cat_cnt = 0
+            for j in range(len(obj_ids)):
+                obj_id = obj_ids[j]
+                obj_data = frame_data[obj_id]
+                obj_bbox = obj_data['bbox']
+                obj_valid = obj_data['valid']
+                obj_mask = obj_data['mask'].numpy().astype(np.uint8)
+                obj_cat = obj_data['category_name']
+                if obj_cat == cat and obj_valid:
+                    cat_cnt += 1
+                    if color_mask == False:
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 3)
+                        for i, contour in enumerate(contours):
+                            moments = cv2.moments(contour)
+                            if moments["m00"] != 0:
+                                cx = int(moments["m10"] / moments["m00"])
+                                cy = int(moments["m01"] / moments["m00"])
+                            else:
+                                cx, cy = contour[0][0]
+                            font = cv2.FONT_HERSHEY_SIMPLEX
+                            text = obj_id
+                            text_size = cv2.getTextSize(text, font, 1, 2)[0]
+                            text_w, text_h = text_size
+                            cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
+                                        (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
+                            cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
+                                        font, 1, (255, 255, 255), 2)
+                    else:
+                        alpha = 0.08
+                        colored_obj_mask = np.zeros_like(frame)
+                        colored_obj_mask[obj_mask == 1] = colors[j]
+                        frame[obj_mask == 1] = (
+                            (1 - alpha) * frame[obj_mask == 1]
+                            + alpha * colored_obj_mask[obj_mask == 1]
+                        )
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 2)
+                        cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
+                        if len(contours) > 0:
+                            largest_contour = max(contours, key=cv2.contourArea)
+                            M = cv2.moments(largest_contour)
+                            if M["m00"] != 0:
+                                center_x = int(M["m10"] / M["m00"])
+                                center_y = int(M["m01"] / M["m00"])
+                            else:
+                                center_x, center_y = 0, 0
+                        font = cv2.FONT_HERSHEY_SIMPLEX
+                        text = obj_id
+                        font_scale = 0.9
+                        text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
+                        text_x = center_x - text_size[0] // 1
+                        text_y = center_y
+                        rect_start = (text_x - 5, text_y - text_size[1] - 5)
+                        rect_end = (text_x + text_size[0] + 5, text_y)
+                        cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+                        cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
+            # plt.figure(figsize=(12, 8))
+            # plt.imshow(frame)
+            # plt.title(f"frame {frame_name}")
+            # plt.tight_layout()
+            # plt.axis('off')
+            # plt.show()
+            buffer = BytesIO()
+            frame = Image.fromarray(frame)
+            frame.save(buffer, format='jpeg')
+            buffer.seek(0)
+            cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+            frame_cat_cnts[frame_name] = cat_cnt
+            buffer.seek(0)  # Reuse buffer instead of creating a new one
+            buffer.truncate()
+            frame_for_contour = Image.fromarray(frame_for_contour)
+            frame_for_contour.save(buffer, format='jpeg')
+            buffer.seek(0)
+            contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+        encoded_frames[cat] = cat_frames
+        contoured_frames[cat] = contour_frames
+        vid_cat_cnts[cat] = frame_cat_cnts
+    return encoded_frames, contoured_frames, vid_cat_cnts
+# def number_objects_and_encode(idx, color_mask=False):
+#     encoded_frames = {}
+#     contoured_frames = {}  # New dictionary for original images
+#     vid_cat_cnts = {}
+#     vid_meta = metas[idx]
+#     vid_data = train_dataset[idx]
+#     vid_id = vid_meta['video']
+#     frame_indx = vid_meta['sample_indx']
+#     cat_names = set(vid_meta['obj_id_cat'].values())
+#     imgs = vid_data[0]
+#     for cat in cat_names:
+#         cat_frames = []
+#         contour_frames = []
+#         frame_cat_cnts = {}
+#         for i in range(imgs.size(0)):
+#             frame_name = frame_indx[i]
+#             frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
+#             frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
+#             frame_data = vid_data[2][frame_name]
+#             obj_ids = list(frame_data.keys())
+#             cat_cnt = 0
+#             for j in range(len(obj_ids)):
+#                 obj_id = obj_ids[j]
+#                 obj_data = frame_data[obj_id]
+#                 obj_bbox = obj_data['bbox']
+#                 obj_valid = obj_data['valid']
+#                 obj_mask = obj_data['mask'].numpy().astype(np.uint8)
+#                 obj_cat = obj_data['category_name']
+#                 if obj_cat == cat and obj_valid:
+#                     cat_cnt += 1
+#                     contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+#                     cv2.drawContours(frame, contours, -1, colors[j], 3)
+#                     cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
+#                     if len(contours) > 0:
+#                         largest_contour = max(contours, key=cv2.contourArea)
+#                         M = cv2.moments(largest_contour)
+#                         if M["m00"] != 0:
+#                             center_x = int(M["m10"] / M["m00"])
+#                             center_y = int(M["m01"] / M["m00"])
+#                         else:
+#                             center_x, center_y = 0, 0
+#                     font = cv2.FONT_HERSHEY_SIMPLEX
+#                     text = obj_id
+#                     font_scale = 1.2
+#                     text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
+#                     text_x = center_x - text_size[0] // 1
+#                     text_y = center_y
+#                     rect_start = (text_x - 5, text_y - text_size[1] - 5)
+#                     rect_end = (text_x + text_size[0] + 5, text_y + 3)
+#                     contour_thickness = 1
+#                     rect_start_contour = (rect_start[0] - contour_thickness, rect_start[1] - contour_thickness)
+#                     rect_end_contour = (rect_end[0] + contour_thickness, rect_end[1] + contour_thickness)
+#                     cv2.rectangle(frame, rect_start_contour, rect_end_contour, colors[j], contour_thickness)
+#                     cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+#                     cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
+#                     if color_mask:
+#                         alpha = 0.08
+#                         colored_obj_mask = np.zeros_like(frame)
+#                         colored_obj_mask[obj_mask == 1] = colors[j]
+#                         frame[obj_mask == 1] = (
+#                             (1 - alpha) * frame[obj_mask == 1]
+#                             + alpha * colored_obj_mask[obj_mask == 1]
+#                         )
+#             # plt.figure(figsize=(12, 8))
+#             # plt.imshow(frame)
+#             # plt.title(f"frame {frame_name}")
+#             # plt.tight_layout()
+#             # plt.axis('off')
+#             # plt.show()
+#             buffer = BytesIO()
+#             frame = Image.fromarray(frame)
+#             frame.save(buffer, format='jpeg')
+#             buffer.seek(0)
+#             cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+#             frame_cat_cnts[frame_name] = cat_cnt
+#             buffer.seek(0)  # Reuse buffer instead of creating a new one
+#             buffer.truncate()
+#             frame_for_contour = Image.fromarray(frame_for_contour)
+#             frame_for_contour.save(buffer, format='jpeg')
+#             buffer.seek(0)
+#             contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+#         encoded_frames[cat] = cat_frames
+#         contoured_frames[cat] = contour_frames
+#         vid_cat_cnts[cat] = frame_cat_cnts
+#     return encoded_frames, contoured_frames, vid_cat_cnts
+def getCaption(idx, model='gpt-4o', color_mask=False):
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    print(f"vid id: {vid_id}\n")
+    frame_indx = vid_meta['sample_indx']             # e.g. [4, 7, 9, 16]
+    cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
+    all_captions = dict()
+    base64_frames, _ , vid_cat_cnts = number_objects_and_encode(idx, color_mask)
+    #marked = "mask with boundary" if color_mask else "boundary"
+    for cat_name in list(cat_names) :
+        is_movable = False
+        if cat_name in ytvos_category_valid_list :
+            is_movable = True
+        if not is_movable:
+            print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
+        image_captions = {}
+        captioner = OpenAI()
+        cat_base64_frames = base64_frames[cat_name]
+        # cont_base64_frames = contoured_frames[cat_name]
+        for i in range(len(cat_base64_frames)):
+            frame_name = frame_indx[i]
+            # cont_base64_image = cont_base64_frames[i]
+            base64_image = cat_base64_frames[i]
+            should_filter = False
+            frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
+            if frame_cat_cnts >= 2:
+                should_filter = True
+            else:
+                print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
+            if is_movable and should_filter:
+                #1단계: 필터링
+                print(f"-----------category name: {cat_name}, frame name: {frame_name}")
+                caption_filter_text = f"""
+                You are a visual assistant analyzing a single frame from a video.
+                In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
+                Are {cat_name}s in the image performing all different and recognizable actions or postures?
+                Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
+                facial expressions, and any notable interactions with objects or other {cat_name}s or people.
+                Only focus on obvious, prominent actions that can be reliably identified from this single frame.
+                - Respond with "YES" if:
+                1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
+                    (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
+                2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
+                3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
+                - Respond with "NONE" if:
+                1) The actions or pose are not clearly differentiable or too similar.
+                2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
+                3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
+                Answer strictly with either "YES" or "NONE".
+                """
+                response1 = captioner.chat.completions.create(
+                    model=model,
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": caption_filter_text,
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                }
+                            ],
+                        }
+                    ],
+                )
+                response_content = response1.choices[0].message.content
+                should_caption = True if "yes" in response_content.lower() else False
+                print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
+            else:
+                should_caption = False
+            #2단계: dense caption 만들기
+            dense_caption_prompt_1 =  f"""
+            In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. The category name of these objects are : {cat_name}.
+            Please describe the image focusing on labeled {cat_name}s in detail, focusing on their actions and interactions.
+            1. Focus only on clear, unique, and prominent actions that distinguish each object.
+            2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
+            3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
+            4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
+            5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
+            6. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                - expressions like 'seems to be', 'appears to be' are BANNED!
+            7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+            8. Include interactions with objects or other entities when they are prominent and observable.
+            9. **Do not include descriptions of appearance** such as clothes, color, size, shape etc.
+            10. **Do not include relative position** between objects such as 'the left elephant' because left/right can be ambiguous.
+            11. Do not mention object IDs.
+            12. Use '{cat_name}' as the noun for the referring expressions.
+            Note that I want to use your description to create a grounding dataset, therefore, your descriptions for different objects should be unique, i.e., If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
+            - Your answer should contain details, and follow the following format:
+                object id. action-oriented description
+                (e.g. 1. the person is holding bananas on two hands and opening his mouth, turning the head right.
+                      2. a person bending over and touching his boots to tie the shoelace.)
+            - for action-oriented description, use {cat_name} as subject noun
+            **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
+            Please pay attention to the categories of these objects and don’t change them.
+            Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
+            Output referring expressions for each object id. Please start your answer:"""
+            dense_caption_prompt_2 = f"""
+            You are an advanced visual language model analyzing a video frame.
+            In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been distinctly labeled with bright numerical IDs at their center and boundary.
+            Your task is to generate **action-oriented descriptions** for each labeled {cat_name}.
+            Your descriptions should capture their **observable actions and interactions**, making sure to highlight movement, gestures, and dynamic behaviors.
+            ---
+            ## Key Guidelines:
+            1. **Describe only clear and visible actions** that uniquely define what the {cat_name} is doing.
+            - Example: "grabbing a branch and pulling it down" (**(O) Specific**)
+            - Avoid: "moving slightly to the side" (**(X) Too vague**)
+            2. **Do not describe appearance, color, or position**—focus purely on the action.
+            - (X) "A large brown bear standing on the left"
+            - (O) "The bear is lifting its front paws and swiping forward."
+            3. **Use dynamic, action-specific verbs** rather than passive descriptions.
+            - (O) "The giraffe is tilting its head and sniffing the ground."
+            - (X) "The giraffe is near a tree and looking around."
+            4. **Avoid assumptions, emotions, or speculative phrasing.**
+            - (X) "The person seems excited" / "The person might be preparing to jump."
+            - (O) "The person is pushing its front legs against the rock and leaping forward."
+            5. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                - expressions like 'seems to be', 'appears to be' are BANNED!
+            6. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+            7. If multiple {cat_name}s are present, make sure their descriptions are **distinct and non-overlapping**.
+            - **Each object should have a unique, descriptive action.**
+            - (X) "Two dogs are running."
+            - (O) "1. One dog is chasing another, its legs stretched mid-air.
+                   2. The other dog is looking back while speeding up."
+            ---
+            ## Output Format:
+            - Each labeled **{cat_name}** should have exactly **one line of description**.
+            - Format: `ID. {cat_name} + action-based description`
+            - (O) Example:
+                ```
+                1. The person is leaning forward while opening a bag with both hands.
+                2. The person is holding onto a rope and pulling themselves up.
+                ```
+            - **Ensure that each object is described individually.**
+            - **Do not group objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
+            ---
+            ## Additional Instructions:
+            - **Do NOT** use expressions like "it appears that..." or "it seems like...".
+            - **Do NOT** mention object IDs in the description (only use the provided format).
+            - **DO NOT** include markdown formatting (no bullet points, no asterisks).
+            - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
+            Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:
+            """
+            dense_caption_prompt = f"""
+            You are a visual assistant analyzing a single frame of a video.
+            In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been labeled with bright numeric IDs at their center and boundary.
+            I am building an **action-centric referring expression** dataset.
+            Your task is to describe each labeled {cat_name} based on **clearly observable and specific actions**.
+            ---
+            ## Guidelines:
+            1. **Focus only on visible and prominent actions** (e.g., running, pushing, grasping an object).
+            2. **Avoid describing minor or ambiguous movements** (e.g., "slightly moving a paw," "tilting head a bit").
+            3. **Do not include subjective or speculative descriptions** (e.g., "it seems excited" or "it might be preparing to jump").
+            4. **Avoid vague expressions** like "engaging with something." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
+            5. **Use dynamic action verbs** (e.g., holding, throwing, inspecting, leaning, pressing) to highlight motion and interaction.
+            6. If multiple {cat_name}s appear, ensure each description is **distinct and non-overlapping**.
+            7. Base your descriptions on these principles:
+            - **Avoid words like 'minimal' or 'slightly'.**
+            - Emphasize **body movement, posture, and motion patterns** (e.g., "lifting its head," "facing forward," "showing its back").
+            - Describe **facial expressions and interactions with objects** (e.g., "opening its mouth wide," "smiling while holding an item").
+            - **Specify actions with other objects or entities** only when they are clear and observable.
+                - (O) "pushing another person"
+                - (X) "interacting with another object"
+            ---
+            ## Output Format:
+            - Each labeled **{cat_name}** must have **exactly one line**.
+            - Format: `ID. {cat_name} + action-based description`
+            - (O) Example:
+                ```
+                1. The person is holding ski poles and skiing down a snowy mountain with bent knees.
+                2. The person is pulling a baby carriage while smiling.
+                ```
+            - **Ensure each object is described individually.**
+            - **Do not group multiple objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
+            ---
+            ## Example:
+            If the frame has two labeled **bears**, your output should be:
+            ```
+            1. The bear is reaching out its right paw while leaning forward to catch prey.
+            2. A bear is standing upright, facing right, and touching the bike beside it.
+            ```
+            ---
+            ## Additional Instructions:
+            - **Do NOT** describe appearance (e.g., color, size, texture) or relative positioning (e.g., "on the left/right").
+            - **Do NOT** reference object IDs explicitly (e.g., "Person 1" or "Object 2" is NOT allowed).
+            - **Do NOT** include markdown formatting (no bullet points, asterisks, or extra symbols).
+            - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
+            Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:"""
+            MAX_RETRIES = 3
+            retry_count = 0
+            if should_caption:
+                while retry_count < MAX_RETRIES:
+                    selected_prompt = random.choice([dense_caption_prompt, dense_caption_prompt_2, dense_caption_prompt_1])
+                    response2 = captioner.chat.completions.create(
+                        model=model,
+                        messages=[
+                            {
+                                "role": "user",
+                                "content": [
+                                    {
+                                        "type": "text",
+                                        "text": selected_prompt,
+                                    },
+                                    {
+                                        "type": "image_url",
+                                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                    },
+                                ],
+                            }
+                        ],
+                    )
+                    # caption = response2.choices[0].message.content
+                    #print(f"{image_path} - {frame_name}: {caption}")
+                    caption = response2.choices[0].message.content.strip()
+                    caption_lower = caption.lower().lstrip()
+                    if caption_lower.startswith("1.") and not any(
+                        phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
+                    ):
+                        break
+                    print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
+                    retry_count += 1
+                    time.sleep(2)
+                if retry_count == MAX_RETRIES:
+                    caption = None
+                    print("Max retries reached. Caption generation failed.")
+            else:
+                caption = None
+            image_captions[frame_name] = caption
+        all_captions[cat_name] = image_captions
+    # final : also prepare valid object ids
+    valid_obj_ids = dict()
+    for cat in cat_names:
+        if cat in ytvos_category_valid_list:
+            obj_id_cat = vid_meta['obj_id_cat']
+            valid_cat_ids = []
+            for obj_id in list(obj_id_cat.keys()):
+                if obj_id_cat[obj_id] == cat:
+                    valid_cat_ids.append(obj_id)
+            valid_obj_ids[cat] = valid_cat_ids
+    return vid_id, all_captions, valid_obj_ids
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions_gpt-4o_randcap.json")
+    parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids_gpt-4o_randcap.json")
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_ytvos_ref(image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    # 색상 후보 8개 (RGB 형식)
+    colors = [
+        (255, 0, 0),    # Red
+        (0, 255, 0),    # Green
+        (0, 0, 255),    # Blue
+        (255, 255, 0),  # Yellow
+        (255, 0, 255),  # Magenta
+        (0, 255, 255),  # Cyan
+        (128, 0, 128),  # Purple
+        (255, 165, 0)   # Orange
+    ]
+    ytvos_category_valid_list = [
+    'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
+    'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
+    'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
+    'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
+    'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
+    'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
+    ]
+    #==================gpt 돌리기===================
+    os.environ['OPENAI_API_KEY'] = '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'
+    result_captions = {}
+    result_valid_obj_ids = {}
+    for i in range(370):
+        vid_id, all_captions, valid_obj_ids = getCaption(i, color_mask=False)
+        if vid_id not in result_captions:
+            result_captions[vid_id] = all_captions
+        if vid_id not in result_valid_obj_ids:
+            result_valid_obj_ids[vid_id] = valid_obj_ids
+    print("Finished!", flush=True)
+    with open(args.save_caption_path, "w") as file:
+        json.dump(result_captions, file, indent=4)
+    with open(args.save_valid_obj_ids_path, "w") as file:
+        json.dump(result_valid_obj_ids, file, indent=4)

mbench/gpt_ref-ytvos_numbered_cy_sanity_2.py ADDED Viewed

	@@ -0,0 +1,676 @@

+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import time
+from os import path as osp
+from io import BytesIO
+import random
+from mbench.ytvos_ref import build as build_ytvos_ref
+import argparse
+import opts
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import textwrap
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+from openai import OpenAI, APIConnectionError, OpenAIError
+import base64
+import json
+import requests
+def number_objects_and_encode_old(idx, color_mask=False):
+    encoded_frames = {}
+    contoured_frames = {}  # New dictionary for original images
+    vid_cat_cnts = {}
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    frame_indx = vid_meta['sample_indx']
+    cat_names = set(vid_meta['obj_id_cat'].values())
+    imgs = vid_data[0]
+    for cat in cat_names:
+        cat_frames = []
+        contour_frames = []
+        frame_cat_cnts = {}
+        for i in range(imgs.size(0)):
+            frame_name = frame_indx[i]
+            frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_data = vid_data[2][frame_name]
+            obj_ids = list(frame_data.keys())
+            cat_cnt = 0
+            for j in range(len(obj_ids)):
+                obj_id = obj_ids[j]
+                obj_data = frame_data[obj_id]
+                obj_bbox = obj_data['bbox']
+                obj_valid = obj_data['valid']
+                obj_mask = obj_data['mask'].numpy().astype(np.uint8)
+                obj_cat = obj_data['category_name']
+                if obj_cat == cat and obj_valid:
+                    cat_cnt += 1
+                    if color_mask == False:
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 3)
+                        for i, contour in enumerate(contours):
+                            moments = cv2.moments(contour)
+                            if moments["m00"] != 0:
+                                cx = int(moments["m10"] / moments["m00"])
+                                cy = int(moments["m01"] / moments["m00"])
+                            else:
+                                cx, cy = contour[0][0]
+                            font = cv2.FONT_HERSHEY_SIMPLEX
+                            text = obj_id
+                            text_size = cv2.getTextSize(text, font, 1, 2)[0]
+                            text_w, text_h = text_size
+                            cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
+                                        (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
+                            cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
+                                        font, 1, (255, 255, 255), 2)
+                    else:
+                        alpha = 0.08
+                        colored_obj_mask = np.zeros_like(frame)
+                        colored_obj_mask[obj_mask == 1] = colors[j]
+                        frame[obj_mask == 1] = (
+                            (1 - alpha) * frame[obj_mask == 1]
+                            + alpha * colored_obj_mask[obj_mask == 1]
+                        )
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 2)
+                        cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
+                        if len(contours) > 0:
+                            largest_contour = max(contours, key=cv2.contourArea)
+                            M = cv2.moments(largest_contour)
+                            if M["m00"] != 0:
+                                center_x = int(M["m10"] / M["m00"])
+                                center_y = int(M["m01"] / M["m00"])
+                            else:
+                                center_x, center_y = 0, 0
+                        font = cv2.FONT_HERSHEY_SIMPLEX
+                        text = obj_id
+                        font_scale = 0.9
+                        text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
+                        text_x = center_x - text_size[0] // 1
+                        text_y = center_y
+                        rect_start = (text_x - 5, text_y - text_size[1] - 5)
+                        rect_end = (text_x + text_size[0] + 5, text_y)
+                        cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+                        cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
+            # plt.figure(figsize=(12, 8))
+            # plt.imshow(frame)
+            # plt.title(f"frame {frame_name}")
+            # plt.tight_layout()
+            # plt.axis('off')
+            # plt.show()
+            buffer = BytesIO()
+            frame = Image.fromarray(frame)
+            frame.save(buffer, format='jpeg')
+            buffer.seek(0)
+            cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+            frame_cat_cnts[frame_name] = cat_cnt
+            buffer.seek(0)  # Reuse buffer instead of creating a new one
+            buffer.truncate()
+            frame_for_contour = Image.fromarray(frame_for_contour)
+            frame_for_contour.save(buffer, format='jpeg')
+            buffer.seek(0)
+            contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+        encoded_frames[cat] = cat_frames
+        contoured_frames[cat] = contour_frames
+        vid_cat_cnts[cat] = frame_cat_cnts
+    return encoded_frames, contoured_frames, vid_cat_cnts
+def number_objects_and_encode(idx, color_mask=False):
+    encoded_frames = {}
+    contoured_frames = {}  # New dictionary for original images
+    vid_cat_cnts = {}
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    frame_indx = vid_meta['sample_indx']
+    cat_names = set(vid_meta['obj_id_cat'].values())
+    imgs = vid_data[0]
+    for cat in cat_names:
+        cat_frames = []
+        contour_frames = []
+        frame_cat_cnts = {}
+        for i in range(imgs.size(0)):
+            frame_name = frame_indx[i]
+            frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_data = vid_data[2][frame_name]
+            obj_ids = list(frame_data.keys())
+            cat_cnt = 0
+            for j in range(len(obj_ids)):
+                obj_id = obj_ids[j]
+                obj_data = frame_data[obj_id]
+                obj_bbox = obj_data['bbox']
+                obj_valid = obj_data['valid']
+                obj_mask = obj_data['mask'].numpy().astype(np.uint8)
+                obj_cat = obj_data['category_name']
+                if obj_cat == cat and obj_valid:
+                    cat_cnt += 1
+                    contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                    cv2.drawContours(frame, contours, -1, colors[j], 3)
+                    cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
+                    if len(contours) > 0:
+                        largest_contour = max(contours, key=cv2.contourArea)
+                        M = cv2.moments(largest_contour)
+                        if M["m00"] != 0:
+                            center_x = int(M["m10"] / M["m00"])
+                            center_y = int(M["m01"] / M["m00"])
+                        else:
+                            center_x, center_y = 0, 0
+                    font = cv2.FONT_HERSHEY_SIMPLEX
+                    text = obj_id
+                    font_scale = 1.2
+                    text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
+                    text_x = center_x - text_size[0] // 1
+                    text_y = center_y
+                    rect_start = (text_x - 5, text_y - text_size[1] - 5)
+                    rect_end = (text_x + text_size[0] + 5, text_y + 3)
+                    contour_thickness = 1
+                    rect_start_contour = (rect_start[0] - contour_thickness, rect_start[1] - contour_thickness)
+                    rect_end_contour = (rect_end[0] + contour_thickness, rect_end[1] + contour_thickness)
+                    cv2.rectangle(frame, rect_start_contour, rect_end_contour, colors[j], contour_thickness)
+                    cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+                    cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
+                    if color_mask:
+                        alpha = 0.08
+                        colored_obj_mask = np.zeros_like(frame)
+                        colored_obj_mask[obj_mask == 1] = colors[j]
+                        frame[obj_mask == 1] = (
+                            (1 - alpha) * frame[obj_mask == 1]
+                            + alpha * colored_obj_mask[obj_mask == 1]
+                        )
+            # plt.figure(figsize=(12, 8))
+            # plt.imshow(frame)
+            # plt.title(f"frame {frame_name}")
+            # plt.tight_layout()
+            # plt.axis('off')
+            # plt.show()
+            buffer = BytesIO()
+            frame = Image.fromarray(frame)
+            frame.save(buffer, format='jpeg')
+            buffer.seek(0)
+            cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+            frame_cat_cnts[frame_name] = cat_cnt
+            buffer.seek(0)  # Reuse buffer instead of creating a new one
+            buffer.truncate()
+            frame_for_contour = Image.fromarray(frame_for_contour)
+            frame_for_contour.save(buffer, format='jpeg')
+            buffer.seek(0)
+            contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+        encoded_frames[cat] = cat_frames
+        contoured_frames[cat] = contour_frames
+        vid_cat_cnts[cat] = frame_cat_cnts
+    return encoded_frames, contoured_frames, vid_cat_cnts
+def getCaption(idx, model='gpt-4o'):
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    print(f"vid id: {vid_id}\n")
+    frame_indx = vid_meta['sample_indx']             # e.g. [4, 7, 9, 16]
+    cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
+    all_captions = dict()
+    # color_mask = random.choice([True, False])
+    color_mask = random.choices([False, True], weights=[60, 40])[0]
+    base64_frames, _ , vid_cat_cnts = number_objects_and_encode(idx, color_mask)
+    #marked = "mask with boundary" if color_mask else "boundary"
+    for cat_name in list(cat_names) :
+        is_movable = False
+        if cat_name in ytvos_category_valid_list :
+            is_movable = True
+        if not is_movable:
+            print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
+        image_captions = {}
+        captioner = OpenAI()
+        cat_base64_frames = base64_frames[cat_name]
+        # cont_base64_frames = contoured_frames[cat_name]
+        for i in range(len(cat_base64_frames)):
+            frame_name = frame_indx[i]
+            # cont_base64_image = cont_base64_frames[i]
+            base64_image = cat_base64_frames[i]
+            should_filter = False
+            frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
+            if frame_cat_cnts >= 2:
+                should_filter = True
+            else:
+                print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
+            if is_movable and should_filter:
+                #1단계: 필터링
+                print(f"-----------category name: {cat_name}, frame name: {frame_name}")
+                caption_filter_text = f"""
+                You are a visual assistant analyzing a single frame from a video.
+                In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
+                Are {cat_name}s in the image performing all different and recognizable actions or postures?
+                Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
+                facial expressions, and any notable interactions with objects or other {cat_name}s or people.
+                Only focus on obvious, prominent actions that can be reliably identified from this single frame.
+                - Respond with "YES" if:
+                1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
+                    (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
+                2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
+                3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
+                - Respond with "NONE" if:
+                1) The actions or pose are not clearly differentiable or too similar.
+                2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
+                3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
+                Answer strictly with either "YES" or "NONE".
+                """
+                response1 = captioner.chat.completions.create(
+                    model=model,
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": caption_filter_text,
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                }
+                            ],
+                        }
+                    ],
+                )
+                response_content = response1.choices[0].message.content
+                should_caption = True if "yes" in response_content.lower() else False
+                print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
+            else:
+                should_caption = False
+            #2단계: dense caption 만들기
+            dense_caption_prompt_1 =  f"""
+            In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. The category name of these objects are : {cat_name}.
+            Please describe the image focusing on labeled {cat_name}s in detail, focusing on their actions and interactions.
+            1. Focus only on clear, unique, and prominent actions that distinguish each object.
+            2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
+            3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
+            4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
+            5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
+            6. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                - expressions like 'seems to be', 'appears to be' are BANNED!
+            7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+            8. Include interactions with objects or other entities when they are prominent and observable.
+            9. **Do not include descriptions of appearance** such as clothes, color, size, shape etc.
+            10. **Do not include relative position** between objects such as 'the left elephant' because left/right can be ambiguous.
+            11. Do not mention object IDs.
+            12. Use '{cat_name}' as the noun for the referring expressions.
+            Note that I want to use your description to create a grounding dataset, therefore, your descriptions for different objects should be unique, i.e., If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
+            - Your answer should contain details, and follow the following format:
+                object id. action-oriented description
+                (e.g. 1. the person is holding bananas on two hands and opening his mouth, turning the head right.
+                      2. a person bending over and touching his boots to tie the shoelace.)
+            - for action-oriented description, use {cat_name} as subject noun
+            **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
+            Please pay attention to the categories of these objects and don’t change them.
+            Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
+            Output referring expressions for each object id. Please start your answer:"""
+            dense_caption_prompt_2 = f"""
+            You are an advanced visual language model analyzing a video frame.
+            In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been distinctly labeled with bright numerical IDs at their center and boundary.
+            Your task is to generate **action-oriented descriptions** for each labeled {cat_name}.
+            Your descriptions should capture their **observable actions and interactions**, making sure to highlight movement, gestures, and dynamic behaviors.
+            ---
+            ## Key Guidelines:
+            1. **Describe only clear and visible actions** that uniquely define what the {cat_name} is doing.
+            - Example: "grabbing a branch and pulling it down" (**(O) Specific**)
+            - Avoid: "moving slightly to the side" (**(X) Too vague**)
+            2. **Do not describe appearance, color, or position**—focus purely on the action.
+            - (X) "A large brown bear standing on the left"
+            - (O) "The bear is lifting its front paws and swiping forward."
+            3. **Use dynamic, action-specific verbs** rather than passive descriptions.
+            - (O) "The giraffe is tilting its head and sniffing the ground."
+            - (X) "The giraffe is near a tree and looking around."
+            4. **Avoid assumptions, emotions, or speculative phrasing.**
+            - (X) "The person seems excited" / "The person might be preparing to jump."
+            - (O) "The person is pushing its front legs against the rock and leaping forward."
+            5. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                - expressions like 'seems to be', 'appears to be' are BANNED!
+            6. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+            7. If multiple {cat_name}s are present, make sure their descriptions are **distinct and non-overlapping**.
+            - **Each object should have a unique, descriptive action.**
+            - (X) "Two dogs are running."
+            - (O) "1. One dog is chasing another, its legs stretched mid-air.
+                            2. The other dog is looking back while speeding up."
+            ---
+            ## Output Format:
+            - Each labeled **{cat_name}** should have exactly **one line of description**.
+            - Format: `ID. {cat_name} + action-based description`
+            - (O) Example:
+                ```
+                1. The person is leaning forward while opening a bag with both hands.
+                2. The person is holding onto a rope and pulling themselves up.
+                ```
+            - **Ensure that each object is described individually.**
+            - **Do not group objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
+            ---
+            ## Additional Instructions:
+            - **Do NOT** use expressions like "it appears that..." or "it seems like...".
+            - **Do NOT** mention object IDs in the description (only use the provided format).
+            - **DO NOT** include markdown formatting (no bullet points, no asterisks).
+            - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
+            Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:
+            """
+            dense_caption_prompt = f"""
+            You are a visual assistant analyzing a single frame of a video.
+            In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been labeled with bright numeric IDs at their center and boundary.
+            I am building an **action-centric referring expression** dataset.
+            Your task is to describe each labeled {cat_name} based on **clearly observable and specific actions**.
+            ---
+            ## Guidelines:
+            1. **Focus only on visible and prominent actions** (e.g., running, pushing, grasping an object).
+            2. **Avoid describing minor or ambiguous movements** (e.g., "slightly moving a paw," "tilting head a bit").
+            3. **Do not include subjective or speculative descriptions** (e.g., "it seems excited" or "it might be preparing to jump").
+            4. **Avoid vague expressions** like "engaging with something." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
+            5. **Use dynamic action verbs** (e.g., holding, throwing, inspecting, leaning, pressing) to highlight motion and interaction.
+            6. If multiple {cat_name}s appear, ensure each description is **distinct and non-overlapping**.
+            7. Base your descriptions on these principles:
+            - **Avoid words like 'minimal' or 'slightly'.**
+            - Emphasize **body movement, posture, and motion patterns** (e.g., "lifting its head," "facing forward," "showing its back").
+            - Describe **facial expressions and interactions with objects** (e.g., "opening its mouth wide," "smiling while holding an item").
+            - **Specify actions with other objects or entities** only when they are clear and observable.
+                - (O) "pushing another person"
+                - (X) "interacting with another object"
+            ---
+            ## Output Format:
+            - Each labeled **{cat_name}** must have **exactly one line**.
+            - Format: `ID. {cat_name} + action-based description`
+            - (O) Example:
+                ```
+                1. The person is holding ski poles and skiing down a snowy mountain with bent knees.
+                2. The person is pulling a baby carriage while smiling.
+                ```
+            - **Ensure each object is described individually.**
+            - **Do not group multiple objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
+            ---
+            ## Example:
+            If the frame has two labeled **bears**, your output should be:
+            ```
+            1. The bear is reaching out its right paw while leaning forward to catch prey.
+            2. A bear is standing upright, facing right, and touching the bike beside it.
+            ```
+            ---
+            ## Additional Instructions:
+            - **Do NOT** describe appearance (e.g., color, size, texture) or relative positioning (e.g., "on the left/right").
+            - **Do NOT** reference object IDs explicitly (e.g., "Person 1" or "Object 2" is NOT allowed).
+            - **Do NOT** include markdown formatting (no bullet points, asterisks, or extra symbols).
+            - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
+            Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:"""
+            MAX_RETRIES = 3
+            retry_count = 0
+            if should_caption:
+                while retry_count < MAX_RETRIES:
+                    selected_prompt = random.choice([dense_caption_prompt, dense_caption_prompt_2])
+                    response2 = captioner.chat.completions.create(
+                        model=model,
+                        messages=[
+                            {
+                                "role": "user",
+                                "content": [
+                                    {
+                                        "type": "text",
+                                        "text": selected_prompt,
+                                    },
+                                    {
+                                        "type": "image_url",
+                                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                    },
+                                ],
+                            }
+                        ],
+                    )
+                    # caption = response2.choices[0].message.content
+                    #print(f"{image_path} - {frame_name}: {caption}")
+                    caption = response2.choices[0].message.content.strip()
+                    caption_lower = caption.lower().lstrip()
+                    if caption_lower.startswith("1.") and not any(
+                        phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
+                    ):
+                        break
+                    print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
+                    retry_count += 1
+                    time.sleep(2)
+                if retry_count == MAX_RETRIES:
+                    caption = None
+                    print("Max retries reached. Caption generation failed.")
+            else:
+                caption = None
+            image_captions[frame_name] = caption
+        all_captions[cat_name] = image_captions
+    # final : also prepare valid object ids
+    valid_obj_ids = dict()
+    for cat in cat_names:
+        if cat in ytvos_category_valid_list:
+            obj_id_cat = vid_meta['obj_id_cat']
+            valid_cat_ids = []
+            for obj_id in list(obj_id_cat.keys()):
+                if obj_id_cat[obj_id] == cat:
+                    valid_cat_ids.append(obj_id)
+            valid_obj_ids[cat] = valid_cat_ids
+    return vid_id, all_captions, valid_obj_ids
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions_gpt-4o_randcap.json")
+    parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids_gpt-4o_randcap.json")
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_ytvos_ref(image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    # 색상 후보 8개 (RGB 형식)
+    colors = [
+        (255, 0, 0),    # Red
+        (0, 255, 0),    # Green
+        (0, 0, 255),    # Blue
+        (255, 255, 0),  # Yellow
+        (255, 0, 255),  # Magenta
+        (0, 255, 255),  # Cyan
+        (128, 0, 128),  # Purple
+        (255, 165, 0)   # Orange
+    ]
+    ytvos_category_valid_list = [
+    'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
+    'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
+    'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
+    'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
+    'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
+    'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
+    ]
+    #==================gpt 돌리기===================
+    os.environ['OPENAI_API_KEY'] = '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'
+    result_captions = {}
+    result_valid_obj_ids = {}
+    for i in range(len(metas)):
+        try:
+            vid_id, all_captions, valid_obj_ids = getCaption(i)
+            if vid_id not in result_captions:
+                result_captions[vid_id] = all_captions
+            if vid_id not in result_valid_obj_ids:
+                result_valid_obj_ids[vid_id] = valid_obj_ids
+        except (requests.exceptions.ConnectionError, APIConnectionError) as e:
+            print(f"created caption until {i-1}", flush=True)
+            print("인터넷 연결 문제로 요청을 처리할 수 없습니다:", e, flush=True)
+            with open(args.save_caption_path, "w") as file:
+                json.dump(result_captions, file, indent=4)
+            with open(args.save_valid_obj_ids_path, "w") as file:
+                json.dump(result_valid_obj_ids, file, indent=4)
+        except OpenAIError as e:
+            print(f"created caption until {i-1}", flush=True)
+            print("OpenAI API 관련 오류가 발생했습니다:", e, flush=True)
+            with open(args.save_caption_path, "w") as file:
+                json.dump(result_captions, file, indent=4)
+            with open(args.save_valid_obj_ids_path, "w") as file:
+                json.dump(result_valid_obj_ids, file, indent=4)
+        except Exception as e:
+            print(f"created caption until {i-1}", flush=True)
+            print("알 수 없는 오류 발생:", e, flush=True)
+            with open(args.save_caption_path, "w") as file:
+                json.dump(result_captions, file, indent=4)
+            with open(args.save_valid_obj_ids_path, "w") as file:
+                json.dump(result_valid_obj_ids, file, indent=4)
+    print("Finished!", flush=True)
+    with open(args.save_caption_path, "w") as file:
+        json.dump(result_captions, file, indent=4)
+    with open(args.save_valid_obj_ids_path, "w") as file:
+        json.dump(result_valid_obj_ids, file, indent=4)

mbench/gpt_test.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

mbench/make_ref-ytvos_json.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import sys
+import os
+from os import path as osp
+sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
+from datasets import build_dataset
+import argparse
+import opts
+from pathlib import Path
+import io
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+#==================json 만들기===================
+def createJson(train_dataset, metas):
+    entire_json = {}
+    #초기화
+    vid_idx = 0
+    while vid_idx < len(train_dataset):
+        #하나의 비디오에 대해
+        video_data = {}
+        video_train_frames, video_train_info = train_dataset[vid_idx]
+        video_meta = metas[vid_idx]
+        video_id = video_meta['video']
+        video_data['bins'] = video_meta['bins']
+        bin_nums = len(video_meta['bins'])
+        obj_nums = max([int(k) for k in list(video_meta['obj_id_cat'].keys())])
+        annotation_data = []
+        frame_names = []
+        for i in range(bin_nums):
+            bin_data = {}
+            for j in range(obj_nums):
+                obj_id = str(j+1)
+                try:
+                    obj_data = {
+                        "category_name":video_meta['obj_id_cat'][obj_id],
+                        "bbox":video_train_info['boxes'][i*obj_nums+j, :].tolist(),
+                        "valid":video_train_info['valid'][i*obj_nums+j].item()
+                    }
+                except:
+                    obj_data = {}
+                bin_data[obj_id] = obj_data
+            annotation_data.append(bin_data)
+        video_data['annotations'] = annotation_data
+        sample_indx = metas[vid_idx]['sample_indx']
+        frames = metas[vid_idx]['frames']
+        for i in sample_indx:
+            frame_name = frames[i]
+            frame_names.append(frame_name)
+        video_data['frame_names'] = frame_names
+        video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
+        entire_json[video_id] = video_data
+        vid_idx += 1
+    return entire_json
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    #==================json 만들기===================
+    entire_json_dict = createJson(train_dataset, metas)
+    print(type(entire_json_dict))
+    entire_json = json.dumps(entire_json_dict, indent=4)
+    with open('mbench/sampled_frame3.json', mode='w') as file:
+        file.write(entire_json)

mbench/numbered_captions_gpt-4o_final.json ADDED Viewed

The diff for this file is too large to render. See raw diff

mbench/numbered_captions_gpt-4o_no_mask_color.json ADDED Viewed

The diff for this file is too large to render. See raw diff

mbench/numbered_captions_gpt-4o_nomask_randcap.json ADDED Viewed

The diff for this file is too large to render. See raw diff

mbench/numbered_captions_gpt-4o_randcap.json ADDED Viewed

The diff for this file is too large to render. See raw diff

mbench/numbered_valid_obj_ids.json ADDED Viewed

	@@ -0,0 +1,2153 @@

+{
+    "003234408d": {
+        "penguin": [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5"
+        ]
+    },
+    "0043f083b5": {
+        "sedan": [
+            "2",
+            "3"
+        ],
+        "bus": [
+            "1"
+        ]
+    },
+    "0044fa5fba": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "005a527edd": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "0065b171f9": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "00917dcfc4": {
+        "zebra": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "00a23ccf53": {
+        "shark": [
+            "1"
+        ]
+    },
+    "00ad5016a4": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "01082ae388": {
+        "leopard": [
+            "1"
+        ]
+    },
+    "011ac0a06f": {
+        "ape": [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5"
+        ]
+    },
+    "013099c098": {
+        "giant_panda": [
+            "1",
+            "2"
+        ]
+    },
+    "0155498c85": {
+        "motorbike": [
+            "2"
+        ],
+        "person": [
+            "1"
+        ]
+    },
+    "01694ad9c8": {
+        "bird": [
+            "1"
+        ]
+    },
+    "017ac35701": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "01b80e8e1a": {
+        "zebra": [
+            "1",
+            "2"
+        ]
+    },
+    "01baa5a4e1": {},
+    "01c3111683": {
+        "whale": [
+            "1"
+        ]
+    },
+    "01c4cb5ffe": {
+        "person": [
+            "1",
+            "3"
+        ]
+    },
+    "01c76f0a82": {
+        "sedan": [
+            "1",
+            "4"
+        ]
+    },
+    "01c783268c": {
+        "ape": [
+            "1"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "01e64dd36a": {
+        "cow": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "01ed275c6e": {
+        "giraffe": [
+            "1",
+            "2"
+        ]
+    },
+    "01ff60d1fa": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "020cd28cd2": {
+        "person": [
+            "1"
+        ]
+    },
+    "02264db755": {
+        "fox": [
+            "1"
+        ]
+    },
+    "0248626d9a": {
+        "train": [
+            "1"
+        ]
+    },
+    "02668dbffa": {
+        "frog": [
+            "1"
+        ]
+    },
+    "0274193026": {
+        "person": [
+            "2"
+        ]
+    },
+    "02d28375aa": {
+        "fox": [
+            "1"
+        ]
+    },
+    "031ccc99b1": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0321b18c10": {
+        "person": [
+            "1",
+            "2"
+        ],
+        "elephant": [
+            "3"
+        ]
+    },
+    "0348a45bca": {
+        "fish": [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5"
+        ]
+    },
+    "0355e92655": {
+        "person": [
+            "2"
+        ],
+        "boat": [
+            "3"
+        ]
+    },
+    "0358b938c1": {
+        "elephant": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "0368107cf1": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "0379ddf557": {
+        "person": [
+            "1"
+        ]
+    },
+    "038b2cc71d": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "038c15a5dd": {
+        "hedgehog": [
+            "1"
+        ]
+    },
+    "03a06cc98a": {
+        "giraffe": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "03a63e187f": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "03c95b4dae": {
+        "elephant": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "03e2b57b0e": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "04194e1248": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "04259896e2": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "0444918a5f": {
+        "truck": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "04460a7a52": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "04474174a4": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "0450095513": {
+        "snail": [
+            "1"
+        ]
+    },
+    "045f00aed2": {
+        "tiger": [
+            "1"
+        ],
+        "person": [
+            "3"
+        ]
+    },
+    "04667fabaa": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "04735c5030": {
+        "cat": [
+            "1",
+            "2"
+        ]
+    },
+    "04990d1915": {
+        "truck": [
+            "3"
+        ],
+        "sedan": [
+            "1"
+        ],
+        "bus": [
+            "2"
+        ]
+    },
+    "04d62d9d98": {
+        "person": [
+            "1"
+        ]
+    },
+    "04f21da964": {
+        "monkey": [
+            "1"
+        ]
+    },
+    "04fbad476e": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "04fe256562": {
+        "truck": [
+            "2"
+        ],
+        "motorbike": [
+            "1"
+        ]
+    },
+    "0503bf89c9": {
+        "hedgehog": [
+            "1"
+        ]
+    },
+    "0536c9eed0": {
+        "cat": [
+            "1"
+        ]
+    },
+    "054acb238f": {
+        "owl": [
+            "1"
+        ]
+    },
+    "05579ca250": {
+        "person": [
+            "1"
+        ],
+        "sedan": [
+            "3"
+        ]
+    },
+    "056c200404": {},
+    "05774f3a2c": {
+        "ape": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "058a7592c8": {
+        "train": [
+            "1"
+        ]
+    },
+    "05a0a513df": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "05a569d8aa": {
+        "cat": [
+            "1"
+        ],
+        "mouse": [
+            "2"
+        ]
+    },
+    "05aa652648": {
+        "ape": [
+            "1"
+        ]
+    },
+    "05d7715782": {},
+    "05e0b0f28f": {
+        "mouse": [
+            "1"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "05fdbbdd7a": {},
+    "05ffcfed85": {
+        "monkey": [
+            "1",
+            "2"
+        ]
+    },
+    "0630391881": {
+        "person": [
+            "1"
+        ]
+    },
+    "06840b2bbe": {
+        "snake": [
+            "1"
+        ]
+    },
+    "068f7dce6f": {
+        "shark": [
+            "1"
+        ]
+    },
+    "0693719753": {
+        "turtle": [
+            "1",
+            "2"
+        ]
+    },
+    "06ce2b51fb": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "06e224798e": {
+        "tiger": [
+            "1"
+        ]
+    },
+    "06ee361788": {
+        "duck": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "06fbb3fa2c": {
+        "eagle": [
+            "1"
+        ]
+    },
+    "0700264286": {
+        "cow": [
+            "1",
+            "2"
+        ]
+    },
+    "070c918ca7": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "07129e14a4": {
+        "parrot": [
+            "1",
+            "2"
+        ],
+        "person": [
+            "3"
+        ]
+    },
+    "07177017e9": {
+        "motorbike": [
+            "1",
+            "2"
+        ]
+    },
+    "07238ffc58": {
+        "monkey": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "07353b2a89": {
+        "sheep": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "0738493cbf": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "075926c651": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "075c701292": {
+        "duck": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "0762ea9a30": {
+        "person": [
+            "1"
+        ]
+    },
+    "07652ee4af": {
+        "person": [
+            "1"
+        ]
+    },
+    "076f206928": {
+        "person": [
+            "3"
+        ],
+        "zebra": [
+            "1",
+            "2"
+        ]
+    },
+    "077d32af19": {
+        "train": [
+            "4"
+        ],
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "079049275c": {
+        "mouse": [
+            "1"
+        ]
+    },
+    "07913cdda7": {
+        "train": [
+            "1"
+        ],
+        "person": [
+            "2",
+            "3"
+        ]
+    },
+    "07a11a35e8": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "07ac33b6df": {
+        "ape": [
+            "1"
+        ]
+    },
+    "07c62c3d11": {
+        "parrot": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "07cc1c7d74": {
+        "snake": [
+            "1"
+        ]
+    },
+    "080196ef01": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "081207976e": {},
+    "081ae4fa44": {
+        "shark": [
+            "1",
+            "2"
+        ]
+    },
+    "081d8250cb": {
+        "person": [
+            "1"
+        ],
+        "sedan": [
+            "3"
+        ]
+    },
+    "082900c5d4": {
+        "duck": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0860df21e2": {},
+    "0866d4c5e3": {
+        "bird": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0891ac2eb6": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "08931bc458": {
+        "person": [
+            "1"
+        ]
+    },
+    "08aa2705d5": {
+        "snake": [
+            "1"
+        ]
+    },
+    "08c8450db7": {},
+    "08d50b926c": {
+        "turtle": [
+            "1",
+            "2"
+        ]
+    },
+    "08e1e4de15": {
+        "monkey": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "08e48c1a48": {
+        "cow": [
+            "1"
+        ]
+    },
+    "08f561c65e": {
+        "giant_panda": [
+            "1"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "08feb87790": {
+        "sheep": [
+            "1"
+        ]
+    },
+    "09049f6fe3": {
+        "mouse": [
+            "1",
+            "2"
+        ]
+    },
+    "092e4ff450": {
+        "snake": [
+            "1"
+        ]
+    },
+    "09338adea8": {
+        "whale": [
+            "1",
+            "2"
+        ]
+    },
+    "093c335ccc": {
+        "person": [
+            "2"
+        ]
+    },
+    "0970d28339": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "0974a213dc": {
+        "giraffe": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "097b471ed8": {
+        "cat": [
+            "1",
+            "2"
+        ]
+    },
+    "0990941758": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "09a348f4fa": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "09a6841288": {
+        "duck": [
+            "1",
+            "2"
+        ]
+    },
+    "09c5bad17b": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "09c9ce80c7": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "09ff54fef4": {
+        "fox": [
+            "1",
+            "2"
+        ]
+    },
+    "0a23765d15": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "0a275e7f12": {
+        "elephant": [
+            "1"
+        ]
+    },
+    "0a2f2bd294": {
+        "motorbike": [
+            "1"
+        ]
+    },
+    "0a7a2514aa": {
+        "cat": [
+            "1"
+        ],
+        "lizard": [
+            "2"
+        ]
+    },
+    "0a7b27fde9": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "0a8c467cc3": {
+        "fish": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0ac8c560ae": {
+        "person": [
+            "2",
+            "3"
+        ]
+    },
+    "0b1627e896": {
+        "boat": [
+            "1"
+        ]
+    },
+    "0b285c47f6": {
+        "mouse": [
+            "1"
+        ]
+    },
+    "0b34ec1d55": {
+        "ape": [
+            "1"
+        ]
+    },
+    "0b5b5e8e5a": {
+        "person": [
+            "1"
+        ],
+        "sedan": [
+            "2"
+        ]
+    },
+    "0b68535614": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "0b6f9105fc": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "0b7dbfa3cb": {
+        "cow": [
+            "1"
+        ]
+    },
+    "0b9cea51ca": {
+        "whale": [
+            "1"
+        ]
+    },
+    "0b9d012be8": {
+        "camel": [
+            "1"
+        ]
+    },
+    "0bcfc4177d": {
+        "truck": [
+            "1"
+        ]
+    },
+    "0bd37b23c1": {
+        "motorbike": [
+            "1"
+        ]
+    },
+    "0bd864064c": {
+        "eagle": [
+            "1"
+        ]
+    },
+    "0c11c6bf7b": {
+        "deer": [
+            "1"
+        ]
+    },
+    "0c26bc77ac": {
+        "crocodile": [
+            "1"
+        ]
+    },
+    "0c3a04798c": {
+        "fish": [
+            "2"
+        ],
+        "duck": [
+            "1"
+        ]
+    },
+    "0c44a9d545": {
+        "tiger": [
+            "1"
+        ]
+    },
+    "0c817cc390": {
+        "dog": [
+            "2"
+        ],
+        "hedgehog": [
+            "1"
+        ]
+    },
+    "0ca839ee9a": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "0cd7ac0ac0": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "0ce06e0121": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "0cfe974a89": {
+        "turtle": [
+            "1",
+            "2"
+        ]
+    },
+    "0d2fcc0dcd": {
+        "zebra": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "0d3aad05d2": {
+        "person": [
+            "1"
+        ]
+    },
+    "0d40b015f4": {
+        "person": [
+            "1"
+        ]
+    },
+    "0d97fba242": {
+        "dog": [
+            "1"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "0d9cc80d7e": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0dab85b6d3": {
+        "lizard": [
+            "1",
+            "2"
+        ]
+    },
+    "0db5c427a5": {
+        "train": [
+            "1"
+        ]
+    },
+    "0dbaf284f1": {
+        "cat": [
+            "1",
+            "2"
+        ]
+    },
+    "0de4923598": {},
+    "0df28a9101": {
+        "turtle": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0e04f636c4": {
+        "frog": [
+            "1"
+        ]
+    },
+    "0e05f0e232": {
+        "lizard": [
+            "1",
+            "2"
+        ]
+    },
+    "0e0930474b": {
+        "sedan": [
+            "1"
+        ],
+        "person": [
+            "2",
+            "3"
+        ]
+    },
+    "0e27472bea": {
+        "turtle": [
+            "1"
+        ]
+    },
+    "0e30020549": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "0e621feb6c": {
+        "lizard": [
+            "1",
+            "2"
+        ]
+    },
+    "0e803c7d73": {},
+    "0e9ebe4e3c": {
+        "truck": [
+            "1"
+        ]
+    },
+    "0e9f2785ec": {
+        "person": [
+            "2"
+        ]
+    },
+    "0ea68d418b": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "0eb403a222": {},
+    "0ee92053d6": {
+        "person": [
+            "1"
+        ]
+    },
+    "0eefca067f": {
+        "giant_panda": [
+            "1",
+            "2"
+        ]
+    },
+    "0f17fa6fcb": {
+        "duck": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0f1ac8e9a3": {
+        "frog": [
+            "1"
+        ]
+    },
+    "0f202e9852": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "0f2ab8b1ff": {
+        "dolphin": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0f51a78756": {
+        "sheep": [
+            "1"
+        ]
+    },
+    "0f5fbe16b0": {
+        "raccoon": [
+            "1",
+            "2"
+        ]
+    },
+    "0f6072077b": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0f6b69b2f4": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "0f6c2163de": {
+        "snail": [
+            "1"
+        ]
+    },
+    "0f74ec5599": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "0f9683715b": {
+        "elephant": [
+            "1"
+        ]
+    },
+    "0fa7b59356": {
+        "duck": [
+            "1"
+        ]
+    },
+    "0fb173695b": {
+        "person": [
+            "3"
+        ]
+    },
+    "0fc958cde2": {
+        "owl": [
+            "1"
+        ]
+    },
+    "0fe7b1a621": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "0ffcdb491c": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "101caff7d4": {
+        "giant_panda": [
+            "1",
+            "2"
+        ]
+    },
+    "1022fe8417": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "1032e80b37": {
+        "giraffe": [
+            "1"
+        ]
+    },
+    "103f501680": {
+        "fish": [
+            "1"
+        ]
+    },
+    "104e64565f": {
+        "elephant": [
+            "1"
+        ]
+    },
+    "104f1ab997": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "106242403f": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "10b31f5431": {
+        "person": [
+            "1",
+            "3",
+            "4"
+        ]
+    },
+    "10eced835e": {
+        "giant_panda": [
+            "1",
+            "2"
+        ]
+    },
+    "110d26fa3a": {
+        "shark": [
+            "1"
+        ]
+    },
+    "1122c1d16a": {
+        "parrot": [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5"
+        ],
+        "person": [
+            "6"
+        ]
+    },
+    "1145b49a5f": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "11485838c2": {
+        "giraffe": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "114e7676ec": {
+        "person": [
+            "1"
+        ]
+    },
+    "1157472b95": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "115ee1072c": {
+        "cow": [
+            "1"
+        ]
+    },
+    "1171141012": {
+        "turtle": [
+            "1"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "117757b4b8": {
+        "snail": [
+            "1"
+        ]
+    },
+    "1178932d2f": {
+        "motorbike": [
+            "3"
+        ],
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "117cc76bda": {
+        "whale": [
+            "1"
+        ]
+    },
+    "1180cbf814": {
+        "fish": [
+            "1",
+            "2"
+        ]
+    },
+    "1187bbd0e3": {
+        "cat": [
+            "1"
+        ]
+    },
+    "1197e44b26": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "119cf20728": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "119dd54871": {
+        "lion": [
+            "1",
+            "2"
+        ]
+    },
+    "11a0c3b724": {
+        "mouse": [
+            "1",
+            "2"
+        ]
+    },
+    "11a6ba8c94": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "11c722a456": {
+        "turtle": [
+            "1",
+            "2"
+        ]
+    },
+    "11cbcb0b4d": {
+        "zebra": [
+            "1"
+        ]
+    },
+    "11ccf5e99d": {
+        "person": [
+            "2"
+        ]
+    },
+    "11ce6f452e": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "11feabe596": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "120cb9514d": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "12156b25b3": {
+        "person": [
+            "1"
+        ]
+    },
+    "122896672d": {
+        "person": [
+            "1",
+            "3"
+        ]
+    },
+    "1233ac8596": {
+        "dog": [
+            "1"
+        ]
+    },
+    "1239c87234": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "1250423f7c": {
+        "person": [
+            "2"
+        ],
+        "elephant": [
+            "3",
+            "4"
+        ]
+    },
+    "1257a1bc67": {
+        "snake": [
+            "1"
+        ]
+    },
+    "125d1b19dd": {
+        "giant_panda": [
+            "1",
+            "2"
+        ]
+    },
+    "126d203967": {
+        "person": [
+            "2"
+        ]
+    },
+    "1295e19071": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "12ad198c54": {
+        "person": [
+            "1"
+        ]
+    },
+    "12bddb2bcb": {
+        "person": [
+            "2"
+        ]
+    },
+    "12ec9b93ee": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "12eebedc35": {
+        "bird": [
+            "1"
+        ]
+    },
+    "132852e094": {
+        "fox": [
+            "1"
+        ]
+    },
+    "1329409f2a": {
+        "fish": [
+            "1"
+        ]
+    },
+    "13325cfa14": {
+        "person": [
+            "2"
+        ]
+    },
+    "1336440745": {
+        "mouse": [
+            "1",
+            "2"
+        ]
+    },
+    "134d06dbf9": {
+        "cat": [
+            "1"
+        ]
+    },
+    "135625b53d": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "13870016f9": {
+        "cow": [
+            "2",
+            "3"
+        ],
+        "person": [
+            "1"
+        ]
+    },
+    "13960b3c84": {
+        "giraffe": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "13adaad9d9": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "13ae097e20": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "13e3070469": {
+        "zebra": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "13f6a8c20d": {
+        "fish": [
+            "1"
+        ]
+    },
+    "1416925cf2": {
+        "truck": [
+            "1",
+            "2"
+        ]
+    },
+    "142d2621f5": {
+        "motorbike": [
+            "3"
+        ],
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "145d5d7c03": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "145fdc3ac5": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "1471274fa7": {
+        "person": [
+            "1"
+        ]
+    },
+    "14a6b5a139": {
+        "fish": [
+            "1"
+        ]
+    },
+    "14c21cea0d": {
+        "monkey": [
+            "1",
+            "2"
+        ]
+    },
+    "14dae0dc93": {
+        "person": [
+            "2"
+        ]
+    },
+    "14f9bd22b5": {
+        "tiger": [
+            "1"
+        ]
+    },
+    "14fd28ae99": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "15097d5d4e": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "150ea711f2": {
+        "whale": [
+            "1"
+        ]
+    },
+    "1514e3563f": {
+        "earless_seal": [
+            "1",
+            "2"
+        ]
+    },
+    "152aaa3a9e": {
+        "raccoon": [
+            "1"
+        ]
+    },
+    "152b7d3bd7": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "15617297cc": {
+        "person": [
+            "1"
+        ]
+    },
+    "15abbe0c52": {
+        "person": [
+            "1"
+        ]
+    },
+    "15d1fb3de5": {
+        "cat": [
+            "2"
+        ],
+        "owl": [
+            "1"
+        ]
+    },
+    "15f67b0fab": {
+        "person": [
+            "1"
+        ]
+    },
+    "161eb59aad": {
+        "cow": [
+            "2",
+            "3"
+        ],
+        "giraffe": [
+            "1"
+        ]
+    },
+    "16288ea47f": {
+        "duck": [
+            "1",
+            "2"
+        ]
+    },
+    "164410ce62": {
+        "person": [
+            "1"
+        ]
+    },
+    "165c3c8cd4": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "165c42b41b": {
+        "motorbike": [
+            "2",
+            "3"
+        ],
+        "person": [
+            "1",
+            "4"
+        ]
+    },
+    "165ec9e22b": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "1669502269": {
+        "person": [
+            "1"
+        ]
+    },
+    "16763cccbb": {
+        "ape": [
+            "1"
+        ]
+    },
+    "16adde065e": {
+        "cat": [
+            "2"
+        ],
+        "person": [
+            "3"
+        ]
+    },
+    "16af445362": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "16afd538ad": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "16c3fa4d5d": {
+        "sedan": [
+            "1"
+        ]
+    },
+    "16d1d65c27": {
+        "monkey": [
+            "1"
+        ]
+    },
+    "16e8599e94": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "16fe9fb444": {
+        "motorbike": [
+            "1"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "1705796b02": {
+        "train": [
+            "1"
+        ]
+    },
+    "1724db7671": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "17418e81ea": {
+        "shark": [
+            "1"
+        ]
+    },
+    "175169edbb": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "17622326fd": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "17656bae77": {
+        "elephant": [
+            "1"
+        ]
+    },
+    "17b0d94172": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "17c220e4f6": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "17c7bcd146": {
+        "train": [
+            "1"
+        ]
+    },
+    "17cb4afe89": {
+        "tiger": [
+            "1"
+        ]
+    },
+    "17cd79a434": {
+        "squirrel": [
+            "1"
+        ]
+    },
+    "17d18604c3": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "17d8ca1a37": {
+        "person": [
+            "2"
+        ],
+        "owl": [
+            "1"
+        ]
+    },
+    "17e33f4330": {
+        "monkey": [
+            "1"
+        ]
+    },
+    "17f7a6d805": {
+        "snail": [
+            "1"
+        ]
+    },
+    "180abc8378": {
+        "person": [
+            "2"
+        ],
+        "owl": [
+            "1"
+        ]
+    },
+    "183ba3d652": {
+        "motorbike": [
+            "3"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "185bf64702": {
+        "zebra": [
+            "1",
+            "2"
+        ]
+    },
+    "18913cc690": {
+        "train": [
+            "1"
+        ]
+    },
+    "1892651815": {
+        "camel": [
+            "1"
+        ]
+    },
+    "189ac8208a": {
+        "giraffe": [
+            "1",
+            "2"
+        ]
+    },
+    "189b44e92c": {
+        "zebra": [
+            "1"
+        ]
+    },
+    "18ac264b76": {
+        "person": [
+            "2"
+        ]
+    },
+    "18b245ab49": {
+        "penguin": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "18b5cebc34": {
+        "mouse": [
+            "1"
+        ]
+    },
+    "18bad52083": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "18bb5144d5": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "18c6f205c5": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "1903f9ea15": {
+        "bird": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "1917b209f2": {
+        "horse": [
+            "2"
+        ],
+        "cow": [
+            "3",
+            "4"
+        ],
+        "person": [
+            "1"
+        ]
+    },
+    "191e74c01d": {
+        "deer": [
+            "1"
+        ]
+    },
+    "19367bb94e": {
+        "fish": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "193ffaa217": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "19696b67d3": {
+        "cow": [
+            "1"
+        ]
+    },
+    "197f3ab6f3": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "1981e763cc": {
+        "sheep": [
+            "1",
+            "2"
+        ]
+    },
+    "198afe39ae": {
+        "person": [
+            "1"
+        ]
+    },
+    "19a6e62b9b": {
+        "monkey": [
+            "1",
+            "2"
+        ]
+    },
+    "19b60d5335": {
+        "hedgehog": [
+            "1"
+        ]
+    },
+    "19c00c11f9": {
+        "person": [
+            "1"
+        ]
+    },
+    "19e061eb88": {
+        "boat": [
+            "1",
+            "2"
+        ]
+    },
+    "19e8bc6178": {
+        "dog": [
+            "1"
+        ]
+    },
+    "19ee80dac6": {
+        "person": [
+            "1",
+            "3",
+            "4"
+        ]
+    },
+    "1a25a9170a": {
+        "cow": [
+            "1"
+        ],
+        "person": [
+            "2",
+            "3"
+        ]
+    },
+    "1a359a6c1a": {
+        "sheep": [
+            "1"
+        ]
+    },
+    "1a3e87c566": {
+        "frog": [
+            "1"
+        ]
+    },
+    "1a5fe06b00": {
+        "bus": [
+            "1"
+        ]
+    },
+    "1a6c0fbd1e": {
+        "person": [
+            "1"
+        ]
+    },
+    "1a6f3b5a4b": {
+        "sedan": [
+            "3"
+        ]
+    },
+    "1a8afbad92": {
+        "zebra": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "1a8bdc5842": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "1a95752aca": {
+        "duck": [
+            "1",
+            "2"
+        ]
+    },
+    "1a9c131cb7": {
+        "ape": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "1aa3da3ee3": {
+        "sheep": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "1ab27ec7ea": {
+        "deer": [
+            "1"
+        ]
+    },
+    "1abf16d21d": {
+        "turtle": [
+            "1"
+        ]
+    },
+    "1acd0f993b": {
+        "dog": [
+            "1"
+        ],
+        "person": [
+            "3"
+        ]
+    },
+    "1ad202e499": {
+        "lizard": [
+            "1",
+            "2"
+        ]
+    },
+    "1af8d2395d": {
+        "airplane": [
+            "4"
+        ],
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "1afd39a1fa": {
+        "motorbike": [
+            "2"
+        ]
+    },
+    "1b2d31306f": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "1b3fa67f0e": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "1b43fa74b4": {
+        "owl": [
+            "1",
+            "2"
+        ]
+    },
+    "1b73ea9fc2": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "1b7e8bb255": {
+        "person": [
+            "2"
+        ]
+    },
+    "1b8680f8cd": {
+        "person": [
+            "2",
+            "3"
+        ]
+    },
+    "1b883843c0": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "1b8898785b": {
+        "monkey": [
+            "1",
+            "2"
+        ]
+    },
+    "1b88ba1aa4": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "1b96a498e5": {
+        "ape": [
+            "1"
+        ]
+    },
+    "1bbc4c274f": {
+        "fish": [
+            "2"
+        ]
+    },
+    "1bd87fe9ab": {
+        "train": [
+            "1"
+        ]
+    },
+    "1c4090c75b": {
+        "whale": [
+            "1"
+        ]
+    },
+    "1c41934f84": {
+        "elephant": [
+            "1",
+            "2"
+        ]
+    },
+    "1c72b04b56": {
+        "lion": [
+            "1"
+        ]
+    },
+    "1c87955a3a": {
+        "turtle": [
+            "2"
+        ],
+        "crocodile": [
+            "1"
+        ]
+    },
+    "1c9f9eb792": {
+        "person": [
+            "2"
+        ]
+    },
+    "1ca240fede": {
+        "train": [
+            "1"
+        ]
+    },
+    "1ca5673803": {
+        "person": [
+            "1",
+            "3"
+        ]
+    },
+    "1cada35274": {
+        "duck": [
+            "1"
+        ]
+    },
+    "1cb44b920d": {
+        "eagle": [
+            "1",
+            "2"
+        ]
+    },
+    "1cd10e62be": {
+        "leopard": [
+            "1"
+        ]
+    },
+    "1d3087d5e5": {
+        "fish": [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5"
+        ]
+    },
+    "1d3685150a": {
+        "person": [
+            "1",
+            "3"
+        ]
+    },
+    "1d6ff083aa": {
+        "person": [
+            "1",
+            "2"
+        ]
+    }
+}

mbench/numbered_valid_obj_ids_gpt-4o.json ADDED Viewed

	@@ -0,0 +1,2153 @@

+{
+    "003234408d": {
+        "penguin": [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5"
+        ]
+    },
+    "0043f083b5": {
+        "sedan": [
+            "2",
+            "3"
+        ],
+        "bus": [
+            "1"
+        ]
+    },
+    "0044fa5fba": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "005a527edd": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "0065b171f9": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "00917dcfc4": {
+        "zebra": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "00a23ccf53": {
+        "shark": [
+            "1"
+        ]
+    },
+    "00ad5016a4": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "01082ae388": {
+        "leopard": [
+            "1"
+        ]
+    },
+    "011ac0a06f": {
+        "ape": [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5"
+        ]
+    },
+    "013099c098": {
+        "giant_panda": [
+            "1",
+            "2"
+        ]
+    },
+    "0155498c85": {
+        "person": [
+            "1"
+        ],
+        "motorbike": [
+            "2"
+        ]
+    },
+    "01694ad9c8": {
+        "bird": [
+            "1"
+        ]
+    },
+    "017ac35701": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "01b80e8e1a": {
+        "zebra": [
+            "1",
+            "2"
+        ]
+    },
+    "01baa5a4e1": {},
+    "01c3111683": {
+        "whale": [
+            "1"
+        ]
+    },
+    "01c4cb5ffe": {
+        "person": [
+            "1",
+            "3"
+        ]
+    },
+    "01c76f0a82": {
+        "sedan": [
+            "1",
+            "4"
+        ]
+    },
+    "01c783268c": {
+        "person": [
+            "2"
+        ],
+        "ape": [
+            "1"
+        ]
+    },
+    "01e64dd36a": {
+        "cow": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "01ed275c6e": {
+        "giraffe": [
+            "1",
+            "2"
+        ]
+    },
+    "01ff60d1fa": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "020cd28cd2": {
+        "person": [
+            "1"
+        ]
+    },
+    "02264db755": {
+        "fox": [
+            "1"
+        ]
+    },
+    "0248626d9a": {
+        "train": [
+            "1"
+        ]
+    },
+    "02668dbffa": {
+        "frog": [
+            "1"
+        ]
+    },
+    "0274193026": {
+        "person": [
+            "2"
+        ]
+    },
+    "02d28375aa": {
+        "fox": [
+            "1"
+        ]
+    },
+    "031ccc99b1": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0321b18c10": {
+        "elephant": [
+            "3"
+        ],
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "0348a45bca": {
+        "fish": [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5"
+        ]
+    },
+    "0355e92655": {
+        "person": [
+            "2"
+        ],
+        "boat": [
+            "3"
+        ]
+    },
+    "0358b938c1": {
+        "elephant": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "0368107cf1": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "0379ddf557": {
+        "person": [
+            "1"
+        ]
+    },
+    "038b2cc71d": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "038c15a5dd": {
+        "hedgehog": [
+            "1"
+        ]
+    },
+    "03a06cc98a": {
+        "giraffe": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "03a63e187f": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "03c95b4dae": {
+        "elephant": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "03e2b57b0e": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "04194e1248": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "04259896e2": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "0444918a5f": {
+        "truck": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "04460a7a52": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "04474174a4": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "0450095513": {
+        "snail": [
+            "1"
+        ]
+    },
+    "045f00aed2": {
+        "tiger": [
+            "1"
+        ],
+        "person": [
+            "3"
+        ]
+    },
+    "04667fabaa": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "04735c5030": {
+        "cat": [
+            "1",
+            "2"
+        ]
+    },
+    "04990d1915": {
+        "bus": [
+            "2"
+        ],
+        "truck": [
+            "3"
+        ],
+        "sedan": [
+            "1"
+        ]
+    },
+    "04d62d9d98": {
+        "person": [
+            "1"
+        ]
+    },
+    "04f21da964": {
+        "monkey": [
+            "1"
+        ]
+    },
+    "04fbad476e": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "04fe256562": {
+        "truck": [
+            "2"
+        ],
+        "motorbike": [
+            "1"
+        ]
+    },
+    "0503bf89c9": {
+        "hedgehog": [
+            "1"
+        ]
+    },
+    "0536c9eed0": {
+        "cat": [
+            "1"
+        ]
+    },
+    "054acb238f": {
+        "owl": [
+            "1"
+        ]
+    },
+    "05579ca250": {
+        "person": [
+            "1"
+        ],
+        "sedan": [
+            "3"
+        ]
+    },
+    "056c200404": {},
+    "05774f3a2c": {
+        "ape": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "058a7592c8": {
+        "train": [
+            "1"
+        ]
+    },
+    "05a0a513df": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "05a569d8aa": {
+        "mouse": [
+            "2"
+        ],
+        "cat": [
+            "1"
+        ]
+    },
+    "05aa652648": {
+        "ape": [
+            "1"
+        ]
+    },
+    "05d7715782": {},
+    "05e0b0f28f": {
+        "mouse": [
+            "1"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "05fdbbdd7a": {},
+    "05ffcfed85": {
+        "monkey": [
+            "1",
+            "2"
+        ]
+    },
+    "0630391881": {
+        "person": [
+            "1"
+        ]
+    },
+    "06840b2bbe": {
+        "snake": [
+            "1"
+        ]
+    },
+    "068f7dce6f": {
+        "shark": [
+            "1"
+        ]
+    },
+    "0693719753": {
+        "turtle": [
+            "1",
+            "2"
+        ]
+    },
+    "06ce2b51fb": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "06e224798e": {
+        "tiger": [
+            "1"
+        ]
+    },
+    "06ee361788": {
+        "duck": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "06fbb3fa2c": {
+        "eagle": [
+            "1"
+        ]
+    },
+    "0700264286": {
+        "cow": [
+            "1",
+            "2"
+        ]
+    },
+    "070c918ca7": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "07129e14a4": {
+        "parrot": [
+            "1",
+            "2"
+        ],
+        "person": [
+            "3"
+        ]
+    },
+    "07177017e9": {
+        "motorbike": [
+            "1",
+            "2"
+        ]
+    },
+    "07238ffc58": {
+        "monkey": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "07353b2a89": {
+        "sheep": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "0738493cbf": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "075926c651": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "075c701292": {
+        "duck": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "0762ea9a30": {
+        "person": [
+            "1"
+        ]
+    },
+    "07652ee4af": {
+        "person": [
+            "1"
+        ]
+    },
+    "076f206928": {
+        "zebra": [
+            "1",
+            "2"
+        ],
+        "person": [
+            "3"
+        ]
+    },
+    "077d32af19": {
+        "train": [
+            "4"
+        ],
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "079049275c": {
+        "mouse": [
+            "1"
+        ]
+    },
+    "07913cdda7": {
+        "train": [
+            "1"
+        ],
+        "person": [
+            "2",
+            "3"
+        ]
+    },
+    "07a11a35e8": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "07ac33b6df": {
+        "ape": [
+            "1"
+        ]
+    },
+    "07c62c3d11": {
+        "parrot": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "07cc1c7d74": {
+        "snake": [
+            "1"
+        ]
+    },
+    "080196ef01": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "081207976e": {},
+    "081ae4fa44": {
+        "shark": [
+            "1",
+            "2"
+        ]
+    },
+    "081d8250cb": {
+        "person": [
+            "1"
+        ],
+        "sedan": [
+            "3"
+        ]
+    },
+    "082900c5d4": {
+        "duck": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0860df21e2": {},
+    "0866d4c5e3": {
+        "bird": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0891ac2eb6": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "08931bc458": {
+        "person": [
+            "1"
+        ]
+    },
+    "08aa2705d5": {
+        "snake": [
+            "1"
+        ]
+    },
+    "08c8450db7": {},
+    "08d50b926c": {
+        "turtle": [
+            "1",
+            "2"
+        ]
+    },
+    "08e1e4de15": {
+        "monkey": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "08e48c1a48": {
+        "cow": [
+            "1"
+        ]
+    },
+    "08f561c65e": {
+        "giant_panda": [
+            "1"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "08feb87790": {
+        "sheep": [
+            "1"
+        ]
+    },
+    "09049f6fe3": {
+        "mouse": [
+            "1",
+            "2"
+        ]
+    },
+    "092e4ff450": {
+        "snake": [
+            "1"
+        ]
+    },
+    "09338adea8": {
+        "whale": [
+            "1",
+            "2"
+        ]
+    },
+    "093c335ccc": {
+        "person": [
+            "2"
+        ]
+    },
+    "0970d28339": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "0974a213dc": {
+        "giraffe": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "097b471ed8": {
+        "cat": [
+            "1",
+            "2"
+        ]
+    },
+    "0990941758": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "09a348f4fa": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "09a6841288": {
+        "duck": [
+            "1",
+            "2"
+        ]
+    },
+    "09c5bad17b": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "09c9ce80c7": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "09ff54fef4": {
+        "fox": [
+            "1",
+            "2"
+        ]
+    },
+    "0a23765d15": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "0a275e7f12": {
+        "elephant": [
+            "1"
+        ]
+    },
+    "0a2f2bd294": {
+        "motorbike": [
+            "1"
+        ]
+    },
+    "0a7a2514aa": {
+        "lizard": [
+            "2"
+        ],
+        "cat": [
+            "1"
+        ]
+    },
+    "0a7b27fde9": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "0a8c467cc3": {
+        "fish": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0ac8c560ae": {
+        "person": [
+            "2",
+            "3"
+        ]
+    },
+    "0b1627e896": {
+        "boat": [
+            "1"
+        ]
+    },
+    "0b285c47f6": {
+        "mouse": [
+            "1"
+        ]
+    },
+    "0b34ec1d55": {
+        "ape": [
+            "1"
+        ]
+    },
+    "0b5b5e8e5a": {
+        "person": [
+            "1"
+        ],
+        "sedan": [
+            "2"
+        ]
+    },
+    "0b68535614": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "0b6f9105fc": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "0b7dbfa3cb": {
+        "cow": [
+            "1"
+        ]
+    },
+    "0b9cea51ca": {
+        "whale": [
+            "1"
+        ]
+    },
+    "0b9d012be8": {
+        "camel": [
+            "1"
+        ]
+    },
+    "0bcfc4177d": {
+        "truck": [
+            "1"
+        ]
+    },
+    "0bd37b23c1": {
+        "motorbike": [
+            "1"
+        ]
+    },
+    "0bd864064c": {
+        "eagle": [
+            "1"
+        ]
+    },
+    "0c11c6bf7b": {
+        "deer": [
+            "1"
+        ]
+    },
+    "0c26bc77ac": {
+        "crocodile": [
+            "1"
+        ]
+    },
+    "0c3a04798c": {
+        "duck": [
+            "1"
+        ],
+        "fish": [
+            "2"
+        ]
+    },
+    "0c44a9d545": {
+        "tiger": [
+            "1"
+        ]
+    },
+    "0c817cc390": {
+        "hedgehog": [
+            "1"
+        ],
+        "dog": [
+            "2"
+        ]
+    },
+    "0ca839ee9a": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "0cd7ac0ac0": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "0ce06e0121": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "0cfe974a89": {
+        "turtle": [
+            "1",
+            "2"
+        ]
+    },
+    "0d2fcc0dcd": {
+        "zebra": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "0d3aad05d2": {
+        "person": [
+            "1"
+        ]
+    },
+    "0d40b015f4": {
+        "person": [
+            "1"
+        ]
+    },
+    "0d97fba242": {
+        "person": [
+            "2"
+        ],
+        "dog": [
+            "1"
+        ]
+    },
+    "0d9cc80d7e": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0dab85b6d3": {
+        "lizard": [
+            "1",
+            "2"
+        ]
+    },
+    "0db5c427a5": {
+        "train": [
+            "1"
+        ]
+    },
+    "0dbaf284f1": {
+        "cat": [
+            "1",
+            "2"
+        ]
+    },
+    "0de4923598": {},
+    "0df28a9101": {
+        "turtle": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0e04f636c4": {
+        "frog": [
+            "1"
+        ]
+    },
+    "0e05f0e232": {
+        "lizard": [
+            "1",
+            "2"
+        ]
+    },
+    "0e0930474b": {
+        "person": [
+            "2",
+            "3"
+        ],
+        "sedan": [
+            "1"
+        ]
+    },
+    "0e27472bea": {
+        "turtle": [
+            "1"
+        ]
+    },
+    "0e30020549": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "0e621feb6c": {
+        "lizard": [
+            "1",
+            "2"
+        ]
+    },
+    "0e803c7d73": {},
+    "0e9ebe4e3c": {
+        "truck": [
+            "1"
+        ]
+    },
+    "0e9f2785ec": {
+        "person": [
+            "2"
+        ]
+    },
+    "0ea68d418b": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "0eb403a222": {},
+    "0ee92053d6": {
+        "person": [
+            "1"
+        ]
+    },
+    "0eefca067f": {
+        "giant_panda": [
+            "1",
+            "2"
+        ]
+    },
+    "0f17fa6fcb": {
+        "duck": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0f1ac8e9a3": {
+        "frog": [
+            "1"
+        ]
+    },
+    "0f202e9852": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "0f2ab8b1ff": {
+        "dolphin": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0f51a78756": {
+        "sheep": [
+            "1"
+        ]
+    },
+    "0f5fbe16b0": {
+        "raccoon": [
+            "1",
+            "2"
+        ]
+    },
+    "0f6072077b": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0f6b69b2f4": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "0f6c2163de": {
+        "snail": [
+            "1"
+        ]
+    },
+    "0f74ec5599": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "0f9683715b": {
+        "elephant": [
+            "1"
+        ]
+    },
+    "0fa7b59356": {
+        "duck": [
+            "1"
+        ]
+    },
+    "0fb173695b": {
+        "person": [
+            "3"
+        ]
+    },
+    "0fc958cde2": {
+        "owl": [
+            "1"
+        ]
+    },
+    "0fe7b1a621": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "0ffcdb491c": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "101caff7d4": {
+        "giant_panda": [
+            "1",
+            "2"
+        ]
+    },
+    "1022fe8417": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "1032e80b37": {
+        "giraffe": [
+            "1"
+        ]
+    },
+    "103f501680": {
+        "fish": [
+            "1"
+        ]
+    },
+    "104e64565f": {
+        "elephant": [
+            "1"
+        ]
+    },
+    "104f1ab997": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "106242403f": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "10b31f5431": {
+        "person": [
+            "1",
+            "3",
+            "4"
+        ]
+    },
+    "10eced835e": {
+        "giant_panda": [
+            "1",
+            "2"
+        ]
+    },
+    "110d26fa3a": {
+        "shark": [
+            "1"
+        ]
+    },
+    "1122c1d16a": {
+        "parrot": [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5"
+        ],
+        "person": [
+            "6"
+        ]
+    },
+    "1145b49a5f": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "11485838c2": {
+        "giraffe": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "114e7676ec": {
+        "person": [
+            "1"
+        ]
+    },
+    "1157472b95": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "115ee1072c": {
+        "cow": [
+            "1"
+        ]
+    },
+    "1171141012": {
+        "person": [
+            "2"
+        ],
+        "turtle": [
+            "1"
+        ]
+    },
+    "117757b4b8": {
+        "snail": [
+            "1"
+        ]
+    },
+    "1178932d2f": {
+        "person": [
+            "1",
+            "2"
+        ],
+        "motorbike": [
+            "3"
+        ]
+    },
+    "117cc76bda": {
+        "whale": [
+            "1"
+        ]
+    },
+    "1180cbf814": {
+        "fish": [
+            "1",
+            "2"
+        ]
+    },
+    "1187bbd0e3": {
+        "cat": [
+            "1"
+        ]
+    },
+    "1197e44b26": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "119cf20728": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "119dd54871": {
+        "lion": [
+            "1",
+            "2"
+        ]
+    },
+    "11a0c3b724": {
+        "mouse": [
+            "1",
+            "2"
+        ]
+    },
+    "11a6ba8c94": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "11c722a456": {
+        "turtle": [
+            "1",
+            "2"
+        ]
+    },
+    "11cbcb0b4d": {
+        "zebra": [
+            "1"
+        ]
+    },
+    "11ccf5e99d": {
+        "person": [
+            "2"
+        ]
+    },
+    "11ce6f452e": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "11feabe596": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "120cb9514d": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "12156b25b3": {
+        "person": [
+            "1"
+        ]
+    },
+    "122896672d": {
+        "person": [
+            "1",
+            "3"
+        ]
+    },
+    "1233ac8596": {
+        "dog": [
+            "1"
+        ]
+    },
+    "1239c87234": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "1250423f7c": {
+        "elephant": [
+            "3",
+            "4"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "1257a1bc67": {
+        "snake": [
+            "1"
+        ]
+    },
+    "125d1b19dd": {
+        "giant_panda": [
+            "1",
+            "2"
+        ]
+    },
+    "126d203967": {
+        "person": [
+            "2"
+        ]
+    },
+    "1295e19071": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "12ad198c54": {
+        "person": [
+            "1"
+        ]
+    },
+    "12bddb2bcb": {
+        "person": [
+            "2"
+        ]
+    },
+    "12ec9b93ee": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "12eebedc35": {
+        "bird": [
+            "1"
+        ]
+    },
+    "132852e094": {
+        "fox": [
+            "1"
+        ]
+    },
+    "1329409f2a": {
+        "fish": [
+            "1"
+        ]
+    },
+    "13325cfa14": {
+        "person": [
+            "2"
+        ]
+    },
+    "1336440745": {
+        "mouse": [
+            "1",
+            "2"
+        ]
+    },
+    "134d06dbf9": {
+        "cat": [
+            "1"
+        ]
+    },
+    "135625b53d": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "13870016f9": {
+        "person": [
+            "1"
+        ],
+        "cow": [
+            "2",
+            "3"
+        ]
+    },
+    "13960b3c84": {
+        "giraffe": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "13adaad9d9": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "13ae097e20": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "13e3070469": {
+        "zebra": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "13f6a8c20d": {
+        "fish": [
+            "1"
+        ]
+    },
+    "1416925cf2": {
+        "truck": [
+            "1",
+            "2"
+        ]
+    },
+    "142d2621f5": {
+        "person": [
+            "1",
+            "2"
+        ],
+        "motorbike": [
+            "3"
+        ]
+    },
+    "145d5d7c03": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "145fdc3ac5": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "1471274fa7": {
+        "person": [
+            "1"
+        ]
+    },
+    "14a6b5a139": {
+        "fish": [
+            "1"
+        ]
+    },
+    "14c21cea0d": {
+        "monkey": [
+            "1",
+            "2"
+        ]
+    },
+    "14dae0dc93": {
+        "person": [
+            "2"
+        ]
+    },
+    "14f9bd22b5": {
+        "tiger": [
+            "1"
+        ]
+    },
+    "14fd28ae99": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "15097d5d4e": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "150ea711f2": {
+        "whale": [
+            "1"
+        ]
+    },
+    "1514e3563f": {
+        "earless_seal": [
+            "1",
+            "2"
+        ]
+    },
+    "152aaa3a9e": {
+        "raccoon": [
+            "1"
+        ]
+    },
+    "152b7d3bd7": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "15617297cc": {
+        "person": [
+            "1"
+        ]
+    },
+    "15abbe0c52": {
+        "person": [
+            "1"
+        ]
+    },
+    "15d1fb3de5": {
+        "owl": [
+            "1"
+        ],
+        "cat": [
+            "2"
+        ]
+    },
+    "15f67b0fab": {
+        "person": [
+            "1"
+        ]
+    },
+    "161eb59aad": {
+        "cow": [
+            "2",
+            "3"
+        ],
+        "giraffe": [
+            "1"
+        ]
+    },
+    "16288ea47f": {
+        "duck": [
+            "1",
+            "2"
+        ]
+    },
+    "164410ce62": {
+        "person": [
+            "1"
+        ]
+    },
+    "165c3c8cd4": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "165c42b41b": {
+        "person": [
+            "1",
+            "4"
+        ],
+        "motorbike": [
+            "2",
+            "3"
+        ]
+    },
+    "165ec9e22b": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "1669502269": {
+        "person": [
+            "1"
+        ]
+    },
+    "16763cccbb": {
+        "ape": [
+            "1"
+        ]
+    },
+    "16adde065e": {
+        "person": [
+            "3"
+        ],
+        "cat": [
+            "2"
+        ]
+    },
+    "16af445362": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "16afd538ad": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "16c3fa4d5d": {
+        "sedan": [
+            "1"
+        ]
+    },
+    "16d1d65c27": {
+        "monkey": [
+            "1"
+        ]
+    },
+    "16e8599e94": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "16fe9fb444": {
+        "person": [
+            "2"
+        ],
+        "motorbike": [
+            "1"
+        ]
+    },
+    "1705796b02": {
+        "train": [
+            "1"
+        ]
+    },
+    "1724db7671": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "17418e81ea": {
+        "shark": [
+            "1"
+        ]
+    },
+    "175169edbb": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "17622326fd": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "17656bae77": {
+        "elephant": [
+            "1"
+        ]
+    },
+    "17b0d94172": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "17c220e4f6": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "17c7bcd146": {
+        "train": [
+            "1"
+        ]
+    },
+    "17cb4afe89": {
+        "tiger": [
+            "1"
+        ]
+    },
+    "17cd79a434": {
+        "squirrel": [
+            "1"
+        ]
+    },
+    "17d18604c3": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "17d8ca1a37": {
+        "person": [
+            "2"
+        ],
+        "owl": [
+            "1"
+        ]
+    },
+    "17e33f4330": {
+        "monkey": [
+            "1"
+        ]
+    },
+    "17f7a6d805": {
+        "snail": [
+            "1"
+        ]
+    },
+    "180abc8378": {
+        "person": [
+            "2"
+        ],
+        "owl": [
+            "1"
+        ]
+    },
+    "183ba3d652": {
+        "person": [
+            "2"
+        ],
+        "motorbike": [
+            "3"
+        ]
+    },
+    "185bf64702": {
+        "zebra": [
+            "1",
+            "2"
+        ]
+    },
+    "18913cc690": {
+        "train": [
+            "1"
+        ]
+    },
+    "1892651815": {
+        "camel": [
+            "1"
+        ]
+    },
+    "189ac8208a": {
+        "giraffe": [
+            "1",
+            "2"
+        ]
+    },
+    "189b44e92c": {
+        "zebra": [
+            "1"
+        ]
+    },
+    "18ac264b76": {
+        "person": [
+            "2"
+        ]
+    },
+    "18b245ab49": {
+        "penguin": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "18b5cebc34": {
+        "mouse": [
+            "1"
+        ]
+    },
+    "18bad52083": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "18bb5144d5": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "18c6f205c5": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "1903f9ea15": {
+        "bird": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "1917b209f2": {
+        "horse": [
+            "2"
+        ],
+        "person": [
+            "1"
+        ],
+        "cow": [
+            "3",
+            "4"
+        ]
+    },
+    "191e74c01d": {
+        "deer": [
+            "1"
+        ]
+    },
+    "19367bb94e": {
+        "fish": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "193ffaa217": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "19696b67d3": {
+        "cow": [
+            "1"
+        ]
+    },
+    "197f3ab6f3": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "1981e763cc": {
+        "sheep": [
+            "1",
+            "2"
+        ]
+    },
+    "198afe39ae": {
+        "person": [
+            "1"
+        ]
+    },
+    "19a6e62b9b": {
+        "monkey": [
+            "1",
+            "2"
+        ]
+    },
+    "19b60d5335": {
+        "hedgehog": [
+            "1"
+        ]
+    },
+    "19c00c11f9": {
+        "person": [
+            "1"
+        ]
+    },
+    "19e061eb88": {
+        "boat": [
+            "1",
+            "2"
+        ]
+    },
+    "19e8bc6178": {
+        "dog": [
+            "1"
+        ]
+    },
+    "19ee80dac6": {
+        "person": [
+            "1",
+            "3",
+            "4"
+        ]
+    },
+    "1a25a9170a": {
+        "person": [
+            "2",
+            "3"
+        ],
+        "cow": [
+            "1"
+        ]
+    },
+    "1a359a6c1a": {
+        "sheep": [
+            "1"
+        ]
+    },
+    "1a3e87c566": {
+        "frog": [
+            "1"
+        ]
+    },
+    "1a5fe06b00": {
+        "bus": [
+            "1"
+        ]
+    },
+    "1a6c0fbd1e": {
+        "person": [
+            "1"
+        ]
+    },
+    "1a6f3b5a4b": {
+        "sedan": [
+            "3"
+        ]
+    },
+    "1a8afbad92": {
+        "zebra": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "1a8bdc5842": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "1a95752aca": {
+        "duck": [
+            "1",
+            "2"
+        ]
+    },
+    "1a9c131cb7": {
+        "ape": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "1aa3da3ee3": {
+        "sheep": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "1ab27ec7ea": {
+        "deer": [
+            "1"
+        ]
+    },
+    "1abf16d21d": {
+        "turtle": [
+            "1"
+        ]
+    },
+    "1acd0f993b": {
+        "person": [
+            "3"
+        ],
+        "dog": [
+            "1"
+        ]
+    },
+    "1ad202e499": {
+        "lizard": [
+            "1",
+            "2"
+        ]
+    },
+    "1af8d2395d": {
+        "person": [
+            "1",
+            "2"
+        ],
+        "airplane": [
+            "4"
+        ]
+    },
+    "1afd39a1fa": {
+        "motorbike": [
+            "2"
+        ]
+    },
+    "1b2d31306f": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "1b3fa67f0e": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "1b43fa74b4": {
+        "owl": [
+            "1",
+            "2"
+        ]
+    },
+    "1b73ea9fc2": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "1b7e8bb255": {
+        "person": [
+            "2"
+        ]
+    },
+    "1b8680f8cd": {
+        "person": [
+            "2",
+            "3"
+        ]
+    },
+    "1b883843c0": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "1b8898785b": {
+        "monkey": [
+            "1",
+            "2"
+        ]
+    },
+    "1b88ba1aa4": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "1b96a498e5": {
+        "ape": [
+            "1"
+        ]
+    },
+    "1bbc4c274f": {
+        "fish": [
+            "2"
+        ]
+    },
+    "1bd87fe9ab": {
+        "train": [
+            "1"
+        ]
+    },
+    "1c4090c75b": {
+        "whale": [
+            "1"
+        ]
+    },
+    "1c41934f84": {
+        "elephant": [
+            "1",
+            "2"
+        ]
+    },
+    "1c72b04b56": {
+        "lion": [
+            "1"
+        ]
+    },
+    "1c87955a3a": {
+        "crocodile": [
+            "1"
+        ],
+        "turtle": [
+            "2"
+        ]
+    },
+    "1c9f9eb792": {
+        "person": [
+            "2"
+        ]
+    },
+    "1ca240fede": {
+        "train": [
+            "1"
+        ]
+    },
+    "1ca5673803": {
+        "person": [
+            "1",
+            "3"
+        ]
+    },
+    "1cada35274": {
+        "duck": [
+            "1"
+        ]
+    },
+    "1cb44b920d": {
+        "eagle": [
+            "1",
+            "2"
+        ]
+    },
+    "1cd10e62be": {
+        "leopard": [
+            "1"
+        ]
+    },
+    "1d3087d5e5": {
+        "fish": [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5"
+        ]
+    },
+    "1d3685150a": {
+        "person": [
+            "1",
+            "3"
+        ]
+    },
+    "1d6ff083aa": {
+        "person": [
+            "1",
+            "2"
+        ]
+    }
+}

mbench/numbered_valid_obj_ids_gpt-4o_no_mask_color.json ADDED Viewed

	@@ -0,0 +1,2153 @@

+{
+    "003234408d": {
+        "penguin": [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5"
+        ]
+    },
+    "0043f083b5": {
+        "bus": [
+            "1"
+        ],
+        "sedan": [
+            "2",
+            "3"
+        ]
+    },
+    "0044fa5fba": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "005a527edd": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "0065b171f9": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "00917dcfc4": {
+        "zebra": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "00a23ccf53": {
+        "shark": [
+            "1"
+        ]
+    },
+    "00ad5016a4": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "01082ae388": {
+        "leopard": [
+            "1"
+        ]
+    },
+    "011ac0a06f": {
+        "ape": [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5"
+        ]
+    },
+    "013099c098": {
+        "giant_panda": [
+            "1",
+            "2"
+        ]
+    },
+    "0155498c85": {
+        "person": [
+            "1"
+        ],
+        "motorbike": [
+            "2"
+        ]
+    },
+    "01694ad9c8": {
+        "bird": [
+            "1"
+        ]
+    },
+    "017ac35701": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "01b80e8e1a": {
+        "zebra": [
+            "1",
+            "2"
+        ]
+    },
+    "01baa5a4e1": {},
+    "01c3111683": {
+        "whale": [
+            "1"
+        ]
+    },
+    "01c4cb5ffe": {
+        "person": [
+            "1",
+            "3"
+        ]
+    },
+    "01c76f0a82": {
+        "sedan": [
+            "1",
+            "4"
+        ]
+    },
+    "01c783268c": {
+        "person": [
+            "2"
+        ],
+        "ape": [
+            "1"
+        ]
+    },
+    "01e64dd36a": {
+        "cow": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "01ed275c6e": {
+        "giraffe": [
+            "1",
+            "2"
+        ]
+    },
+    "01ff60d1fa": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "020cd28cd2": {
+        "person": [
+            "1"
+        ]
+    },
+    "02264db755": {
+        "fox": [
+            "1"
+        ]
+    },
+    "0248626d9a": {
+        "train": [
+            "1"
+        ]
+    },
+    "02668dbffa": {
+        "frog": [
+            "1"
+        ]
+    },
+    "0274193026": {
+        "person": [
+            "2"
+        ]
+    },
+    "02d28375aa": {
+        "fox": [
+            "1"
+        ]
+    },
+    "031ccc99b1": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0321b18c10": {
+        "person": [
+            "1",
+            "2"
+        ],
+        "elephant": [
+            "3"
+        ]
+    },
+    "0348a45bca": {
+        "fish": [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5"
+        ]
+    },
+    "0355e92655": {
+        "boat": [
+            "3"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "0358b938c1": {
+        "elephant": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "0368107cf1": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "0379ddf557": {
+        "person": [
+            "1"
+        ]
+    },
+    "038b2cc71d": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "038c15a5dd": {
+        "hedgehog": [
+            "1"
+        ]
+    },
+    "03a06cc98a": {
+        "giraffe": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "03a63e187f": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "03c95b4dae": {
+        "elephant": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "03e2b57b0e": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "04194e1248": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "04259896e2": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "0444918a5f": {
+        "truck": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "04460a7a52": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "04474174a4": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "0450095513": {
+        "snail": [
+            "1"
+        ]
+    },
+    "045f00aed2": {
+        "tiger": [
+            "1"
+        ],
+        "person": [
+            "3"
+        ]
+    },
+    "04667fabaa": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "04735c5030": {
+        "cat": [
+            "1",
+            "2"
+        ]
+    },
+    "04990d1915": {
+        "bus": [
+            "2"
+        ],
+        "truck": [
+            "3"
+        ],
+        "sedan": [
+            "1"
+        ]
+    },
+    "04d62d9d98": {
+        "person": [
+            "1"
+        ]
+    },
+    "04f21da964": {
+        "monkey": [
+            "1"
+        ]
+    },
+    "04fbad476e": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "04fe256562": {
+        "motorbike": [
+            "1"
+        ],
+        "truck": [
+            "2"
+        ]
+    },
+    "0503bf89c9": {
+        "hedgehog": [
+            "1"
+        ]
+    },
+    "0536c9eed0": {
+        "cat": [
+            "1"
+        ]
+    },
+    "054acb238f": {
+        "owl": [
+            "1"
+        ]
+    },
+    "05579ca250": {
+        "person": [
+            "1"
+        ],
+        "sedan": [
+            "3"
+        ]
+    },
+    "056c200404": {},
+    "05774f3a2c": {
+        "ape": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "058a7592c8": {
+        "train": [
+            "1"
+        ]
+    },
+    "05a0a513df": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "05a569d8aa": {
+        "cat": [
+            "1"
+        ],
+        "mouse": [
+            "2"
+        ]
+    },
+    "05aa652648": {
+        "ape": [
+            "1"
+        ]
+    },
+    "05d7715782": {},
+    "05e0b0f28f": {
+        "person": [
+            "2"
+        ],
+        "mouse": [
+            "1"
+        ]
+    },
+    "05fdbbdd7a": {},
+    "05ffcfed85": {
+        "monkey": [
+            "1",
+            "2"
+        ]
+    },
+    "0630391881": {
+        "person": [
+            "1"
+        ]
+    },
+    "06840b2bbe": {
+        "snake": [
+            "1"
+        ]
+    },
+    "068f7dce6f": {
+        "shark": [
+            "1"
+        ]
+    },
+    "0693719753": {
+        "turtle": [
+            "1",
+            "2"
+        ]
+    },
+    "06ce2b51fb": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "06e224798e": {
+        "tiger": [
+            "1"
+        ]
+    },
+    "06ee361788": {
+        "duck": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "06fbb3fa2c": {
+        "eagle": [
+            "1"
+        ]
+    },
+    "0700264286": {
+        "cow": [
+            "1",
+            "2"
+        ]
+    },
+    "070c918ca7": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "07129e14a4": {
+        "parrot": [
+            "1",
+            "2"
+        ],
+        "person": [
+            "3"
+        ]
+    },
+    "07177017e9": {
+        "motorbike": [
+            "1",
+            "2"
+        ]
+    },
+    "07238ffc58": {
+        "monkey": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "07353b2a89": {
+        "sheep": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "0738493cbf": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "075926c651": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "075c701292": {
+        "duck": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "0762ea9a30": {
+        "person": [
+            "1"
+        ]
+    },
+    "07652ee4af": {
+        "person": [
+            "1"
+        ]
+    },
+    "076f206928": {
+        "zebra": [
+            "1",
+            "2"
+        ],
+        "person": [
+            "3"
+        ]
+    },
+    "077d32af19": {
+        "train": [
+            "4"
+        ],
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "079049275c": {
+        "mouse": [
+            "1"
+        ]
+    },
+    "07913cdda7": {
+        "train": [
+            "1"
+        ],
+        "person": [
+            "2",
+            "3"
+        ]
+    },
+    "07a11a35e8": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "07ac33b6df": {
+        "ape": [
+            "1"
+        ]
+    },
+    "07c62c3d11": {
+        "parrot": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "07cc1c7d74": {
+        "snake": [
+            "1"
+        ]
+    },
+    "080196ef01": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "081207976e": {},
+    "081ae4fa44": {
+        "shark": [
+            "1",
+            "2"
+        ]
+    },
+    "081d8250cb": {
+        "person": [
+            "1"
+        ],
+        "sedan": [
+            "3"
+        ]
+    },
+    "082900c5d4": {
+        "duck": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0860df21e2": {},
+    "0866d4c5e3": {
+        "bird": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0891ac2eb6": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "08931bc458": {
+        "person": [
+            "1"
+        ]
+    },
+    "08aa2705d5": {
+        "snake": [
+            "1"
+        ]
+    },
+    "08c8450db7": {},
+    "08d50b926c": {
+        "turtle": [
+            "1",
+            "2"
+        ]
+    },
+    "08e1e4de15": {
+        "monkey": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "08e48c1a48": {
+        "cow": [
+            "1"
+        ]
+    },
+    "08f561c65e": {
+        "person": [
+            "2"
+        ],
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "08feb87790": {
+        "sheep": [
+            "1"
+        ]
+    },
+    "09049f6fe3": {
+        "mouse": [
+            "1",
+            "2"
+        ]
+    },
+    "092e4ff450": {
+        "snake": [
+            "1"
+        ]
+    },
+    "09338adea8": {
+        "whale": [
+            "1",
+            "2"
+        ]
+    },
+    "093c335ccc": {
+        "person": [
+            "2"
+        ]
+    },
+    "0970d28339": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "0974a213dc": {
+        "giraffe": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "097b471ed8": {
+        "cat": [
+            "1",
+            "2"
+        ]
+    },
+    "0990941758": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "09a348f4fa": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "09a6841288": {
+        "duck": [
+            "1",
+            "2"
+        ]
+    },
+    "09c5bad17b": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "09c9ce80c7": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "09ff54fef4": {
+        "fox": [
+            "1",
+            "2"
+        ]
+    },
+    "0a23765d15": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "0a275e7f12": {
+        "elephant": [
+            "1"
+        ]
+    },
+    "0a2f2bd294": {
+        "motorbike": [
+            "1"
+        ]
+    },
+    "0a7a2514aa": {
+        "cat": [
+            "1"
+        ],
+        "lizard": [
+            "2"
+        ]
+    },
+    "0a7b27fde9": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "0a8c467cc3": {
+        "fish": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0ac8c560ae": {
+        "person": [
+            "2",
+            "3"
+        ]
+    },
+    "0b1627e896": {
+        "boat": [
+            "1"
+        ]
+    },
+    "0b285c47f6": {
+        "mouse": [
+            "1"
+        ]
+    },
+    "0b34ec1d55": {
+        "ape": [
+            "1"
+        ]
+    },
+    "0b5b5e8e5a": {
+        "person": [
+            "1"
+        ],
+        "sedan": [
+            "2"
+        ]
+    },
+    "0b68535614": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "0b6f9105fc": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "0b7dbfa3cb": {
+        "cow": [
+            "1"
+        ]
+    },
+    "0b9cea51ca": {
+        "whale": [
+            "1"
+        ]
+    },
+    "0b9d012be8": {
+        "camel": [
+            "1"
+        ]
+    },
+    "0bcfc4177d": {
+        "truck": [
+            "1"
+        ]
+    },
+    "0bd37b23c1": {
+        "motorbike": [
+            "1"
+        ]
+    },
+    "0bd864064c": {
+        "eagle": [
+            "1"
+        ]
+    },
+    "0c11c6bf7b": {
+        "deer": [
+            "1"
+        ]
+    },
+    "0c26bc77ac": {
+        "crocodile": [
+            "1"
+        ]
+    },
+    "0c3a04798c": {
+        "duck": [
+            "1"
+        ],
+        "fish": [
+            "2"
+        ]
+    },
+    "0c44a9d545": {
+        "tiger": [
+            "1"
+        ]
+    },
+    "0c817cc390": {
+        "hedgehog": [
+            "1"
+        ],
+        "dog": [
+            "2"
+        ]
+    },
+    "0ca839ee9a": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "0cd7ac0ac0": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "0ce06e0121": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "0cfe974a89": {
+        "turtle": [
+            "1",
+            "2"
+        ]
+    },
+    "0d2fcc0dcd": {
+        "zebra": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "0d3aad05d2": {
+        "person": [
+            "1"
+        ]
+    },
+    "0d40b015f4": {
+        "person": [
+            "1"
+        ]
+    },
+    "0d97fba242": {
+        "person": [
+            "2"
+        ],
+        "dog": [
+            "1"
+        ]
+    },
+    "0d9cc80d7e": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0dab85b6d3": {
+        "lizard": [
+            "1",
+            "2"
+        ]
+    },
+    "0db5c427a5": {
+        "train": [
+            "1"
+        ]
+    },
+    "0dbaf284f1": {
+        "cat": [
+            "1",
+            "2"
+        ]
+    },
+    "0de4923598": {},
+    "0df28a9101": {
+        "turtle": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0e04f636c4": {
+        "frog": [
+            "1"
+        ]
+    },
+    "0e05f0e232": {
+        "lizard": [
+            "1",
+            "2"
+        ]
+    },
+    "0e0930474b": {
+        "person": [
+            "2",
+            "3"
+        ],
+        "sedan": [
+            "1"
+        ]
+    },
+    "0e27472bea": {
+        "turtle": [
+            "1"
+        ]
+    },
+    "0e30020549": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "0e621feb6c": {
+        "lizard": [
+            "1",
+            "2"
+        ]
+    },
+    "0e803c7d73": {},
+    "0e9ebe4e3c": {
+        "truck": [
+            "1"
+        ]
+    },
+    "0e9f2785ec": {
+        "person": [
+            "2"
+        ]
+    },
+    "0ea68d418b": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "0eb403a222": {},
+    "0ee92053d6": {
+        "person": [
+            "1"
+        ]
+    },
+    "0eefca067f": {
+        "giant_panda": [
+            "1",
+            "2"
+        ]
+    },
+    "0f17fa6fcb": {
+        "duck": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0f1ac8e9a3": {
+        "frog": [
+            "1"
+        ]
+    },
+    "0f202e9852": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "0f2ab8b1ff": {
+        "dolphin": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0f51a78756": {
+        "sheep": [
+            "1"
+        ]
+    },
+    "0f5fbe16b0": {
+        "raccoon": [
+            "1",
+            "2"
+        ]
+    },
+    "0f6072077b": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0f6b69b2f4": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "0f6c2163de": {
+        "snail": [
+            "1"
+        ]
+    },
+    "0f74ec5599": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "0f9683715b": {
+        "elephant": [
+            "1"
+        ]
+    },
+    "0fa7b59356": {
+        "duck": [
+            "1"
+        ]
+    },
+    "0fb173695b": {
+        "person": [
+            "3"
+        ]
+    },
+    "0fc958cde2": {
+        "owl": [
+            "1"
+        ]
+    },
+    "0fe7b1a621": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "0ffcdb491c": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "101caff7d4": {
+        "giant_panda": [
+            "1",
+            "2"
+        ]
+    },
+    "1022fe8417": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "1032e80b37": {
+        "giraffe": [
+            "1"
+        ]
+    },
+    "103f501680": {
+        "fish": [
+            "1"
+        ]
+    },
+    "104e64565f": {
+        "elephant": [
+            "1"
+        ]
+    },
+    "104f1ab997": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "106242403f": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "10b31f5431": {
+        "person": [
+            "1",
+            "3",
+            "4"
+        ]
+    },
+    "10eced835e": {
+        "giant_panda": [
+            "1",
+            "2"
+        ]
+    },
+    "110d26fa3a": {
+        "shark": [
+            "1"
+        ]
+    },
+    "1122c1d16a": {
+        "parrot": [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5"
+        ],
+        "person": [
+            "6"
+        ]
+    },
+    "1145b49a5f": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "11485838c2": {
+        "giraffe": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "114e7676ec": {
+        "person": [
+            "1"
+        ]
+    },
+    "1157472b95": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "115ee1072c": {
+        "cow": [
+            "1"
+        ]
+    },
+    "1171141012": {
+        "person": [
+            "2"
+        ],
+        "turtle": [
+            "1"
+        ]
+    },
+    "117757b4b8": {
+        "snail": [
+            "1"
+        ]
+    },
+    "1178932d2f": {
+        "person": [
+            "1",
+            "2"
+        ],
+        "motorbike": [
+            "3"
+        ]
+    },
+    "117cc76bda": {
+        "whale": [
+            "1"
+        ]
+    },
+    "1180cbf814": {
+        "fish": [
+            "1",
+            "2"
+        ]
+    },
+    "1187bbd0e3": {
+        "cat": [
+            "1"
+        ]
+    },
+    "1197e44b26": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "119cf20728": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "119dd54871": {
+        "lion": [
+            "1",
+            "2"
+        ]
+    },
+    "11a0c3b724": {
+        "mouse": [
+            "1",
+            "2"
+        ]
+    },
+    "11a6ba8c94": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "11c722a456": {
+        "turtle": [
+            "1",
+            "2"
+        ]
+    },
+    "11cbcb0b4d": {
+        "zebra": [
+            "1"
+        ]
+    },
+    "11ccf5e99d": {
+        "person": [
+            "2"
+        ]
+    },
+    "11ce6f452e": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "11feabe596": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "120cb9514d": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "12156b25b3": {
+        "person": [
+            "1"
+        ]
+    },
+    "122896672d": {
+        "person": [
+            "1",
+            "3"
+        ]
+    },
+    "1233ac8596": {
+        "dog": [
+            "1"
+        ]
+    },
+    "1239c87234": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "1250423f7c": {
+        "person": [
+            "2"
+        ],
+        "elephant": [
+            "3",
+            "4"
+        ]
+    },
+    "1257a1bc67": {
+        "snake": [
+            "1"
+        ]
+    },
+    "125d1b19dd": {
+        "giant_panda": [
+            "1",
+            "2"
+        ]
+    },
+    "126d203967": {
+        "person": [
+            "2"
+        ]
+    },
+    "1295e19071": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "12ad198c54": {
+        "person": [
+            "1"
+        ]
+    },
+    "12bddb2bcb": {
+        "person": [
+            "2"
+        ]
+    },
+    "12ec9b93ee": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "12eebedc35": {
+        "bird": [
+            "1"
+        ]
+    },
+    "132852e094": {
+        "fox": [
+            "1"
+        ]
+    },
+    "1329409f2a": {
+        "fish": [
+            "1"
+        ]
+    },
+    "13325cfa14": {
+        "person": [
+            "2"
+        ]
+    },
+    "1336440745": {
+        "mouse": [
+            "1",
+            "2"
+        ]
+    },
+    "134d06dbf9": {
+        "cat": [
+            "1"
+        ]
+    },
+    "135625b53d": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "13870016f9": {
+        "person": [
+            "1"
+        ],
+        "cow": [
+            "2",
+            "3"
+        ]
+    },
+    "13960b3c84": {
+        "giraffe": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "13adaad9d9": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "13ae097e20": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "13e3070469": {
+        "zebra": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "13f6a8c20d": {
+        "fish": [
+            "1"
+        ]
+    },
+    "1416925cf2": {
+        "truck": [
+            "1",
+            "2"
+        ]
+    },
+    "142d2621f5": {
+        "person": [
+            "1",
+            "2"
+        ],
+        "motorbike": [
+            "3"
+        ]
+    },
+    "145d5d7c03": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "145fdc3ac5": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "1471274fa7": {
+        "person": [
+            "1"
+        ]
+    },
+    "14a6b5a139": {
+        "fish": [
+            "1"
+        ]
+    },
+    "14c21cea0d": {
+        "monkey": [
+            "1",
+            "2"
+        ]
+    },
+    "14dae0dc93": {
+        "person": [
+            "2"
+        ]
+    },
+    "14f9bd22b5": {
+        "tiger": [
+            "1"
+        ]
+    },
+    "14fd28ae99": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "15097d5d4e": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "150ea711f2": {
+        "whale": [
+            "1"
+        ]
+    },
+    "1514e3563f": {
+        "earless_seal": [
+            "1",
+            "2"
+        ]
+    },
+    "152aaa3a9e": {
+        "raccoon": [
+            "1"
+        ]
+    },
+    "152b7d3bd7": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "15617297cc": {
+        "person": [
+            "1"
+        ]
+    },
+    "15abbe0c52": {
+        "person": [
+            "1"
+        ]
+    },
+    "15d1fb3de5": {
+        "cat": [
+            "2"
+        ],
+        "owl": [
+            "1"
+        ]
+    },
+    "15f67b0fab": {
+        "person": [
+            "1"
+        ]
+    },
+    "161eb59aad": {
+        "cow": [
+            "2",
+            "3"
+        ],
+        "giraffe": [
+            "1"
+        ]
+    },
+    "16288ea47f": {
+        "duck": [
+            "1",
+            "2"
+        ]
+    },
+    "164410ce62": {
+        "person": [
+            "1"
+        ]
+    },
+    "165c3c8cd4": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "165c42b41b": {
+        "person": [
+            "1",
+            "4"
+        ],
+        "motorbike": [
+            "2",
+            "3"
+        ]
+    },
+    "165ec9e22b": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "1669502269": {
+        "person": [
+            "1"
+        ]
+    },
+    "16763cccbb": {
+        "ape": [
+            "1"
+        ]
+    },
+    "16adde065e": {
+        "cat": [
+            "2"
+        ],
+        "person": [
+            "3"
+        ]
+    },
+    "16af445362": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "16afd538ad": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "16c3fa4d5d": {
+        "sedan": [
+            "1"
+        ]
+    },
+    "16d1d65c27": {
+        "monkey": [
+            "1"
+        ]
+    },
+    "16e8599e94": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "16fe9fb444": {
+        "motorbike": [
+            "1"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "1705796b02": {
+        "train": [
+            "1"
+        ]
+    },
+    "1724db7671": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "17418e81ea": {
+        "shark": [
+            "1"
+        ]
+    },
+    "175169edbb": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "17622326fd": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "17656bae77": {
+        "elephant": [
+            "1"
+        ]
+    },
+    "17b0d94172": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "17c220e4f6": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "17c7bcd146": {
+        "train": [
+            "1"
+        ]
+    },
+    "17cb4afe89": {
+        "tiger": [
+            "1"
+        ]
+    },
+    "17cd79a434": {
+        "squirrel": [
+            "1"
+        ]
+    },
+    "17d18604c3": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "17d8ca1a37": {
+        "person": [
+            "2"
+        ],
+        "owl": [
+            "1"
+        ]
+    },
+    "17e33f4330": {
+        "monkey": [
+            "1"
+        ]
+    },
+    "17f7a6d805": {
+        "snail": [
+            "1"
+        ]
+    },
+    "180abc8378": {
+        "person": [
+            "2"
+        ],
+        "owl": [
+            "1"
+        ]
+    },
+    "183ba3d652": {
+        "person": [
+            "2"
+        ],
+        "motorbike": [
+            "3"
+        ]
+    },
+    "185bf64702": {
+        "zebra": [
+            "1",
+            "2"
+        ]
+    },
+    "18913cc690": {
+        "train": [
+            "1"
+        ]
+    },
+    "1892651815": {
+        "camel": [
+            "1"
+        ]
+    },
+    "189ac8208a": {
+        "giraffe": [
+            "1",
+            "2"
+        ]
+    },
+    "189b44e92c": {
+        "zebra": [
+            "1"
+        ]
+    },
+    "18ac264b76": {
+        "person": [
+            "2"
+        ]
+    },
+    "18b245ab49": {
+        "penguin": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "18b5cebc34": {
+        "mouse": [
+            "1"
+        ]
+    },
+    "18bad52083": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "18bb5144d5": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "18c6f205c5": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "1903f9ea15": {
+        "bird": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "1917b209f2": {
+        "person": [
+            "1"
+        ],
+        "cow": [
+            "3",
+            "4"
+        ],
+        "horse": [
+            "2"
+        ]
+    },
+    "191e74c01d": {
+        "deer": [
+            "1"
+        ]
+    },
+    "19367bb94e": {
+        "fish": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "193ffaa217": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "19696b67d3": {
+        "cow": [
+            "1"
+        ]
+    },
+    "197f3ab6f3": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "1981e763cc": {
+        "sheep": [
+            "1",
+            "2"
+        ]
+    },
+    "198afe39ae": {
+        "person": [
+            "1"
+        ]
+    },
+    "19a6e62b9b": {
+        "monkey": [
+            "1",
+            "2"
+        ]
+    },
+    "19b60d5335": {
+        "hedgehog": [
+            "1"
+        ]
+    },
+    "19c00c11f9": {
+        "person": [
+            "1"
+        ]
+    },
+    "19e061eb88": {
+        "boat": [
+            "1",
+            "2"
+        ]
+    },
+    "19e8bc6178": {
+        "dog": [
+            "1"
+        ]
+    },
+    "19ee80dac6": {
+        "person": [
+            "1",
+            "3",
+            "4"
+        ]
+    },
+    "1a25a9170a": {
+        "person": [
+            "2",
+            "3"
+        ],
+        "cow": [
+            "1"
+        ]
+    },
+    "1a359a6c1a": {
+        "sheep": [
+            "1"
+        ]
+    },
+    "1a3e87c566": {
+        "frog": [
+            "1"
+        ]
+    },
+    "1a5fe06b00": {
+        "bus": [
+            "1"
+        ]
+    },
+    "1a6c0fbd1e": {
+        "person": [
+            "1"
+        ]
+    },
+    "1a6f3b5a4b": {
+        "sedan": [
+            "3"
+        ]
+    },
+    "1a8afbad92": {
+        "zebra": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "1a8bdc5842": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "1a95752aca": {
+        "duck": [
+            "1",
+            "2"
+        ]
+    },
+    "1a9c131cb7": {
+        "ape": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "1aa3da3ee3": {
+        "sheep": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "1ab27ec7ea": {
+        "deer": [
+            "1"
+        ]
+    },
+    "1abf16d21d": {
+        "turtle": [
+            "1"
+        ]
+    },
+    "1acd0f993b": {
+        "person": [
+            "3"
+        ],
+        "dog": [
+            "1"
+        ]
+    },
+    "1ad202e499": {
+        "lizard": [
+            "1",
+            "2"
+        ]
+    },
+    "1af8d2395d": {
+        "person": [
+            "1",
+            "2"
+        ],
+        "airplane": [
+            "4"
+        ]
+    },
+    "1afd39a1fa": {
+        "motorbike": [
+            "2"
+        ]
+    },
+    "1b2d31306f": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "1b3fa67f0e": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "1b43fa74b4": {
+        "owl": [
+            "1",
+            "2"
+        ]
+    },
+    "1b73ea9fc2": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "1b7e8bb255": {
+        "person": [
+            "2"
+        ]
+    },
+    "1b8680f8cd": {
+        "person": [
+            "2",
+            "3"
+        ]
+    },
+    "1b883843c0": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "1b8898785b": {
+        "monkey": [
+            "1",
+            "2"
+        ]
+    },
+    "1b88ba1aa4": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "1b96a498e5": {
+        "ape": [
+            "1"
+        ]
+    },
+    "1bbc4c274f": {
+        "fish": [
+            "2"
+        ]
+    },
+    "1bd87fe9ab": {
+        "train": [
+            "1"
+        ]
+    },
+    "1c4090c75b": {
+        "whale": [
+            "1"
+        ]
+    },
+    "1c41934f84": {
+        "elephant": [
+            "1",
+            "2"
+        ]
+    },
+    "1c72b04b56": {
+        "lion": [
+            "1"
+        ]
+    },
+    "1c87955a3a": {
+        "crocodile": [
+            "1"
+        ],
+        "turtle": [
+            "2"
+        ]
+    },
+    "1c9f9eb792": {
+        "person": [
+            "2"
+        ]
+    },
+    "1ca240fede": {
+        "train": [
+            "1"
+        ]
+    },
+    "1ca5673803": {
+        "person": [
+            "1",
+            "3"
+        ]
+    },
+    "1cada35274": {
+        "duck": [
+            "1"
+        ]
+    },
+    "1cb44b920d": {
+        "eagle": [
+            "1",
+            "2"
+        ]
+    },
+    "1cd10e62be": {
+        "leopard": [
+            "1"
+        ]
+    },
+    "1d3087d5e5": {
+        "fish": [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5"
+        ]
+    },
+    "1d3685150a": {
+        "person": [
+            "1",
+            "3"
+        ]
+    },
+    "1d6ff083aa": {
+        "person": [
+            "1",
+            "2"
+        ]
+    }
+}

mbench/numbered_valid_obj_ids_gpt-4o_nomask_randcap.json ADDED Viewed

	@@ -0,0 +1,2153 @@

+{
+    "003234408d": {
+        "penguin": [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5"
+        ]
+    },
+    "0043f083b5": {
+        "sedan": [
+            "2",
+            "3"
+        ],
+        "bus": [
+            "1"
+        ]
+    },
+    "0044fa5fba": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "005a527edd": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "0065b171f9": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "00917dcfc4": {
+        "zebra": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "00a23ccf53": {
+        "shark": [
+            "1"
+        ]
+    },
+    "00ad5016a4": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "01082ae388": {
+        "leopard": [
+            "1"
+        ]
+    },
+    "011ac0a06f": {
+        "ape": [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5"
+        ]
+    },
+    "013099c098": {
+        "giant_panda": [
+            "1",
+            "2"
+        ]
+    },
+    "0155498c85": {
+        "motorbike": [
+            "2"
+        ],
+        "person": [
+            "1"
+        ]
+    },
+    "01694ad9c8": {
+        "bird": [
+            "1"
+        ]
+    },
+    "017ac35701": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "01b80e8e1a": {
+        "zebra": [
+            "1",
+            "2"
+        ]
+    },
+    "01baa5a4e1": {},
+    "01c3111683": {
+        "whale": [
+            "1"
+        ]
+    },
+    "01c4cb5ffe": {
+        "person": [
+            "1",
+            "3"
+        ]
+    },
+    "01c76f0a82": {
+        "sedan": [
+            "1",
+            "4"
+        ]
+    },
+    "01c783268c": {
+        "ape": [
+            "1"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "01e64dd36a": {
+        "cow": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "01ed275c6e": {
+        "giraffe": [
+            "1",
+            "2"
+        ]
+    },
+    "01ff60d1fa": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "020cd28cd2": {
+        "person": [
+            "1"
+        ]
+    },
+    "02264db755": {
+        "fox": [
+            "1"
+        ]
+    },
+    "0248626d9a": {
+        "train": [
+            "1"
+        ]
+    },
+    "02668dbffa": {
+        "frog": [
+            "1"
+        ]
+    },
+    "0274193026": {
+        "person": [
+            "2"
+        ]
+    },
+    "02d28375aa": {
+        "fox": [
+            "1"
+        ]
+    },
+    "031ccc99b1": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0321b18c10": {
+        "elephant": [
+            "3"
+        ],
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "0348a45bca": {
+        "fish": [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5"
+        ]
+    },
+    "0355e92655": {
+        "boat": [
+            "3"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "0358b938c1": {
+        "elephant": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "0368107cf1": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "0379ddf557": {
+        "person": [
+            "1"
+        ]
+    },
+    "038b2cc71d": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "038c15a5dd": {
+        "hedgehog": [
+            "1"
+        ]
+    },
+    "03a06cc98a": {
+        "giraffe": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "03a63e187f": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "03c95b4dae": {
+        "elephant": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "03e2b57b0e": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "04194e1248": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "04259896e2": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "0444918a5f": {
+        "truck": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "04460a7a52": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "04474174a4": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "0450095513": {
+        "snail": [
+            "1"
+        ]
+    },
+    "045f00aed2": {
+        "person": [
+            "3"
+        ],
+        "tiger": [
+            "1"
+        ]
+    },
+    "04667fabaa": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "04735c5030": {
+        "cat": [
+            "1",
+            "2"
+        ]
+    },
+    "04990d1915": {
+        "sedan": [
+            "1"
+        ],
+        "bus": [
+            "2"
+        ],
+        "truck": [
+            "3"
+        ]
+    },
+    "04d62d9d98": {
+        "person": [
+            "1"
+        ]
+    },
+    "04f21da964": {
+        "monkey": [
+            "1"
+        ]
+    },
+    "04fbad476e": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "04fe256562": {
+        "motorbike": [
+            "1"
+        ],
+        "truck": [
+            "2"
+        ]
+    },
+    "0503bf89c9": {
+        "hedgehog": [
+            "1"
+        ]
+    },
+    "0536c9eed0": {
+        "cat": [
+            "1"
+        ]
+    },
+    "054acb238f": {
+        "owl": [
+            "1"
+        ]
+    },
+    "05579ca250": {
+        "sedan": [
+            "3"
+        ],
+        "person": [
+            "1"
+        ]
+    },
+    "056c200404": {},
+    "05774f3a2c": {
+        "ape": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "058a7592c8": {
+        "train": [
+            "1"
+        ]
+    },
+    "05a0a513df": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "05a569d8aa": {
+        "mouse": [
+            "2"
+        ],
+        "cat": [
+            "1"
+        ]
+    },
+    "05aa652648": {
+        "ape": [
+            "1"
+        ]
+    },
+    "05d7715782": {},
+    "05e0b0f28f": {
+        "mouse": [
+            "1"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "05fdbbdd7a": {},
+    "05ffcfed85": {
+        "monkey": [
+            "1",
+            "2"
+        ]
+    },
+    "0630391881": {
+        "person": [
+            "1"
+        ]
+    },
+    "06840b2bbe": {
+        "snake": [
+            "1"
+        ]
+    },
+    "068f7dce6f": {
+        "shark": [
+            "1"
+        ]
+    },
+    "0693719753": {
+        "turtle": [
+            "1",
+            "2"
+        ]
+    },
+    "06ce2b51fb": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "06e224798e": {
+        "tiger": [
+            "1"
+        ]
+    },
+    "06ee361788": {
+        "duck": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "06fbb3fa2c": {
+        "eagle": [
+            "1"
+        ]
+    },
+    "0700264286": {
+        "cow": [
+            "1",
+            "2"
+        ]
+    },
+    "070c918ca7": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "07129e14a4": {
+        "parrot": [
+            "1",
+            "2"
+        ],
+        "person": [
+            "3"
+        ]
+    },
+    "07177017e9": {
+        "motorbike": [
+            "1",
+            "2"
+        ]
+    },
+    "07238ffc58": {
+        "monkey": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "07353b2a89": {
+        "sheep": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "0738493cbf": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "075926c651": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "075c701292": {
+        "duck": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "0762ea9a30": {
+        "person": [
+            "1"
+        ]
+    },
+    "07652ee4af": {
+        "person": [
+            "1"
+        ]
+    },
+    "076f206928": {
+        "zebra": [
+            "1",
+            "2"
+        ],
+        "person": [
+            "3"
+        ]
+    },
+    "077d32af19": {
+        "train": [
+            "4"
+        ],
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "079049275c": {
+        "mouse": [
+            "1"
+        ]
+    },
+    "07913cdda7": {
+        "train": [
+            "1"
+        ],
+        "person": [
+            "2",
+            "3"
+        ]
+    },
+    "07a11a35e8": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "07ac33b6df": {
+        "ape": [
+            "1"
+        ]
+    },
+    "07c62c3d11": {
+        "parrot": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "07cc1c7d74": {
+        "snake": [
+            "1"
+        ]
+    },
+    "080196ef01": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "081207976e": {},
+    "081ae4fa44": {
+        "shark": [
+            "1",
+            "2"
+        ]
+    },
+    "081d8250cb": {
+        "sedan": [
+            "3"
+        ],
+        "person": [
+            "1"
+        ]
+    },
+    "082900c5d4": {
+        "duck": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0860df21e2": {},
+    "0866d4c5e3": {
+        "bird": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0891ac2eb6": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "08931bc458": {
+        "person": [
+            "1"
+        ]
+    },
+    "08aa2705d5": {
+        "snake": [
+            "1"
+        ]
+    },
+    "08c8450db7": {},
+    "08d50b926c": {
+        "turtle": [
+            "1",
+            "2"
+        ]
+    },
+    "08e1e4de15": {
+        "monkey": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "08e48c1a48": {
+        "cow": [
+            "1"
+        ]
+    },
+    "08f561c65e": {
+        "person": [
+            "2"
+        ],
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "08feb87790": {
+        "sheep": [
+            "1"
+        ]
+    },
+    "09049f6fe3": {
+        "mouse": [
+            "1",
+            "2"
+        ]
+    },
+    "092e4ff450": {
+        "snake": [
+            "1"
+        ]
+    },
+    "09338adea8": {
+        "whale": [
+            "1",
+            "2"
+        ]
+    },
+    "093c335ccc": {
+        "person": [
+            "2"
+        ]
+    },
+    "0970d28339": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "0974a213dc": {
+        "giraffe": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "097b471ed8": {
+        "cat": [
+            "1",
+            "2"
+        ]
+    },
+    "0990941758": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "09a348f4fa": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "09a6841288": {
+        "duck": [
+            "1",
+            "2"
+        ]
+    },
+    "09c5bad17b": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "09c9ce80c7": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "09ff54fef4": {
+        "fox": [
+            "1",
+            "2"
+        ]
+    },
+    "0a23765d15": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "0a275e7f12": {
+        "elephant": [
+            "1"
+        ]
+    },
+    "0a2f2bd294": {
+        "motorbike": [
+            "1"
+        ]
+    },
+    "0a7a2514aa": {
+        "cat": [
+            "1"
+        ],
+        "lizard": [
+            "2"
+        ]
+    },
+    "0a7b27fde9": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "0a8c467cc3": {
+        "fish": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0ac8c560ae": {
+        "person": [
+            "2",
+            "3"
+        ]
+    },
+    "0b1627e896": {
+        "boat": [
+            "1"
+        ]
+    },
+    "0b285c47f6": {
+        "mouse": [
+            "1"
+        ]
+    },
+    "0b34ec1d55": {
+        "ape": [
+            "1"
+        ]
+    },
+    "0b5b5e8e5a": {
+        "sedan": [
+            "2"
+        ],
+        "person": [
+            "1"
+        ]
+    },
+    "0b68535614": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "0b6f9105fc": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "0b7dbfa3cb": {
+        "cow": [
+            "1"
+        ]
+    },
+    "0b9cea51ca": {
+        "whale": [
+            "1"
+        ]
+    },
+    "0b9d012be8": {
+        "camel": [
+            "1"
+        ]
+    },
+    "0bcfc4177d": {
+        "truck": [
+            "1"
+        ]
+    },
+    "0bd37b23c1": {
+        "motorbike": [
+            "1"
+        ]
+    },
+    "0bd864064c": {
+        "eagle": [
+            "1"
+        ]
+    },
+    "0c11c6bf7b": {
+        "deer": [
+            "1"
+        ]
+    },
+    "0c26bc77ac": {
+        "crocodile": [
+            "1"
+        ]
+    },
+    "0c3a04798c": {
+        "fish": [
+            "2"
+        ],
+        "duck": [
+            "1"
+        ]
+    },
+    "0c44a9d545": {
+        "tiger": [
+            "1"
+        ]
+    },
+    "0c817cc390": {
+        "hedgehog": [
+            "1"
+        ],
+        "dog": [
+            "2"
+        ]
+    },
+    "0ca839ee9a": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "0cd7ac0ac0": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "0ce06e0121": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "0cfe974a89": {
+        "turtle": [
+            "1",
+            "2"
+        ]
+    },
+    "0d2fcc0dcd": {
+        "zebra": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "0d3aad05d2": {
+        "person": [
+            "1"
+        ]
+    },
+    "0d40b015f4": {
+        "person": [
+            "1"
+        ]
+    },
+    "0d97fba242": {
+        "dog": [
+            "1"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "0d9cc80d7e": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0dab85b6d3": {
+        "lizard": [
+            "1",
+            "2"
+        ]
+    },
+    "0db5c427a5": {
+        "train": [
+            "1"
+        ]
+    },
+    "0dbaf284f1": {
+        "cat": [
+            "1",
+            "2"
+        ]
+    },
+    "0de4923598": {},
+    "0df28a9101": {
+        "turtle": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0e04f636c4": {
+        "frog": [
+            "1"
+        ]
+    },
+    "0e05f0e232": {
+        "lizard": [
+            "1",
+            "2"
+        ]
+    },
+    "0e0930474b": {
+        "sedan": [
+            "1"
+        ],
+        "person": [
+            "2",
+            "3"
+        ]
+    },
+    "0e27472bea": {
+        "turtle": [
+            "1"
+        ]
+    },
+    "0e30020549": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "0e621feb6c": {
+        "lizard": [
+            "1",
+            "2"
+        ]
+    },
+    "0e803c7d73": {},
+    "0e9ebe4e3c": {
+        "truck": [
+            "1"
+        ]
+    },
+    "0e9f2785ec": {
+        "person": [
+            "2"
+        ]
+    },
+    "0ea68d418b": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "0eb403a222": {},
+    "0ee92053d6": {
+        "person": [
+            "1"
+        ]
+    },
+    "0eefca067f": {
+        "giant_panda": [
+            "1",
+            "2"
+        ]
+    },
+    "0f17fa6fcb": {
+        "duck": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0f1ac8e9a3": {
+        "frog": [
+            "1"
+        ]
+    },
+    "0f202e9852": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "0f2ab8b1ff": {
+        "dolphin": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0f51a78756": {
+        "sheep": [
+            "1"
+        ]
+    },
+    "0f5fbe16b0": {
+        "raccoon": [
+            "1",
+            "2"
+        ]
+    },
+    "0f6072077b": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0f6b69b2f4": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "0f6c2163de": {
+        "snail": [
+            "1"
+        ]
+    },
+    "0f74ec5599": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "0f9683715b": {
+        "elephant": [
+            "1"
+        ]
+    },
+    "0fa7b59356": {
+        "duck": [
+            "1"
+        ]
+    },
+    "0fb173695b": {
+        "person": [
+            "3"
+        ]
+    },
+    "0fc958cde2": {
+        "owl": [
+            "1"
+        ]
+    },
+    "0fe7b1a621": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "0ffcdb491c": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "101caff7d4": {
+        "giant_panda": [
+            "1",
+            "2"
+        ]
+    },
+    "1022fe8417": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "1032e80b37": {
+        "giraffe": [
+            "1"
+        ]
+    },
+    "103f501680": {
+        "fish": [
+            "1"
+        ]
+    },
+    "104e64565f": {
+        "elephant": [
+            "1"
+        ]
+    },
+    "104f1ab997": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "106242403f": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "10b31f5431": {
+        "person": [
+            "1",
+            "3",
+            "4"
+        ]
+    },
+    "10eced835e": {
+        "giant_panda": [
+            "1",
+            "2"
+        ]
+    },
+    "110d26fa3a": {
+        "shark": [
+            "1"
+        ]
+    },
+    "1122c1d16a": {
+        "parrot": [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5"
+        ],
+        "person": [
+            "6"
+        ]
+    },
+    "1145b49a5f": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "11485838c2": {
+        "giraffe": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "114e7676ec": {
+        "person": [
+            "1"
+        ]
+    },
+    "1157472b95": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "115ee1072c": {
+        "cow": [
+            "1"
+        ]
+    },
+    "1171141012": {
+        "turtle": [
+            "1"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "117757b4b8": {
+        "snail": [
+            "1"
+        ]
+    },
+    "1178932d2f": {
+        "motorbike": [
+            "3"
+        ],
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "117cc76bda": {
+        "whale": [
+            "1"
+        ]
+    },
+    "1180cbf814": {
+        "fish": [
+            "1",
+            "2"
+        ]
+    },
+    "1187bbd0e3": {
+        "cat": [
+            "1"
+        ]
+    },
+    "1197e44b26": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "119cf20728": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "119dd54871": {
+        "lion": [
+            "1",
+            "2"
+        ]
+    },
+    "11a0c3b724": {
+        "mouse": [
+            "1",
+            "2"
+        ]
+    },
+    "11a6ba8c94": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "11c722a456": {
+        "turtle": [
+            "1",
+            "2"
+        ]
+    },
+    "11cbcb0b4d": {
+        "zebra": [
+            "1"
+        ]
+    },
+    "11ccf5e99d": {
+        "person": [
+            "2"
+        ]
+    },
+    "11ce6f452e": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "11feabe596": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "120cb9514d": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "12156b25b3": {
+        "person": [
+            "1"
+        ]
+    },
+    "122896672d": {
+        "person": [
+            "1",
+            "3"
+        ]
+    },
+    "1233ac8596": {
+        "dog": [
+            "1"
+        ]
+    },
+    "1239c87234": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "1250423f7c": {
+        "elephant": [
+            "3",
+            "4"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "1257a1bc67": {
+        "snake": [
+            "1"
+        ]
+    },
+    "125d1b19dd": {
+        "giant_panda": [
+            "1",
+            "2"
+        ]
+    },
+    "126d203967": {
+        "person": [
+            "2"
+        ]
+    },
+    "1295e19071": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "12ad198c54": {
+        "person": [
+            "1"
+        ]
+    },
+    "12bddb2bcb": {
+        "person": [
+            "2"
+        ]
+    },
+    "12ec9b93ee": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "12eebedc35": {
+        "bird": [
+            "1"
+        ]
+    },
+    "132852e094": {
+        "fox": [
+            "1"
+        ]
+    },
+    "1329409f2a": {
+        "fish": [
+            "1"
+        ]
+    },
+    "13325cfa14": {
+        "person": [
+            "2"
+        ]
+    },
+    "1336440745": {
+        "mouse": [
+            "1",
+            "2"
+        ]
+    },
+    "134d06dbf9": {
+        "cat": [
+            "1"
+        ]
+    },
+    "135625b53d": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "13870016f9": {
+        "cow": [
+            "2",
+            "3"
+        ],
+        "person": [
+            "1"
+        ]
+    },
+    "13960b3c84": {
+        "giraffe": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "13adaad9d9": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "13ae097e20": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "13e3070469": {
+        "zebra": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "13f6a8c20d": {
+        "fish": [
+            "1"
+        ]
+    },
+    "1416925cf2": {
+        "truck": [
+            "1",
+            "2"
+        ]
+    },
+    "142d2621f5": {
+        "motorbike": [
+            "3"
+        ],
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "145d5d7c03": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "145fdc3ac5": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "1471274fa7": {
+        "person": [
+            "1"
+        ]
+    },
+    "14a6b5a139": {
+        "fish": [
+            "1"
+        ]
+    },
+    "14c21cea0d": {
+        "monkey": [
+            "1",
+            "2"
+        ]
+    },
+    "14dae0dc93": {
+        "person": [
+            "2"
+        ]
+    },
+    "14f9bd22b5": {
+        "tiger": [
+            "1"
+        ]
+    },
+    "14fd28ae99": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "15097d5d4e": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "150ea711f2": {
+        "whale": [
+            "1"
+        ]
+    },
+    "1514e3563f": {
+        "earless_seal": [
+            "1",
+            "2"
+        ]
+    },
+    "152aaa3a9e": {
+        "raccoon": [
+            "1"
+        ]
+    },
+    "152b7d3bd7": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "15617297cc": {
+        "person": [
+            "1"
+        ]
+    },
+    "15abbe0c52": {
+        "person": [
+            "1"
+        ]
+    },
+    "15d1fb3de5": {
+        "owl": [
+            "1"
+        ],
+        "cat": [
+            "2"
+        ]
+    },
+    "15f67b0fab": {
+        "person": [
+            "1"
+        ]
+    },
+    "161eb59aad": {
+        "giraffe": [
+            "1"
+        ],
+        "cow": [
+            "2",
+            "3"
+        ]
+    },
+    "16288ea47f": {
+        "duck": [
+            "1",
+            "2"
+        ]
+    },
+    "164410ce62": {
+        "person": [
+            "1"
+        ]
+    },
+    "165c3c8cd4": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "165c42b41b": {
+        "motorbike": [
+            "2",
+            "3"
+        ],
+        "person": [
+            "1",
+            "4"
+        ]
+    },
+    "165ec9e22b": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "1669502269": {
+        "person": [
+            "1"
+        ]
+    },
+    "16763cccbb": {
+        "ape": [
+            "1"
+        ]
+    },
+    "16adde065e": {
+        "cat": [
+            "2"
+        ],
+        "person": [
+            "3"
+        ]
+    },
+    "16af445362": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "16afd538ad": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "16c3fa4d5d": {
+        "sedan": [
+            "1"
+        ]
+    },
+    "16d1d65c27": {
+        "monkey": [
+            "1"
+        ]
+    },
+    "16e8599e94": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "16fe9fb444": {
+        "motorbike": [
+            "1"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "1705796b02": {
+        "train": [
+            "1"
+        ]
+    },
+    "1724db7671": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "17418e81ea": {
+        "shark": [
+            "1"
+        ]
+    },
+    "175169edbb": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "17622326fd": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "17656bae77": {
+        "elephant": [
+            "1"
+        ]
+    },
+    "17b0d94172": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "17c220e4f6": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "17c7bcd146": {
+        "train": [
+            "1"
+        ]
+    },
+    "17cb4afe89": {
+        "tiger": [
+            "1"
+        ]
+    },
+    "17cd79a434": {
+        "squirrel": [
+            "1"
+        ]
+    },
+    "17d18604c3": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "17d8ca1a37": {
+        "owl": [
+            "1"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "17e33f4330": {
+        "monkey": [
+            "1"
+        ]
+    },
+    "17f7a6d805": {
+        "snail": [
+            "1"
+        ]
+    },
+    "180abc8378": {
+        "owl": [
+            "1"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "183ba3d652": {
+        "motorbike": [
+            "3"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "185bf64702": {
+        "zebra": [
+            "1",
+            "2"
+        ]
+    },
+    "18913cc690": {
+        "train": [
+            "1"
+        ]
+    },
+    "1892651815": {
+        "camel": [
+            "1"
+        ]
+    },
+    "189ac8208a": {
+        "giraffe": [
+            "1",
+            "2"
+        ]
+    },
+    "189b44e92c": {
+        "zebra": [
+            "1"
+        ]
+    },
+    "18ac264b76": {
+        "person": [
+            "2"
+        ]
+    },
+    "18b245ab49": {
+        "penguin": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "18b5cebc34": {
+        "mouse": [
+            "1"
+        ]
+    },
+    "18bad52083": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "18bb5144d5": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "18c6f205c5": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "1903f9ea15": {
+        "bird": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "1917b209f2": {
+        "cow": [
+            "3",
+            "4"
+        ],
+        "horse": [
+            "2"
+        ],
+        "person": [
+            "1"
+        ]
+    },
+    "191e74c01d": {
+        "deer": [
+            "1"
+        ]
+    },
+    "19367bb94e": {
+        "fish": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "193ffaa217": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "19696b67d3": {
+        "cow": [
+            "1"
+        ]
+    },
+    "197f3ab6f3": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "1981e763cc": {
+        "sheep": [
+            "1",
+            "2"
+        ]
+    },
+    "198afe39ae": {
+        "person": [
+            "1"
+        ]
+    },
+    "19a6e62b9b": {
+        "monkey": [
+            "1",
+            "2"
+        ]
+    },
+    "19b60d5335": {
+        "hedgehog": [
+            "1"
+        ]
+    },
+    "19c00c11f9": {
+        "person": [
+            "1"
+        ]
+    },
+    "19e061eb88": {
+        "boat": [
+            "1",
+            "2"
+        ]
+    },
+    "19e8bc6178": {
+        "dog": [
+            "1"
+        ]
+    },
+    "19ee80dac6": {
+        "person": [
+            "1",
+            "3",
+            "4"
+        ]
+    },
+    "1a25a9170a": {
+        "person": [
+            "2",
+            "3"
+        ],
+        "cow": [
+            "1"
+        ]
+    },
+    "1a359a6c1a": {
+        "sheep": [
+            "1"
+        ]
+    },
+    "1a3e87c566": {
+        "frog": [
+            "1"
+        ]
+    },
+    "1a5fe06b00": {
+        "bus": [
+            "1"
+        ]
+    },
+    "1a6c0fbd1e": {
+        "person": [
+            "1"
+        ]
+    },
+    "1a6f3b5a4b": {
+        "sedan": [
+            "3"
+        ]
+    },
+    "1a8afbad92": {
+        "zebra": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "1a8bdc5842": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "1a95752aca": {
+        "duck": [
+            "1",
+            "2"
+        ]
+    },
+    "1a9c131cb7": {
+        "ape": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "1aa3da3ee3": {
+        "sheep": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "1ab27ec7ea": {
+        "deer": [
+            "1"
+        ]
+    },
+    "1abf16d21d": {
+        "turtle": [
+            "1"
+        ]
+    },
+    "1acd0f993b": {
+        "dog": [
+            "1"
+        ],
+        "person": [
+            "3"
+        ]
+    },
+    "1ad202e499": {
+        "lizard": [
+            "1",
+            "2"
+        ]
+    },
+    "1af8d2395d": {
+        "airplane": [
+            "4"
+        ],
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "1afd39a1fa": {
+        "motorbike": [
+            "2"
+        ]
+    },
+    "1b2d31306f": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "1b3fa67f0e": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "1b43fa74b4": {
+        "owl": [
+            "1",
+            "2"
+        ]
+    },
+    "1b73ea9fc2": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "1b7e8bb255": {
+        "person": [
+            "2"
+        ]
+    },
+    "1b8680f8cd": {
+        "person": [
+            "2",
+            "3"
+        ]
+    },
+    "1b883843c0": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "1b8898785b": {
+        "monkey": [
+            "1",
+            "2"
+        ]
+    },
+    "1b88ba1aa4": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "1b96a498e5": {
+        "ape": [
+            "1"
+        ]
+    },
+    "1bbc4c274f": {
+        "fish": [
+            "2"
+        ]
+    },
+    "1bd87fe9ab": {
+        "train": [
+            "1"
+        ]
+    },
+    "1c4090c75b": {
+        "whale": [
+            "1"
+        ]
+    },
+    "1c41934f84": {
+        "elephant": [
+            "1",
+            "2"
+        ]
+    },
+    "1c72b04b56": {
+        "lion": [
+            "1"
+        ]
+    },
+    "1c87955a3a": {
+        "crocodile": [
+            "1"
+        ],
+        "turtle": [
+            "2"
+        ]
+    },
+    "1c9f9eb792": {
+        "person": [
+            "2"
+        ]
+    },
+    "1ca240fede": {
+        "train": [
+            "1"
+        ]
+    },
+    "1ca5673803": {
+        "person": [
+            "1",
+            "3"
+        ]
+    },
+    "1cada35274": {
+        "duck": [
+            "1"
+        ]
+    },
+    "1cb44b920d": {
+        "eagle": [
+            "1",
+            "2"
+        ]
+    },
+    "1cd10e62be": {
+        "leopard": [
+            "1"
+        ]
+    },
+    "1d3087d5e5": {
+        "fish": [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5"
+        ]
+    },
+    "1d3685150a": {
+        "person": [
+            "1",
+            "3"
+        ]
+    },
+    "1d6ff083aa": {
+        "person": [
+            "1",
+            "2"
+        ]
+    }
+}

mbench/numbered_valid_obj_ids_gpt-4o_randcap.json ADDED Viewed

	@@ -0,0 +1,2153 @@

+{
+    "003234408d": {
+        "penguin": [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5"
+        ]
+    },
+    "0043f083b5": {
+        "bus": [
+            "1"
+        ],
+        "sedan": [
+            "2",
+            "3"
+        ]
+    },
+    "0044fa5fba": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "005a527edd": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "0065b171f9": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "00917dcfc4": {
+        "zebra": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "00a23ccf53": {
+        "shark": [
+            "1"
+        ]
+    },
+    "00ad5016a4": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "01082ae388": {
+        "leopard": [
+            "1"
+        ]
+    },
+    "011ac0a06f": {
+        "ape": [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5"
+        ]
+    },
+    "013099c098": {
+        "giant_panda": [
+            "1",
+            "2"
+        ]
+    },
+    "0155498c85": {
+        "motorbike": [
+            "2"
+        ],
+        "person": [
+            "1"
+        ]
+    },
+    "01694ad9c8": {
+        "bird": [
+            "1"
+        ]
+    },
+    "017ac35701": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "01b80e8e1a": {
+        "zebra": [
+            "1",
+            "2"
+        ]
+    },
+    "01baa5a4e1": {},
+    "01c3111683": {
+        "whale": [
+            "1"
+        ]
+    },
+    "01c4cb5ffe": {
+        "person": [
+            "1",
+            "3"
+        ]
+    },
+    "01c76f0a82": {
+        "sedan": [
+            "1",
+            "4"
+        ]
+    },
+    "01c783268c": {
+        "person": [
+            "2"
+        ],
+        "ape": [
+            "1"
+        ]
+    },
+    "01e64dd36a": {
+        "cow": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "01ed275c6e": {
+        "giraffe": [
+            "1",
+            "2"
+        ]
+    },
+    "01ff60d1fa": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "020cd28cd2": {
+        "person": [
+            "1"
+        ]
+    },
+    "02264db755": {
+        "fox": [
+            "1"
+        ]
+    },
+    "0248626d9a": {
+        "train": [
+            "1"
+        ]
+    },
+    "02668dbffa": {
+        "frog": [
+            "1"
+        ]
+    },
+    "0274193026": {
+        "person": [
+            "2"
+        ]
+    },
+    "02d28375aa": {
+        "fox": [
+            "1"
+        ]
+    },
+    "031ccc99b1": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0321b18c10": {
+        "elephant": [
+            "3"
+        ],
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "0348a45bca": {
+        "fish": [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5"
+        ]
+    },
+    "0355e92655": {
+        "boat": [
+            "3"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "0358b938c1": {
+        "elephant": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "0368107cf1": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "0379ddf557": {
+        "person": [
+            "1"
+        ]
+    },
+    "038b2cc71d": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "038c15a5dd": {
+        "hedgehog": [
+            "1"
+        ]
+    },
+    "03a06cc98a": {
+        "giraffe": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "03a63e187f": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "03c95b4dae": {
+        "elephant": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "03e2b57b0e": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "04194e1248": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "04259896e2": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "0444918a5f": {
+        "truck": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "04460a7a52": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "04474174a4": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "0450095513": {
+        "snail": [
+            "1"
+        ]
+    },
+    "045f00aed2": {
+        "person": [
+            "3"
+        ],
+        "tiger": [
+            "1"
+        ]
+    },
+    "04667fabaa": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "04735c5030": {
+        "cat": [
+            "1",
+            "2"
+        ]
+    },
+    "04990d1915": {
+        "truck": [
+            "3"
+        ],
+        "bus": [
+            "2"
+        ],
+        "sedan": [
+            "1"
+        ]
+    },
+    "04d62d9d98": {
+        "person": [
+            "1"
+        ]
+    },
+    "04f21da964": {
+        "monkey": [
+            "1"
+        ]
+    },
+    "04fbad476e": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "04fe256562": {
+        "motorbike": [
+            "1"
+        ],
+        "truck": [
+            "2"
+        ]
+    },
+    "0503bf89c9": {
+        "hedgehog": [
+            "1"
+        ]
+    },
+    "0536c9eed0": {
+        "cat": [
+            "1"
+        ]
+    },
+    "054acb238f": {
+        "owl": [
+            "1"
+        ]
+    },
+    "05579ca250": {
+        "person": [
+            "1"
+        ],
+        "sedan": [
+            "3"
+        ]
+    },
+    "056c200404": {},
+    "05774f3a2c": {
+        "ape": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "058a7592c8": {
+        "train": [
+            "1"
+        ]
+    },
+    "05a0a513df": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "05a569d8aa": {
+        "cat": [
+            "1"
+        ],
+        "mouse": [
+            "2"
+        ]
+    },
+    "05aa652648": {
+        "ape": [
+            "1"
+        ]
+    },
+    "05d7715782": {},
+    "05e0b0f28f": {
+        "mouse": [
+            "1"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "05fdbbdd7a": {},
+    "05ffcfed85": {
+        "monkey": [
+            "1",
+            "2"
+        ]
+    },
+    "0630391881": {
+        "person": [
+            "1"
+        ]
+    },
+    "06840b2bbe": {
+        "snake": [
+            "1"
+        ]
+    },
+    "068f7dce6f": {
+        "shark": [
+            "1"
+        ]
+    },
+    "0693719753": {
+        "turtle": [
+            "1",
+            "2"
+        ]
+    },
+    "06ce2b51fb": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "06e224798e": {
+        "tiger": [
+            "1"
+        ]
+    },
+    "06ee361788": {
+        "duck": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "06fbb3fa2c": {
+        "eagle": [
+            "1"
+        ]
+    },
+    "0700264286": {
+        "cow": [
+            "1",
+            "2"
+        ]
+    },
+    "070c918ca7": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "07129e14a4": {
+        "person": [
+            "3"
+        ],
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "07177017e9": {
+        "motorbike": [
+            "1",
+            "2"
+        ]
+    },
+    "07238ffc58": {
+        "monkey": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "07353b2a89": {
+        "sheep": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "0738493cbf": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "075926c651": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "075c701292": {
+        "duck": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "0762ea9a30": {
+        "person": [
+            "1"
+        ]
+    },
+    "07652ee4af": {
+        "person": [
+            "1"
+        ]
+    },
+    "076f206928": {
+        "person": [
+            "3"
+        ],
+        "zebra": [
+            "1",
+            "2"
+        ]
+    },
+    "077d32af19": {
+        "train": [
+            "4"
+        ],
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "079049275c": {
+        "mouse": [
+            "1"
+        ]
+    },
+    "07913cdda7": {
+        "train": [
+            "1"
+        ],
+        "person": [
+            "2",
+            "3"
+        ]
+    },
+    "07a11a35e8": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "07ac33b6df": {
+        "ape": [
+            "1"
+        ]
+    },
+    "07c62c3d11": {
+        "parrot": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "07cc1c7d74": {
+        "snake": [
+            "1"
+        ]
+    },
+    "080196ef01": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "081207976e": {},
+    "081ae4fa44": {
+        "shark": [
+            "1",
+            "2"
+        ]
+    },
+    "081d8250cb": {
+        "person": [
+            "1"
+        ],
+        "sedan": [
+            "3"
+        ]
+    },
+    "082900c5d4": {
+        "duck": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0860df21e2": {},
+    "0866d4c5e3": {
+        "bird": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0891ac2eb6": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "08931bc458": {
+        "person": [
+            "1"
+        ]
+    },
+    "08aa2705d5": {
+        "snake": [
+            "1"
+        ]
+    },
+    "08c8450db7": {},
+    "08d50b926c": {
+        "turtle": [
+            "1",
+            "2"
+        ]
+    },
+    "08e1e4de15": {
+        "monkey": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "08e48c1a48": {
+        "cow": [
+            "1"
+        ]
+    },
+    "08f561c65e": {
+        "person": [
+            "2"
+        ],
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "08feb87790": {
+        "sheep": [
+            "1"
+        ]
+    },
+    "09049f6fe3": {
+        "mouse": [
+            "1",
+            "2"
+        ]
+    },
+    "092e4ff450": {
+        "snake": [
+            "1"
+        ]
+    },
+    "09338adea8": {
+        "whale": [
+            "1",
+            "2"
+        ]
+    },
+    "093c335ccc": {
+        "person": [
+            "2"
+        ]
+    },
+    "0970d28339": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "0974a213dc": {
+        "giraffe": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "097b471ed8": {
+        "cat": [
+            "1",
+            "2"
+        ]
+    },
+    "0990941758": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "09a348f4fa": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "09a6841288": {
+        "duck": [
+            "1",
+            "2"
+        ]
+    },
+    "09c5bad17b": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "09c9ce80c7": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "09ff54fef4": {
+        "fox": [
+            "1",
+            "2"
+        ]
+    },
+    "0a23765d15": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "0a275e7f12": {
+        "elephant": [
+            "1"
+        ]
+    },
+    "0a2f2bd294": {
+        "motorbike": [
+            "1"
+        ]
+    },
+    "0a7a2514aa": {
+        "cat": [
+            "1"
+        ],
+        "lizard": [
+            "2"
+        ]
+    },
+    "0a7b27fde9": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "0a8c467cc3": {
+        "fish": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0ac8c560ae": {
+        "person": [
+            "2",
+            "3"
+        ]
+    },
+    "0b1627e896": {
+        "boat": [
+            "1"
+        ]
+    },
+    "0b285c47f6": {
+        "mouse": [
+            "1"
+        ]
+    },
+    "0b34ec1d55": {
+        "ape": [
+            "1"
+        ]
+    },
+    "0b5b5e8e5a": {
+        "person": [
+            "1"
+        ],
+        "sedan": [
+            "2"
+        ]
+    },
+    "0b68535614": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "0b6f9105fc": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "0b7dbfa3cb": {
+        "cow": [
+            "1"
+        ]
+    },
+    "0b9cea51ca": {
+        "whale": [
+            "1"
+        ]
+    },
+    "0b9d012be8": {
+        "camel": [
+            "1"
+        ]
+    },
+    "0bcfc4177d": {
+        "truck": [
+            "1"
+        ]
+    },
+    "0bd37b23c1": {
+        "motorbike": [
+            "1"
+        ]
+    },
+    "0bd864064c": {
+        "eagle": [
+            "1"
+        ]
+    },
+    "0c11c6bf7b": {
+        "deer": [
+            "1"
+        ]
+    },
+    "0c26bc77ac": {
+        "crocodile": [
+            "1"
+        ]
+    },
+    "0c3a04798c": {
+        "fish": [
+            "2"
+        ],
+        "duck": [
+            "1"
+        ]
+    },
+    "0c44a9d545": {
+        "tiger": [
+            "1"
+        ]
+    },
+    "0c817cc390": {
+        "hedgehog": [
+            "1"
+        ],
+        "dog": [
+            "2"
+        ]
+    },
+    "0ca839ee9a": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "0cd7ac0ac0": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "0ce06e0121": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "0cfe974a89": {
+        "turtle": [
+            "1",
+            "2"
+        ]
+    },
+    "0d2fcc0dcd": {
+        "zebra": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "0d3aad05d2": {
+        "person": [
+            "1"
+        ]
+    },
+    "0d40b015f4": {
+        "person": [
+            "1"
+        ]
+    },
+    "0d97fba242": {
+        "dog": [
+            "1"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "0d9cc80d7e": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0dab85b6d3": {
+        "lizard": [
+            "1",
+            "2"
+        ]
+    },
+    "0db5c427a5": {
+        "train": [
+            "1"
+        ]
+    },
+    "0dbaf284f1": {
+        "cat": [
+            "1",
+            "2"
+        ]
+    },
+    "0de4923598": {},
+    "0df28a9101": {
+        "turtle": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0e04f636c4": {
+        "frog": [
+            "1"
+        ]
+    },
+    "0e05f0e232": {
+        "lizard": [
+            "1",
+            "2"
+        ]
+    },
+    "0e0930474b": {
+        "person": [
+            "2",
+            "3"
+        ],
+        "sedan": [
+            "1"
+        ]
+    },
+    "0e27472bea": {
+        "turtle": [
+            "1"
+        ]
+    },
+    "0e30020549": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "0e621feb6c": {
+        "lizard": [
+            "1",
+            "2"
+        ]
+    },
+    "0e803c7d73": {},
+    "0e9ebe4e3c": {
+        "truck": [
+            "1"
+        ]
+    },
+    "0e9f2785ec": {
+        "person": [
+            "2"
+        ]
+    },
+    "0ea68d418b": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "0eb403a222": {},
+    "0ee92053d6": {
+        "person": [
+            "1"
+        ]
+    },
+    "0eefca067f": {
+        "giant_panda": [
+            "1",
+            "2"
+        ]
+    },
+    "0f17fa6fcb": {
+        "duck": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0f1ac8e9a3": {
+        "frog": [
+            "1"
+        ]
+    },
+    "0f202e9852": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "0f2ab8b1ff": {
+        "dolphin": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0f51a78756": {
+        "sheep": [
+            "1"
+        ]
+    },
+    "0f5fbe16b0": {
+        "raccoon": [
+            "1",
+            "2"
+        ]
+    },
+    "0f6072077b": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0f6b69b2f4": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "0f6c2163de": {
+        "snail": [
+            "1"
+        ]
+    },
+    "0f74ec5599": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "0f9683715b": {
+        "elephant": [
+            "1"
+        ]
+    },
+    "0fa7b59356": {
+        "duck": [
+            "1"
+        ]
+    },
+    "0fb173695b": {
+        "person": [
+            "3"
+        ]
+    },
+    "0fc958cde2": {
+        "owl": [
+            "1"
+        ]
+    },
+    "0fe7b1a621": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "0ffcdb491c": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "101caff7d4": {
+        "giant_panda": [
+            "1",
+            "2"
+        ]
+    },
+    "1022fe8417": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "1032e80b37": {
+        "giraffe": [
+            "1"
+        ]
+    },
+    "103f501680": {
+        "fish": [
+            "1"
+        ]
+    },
+    "104e64565f": {
+        "elephant": [
+            "1"
+        ]
+    },
+    "104f1ab997": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "106242403f": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "10b31f5431": {
+        "person": [
+            "1",
+            "3",
+            "4"
+        ]
+    },
+    "10eced835e": {
+        "giant_panda": [
+            "1",
+            "2"
+        ]
+    },
+    "110d26fa3a": {
+        "shark": [
+            "1"
+        ]
+    },
+    "1122c1d16a": {
+        "person": [
+            "6"
+        ],
+        "parrot": [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5"
+        ]
+    },
+    "1145b49a5f": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "11485838c2": {
+        "giraffe": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "114e7676ec": {
+        "person": [
+            "1"
+        ]
+    },
+    "1157472b95": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "115ee1072c": {
+        "cow": [
+            "1"
+        ]
+    },
+    "1171141012": {
+        "turtle": [
+            "1"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "117757b4b8": {
+        "snail": [
+            "1"
+        ]
+    },
+    "1178932d2f": {
+        "motorbike": [
+            "3"
+        ],
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "117cc76bda": {
+        "whale": [
+            "1"
+        ]
+    },
+    "1180cbf814": {
+        "fish": [
+            "1",
+            "2"
+        ]
+    },
+    "1187bbd0e3": {
+        "cat": [
+            "1"
+        ]
+    },
+    "1197e44b26": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "119cf20728": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "119dd54871": {
+        "lion": [
+            "1",
+            "2"
+        ]
+    },
+    "11a0c3b724": {
+        "mouse": [
+            "1",
+            "2"
+        ]
+    },
+    "11a6ba8c94": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "11c722a456": {
+        "turtle": [
+            "1",
+            "2"
+        ]
+    },
+    "11cbcb0b4d": {
+        "zebra": [
+            "1"
+        ]
+    },
+    "11ccf5e99d": {
+        "person": [
+            "2"
+        ]
+    },
+    "11ce6f452e": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "11feabe596": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "120cb9514d": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "12156b25b3": {
+        "person": [
+            "1"
+        ]
+    },
+    "122896672d": {
+        "person": [
+            "1",
+            "3"
+        ]
+    },
+    "1233ac8596": {
+        "dog": [
+            "1"
+        ]
+    },
+    "1239c87234": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "1250423f7c": {
+        "elephant": [
+            "3",
+            "4"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "1257a1bc67": {
+        "snake": [
+            "1"
+        ]
+    },
+    "125d1b19dd": {
+        "giant_panda": [
+            "1",
+            "2"
+        ]
+    },
+    "126d203967": {
+        "person": [
+            "2"
+        ]
+    },
+    "1295e19071": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "12ad198c54": {
+        "person": [
+            "1"
+        ]
+    },
+    "12bddb2bcb": {
+        "person": [
+            "2"
+        ]
+    },
+    "12ec9b93ee": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "12eebedc35": {
+        "bird": [
+            "1"
+        ]
+    },
+    "132852e094": {
+        "fox": [
+            "1"
+        ]
+    },
+    "1329409f2a": {
+        "fish": [
+            "1"
+        ]
+    },
+    "13325cfa14": {
+        "person": [
+            "2"
+        ]
+    },
+    "1336440745": {
+        "mouse": [
+            "1",
+            "2"
+        ]
+    },
+    "134d06dbf9": {
+        "cat": [
+            "1"
+        ]
+    },
+    "135625b53d": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "13870016f9": {
+        "cow": [
+            "2",
+            "3"
+        ],
+        "person": [
+            "1"
+        ]
+    },
+    "13960b3c84": {
+        "giraffe": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "13adaad9d9": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "13ae097e20": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "13e3070469": {
+        "zebra": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "13f6a8c20d": {
+        "fish": [
+            "1"
+        ]
+    },
+    "1416925cf2": {
+        "truck": [
+            "1",
+            "2"
+        ]
+    },
+    "142d2621f5": {
+        "motorbike": [
+            "3"
+        ],
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "145d5d7c03": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "145fdc3ac5": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "1471274fa7": {
+        "person": [
+            "1"
+        ]
+    },
+    "14a6b5a139": {
+        "fish": [
+            "1"
+        ]
+    },
+    "14c21cea0d": {
+        "monkey": [
+            "1",
+            "2"
+        ]
+    },
+    "14dae0dc93": {
+        "person": [
+            "2"
+        ]
+    },
+    "14f9bd22b5": {
+        "tiger": [
+            "1"
+        ]
+    },
+    "14fd28ae99": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "15097d5d4e": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "150ea711f2": {
+        "whale": [
+            "1"
+        ]
+    },
+    "1514e3563f": {
+        "earless_seal": [
+            "1",
+            "2"
+        ]
+    },
+    "152aaa3a9e": {
+        "raccoon": [
+            "1"
+        ]
+    },
+    "152b7d3bd7": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "15617297cc": {
+        "person": [
+            "1"
+        ]
+    },
+    "15abbe0c52": {
+        "person": [
+            "1"
+        ]
+    },
+    "15d1fb3de5": {
+        "owl": [
+            "1"
+        ],
+        "cat": [
+            "2"
+        ]
+    },
+    "15f67b0fab": {
+        "person": [
+            "1"
+        ]
+    },
+    "161eb59aad": {
+        "cow": [
+            "2",
+            "3"
+        ],
+        "giraffe": [
+            "1"
+        ]
+    },
+    "16288ea47f": {
+        "duck": [
+            "1",
+            "2"
+        ]
+    },
+    "164410ce62": {
+        "person": [
+            "1"
+        ]
+    },
+    "165c3c8cd4": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "165c42b41b": {
+        "motorbike": [
+            "2",
+            "3"
+        ],
+        "person": [
+            "1",
+            "4"
+        ]
+    },
+    "165ec9e22b": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "1669502269": {
+        "person": [
+            "1"
+        ]
+    },
+    "16763cccbb": {
+        "ape": [
+            "1"
+        ]
+    },
+    "16adde065e": {
+        "cat": [
+            "2"
+        ],
+        "person": [
+            "3"
+        ]
+    },
+    "16af445362": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "16afd538ad": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "16c3fa4d5d": {
+        "sedan": [
+            "1"
+        ]
+    },
+    "16d1d65c27": {
+        "monkey": [
+            "1"
+        ]
+    },
+    "16e8599e94": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "16fe9fb444": {
+        "motorbike": [
+            "1"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "1705796b02": {
+        "train": [
+            "1"
+        ]
+    },
+    "1724db7671": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "17418e81ea": {
+        "shark": [
+            "1"
+        ]
+    },
+    "175169edbb": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "17622326fd": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "17656bae77": {
+        "elephant": [
+            "1"
+        ]
+    },
+    "17b0d94172": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "17c220e4f6": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "17c7bcd146": {
+        "train": [
+            "1"
+        ]
+    },
+    "17cb4afe89": {
+        "tiger": [
+            "1"
+        ]
+    },
+    "17cd79a434": {
+        "squirrel": [
+            "1"
+        ]
+    },
+    "17d18604c3": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "17d8ca1a37": {
+        "owl": [
+            "1"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "17e33f4330": {
+        "monkey": [
+            "1"
+        ]
+    },
+    "17f7a6d805": {
+        "snail": [
+            "1"
+        ]
+    },
+    "180abc8378": {
+        "owl": [
+            "1"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "183ba3d652": {
+        "person": [
+            "2"
+        ],
+        "motorbike": [
+            "3"
+        ]
+    },
+    "185bf64702": {
+        "zebra": [
+            "1",
+            "2"
+        ]
+    },
+    "18913cc690": {
+        "train": [
+            "1"
+        ]
+    },
+    "1892651815": {
+        "camel": [
+            "1"
+        ]
+    },
+    "189ac8208a": {
+        "giraffe": [
+            "1",
+            "2"
+        ]
+    },
+    "189b44e92c": {
+        "zebra": [
+            "1"
+        ]
+    },
+    "18ac264b76": {
+        "person": [
+            "2"
+        ]
+    },
+    "18b245ab49": {
+        "penguin": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "18b5cebc34": {
+        "mouse": [
+            "1"
+        ]
+    },
+    "18bad52083": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "18bb5144d5": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "18c6f205c5": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "1903f9ea15": {
+        "bird": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "1917b209f2": {
+        "cow": [
+            "3",
+            "4"
+        ],
+        "person": [
+            "1"
+        ],
+        "horse": [
+            "2"
+        ]
+    },
+    "191e74c01d": {
+        "deer": [
+            "1"
+        ]
+    },
+    "19367bb94e": {
+        "fish": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "193ffaa217": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "19696b67d3": {
+        "cow": [
+            "1"
+        ]
+    },
+    "197f3ab6f3": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "1981e763cc": {
+        "sheep": [
+            "1",
+            "2"
+        ]
+    },
+    "198afe39ae": {
+        "person": [
+            "1"
+        ]
+    },
+    "19a6e62b9b": {
+        "monkey": [
+            "1",
+            "2"
+        ]
+    },
+    "19b60d5335": {
+        "hedgehog": [
+            "1"
+        ]
+    },
+    "19c00c11f9": {
+        "person": [
+            "1"
+        ]
+    },
+    "19e061eb88": {
+        "boat": [
+            "1",
+            "2"
+        ]
+    },
+    "19e8bc6178": {
+        "dog": [
+            "1"
+        ]
+    },
+    "19ee80dac6": {
+        "person": [
+            "1",
+            "3",
+            "4"
+        ]
+    },
+    "1a25a9170a": {
+        "cow": [
+            "1"
+        ],
+        "person": [
+            "2",
+            "3"
+        ]
+    },
+    "1a359a6c1a": {
+        "sheep": [
+            "1"
+        ]
+    },
+    "1a3e87c566": {
+        "frog": [
+            "1"
+        ]
+    },
+    "1a5fe06b00": {
+        "bus": [
+            "1"
+        ]
+    },
+    "1a6c0fbd1e": {
+        "person": [
+            "1"
+        ]
+    },
+    "1a6f3b5a4b": {
+        "sedan": [
+            "3"
+        ]
+    },
+    "1a8afbad92": {
+        "zebra": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "1a8bdc5842": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "1a95752aca": {
+        "duck": [
+            "1",
+            "2"
+        ]
+    },
+    "1a9c131cb7": {
+        "ape": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "1aa3da3ee3": {
+        "sheep": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "1ab27ec7ea": {
+        "deer": [
+            "1"
+        ]
+    },
+    "1abf16d21d": {
+        "turtle": [
+            "1"
+        ]
+    },
+    "1acd0f993b": {
+        "dog": [
+            "1"
+        ],
+        "person": [
+            "3"
+        ]
+    },
+    "1ad202e499": {
+        "lizard": [
+            "1",
+            "2"
+        ]
+    },
+    "1af8d2395d": {
+        "person": [
+            "1",
+            "2"
+        ],
+        "airplane": [
+            "4"
+        ]
+    },
+    "1afd39a1fa": {
+        "motorbike": [
+            "2"
+        ]
+    },
+    "1b2d31306f": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "1b3fa67f0e": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "1b43fa74b4": {
+        "owl": [
+            "1",
+            "2"
+        ]
+    },
+    "1b73ea9fc2": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "1b7e8bb255": {
+        "person": [
+            "2"
+        ]
+    },
+    "1b8680f8cd": {
+        "person": [
+            "2",
+            "3"
+        ]
+    },
+    "1b883843c0": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "1b8898785b": {
+        "monkey": [
+            "1",
+            "2"
+        ]
+    },
+    "1b88ba1aa4": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "1b96a498e5": {
+        "ape": [
+            "1"
+        ]
+    },
+    "1bbc4c274f": {
+        "fish": [
+            "2"
+        ]
+    },
+    "1bd87fe9ab": {
+        "train": [
+            "1"
+        ]
+    },
+    "1c4090c75b": {
+        "whale": [
+            "1"
+        ]
+    },
+    "1c41934f84": {
+        "elephant": [
+            "1",
+            "2"
+        ]
+    },
+    "1c72b04b56": {
+        "lion": [
+            "1"
+        ]
+    },
+    "1c87955a3a": {
+        "crocodile": [
+            "1"
+        ],
+        "turtle": [
+            "2"
+        ]
+    },
+    "1c9f9eb792": {
+        "person": [
+            "2"
+        ]
+    },
+    "1ca240fede": {
+        "train": [
+            "1"
+        ]
+    },
+    "1ca5673803": {
+        "person": [
+            "1",
+            "3"
+        ]
+    },
+    "1cada35274": {
+        "duck": [
+            "1"
+        ]
+    },
+    "1cb44b920d": {
+        "eagle": [
+            "1",
+            "2"
+        ]
+    },
+    "1cd10e62be": {
+        "leopard": [
+            "1"
+        ]
+    },
+    "1d3087d5e5": {
+        "fish": [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5"
+        ]
+    },
+    "1d3685150a": {
+        "person": [
+            "1",
+            "3"
+        ]
+    },
+    "1d6ff083aa": {
+        "person": [
+            "1",
+            "2"
+        ]
+    }
+}