dianecy commited on
Commit
2c58401
·
verified ·
1 Parent(s): 3ec4928

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +7 -0
  2. .history/datasets/__init___20241227174300.py +37 -0
  3. .history/datasets/ytvos_ref_20250113130043.py +0 -0
  4. .history/datasets/ytvos_ref_20250116073805.py +239 -0
  5. .history/mbench/gpt_ref-ytvos-cy_20250121155719.py +428 -0
  6. .history/mbench/gpt_ref-ytvos_20250119070039.py +277 -0
  7. .history/mbench/gpt_ref-ytvos_20250119070740.py +285 -0
  8. .history/mbench/gpt_ref-ytvos_20250119071412.py +292 -0
  9. .history/mbench/gpt_ref-ytvos_20250119072601.py +292 -0
  10. .history/mbench/gpt_ref-ytvos_20250119073047.py +292 -0
  11. .history/mbench/gpt_ref-ytvos_numbered_cy_20250131124149.py +427 -0
  12. .history/mbench/gpt_ref-ytvos_numbered_cy_20250201141952.py +460 -0
  13. .history/mbench/gpt_ref-ytvos_numbered_cy_20250202183102.py +460 -0
  14. .history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207172804.py +656 -0
  15. .history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207173210.py +656 -0
  16. .history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207173355.py +677 -0
  17. .history/mbench/make_ref-ytvos_json_20250117032501.py +104 -0
  18. .history/mbench/make_ref-ytvos_json_20250117072314.py +107 -0
  19. .history/mbench_a2d/gpt_a2d_numbered_20250206114207.py +205 -0
  20. __pycache__/opts.cpython-310.pyc +0 -0
  21. __pycache__/opts.cpython-39.pyc +0 -0
  22. __pycache__/refer.cpython-39.pyc +0 -0
  23. davis2017/davis.py +122 -0
  24. docs/davis_demo1.gif +3 -0
  25. docs/davis_demo2.gif +3 -0
  26. docs/install.md +42 -0
  27. docs/network.png +3 -0
  28. docs/ytvos_demo1.gif +3 -0
  29. docs/ytvos_demo2.gif +3 -0
  30. hf_cache/.locks/models--zhiqiulin--clip-flant5-xxl/e14a3254bf04f32056759bdc60c64736e7638f31b43957586ff2442ff393890a.lock +0 -0
  31. hf_cache/models--zhiqiulin--clip-flant5-xxl/snapshots/89bad6fffe1126b24d4360c1e1f69145eb6103aa/pytorch_model-00002-of-00003.bin +3 -0
  32. make_ref-ytvos/manual_selection.ipynb +381 -0
  33. make_refcoco/refcocog_google/multi_object_data_gref_google.json +0 -0
  34. make_refcoco/refcocog_google/needrevision_refid_part4.json +506 -0
  35. make_refcoco/refcocog_umd/needrevision_refid_part4.json +498 -0
  36. mbench/__pycache__/__init__.cpython-310.pyc +0 -0
  37. mbench/__pycache__/ytvos_ref.cpython-310.pyc +0 -0
  38. mbench/check_image_numbered_cy.ipynb +0 -0
  39. mbench/check_image_numbered_cy_score.py +212 -0
  40. mbench/gpt_ref-ytvos-cy.ipynb +0 -0
  41. mbench/gpt_ref-ytvos-revised.ipynb +0 -0
  42. mbench/gpt_ref-ytvos_numbered.ipynb +3 -0
  43. mbench/gpt_ref-ytvos_numbered_cy.ipynb +0 -0
  44. mbench/numbered_captions.json +0 -0
  45. mbench/numbered_captions_gpt-4o.json +0 -0
  46. mbench/numbered_captions_gpt-4o_nomask_randcap2.json +0 -0
  47. mbench/numbered_valid_obj_ids_gpt-4o_final.json +0 -0
  48. mbench/numbered_valid_obj_ids_gpt-4o_nomask_randcap2.json +2153 -0
  49. mbench/sampled_frame.json +3 -0
  50. mbench/sampled_frame2.json +0 -0
.gitattributes CHANGED
@@ -47,3 +47,10 @@ LAVT-RIS/refer/data/refcocog/refs(google).p filter=lfs diff=lfs merge=lfs -text
47
  LAVT-RIS/refer/data/refcocog/refs(umd).p filter=lfs diff=lfs merge=lfs -text
48
  LAVT-RIS/refer/evaluation/tokenizer/stanford-corenlp-3.4.1.jar filter=lfs diff=lfs merge=lfs -text
49
  hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/12acb5074c883dcab3e166d86d20130615ff83b0d26736ee046f4184202ebd3b filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
47
  LAVT-RIS/refer/data/refcocog/refs(umd).p filter=lfs diff=lfs merge=lfs -text
48
  LAVT-RIS/refer/evaluation/tokenizer/stanford-corenlp-3.4.1.jar filter=lfs diff=lfs merge=lfs -text
49
  hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/12acb5074c883dcab3e166d86d20130615ff83b0d26736ee046f4184202ebd3b filter=lfs diff=lfs merge=lfs -text
50
+ docs/davis_demo2.gif filter=lfs diff=lfs merge=lfs -text
51
+ mbench/gpt_ref-ytvos_numbered.ipynb filter=lfs diff=lfs merge=lfs -text
52
+ docs/ytvos_demo2.gif filter=lfs diff=lfs merge=lfs -text
53
+ mbench/sampled_frame.json filter=lfs diff=lfs merge=lfs -text
54
+ docs/network.png filter=lfs diff=lfs merge=lfs -text
55
+ docs/ytvos_demo1.gif filter=lfs diff=lfs merge=lfs -text
56
+ docs/davis_demo1.gif filter=lfs diff=lfs merge=lfs -text
.history/datasets/__init___20241227174300.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.utils.data
2
+ import torchvision
3
+
4
+ from .ytvos import build as build_ytvos
5
+ from .davis import build as build_davis
6
+ from .a2d import build as build_a2d
7
+ from .jhmdb import build as build_jhmdb
8
+ from .refexp import build as build_refexp
9
+ from .concat_dataset import build as build_joint
10
+
11
+
12
+ def get_coco_api_from_dataset(dataset):
13
+ for _ in range(10):
14
+ # if isinstance(dataset, torchvision.datasets.CocoDetection):
15
+ # break
16
+ if isinstance(dataset, torch.utils.data.Subset):
17
+ dataset = dataset.dataset
18
+ if isinstance(dataset, torchvision.datasets.CocoDetection):
19
+ return dataset.coco
20
+
21
+
22
+ def build_dataset(dataset_file: str, image_set: str, args):
23
+ if dataset_file == 'ytvos':
24
+ return build_ytvos(image_set, args)
25
+ if dataset_file == 'davis':
26
+ return build_davis(image_set, args)
27
+ if dataset_file == 'a2d':
28
+ return build_a2d(image_set, args)
29
+ if dataset_file == 'jhmdb':
30
+ return build_jhmdb(image_set, args)
31
+ # for pretraining
32
+ if dataset_file == "refcoco" or dataset_file == "refcoco+" or dataset_file == "refcocog":
33
+ return build_refexp(dataset_file, image_set, args)
34
+ # for joint training of refcoco and ytvos
35
+ if dataset_file == 'joint':
36
+ return build_joint(image_set, args)
37
+ raise ValueError(f'dataset {dataset_file} not supported')
.history/datasets/ytvos_ref_20250113130043.py ADDED
File without changes
.history/datasets/ytvos_ref_20250116073805.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ref-YoutubeVOS data loader
3
+ """
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from torch.autograd.grad_mode import F
8
+ from torch.utils.data import Dataset
9
+ import datasets.transforms_video as T
10
+
11
+ import os
12
+ from PIL import Image
13
+ import json
14
+ import numpy as np
15
+ import random
16
+
17
+ from datasets.categories import ytvos_category_dict as category_dict
18
+
19
+
20
+ class YTVOSDataset(Dataset):
21
+ """
22
+ A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
23
+ "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
24
+ (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
25
+ The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
26
+ dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
27
+ through the Youtube-VOS referring video object segmentation competition page at:
28
+ https://competitions.codalab.org/competitions/29139
29
+ Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
30
+ two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
31
+ currently only be done on the competition 'validation' subset using the competition's server, as
32
+ annotations were publicly released only for the 'train' subset of the competition.
33
+
34
+ """
35
+ def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
36
+ num_frames: int, max_skip: int):
37
+ self.img_folder = img_folder
38
+ self.ann_file = ann_file
39
+ self._transforms = transforms
40
+ self.return_masks = return_masks # not used
41
+ self.num_frames = num_frames
42
+ self.max_skip = max_skip
43
+ # create video meta data
44
+ self.prepare_metas()
45
+
46
+ print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
47
+ print('\n')
48
+
49
+ def prepare_metas(self):
50
+ # read object information
51
+ with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
52
+ subset_metas_by_video = json.load(f)['videos']
53
+
54
+ # read expression data
55
+ with open(str(self.ann_file), 'r') as f:
56
+ subset_expressions_by_video = json.load(f)['videos']
57
+ self.videos = list(subset_expressions_by_video.keys())
58
+
59
+ self.metas = []
60
+ skip_vid_count = 0
61
+
62
+ for vid in self.videos:
63
+ vid_meta = subset_metas_by_video[vid]
64
+ vid_data = subset_expressions_by_video[vid]
65
+ vid_frames = sorted(vid_data['frames'])
66
+ vid_len = len(vid_frames)
67
+
68
+ if vid_len < 11:
69
+ #print(f"Too short video: {vid} with frame length {vid_len}")
70
+ skip_vid_count += 1
71
+ continue
72
+
73
+
74
+ # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
75
+ start_idx , end_idx = 2, vid_len-2
76
+ bin_size = (end_idx - start_idx) // 4
77
+
78
+ bins = []
79
+ for i in range(4):
80
+ bin_start = start_idx + i * bin_size
81
+ bin_end = bin_start + bin_size if i < 3 else end_idx
82
+
83
+ bins.append((bin_start, bin_end))
84
+
85
+ # Random sample one frame from each bin
86
+ sample_indx = []
87
+ for start_idx, end_idx in bins:
88
+ sample_indx.append(random.randint(start_idx, end_idx - 1))
89
+ sample_indx.sort() # Ensure indices are in order
90
+
91
+
92
+ meta = {
93
+ 'video':vid,
94
+ 'sample_indx':sample_indx,
95
+ 'bins':bins,
96
+ 'frames':vid_frames
97
+ }
98
+ obj_id_cat = {}
99
+ for exp_id, exp_dict in vid_data['expressions'].items():
100
+ obj_id = exp_dict['obj_id']
101
+ if obj_id not in obj_id_cat:
102
+ obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
103
+ meta['obj_id_cat'] = obj_id_cat
104
+ self.metas.append(meta)
105
+
106
+ print(f"skipped {skip_vid_count} short videos")
107
+
108
+
109
+ @staticmethod
110
+ def bounding_box(img):
111
+ rows = np.any(img, axis=1)
112
+ cols = np.any(img, axis=0)
113
+ rmin, rmax = np.where(rows)[0][[0, -1]]
114
+ cmin, cmax = np.where(cols)[0][[0, -1]]
115
+ return rmin, rmax, cmin, cmax # y1, y2, x1, x2
116
+
117
+ def __len__(self):
118
+ return len(self.metas)
119
+
120
+ def __getitem__(self, idx):
121
+ meta = self.metas[idx] # dict
122
+
123
+ video, sample_indx, bins, frames, obj_id_cat = \
124
+ meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
125
+
126
+ # read frames and masks
127
+ imgs, labels, boxes, masks, valid = [], [], [], [], []
128
+ for frame_indx in sample_indx:
129
+ frame_name = frames[frame_indx]
130
+ img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
131
+ mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
132
+ img = Image.open(img_path).convert('RGB')
133
+ imgs.append(img)
134
+
135
+ mask = Image.open(mask_path).convert('P')
136
+ mask = np.array(mask)
137
+
138
+ # create the target
139
+ for obj_id in list(obj_id_cat.keys()):
140
+ obj_mask = (mask==int(obj_id)).astype(np.float32) # 0,1 binary
141
+ if (obj_mask > 0).any():
142
+ y1, y2, x1, x2 = self.bounding_box(mask)
143
+ box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
144
+ valid.append(1)
145
+ else: # some frame didn't contain the instance
146
+ box = torch.tensor([0, 0, 0, 0]).to(torch.float)
147
+ valid.append(0)
148
+ obj_mask = torch.from_numpy(obj_mask)
149
+
150
+ # append
151
+ masks.append(obj_mask)
152
+ boxes.append(box)
153
+
154
+
155
+ # transform
156
+ w, h = img.size
157
+ boxes = torch.stack(boxes, dim=0)
158
+ boxes[:, 0::2].clamp_(min=0, max=w)
159
+ boxes[:, 1::2].clamp_(min=0, max=h)
160
+ masks = torch.stack(masks, dim=0)
161
+ target = {
162
+ 'frames_idx': sample_indx, # [T,]
163
+ 'boxes': boxes, # [T, 4], xyxy
164
+ 'masks': masks, # [T, H, W]
165
+ 'valid': torch.tensor(valid), # [T,]
166
+ 'obj_ids' : list(obj_id_cat.keys()),
167
+ 'orig_size': torch.as_tensor([int(h), int(w)]),
168
+ 'size': torch.as_tensor([int(h), int(w)])
169
+ }
170
+
171
+ # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
172
+ if self._transforms:
173
+ imgs, target = self._transforms(imgs, target)
174
+ imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
175
+ else:
176
+ imgs = np.array(imgs)
177
+ imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
178
+
179
+
180
+ # # FIXME: handle "valid", since some box may be removed due to random crop
181
+ # if torch.any(target['valid'] == 1): # at leatst one instance
182
+ # instance_check = True
183
+ # else:
184
+ # idx = random.randint(0, self.__len__() - 1)
185
+
186
+ return imgs, target
187
+
188
+
189
+ def make_coco_transforms(image_set, max_size=640):
190
+ normalize = T.Compose([
191
+ T.ToTensor(),
192
+ T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
193
+ ])
194
+
195
+ scales = [288, 320, 352, 392, 416, 448, 480, 512]
196
+
197
+ if image_set == 'train':
198
+ return T.Compose([
199
+ T.RandomHorizontalFlip(),
200
+ T.PhotometricDistort(),
201
+ T.RandomSelect(
202
+ T.Compose([
203
+ T.RandomResize(scales, max_size=max_size),
204
+ T.Check(),
205
+ ]),
206
+ T.Compose([
207
+ T.RandomResize([400, 500, 600]),
208
+ T.RandomSizeCrop(384, 600),
209
+ T.RandomResize(scales, max_size=max_size),
210
+ T.Check(),
211
+ ])
212
+ ),
213
+ normalize,
214
+ ])
215
+
216
+ # we do not use the 'val' set since the annotations are inaccessible
217
+ if image_set == 'val':
218
+ return T.Compose([
219
+ T.RandomResize([360], max_size=640),
220
+ normalize,
221
+ ])
222
+
223
+ raise ValueError(f'unknown {image_set}')
224
+
225
+
226
+ def build(image_set, args):
227
+ root = Path(args.ytvos_path)
228
+ assert root.exists(), f'provided YTVOS path {root} does not exist'
229
+ PATHS = {
230
+ "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
231
+ "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"), # not used actually
232
+ }
233
+ img_folder, ann_file = PATHS[image_set]
234
+ # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
235
+ # num_frames=args.num_frames, max_skip=args.max_skip)
236
+ dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
237
+ num_frames=args.num_frames, max_skip=args.max_skip)
238
+ return dataset
239
+
.history/mbench/gpt_ref-ytvos-cy_20250121155719.py ADDED
@@ -0,0 +1,428 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from os import path as osp
3
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
4
+
5
+ from mbench.ytvos_ref import build as build_ytvos_ref
6
+ import argparse
7
+ import opts
8
+
9
+ import sys
10
+ from pathlib import Path
11
+ import os
12
+ from os import path as osp
13
+ import skimage
14
+ from io import BytesIO
15
+
16
+ import numpy as np
17
+ import pandas as pd
18
+ import regex as re
19
+ import json
20
+
21
+ import cv2
22
+ from PIL import Image, ImageDraw
23
+ import torch
24
+ from torchvision.transforms import functional as F
25
+
26
+ from skimage import measure # (pip install scikit-image)
27
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
28
+
29
+ import matplotlib.pyplot as plt
30
+ import matplotlib.patches as patches
31
+ from matplotlib.collections import PatchCollection
32
+ from matplotlib.patches import Rectangle
33
+
34
+
35
+ import ipywidgets as widgets
36
+ from IPython.display import display, clear_output
37
+
38
+ from openai import OpenAI
39
+ import base64
40
+
41
+ # Function to encode the image
42
+ def encode_image(image_path):
43
+ with open(image_path, "rb") as image_file:
44
+ return base64.b64encode(image_file.read()).decode("utf-8")
45
+
46
+ # Captioner
47
+ ytvos_category_valid_list = [
48
+ 'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
49
+ 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
50
+ 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
51
+ 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
52
+ 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
53
+ 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
54
+ ]
55
+ def getCaption(video_id, json_data):
56
+ #데이터 가져오기
57
+ video_data = json_data[video_id]
58
+ frame_names = video_data['frame_names']
59
+ video_path = video_data['video_path']
60
+
61
+ cat_names = set()
62
+ all_captions = dict()
63
+ for obj_id in list(video_data['annotations'][0].keys()):
64
+ cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
65
+
66
+ # cat_names : person, snowboard
67
+ # 1. gpt에서 직접 action의 대상이 될 수 있는가 물어보기
68
+ # 2. ref-youtube-vos 에서 제공하는 카테고리 정보에서 우리가 처리하고 싶은 카테고리 이름만 남긴다
69
+
70
+ for cat_name in list(cat_names) :
71
+ image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
72
+ image_captions = {}
73
+
74
+ captioner = OpenAI()
75
+
76
+ #0단계: action의 대상이 될 수 있는가?
77
+ is_movable = False
78
+ if cat_name in ytvos_category_valid_list :
79
+ is_movable = True
80
+
81
+ # response_check = captioner.chat.completions.create(
82
+ # model="gpt-4o",
83
+ # messages=[
84
+ # {
85
+ # "role": "user",
86
+ # "content": f"""
87
+ # Can a {cat_name} be a subject of distinct actions or movements?
88
+ # For example, if {cat_name} is a person, animal, or vehicle, it is likely an action-capable subject.
89
+ # However, if it is an inanimate object like a snowboard, tree, or book, it cannot independently perform actions.
90
+ # Respond with YES if {cat_name} can perform distinct actions or movements; otherwise, respond with NONE.
91
+ # Answer only YES or NONE.
92
+ # """
93
+ # }
94
+ # ],
95
+ # )
96
+ # response_check_content = response_check.choices[0].message.content.strip().lower()
97
+ # print(f"Movable Check for {cat_name}: {response_check_content}")
98
+
99
+ # if response_check_content == "yes": is_movable = True
100
+
101
+ if not is_movable:
102
+ print(f"Skipping {cat_name}: Determined to be non-movable.")
103
+ continue
104
+
105
+ for i in range(len(image_paths)):
106
+ image_path = image_paths[i]
107
+ frame_name = frame_names[i]
108
+ base64_image = encode_image(image_path)
109
+
110
+ #1단계: 필터링
111
+ #print(f"-----------category name: {cat_name}, frame name: {frame_name}")
112
+ response1 = captioner.chat.completions.create(
113
+ model="chatgpt-4o-latest",
114
+ messages=[
115
+ {
116
+ "role": "user",
117
+ "content": [
118
+ {
119
+ "type": "text",
120
+
121
+ "text": f"""Are there multiple {cat_name}s in the image, each performing distinct and recognizable actions?
122
+ Focus only on clear and prominent actions, avoiding minor or ambiguous ones.
123
+ Each action should be unique and clearly associated with a specific object.
124
+
125
+ Respond with YES if:
126
+ - The {cat_name}s are people, animals or vehicles, and their actions are distinct and recognizable.
127
+ - The {cat_name}s involve clear, distinguishable actions performed independently.
128
+
129
+ Respond with NONE if:
130
+ - The {cat_name}s are objects (e.g., snowboard, tree, books) and do not involve direct interaction with a person.
131
+ - Actions are ambiguous, minor, or not clearly visible.
132
+
133
+ If the {cat_name} is 'snowboard' and it is not actively being used or interacted with by a person, output NONE.
134
+ If the {cat_name} is 'person' and their actions are distinct and clear, output YES.
135
+
136
+ Answer only YES or NONE."""
137
+
138
+ },
139
+ {
140
+ "type": "image_url",
141
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
142
+ },
143
+ ],
144
+ }
145
+ ],
146
+ )
147
+ response_content = response1.choices[0].message.content
148
+ should_caption = True if "yes" in response_content.lower() else False
149
+ #print(f"are {cat_name}s distinguished by action: {response_content}")
150
+
151
+ #2단계: dense caption 만들기
152
+ if should_caption:
153
+ response2 = captioner.chat.completions.create(
154
+ model="chatgpt-4o-latest",
155
+ messages=[
156
+ {
157
+ "role": "user",
158
+ "content": [
159
+ {
160
+ "type": "text",
161
+
162
+ "text": f"""
163
+ Generate a detailed action-centric caption describing the actions of the {cat_name}s in the image.
164
+ 1. Focus only on clear, unique, and prominent actions that distinguish each object.
165
+ 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
166
+ 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
167
+ 4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
168
+ 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
169
+ 6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
170
+ 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
171
+ 8. Include interactions with objects or other entities when they are prominent and observable.
172
+ 9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
173
+ Output only the caption.""",
174
+ },
175
+ {
176
+ "type": "image_url",
177
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
178
+ },
179
+ ],
180
+ }
181
+ ],
182
+ )
183
+
184
+ caption = response2.choices[0].message.content
185
+ #print(f"{image_path} - {frame_name}: {caption}")
186
+ else:
187
+ caption = None
188
+
189
+ image_captions[frame_name] = caption
190
+ all_captions[cat_name] = image_captions
191
+
192
+ # final : also prepare valid object ids
193
+ valid_obj_ids = []
194
+ valid_cat_names = list(all_captions.keys())
195
+ for obj_id in list(video_data['annotations'][0].keys()):
196
+ cat = video_data['annotations'][0][obj_id]['category_name']
197
+ if cat in valid_cat_names : valid_obj_ids.append(obj_id)
198
+
199
+ return all_captions, valid_obj_ids
200
+
201
+ # Referring expression generator and QA filter
202
+ def getRefExp(video_id, frame_name, caption, obj_id, json_data):
203
+
204
+ # 이미지에 해당 물체 바운딩 박스 그리기
205
+ video_data = json_data[video_id]
206
+ frame_names = video_data['frame_names']
207
+ video_path = video_data['video_path']
208
+ I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
209
+ frame_indx = frame_names.index(frame_name)
210
+ obj_data = video_data['annotations'][frame_indx][obj_id]
211
+
212
+ bbox = obj_data['bbox']
213
+ cat_name = obj_data['category_name']
214
+ valid = obj_data['valid']
215
+
216
+ if valid == 0:
217
+ print("Object not in this frame!")
218
+ return {}
219
+
220
+
221
+ x_min, y_min, x_max, y_max = bbox
222
+ x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
223
+ cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
224
+ plt.figure()
225
+ plt.imshow(I)
226
+ plt.axis('off')
227
+ plt.show()
228
+
229
+ #cropped object for visibility check
230
+ cropped_I = I[y_min:y_max, x_min:x_max]
231
+ pil_cropped_I = Image.fromarray(cropped_I)
232
+ buff_crop = BytesIO()
233
+ pil_cropped_I.save(buff_crop, format='JPEG')
234
+ base64_cropped_I = base64.b64encode(buff_crop.getvalue()).decode("utf-8")
235
+
236
+ #entire image for referring expression generation
237
+ pil_I = Image.fromarray(I)
238
+ buff = BytesIO()
239
+ pil_I.save(buff, format='JPEG')
240
+ base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
241
+
242
+ # 구분 가능 여부 확인
243
+ generator = OpenAI()
244
+ response_check = generator.chat.completions.create(
245
+ model="chatgpt-4o-latest",
246
+ messages=[
247
+ {
248
+ "role": "user",
249
+ "content": [
250
+ {
251
+
252
+ "type": "text",
253
+ "text": f"""Can the {cat_name} in the provided cropped image be clearly identified as belonging to the category {cat_name}?
254
+ Focus on whether the cropped image provides enough visible features (e.g., ears, head shape, fur texture) to confirm that it is a {cat_name}, even if the full body is not visible.
255
+
256
+ Guidelines:
257
+ - If the visible features (like ears, fur texture or head shape) are sufficient to identify the {cat_name}, respond with YES.
258
+ - If multiple {cat_name}s are entangled or overlapping, making it difficult to distinguish one from another, respond with NONE.
259
+ - If the object is clearly visible and identifiable as a {cat_name}, respond with YES.
260
+
261
+ Output only either YES or NONE.
262
+ """
263
+ },
264
+ {
265
+ "type": "image_url",
266
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
267
+ }
268
+ ]
269
+ },
270
+ ]
271
+ )
272
+
273
+ response_check_content = response_check.choices[0].message.content.strip().lower()
274
+ #print(f"is object {obj_id} visible: {response_check_content}")
275
+
276
+ if "yes" not in response_check_content:
277
+ print(f"Referring expression not generated: {cat_name} is ambiguous in this frame.")
278
+ return {"ref_exp": "NONE", "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : False}
279
+
280
+ # Referring expression 만들기
281
+ # generator = OpenAI()
282
+ response = generator.chat.completions.create(
283
+ model="chatgpt-4o-latest",
284
+ messages=[
285
+ {
286
+ "role": "user",
287
+ "content": [
288
+ {
289
+ "type": "text",
290
+
291
+ "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box, corresponding to Object ID {obj_id}.
292
+ Guidelines for creating the referring expression:
293
+ 1. The referring expression should describe the prominent actions or poses of the highlighted {cat_name} (Object ID {obj_id}).
294
+ 2. Focus on the behavior or pose described in the caption that is specifically associated with this {cat_name}. Do not include actions or poses of other {cat_name}s.
295
+ 3. If multiple {cat_name}s are present, ensure that the referring expression exclusively describes the {cat_name} corresponding to Object ID {obj_id}.
296
+ 4. Avoid ambiguous or subjective terms. Use specific and clear action verbs to describe the highlighted {cat_name}.
297
+ 5. The referring expression should only describe Object ID {obj_id} and not any other objects or entities.
298
+ 6. Use '{cat_name}' as the noun for the referring expressions.
299
+ Output only the referring expression for the highlighted {cat_name} (Object ID {obj_id}).
300
+
301
+ {caption}
302
+ """
303
+ },
304
+ {
305
+ "type": "image_url",
306
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
307
+ },
308
+ # {
309
+ # "type": "image_url",
310
+ # "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
311
+ # }
312
+ ],
313
+ }
314
+ ],
315
+ )
316
+
317
+ ref_exp = response.choices[0].message.content.strip()
318
+
319
+ #QA filtering
320
+ #QA1: 원하는 물체를 설명하는지
321
+ filter = OpenAI()
322
+ response1 = filter.chat.completions.create(
323
+ model="chatgpt-4o-latest",
324
+ messages=[
325
+ {
326
+ "role": "user",
327
+ "content": [
328
+ {
329
+ "type": "text",
330
+ "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
331
+ {ref_exp}""",
332
+ },
333
+ {
334
+ "type": "image_url",
335
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
336
+ },
337
+ ],
338
+ }
339
+ ],
340
+ )
341
+
342
+ response1_content = response1.choices[0].message.content
343
+ describesHighlighted = True if "yes" in response1_content.lower() else False
344
+
345
+ #QA2: 원하지 않는 물체를 설명하지 않는지
346
+ response2 = filter.chat.completions.create(
347
+ model="chatgpt-4o-latest",
348
+ messages=[
349
+ {
350
+ "role": "user",
351
+ "content": [
352
+ {
353
+ "type": "text",
354
+ "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
355
+ {ref_exp}""",
356
+ },
357
+ {
358
+ "type": "image_url",
359
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
360
+ },
361
+ ],
362
+ }
363
+ ],
364
+ )
365
+
366
+ response2_content = response2.choices[0].message.content
367
+ notDescribesNotHighlighted = False if "yes" in response2_content.lower() else True
368
+
369
+ isValid = True if describesHighlighted and notDescribesNotHighlighted else False
370
+
371
+ #print(f"describesHighlighted: {describesHighlighted}, notDescribesNotHighlighted: {notDescribesNotHighlighted}")
372
+ #print(f"ref exp: {ref_exp}")
373
+ #print("")
374
+
375
+ return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
376
+
377
+
378
+ if __name__ == '__main__':
379
+ with open('mbench/sampled_frame3.json', 'r') as file:
380
+ data = json.load(file)
381
+
382
+ vid_ids = list(data.keys())
383
+ all_ref_exps = {}
384
+
385
+ os.environ['OPENAI_API_KEY'] = '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'
386
+
387
+ # 전체 데이터셋의 vid_id에 대해
388
+ for i in range(1):
389
+ vid_id = vid_ids[i]
390
+
391
+ #====캡션 만들기====
392
+ # print("=====================captioner========================")
393
+ captions, valid_obj_ids = getCaption(vid_id, data)
394
+ cats_in_vid = list(captions.keys())
395
+ # print()
396
+
397
+ #====referring expression 만들고 QA filtering====
398
+ # print("=====================referring expression generator & QA filter========================")
399
+ ref_expressions = {}
400
+
401
+ # 각 카테고리별로
402
+ for cat_name in cats_in_vid:
403
+ if cat_name not in ref_expressions:
404
+ ref_expressions[cat_name] = {}
405
+ # 각 비디오 프레임 별로
406
+ for frame_name in data[vid_id]['frame_names']:
407
+ # print(f'--------category: {cat_name}, frame_name: {frame_name}')
408
+
409
+ if frame_name not in ref_expressions[cat_name]:
410
+ ref_expressions[cat_name][frame_name] = {} # Create frame-level dictionary
411
+ caption = captions[cat_name][frame_name]
412
+ if not caption : continue
413
+ else :
414
+ # 각 obj id별로
415
+ for obj_id in valid_obj_ids:
416
+ ref_exp = getRefExp(vid_id, frame_name, caption, obj_id, data)
417
+ ref_expressions[cat_name][frame_name][obj_id] = ref_exp # Store ref_exp
418
+
419
+ all_ref_exps[vid_id] = ref_expressions
420
+
421
+
422
+ with open('mbench/result_revised.json', 'w') as file:
423
+ json.dump(all_ref_exps, file, indent=4)
424
+
425
+
426
+
427
+
428
+
.history/mbench/gpt_ref-ytvos_20250119070039.py ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import build_dataset
2
+ import argparse
3
+ import opts
4
+
5
+ import sys
6
+ from pathlib import Path
7
+ import os
8
+ from os import path as osp
9
+ import skimage
10
+ from io import BytesIO
11
+
12
+ import numpy as np
13
+ import pandas as pd
14
+ import regex as re
15
+ import json
16
+
17
+ import cv2
18
+ from PIL import Image, ImageDraw
19
+ import torch
20
+ from torchvision.transforms import functional as F
21
+
22
+ from skimage import measure # (pip install scikit-image)
23
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
24
+
25
+ import matplotlib.pyplot as plt
26
+ import matplotlib.patches as patches
27
+ from matplotlib.collections import PatchCollection
28
+ from matplotlib.patches import Rectangle
29
+
30
+
31
+ import ipywidgets as widgets
32
+ from IPython.display import display, clear_output
33
+
34
+ from openai import OpenAI
35
+ import base64
36
+
37
+ os.environ['OPENAI_API_KEY'] = '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'
38
+
39
+ # Function to encode the image
40
+ def encode_image(image_path):
41
+ with open(image_path, "rb") as image_file:
42
+ return base64.b64encode(image_file.read()).decode("utf-8")
43
+
44
+ def getCaption(video_id, json_data):
45
+ #데이터 가져오기
46
+ video_data = json_data[video_id]
47
+ frame_names = video_data['frame_names']
48
+ video_path = video_data['video_path']
49
+
50
+ cat_names = set()
51
+ for obj_id in list(video_data['annotations'][0].keys()):
52
+ cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
53
+
54
+ if len(cat_names) == 1:
55
+ cat_name = next(iter(cat_names))
56
+ else:
57
+ print("more than 2 categories")
58
+ return -1
59
+
60
+ image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
61
+ image_captions = {}
62
+
63
+ captioner = OpenAI()
64
+ for i in range(len(image_paths)):
65
+ image_path = image_paths[i]
66
+ frame_name = frame_names[i]
67
+ base64_image = encode_image(image_path)
68
+
69
+ #1단계: 필터링
70
+ response1 = captioner.chat.completions.create(
71
+ model="gpt-4o-mini",
72
+ messages=[
73
+ {
74
+ "role": "user",
75
+ "content": [
76
+ {
77
+ "type": "text",
78
+ "text": f"Are there multiple {cat_name}s that can be distinguished by action? Each action should be prominent and describe the corresponding object only. If so, only output YES. If not, only output None",
79
+ },
80
+ {
81
+ "type": "image_url",
82
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
83
+ },
84
+ ],
85
+ }
86
+ ],
87
+ )
88
+ response_content = response1.choices[0].message.content
89
+ should_caption = True if "yes" in response_content.lower() else False
90
+
91
+ #2단계: dense caption 만들기
92
+ if should_caption:
93
+ response2 = captioner.chat.completions.create(
94
+ model="gpt-4o-mini",
95
+ messages=[
96
+ {
97
+ "role": "user",
98
+ "content": [
99
+ {
100
+ "type": "text",
101
+ "text": f"""
102
+ Describe the image in detail focusing on the {cat_name}s' actions.
103
+ 1. Each action should be prominent, clear and unique, describing the corresponding object only.
104
+ 2. Avoid overly detailed or indeterminate details such as ‘in anticipation’.
105
+ 3. Avoid subjective descriptions such as ‘soft’, ‘controlled’, ‘attentive’, ‘skilled’, ‘casual atmosphere’ and descriptions of the setting.
106
+ 4. Do not include actions that needs to be guessed or suggested.""",
107
+ },
108
+ {
109
+ "type": "image_url",
110
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
111
+ },
112
+ ],
113
+ }
114
+ ],
115
+ )
116
+
117
+ caption = response2.choices[0].message.content
118
+ else:
119
+ caption = None
120
+
121
+ image_captions[frame_name] = caption
122
+ return image_captions
123
+
124
+ def getRefExp(video_id, frame_name, caption, obj_id, json_data):
125
+ # 이미지에 해당 물체 바운딩 박스 그리기
126
+ video_data = json_data[video_id]
127
+ frame_names = video_data['frame_names']
128
+ video_path = video_data['video_path']
129
+ I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
130
+ frame_indx = frame_names.index(frame_name)
131
+ obj_data = video_data['annotations'][frame_indx][obj_id]
132
+
133
+ bbox = obj_data['bbox']
134
+ cat_name = obj_data['category_name']
135
+ valid = obj_data['valid']
136
+
137
+ if valid == 0:
138
+ print("Object not in this frame!")
139
+ return {}
140
+
141
+
142
+ x_min, y_min, x_max, y_max = bbox
143
+ x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
144
+ cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
145
+ plt.figure()
146
+ plt.imshow(I)
147
+ plt.axis('off')
148
+ plt.show()
149
+ pil_I = Image.fromarray(I)
150
+ buff = BytesIO()
151
+ pil_I.save(buff, format='JPEG')
152
+ base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
153
+
154
+ #ref expression 만들기
155
+ generator = OpenAI()
156
+ response = generator.chat.completions.create(
157
+ model="gpt-4o-mini",
158
+ messages=[
159
+ {
160
+ "role": "user",
161
+ "content": [
162
+ {
163
+ "type": "text",
164
+ "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box.
165
+ 1. The referring expression describes the action and does not contain information about appearance or location in the picture.
166
+ 2. Focus only on prominent actions and avoid overly detailed or indeterminate details.
167
+ 3. Avoid subjective terms describing emotion such as ‘in anticipation’, ‘attentively’ or ‘relaxed’ and professional, difficult words.
168
+ 4. The referring expression should only describe the highlighted {cat_name} and not any other.
169
+ 5. Use '{cat_name}' as the noun for the referring expressions.
170
+ Output only the referring expression.
171
+ {caption}""",
172
+ },
173
+ {
174
+ "type": "image_url",
175
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
176
+ },
177
+ ],
178
+ }
179
+ ],
180
+ )
181
+
182
+ ref_exp = response.choices[0].message.content
183
+
184
+ #QA filtering
185
+ #QA1: 원하는 물체를 설명하는지
186
+ filter = OpenAI()
187
+ response1 = filter.chat.completions.create(
188
+ model="gpt-4o-mini",
189
+ messages=[
190
+ {
191
+ "role": "user",
192
+ "content": [
193
+ {
194
+ "type": "text",
195
+ "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
196
+ {ref_exp}""",
197
+ },
198
+ {
199
+ "type": "image_url",
200
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
201
+ },
202
+ ],
203
+ }
204
+ ],
205
+ )
206
+
207
+ response1_content = response1.choices[0].message.content
208
+ describesHighlighted = True if "yes" in response1_content.lower() else False
209
+
210
+ #QA2: 원하지 않는 물체를 설명하지 않는지
211
+ response2 = filter.chat.completions.create(
212
+ model="gpt-4o-mini",
213
+ messages=[
214
+ {
215
+ "role": "user",
216
+ "content": [
217
+ {
218
+ "type": "text",
219
+ "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
220
+ {ref_exp}""",
221
+ },
222
+ {
223
+ "type": "image_url",
224
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
225
+ },
226
+ ],
227
+ }
228
+ ],
229
+ )
230
+
231
+ response2_content = response2.choices[0].message.content
232
+ describesNotHighlighted = True if "yes" in response2_content.lower() else False
233
+
234
+ isValid = True if describesHighlighted and not describesNotHighlighted else False
235
+
236
+ print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
237
+
238
+ return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
239
+
240
+ def createRefExp(video_id, json_data):
241
+ video_data = json_data[video_id]
242
+ obj_ids = list(video_data['annotations'][0].keys())
243
+ frame_names = video_data['frame_names']
244
+
245
+ captions_per_frame = getCaption(video_id, json_data)
246
+
247
+ if captions_per_frame == -1:
248
+ print("There are more than 2 cateories")
249
+ return
250
+
251
+
252
+ video_ref_exps = {}
253
+
254
+ for frame_name in frame_names:
255
+ frame_caption = captions_per_frame[frame_name]
256
+
257
+ if frame_caption == None:
258
+ video_ref_exps[frame_name] = None
259
+
260
+ else:
261
+ frame_ref_exps = {}
262
+ for obj_id in obj_ids:
263
+ exp_per_obj = getRefExp(video_id, frame_name, frame_caption, obj_id, json_data)
264
+ frame_ref_exps[obj_id] = exp_per_obj
265
+ video_ref_exps[frame_name] = frame_ref_exps
266
+
267
+ return video_ref_exps
268
+
269
+ if __name__ == '__main__':
270
+ with open('mbench/sampled_frame3.json', 'r') as file:
271
+ data = json.load(file)
272
+
273
+ all_video_refs = {}
274
+ for i in range(3):
275
+ video_id = list(data.keys())[i]
276
+ video_ref = createRefExp(video_id, data)
277
+ all_video_refs[video_id] = video_ref
.history/mbench/gpt_ref-ytvos_20250119070740.py ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import build_dataset
2
+ import argparse
3
+ import opts
4
+
5
+ import sys
6
+ from pathlib import Path
7
+ import os
8
+ from os import path as osp
9
+ import skimage
10
+ from io import BytesIO
11
+
12
+ import numpy as np
13
+ import pandas as pd
14
+ import regex as re
15
+ import json
16
+
17
+ import cv2
18
+ from PIL import Image, ImageDraw
19
+ import torch
20
+ from torchvision.transforms import functional as F
21
+
22
+ from skimage import measure # (pip install scikit-image)
23
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
24
+
25
+ import matplotlib.pyplot as plt
26
+ import matplotlib.patches as patches
27
+ from matplotlib.collections import PatchCollection
28
+ from matplotlib.patches import Rectangle
29
+
30
+
31
+ import ipywidgets as widgets
32
+ from IPython.display import display, clear_output
33
+
34
+ from openai import OpenAI
35
+ import base64
36
+
37
+ os.environ['OPENAI_API_KEY'] = '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'
38
+
39
+ # Function to encode the image
40
+ def encode_image(image_path):
41
+ with open(image_path, "rb") as image_file:
42
+ return base64.b64encode(image_file.read()).decode("utf-8")
43
+
44
+ def getCaption(video_id, json_data):
45
+ #데이터 가져오기
46
+ video_data = json_data[video_id]
47
+ frame_names = video_data['frame_names']
48
+ video_path = video_data['video_path']
49
+
50
+ cat_names = set()
51
+ for obj_id in list(video_data['annotations'][0].keys()):
52
+ cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
53
+
54
+ if len(cat_names) == 1:
55
+ cat_name = next(iter(cat_names))
56
+ else:
57
+ print("more than 2 categories")
58
+ return -1
59
+
60
+ image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
61
+ image_captions = {}
62
+
63
+ captioner = OpenAI()
64
+ for i in range(len(image_paths)):
65
+ image_path = image_paths[i]
66
+ frame_name = frame_names[i]
67
+ base64_image = encode_image(image_path)
68
+
69
+ #1단계: 필터링
70
+ response1 = captioner.chat.completions.create(
71
+ model="gpt-4o-mini",
72
+ messages=[
73
+ {
74
+ "role": "user",
75
+ "content": [
76
+ {
77
+ "type": "text",
78
+ "text": f"Are there multiple {cat_name}s that can be distinguished by action? Each action should be prominent and describe the corresponding object only. If so, only output YES. If not, only output None",
79
+ },
80
+ {
81
+ "type": "image_url",
82
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
83
+ },
84
+ ],
85
+ }
86
+ ],
87
+ )
88
+ response_content = response1.choices[0].message.content
89
+ should_caption = True if "yes" in response_content.lower() else False
90
+
91
+ #2단계: dense caption 만들기
92
+ if should_caption:
93
+ response2 = captioner.chat.completions.create(
94
+ model="gpt-4o-mini",
95
+ messages=[
96
+ {
97
+ "role": "user",
98
+ "content": [
99
+ {
100
+ "type": "text",
101
+ "text": f"""
102
+ Describe the image in detail focusing on the {cat_name}s' actions.
103
+ 1. Each action should be prominent, clear and unique, describing the corresponding object only.
104
+ 2. Avoid overly detailed or indeterminate details such as ‘in anticipation’.
105
+ 3. Avoid subjective descriptions such as ‘soft’, ‘controlled’, ‘attentive’, ‘skilled’, ‘casual atmosphere’ and descriptions of the setting.
106
+ 4. Do not include actions that needs to be guessed or suggested.""",
107
+ },
108
+ {
109
+ "type": "image_url",
110
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
111
+ },
112
+ ],
113
+ }
114
+ ],
115
+ )
116
+
117
+ caption = response2.choices[0].message.content
118
+ else:
119
+ caption = None
120
+
121
+ image_captions[frame_name] = caption
122
+ return image_captions
123
+
124
+ def getRefExp(video_id, frame_name, caption, obj_id, json_data):
125
+ # 이미지에 해당 물체 바운딩 박스 그리기
126
+ video_data = json_data[video_id]
127
+ frame_names = video_data['frame_names']
128
+ video_path = video_data['video_path']
129
+ I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
130
+ frame_indx = frame_names.index(frame_name)
131
+ obj_data = video_data['annotations'][frame_indx][obj_id]
132
+
133
+ bbox = obj_data['bbox']
134
+ cat_name = obj_data['category_name']
135
+ valid = obj_data['valid']
136
+
137
+ if valid == 0:
138
+ print("Object not in this frame!")
139
+ return {}
140
+
141
+
142
+ x_min, y_min, x_max, y_max = bbox
143
+ x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
144
+ cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
145
+ plt.figure()
146
+ plt.imshow(I)
147
+ plt.axis('off')
148
+ plt.show()
149
+ pil_I = Image.fromarray(I)
150
+ buff = BytesIO()
151
+ pil_I.save(buff, format='JPEG')
152
+ base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
153
+
154
+ #ref expression 만들기
155
+ generator = OpenAI()
156
+ response = generator.chat.completions.create(
157
+ model="gpt-4o-mini",
158
+ messages=[
159
+ {
160
+ "role": "user",
161
+ "content": [
162
+ {
163
+ "type": "text",
164
+ "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box.
165
+ 1. The referring expression describes the action and does not contain information about appearance or location in the picture.
166
+ 2. Focus only on prominent actions and avoid overly detailed or indeterminate details.
167
+ 3. Avoid subjective terms describing emotion such as ‘in anticipation’, ‘attentively’ or ‘relaxed’ and professional, difficult words.
168
+ 4. The referring expression should only describe the highlighted {cat_name} and not any other.
169
+ 5. Use '{cat_name}' as the noun for the referring expressions.
170
+ Output only the referring expression.
171
+ {caption}""",
172
+ },
173
+ {
174
+ "type": "image_url",
175
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
176
+ },
177
+ ],
178
+ }
179
+ ],
180
+ )
181
+
182
+ ref_exp = response.choices[0].message.content
183
+
184
+ #QA filtering
185
+ #QA1: 원하는 물체를 설명하는지
186
+ filter = OpenAI()
187
+ response1 = filter.chat.completions.create(
188
+ model="gpt-4o-mini",
189
+ messages=[
190
+ {
191
+ "role": "user",
192
+ "content": [
193
+ {
194
+ "type": "text",
195
+ "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
196
+ {ref_exp}""",
197
+ },
198
+ {
199
+ "type": "image_url",
200
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
201
+ },
202
+ ],
203
+ }
204
+ ],
205
+ )
206
+
207
+ response1_content = response1.choices[0].message.content
208
+ describesHighlighted = True if "yes" in response1_content.lower() else False
209
+
210
+ #QA2: 원하지 않는 물체를 설명하지 않는지
211
+ response2 = filter.chat.completions.create(
212
+ model="gpt-4o-mini",
213
+ messages=[
214
+ {
215
+ "role": "user",
216
+ "content": [
217
+ {
218
+ "type": "text",
219
+ "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
220
+ {ref_exp}""",
221
+ },
222
+ {
223
+ "type": "image_url",
224
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
225
+ },
226
+ ],
227
+ }
228
+ ],
229
+ )
230
+
231
+ response2_content = response2.choices[0].message.content
232
+ describesNotHighlighted = True if "yes" in response2_content.lower() else False
233
+
234
+ isValid = True if describesHighlighted and not describesNotHighlighted else False
235
+
236
+ print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
237
+
238
+ return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
239
+
240
+ def createRefExp(video_id, json_data):
241
+ video_data = json_data[video_id]
242
+ obj_ids = list(video_data['annotations'][0].keys())
243
+ frame_names = video_data['frame_names']
244
+
245
+ captions_per_frame = getCaption(video_id, json_data)
246
+
247
+ if captions_per_frame == -1:
248
+ print("There are more than 2 cateories")
249
+ return
250
+
251
+
252
+ video_ref_exps = {}
253
+
254
+ for frame_name in frame_names:
255
+ frame_caption = captions_per_frame[frame_name]
256
+
257
+ if frame_caption == None:
258
+ video_ref_exps[frame_name] = None
259
+
260
+ else:
261
+ frame_ref_exps = {}
262
+ for obj_id in obj_ids:
263
+ exp_per_obj = getRefExp(video_id, frame_name, frame_caption, obj_id, json_data)
264
+ frame_ref_exps[obj_id] = exp_per_obj
265
+ video_ref_exps[frame_name] = frame_ref_exps
266
+
267
+ return video_ref_exps
268
+
269
+ if __name__ == '__main__':
270
+ with open('mbench/sampled_frame3.json', 'r') as file:
271
+ data = json.load(file)
272
+
273
+ videos = set()
274
+ with open('make_ref-ytvos/selected_frames.jsonl', 'r') as file:
275
+ manual_select = list(file)
276
+ for frame in manual_select:
277
+ result = json.loads(frame)
278
+ videos.add(result['video'])
279
+
280
+
281
+ all_video_refs = {}
282
+ for i in range(10):
283
+ video_id = list(data.keys())[i]
284
+ video_ref = createRefExp(video_id, data)
285
+ all_video_refs[video_id] = video_ref
.history/mbench/gpt_ref-ytvos_20250119071412.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from os import path as osp
3
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
4
+
5
+ from datasets import build_dataset
6
+ import argparse
7
+ import opts
8
+
9
+ from pathlib import Path
10
+ import os
11
+ import skimage
12
+ from io import BytesIO
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+ import regex as re
17
+ import json
18
+
19
+ import cv2
20
+ from PIL import Image, ImageDraw
21
+ import torch
22
+ from torchvision.transforms import functional as F
23
+
24
+ from skimage import measure # (pip install scikit-image)
25
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
26
+
27
+ import matplotlib.pyplot as plt
28
+ import matplotlib.patches as patches
29
+ from matplotlib.collections import PatchCollection
30
+ from matplotlib.patches import Rectangle
31
+
32
+
33
+ import ipywidgets as widgets
34
+ from IPython.display import display, clear_output
35
+
36
+ from openai import OpenAI
37
+ import base64
38
+
39
+ os.environ['OPENAI_API_KEY'] = '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'
40
+
41
+ # Function to encode the image
42
+ def encode_image(image_path):
43
+ with open(image_path, "rb") as image_file:
44
+ return base64.b64encode(image_file.read()).decode("utf-8")
45
+
46
+ def getCaption(video_id, json_data):
47
+ #데이터 가져오기
48
+ video_data = json_data[video_id]
49
+ frame_names = video_data['frame_names']
50
+ video_path = video_data['video_path']
51
+
52
+ cat_names = set()
53
+ for obj_id in list(video_data['annotations'][0].keys()):
54
+ cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
55
+
56
+ if len(cat_names) == 1:
57
+ cat_name = next(iter(cat_names))
58
+ else:
59
+ print("more than 2 categories")
60
+ return -1
61
+
62
+ image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
63
+ image_captions = {}
64
+
65
+ captioner = OpenAI()
66
+ for i in range(len(image_paths)):
67
+ image_path = image_paths[i]
68
+ frame_name = frame_names[i]
69
+ base64_image = encode_image(image_path)
70
+
71
+ #1단계: 필터링
72
+ response1 = captioner.chat.completions.create(
73
+ model="gpt-4o-mini",
74
+ messages=[
75
+ {
76
+ "role": "user",
77
+ "content": [
78
+ {
79
+ "type": "text",
80
+ "text": f"Are there multiple {cat_name}s that can be distinguished by action? Each action should be prominent and describe the corresponding object only. If so, only output YES. If not, only output None",
81
+ },
82
+ {
83
+ "type": "image_url",
84
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
85
+ },
86
+ ],
87
+ }
88
+ ],
89
+ )
90
+ response_content = response1.choices[0].message.content
91
+ should_caption = True if "yes" in response_content.lower() else False
92
+
93
+ #2단계: dense caption 만들기
94
+ if should_caption:
95
+ response2 = captioner.chat.completions.create(
96
+ model="gpt-4o-mini",
97
+ messages=[
98
+ {
99
+ "role": "user",
100
+ "content": [
101
+ {
102
+ "type": "text",
103
+ "text": f"""
104
+ Describe the image in detail focusing on the {cat_name}s' actions.
105
+ 1. Each action should be prominent, clear and unique, describing the corresponding object only.
106
+ 2. Avoid overly detailed or indeterminate details such as ‘in anticipation’.
107
+ 3. Avoid subjective descriptions such as ‘soft’, ‘controlled’, ‘attentive’, ‘skilled’, ‘casual atmosphere’ and descriptions of the setting.
108
+ 4. Do not include actions that needs to be guessed or suggested.""",
109
+ },
110
+ {
111
+ "type": "image_url",
112
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
113
+ },
114
+ ],
115
+ }
116
+ ],
117
+ )
118
+
119
+ caption = response2.choices[0].message.content
120
+ else:
121
+ caption = None
122
+
123
+ image_captions[frame_name] = caption
124
+ return image_captions
125
+
126
+ def getRefExp(video_id, frame_name, caption, obj_id, json_data):
127
+ # 이미지에 해당 물체 바운딩 박스 그리기
128
+ video_data = json_data[video_id]
129
+ frame_names = video_data['frame_names']
130
+ video_path = video_data['video_path']
131
+ I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
132
+ frame_indx = frame_names.index(frame_name)
133
+ obj_data = video_data['annotations'][frame_indx][obj_id]
134
+
135
+ bbox = obj_data['bbox']
136
+ cat_name = obj_data['category_name']
137
+ valid = obj_data['valid']
138
+
139
+ if valid == 0:
140
+ print("Object not in this frame!")
141
+ return {}
142
+
143
+
144
+ x_min, y_min, x_max, y_max = bbox
145
+ x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
146
+ cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
147
+ plt.figure()
148
+ plt.imshow(I)
149
+ plt.axis('off')
150
+ plt.show()
151
+ pil_I = Image.fromarray(I)
152
+ buff = BytesIO()
153
+ pil_I.save(buff, format='JPEG')
154
+ base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
155
+
156
+ #ref expression 만들기
157
+ generator = OpenAI()
158
+ response = generator.chat.completions.create(
159
+ model="gpt-4o-mini",
160
+ messages=[
161
+ {
162
+ "role": "user",
163
+ "content": [
164
+ {
165
+ "type": "text",
166
+ "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box.
167
+ 1. The referring expression describes the action and does not contain information about appearance or location in the picture.
168
+ 2. Focus only on prominent actions and avoid overly detailed or indeterminate details.
169
+ 3. Avoid subjective terms describing emotion such as ‘in anticipation’, ‘attentively’ or ‘relaxed’ and professional, difficult words.
170
+ 4. The referring expression should only describe the highlighted {cat_name} and not any other.
171
+ 5. Use '{cat_name}' as the noun for the referring expressions.
172
+ Output only the referring expression.
173
+ {caption}""",
174
+ },
175
+ {
176
+ "type": "image_url",
177
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
178
+ },
179
+ ],
180
+ }
181
+ ],
182
+ )
183
+
184
+ ref_exp = response.choices[0].message.content
185
+
186
+ #QA filtering
187
+ #QA1: 원하는 물체를 설명하는지
188
+ filter = OpenAI()
189
+ response1 = filter.chat.completions.create(
190
+ model="gpt-4o-mini",
191
+ messages=[
192
+ {
193
+ "role": "user",
194
+ "content": [
195
+ {
196
+ "type": "text",
197
+ "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
198
+ {ref_exp}""",
199
+ },
200
+ {
201
+ "type": "image_url",
202
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
203
+ },
204
+ ],
205
+ }
206
+ ],
207
+ )
208
+
209
+ response1_content = response1.choices[0].message.content
210
+ describesHighlighted = True if "yes" in response1_content.lower() else False
211
+
212
+ #QA2: 원하지 않는 물체를 설명하지 않는지
213
+ response2 = filter.chat.completions.create(
214
+ model="gpt-4o-mini",
215
+ messages=[
216
+ {
217
+ "role": "user",
218
+ "content": [
219
+ {
220
+ "type": "text",
221
+ "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
222
+ {ref_exp}""",
223
+ },
224
+ {
225
+ "type": "image_url",
226
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
227
+ },
228
+ ],
229
+ }
230
+ ],
231
+ )
232
+
233
+ response2_content = response2.choices[0].message.content
234
+ describesNotHighlighted = True if "yes" in response2_content.lower() else False
235
+
236
+ isValid = True if describesHighlighted and not describesNotHighlighted else False
237
+
238
+ print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
239
+
240
+ return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
241
+
242
+ def createRefExp(video_id, json_data):
243
+ video_data = json_data[video_id]
244
+ obj_ids = list(video_data['annotations'][0].keys())
245
+ frame_names = video_data['frame_names']
246
+
247
+ captions_per_frame = getCaption(video_id, json_data)
248
+
249
+ if captions_per_frame == -1:
250
+ print("There are more than 2 cateories")
251
+ return
252
+
253
+
254
+ video_ref_exps = {}
255
+
256
+ for frame_name in frame_names:
257
+ frame_caption = captions_per_frame[frame_name]
258
+
259
+ if frame_caption == None:
260
+ video_ref_exps[frame_name] = None
261
+
262
+ else:
263
+ frame_ref_exps = {}
264
+ for obj_id in obj_ids:
265
+ exp_per_obj = getRefExp(video_id, frame_name, frame_caption, obj_id, json_data)
266
+ frame_ref_exps[obj_id] = exp_per_obj
267
+ video_ref_exps[frame_name] = frame_ref_exps
268
+
269
+ return video_ref_exps
270
+
271
+ if __name__ == '__main__':
272
+ with open('mbench/sampled_frame3.json', 'r') as file:
273
+ data = json.load(file)
274
+
275
+ videos = set()
276
+ with open('make_ref-ytvos/selected_frames.jsonl', 'r') as file:
277
+ manual_select = list(file)
278
+ for frame in manual_select:
279
+ result = json.loads(frame)
280
+ videos.add(result['video'])
281
+ videos = list(videos)
282
+
283
+
284
+ all_video_refs = {}
285
+ for i in range(1):
286
+ video_id = videos[i]
287
+ video_ref = createRefExp(video_id, data)
288
+ all_video_refs[video_id] = video_ref
289
+
290
+ json_obj = json.dumps(all_video_refs, indent=4)
291
+ with open('mbench/result.json', 'w') as file:
292
+ file.wirte(json_obj)
.history/mbench/gpt_ref-ytvos_20250119072601.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from os import path as osp
3
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
4
+
5
+ from datasets import build_dataset
6
+ import argparse
7
+ import opts
8
+
9
+ from pathlib import Path
10
+ import os
11
+ import skimage
12
+ from io import BytesIO
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+ import regex as re
17
+ import json
18
+
19
+ import cv2
20
+ from PIL import Image, ImageDraw
21
+ import torch
22
+ from torchvision.transforms import functional as F
23
+
24
+ from skimage import measure # (pip install scikit-image)
25
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
26
+
27
+ import matplotlib.pyplot as plt
28
+ import matplotlib.patches as patches
29
+ from matplotlib.collections import PatchCollection
30
+ from matplotlib.patches import Rectangle
31
+
32
+
33
+ import ipywidgets as widgets
34
+ from IPython.display import display, clear_output
35
+
36
+ from openai import OpenAI
37
+ import base64
38
+
39
+ os.environ['OPENAI_API_KEY'] = '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'
40
+
41
+ # Function to encode the image
42
+ def encode_image(image_path):
43
+ with open(image_path, "rb") as image_file:
44
+ return base64.b64encode(image_file.read()).decode("utf-8")
45
+
46
+ def getCaption(video_id, json_data):
47
+ #데이터 가져오기
48
+ video_data = json_data[video_id]
49
+ frame_names = video_data['frame_names']
50
+ video_path = video_data['video_path']
51
+
52
+ cat_names = set()
53
+ for obj_id in list(video_data['annotations'][0].keys()):
54
+ cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
55
+
56
+ if len(cat_names) == 1:
57
+ cat_name = next(iter(cat_names))
58
+ else:
59
+ print("more than 2 categories")
60
+ return -1
61
+
62
+ image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
63
+ image_captions = {}
64
+
65
+ captioner = OpenAI()
66
+ for i in range(len(image_paths)):
67
+ image_path = image_paths[i]
68
+ frame_name = frame_names[i]
69
+ base64_image = encode_image(image_path)
70
+
71
+ #1단계: 필터링
72
+ response1 = captioner.chat.completions.create(
73
+ model="gpt-4o-mini",
74
+ messages=[
75
+ {
76
+ "role": "user",
77
+ "content": [
78
+ {
79
+ "type": "text",
80
+ "text": f"Are there multiple {cat_name}s that can be distinguished by action? Each action should be prominent and describe the corresponding object only. If so, only output YES. If not, only output None",
81
+ },
82
+ {
83
+ "type": "image_url",
84
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
85
+ },
86
+ ],
87
+ }
88
+ ],
89
+ )
90
+ response_content = response1.choices[0].message.content
91
+ should_caption = True if "yes" in response_content.lower() else False
92
+
93
+ #2단계: dense caption 만들기
94
+ if should_caption:
95
+ response2 = captioner.chat.completions.create(
96
+ model="gpt-4o-mini",
97
+ messages=[
98
+ {
99
+ "role": "user",
100
+ "content": [
101
+ {
102
+ "type": "text",
103
+ "text": f"""
104
+ Describe the image in detail focusing on the {cat_name}s' actions.
105
+ 1. Each action should be prominent, clear and unique, describing the corresponding object only.
106
+ 2. Avoid overly detailed or indeterminate details such as ‘in anticipation’.
107
+ 3. Avoid subjective descriptions such as ‘soft’, ‘controlled’, ‘attentive’, ‘skilled’, ‘casual atmosphere’ and descriptions of the setting.
108
+ 4. Do not include actions that needs to be guessed or suggested.""",
109
+ },
110
+ {
111
+ "type": "image_url",
112
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
113
+ },
114
+ ],
115
+ }
116
+ ],
117
+ )
118
+
119
+ caption = response2.choices[0].message.content
120
+ else:
121
+ caption = None
122
+
123
+ image_captions[frame_name] = caption
124
+ return image_captions
125
+
126
+ def getRefExp(video_id, frame_name, caption, obj_id, json_data):
127
+ # 이미지에 해당 물체 바운딩 박스 그리기
128
+ video_data = json_data[video_id]
129
+ frame_names = video_data['frame_names']
130
+ video_path = video_data['video_path']
131
+ I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
132
+ frame_indx = frame_names.index(frame_name)
133
+ obj_data = video_data['annotations'][frame_indx][obj_id]
134
+
135
+ bbox = obj_data['bbox']
136
+ cat_name = obj_data['category_name']
137
+ valid = obj_data['valid']
138
+
139
+ if valid == 0:
140
+ print("Object not in this frame!")
141
+ return {}
142
+
143
+
144
+ x_min, y_min, x_max, y_max = bbox
145
+ x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
146
+ cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
147
+ plt.figure()
148
+ plt.imshow(I)
149
+ plt.axis('off')
150
+ plt.show()
151
+ pil_I = Image.fromarray(I)
152
+ buff = BytesIO()
153
+ pil_I.save(buff, format='JPEG')
154
+ base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
155
+
156
+ #ref expression 만들기
157
+ generator = OpenAI()
158
+ response = generator.chat.completions.create(
159
+ model="gpt-4o-mini",
160
+ messages=[
161
+ {
162
+ "role": "user",
163
+ "content": [
164
+ {
165
+ "type": "text",
166
+ "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box.
167
+ 1. The referring expression describes the action and does not contain information about appearance or location in the picture.
168
+ 2. Focus only on prominent actions and avoid overly detailed or indeterminate details.
169
+ 3. Avoid subjective terms describing emotion such as ‘in anticipation’, ‘attentively’ or ‘relaxed’ and professional, difficult words.
170
+ 4. The referring expression should only describe the highlighted {cat_name} and not any other.
171
+ 5. Use '{cat_name}' as the noun for the referring expressions.
172
+ Output only the referring expression.
173
+ {caption}""",
174
+ },
175
+ {
176
+ "type": "image_url",
177
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
178
+ },
179
+ ],
180
+ }
181
+ ],
182
+ )
183
+
184
+ ref_exp = response.choices[0].message.content
185
+
186
+ #QA filtering
187
+ #QA1: 원하는 물체를 설명하는지
188
+ filter = OpenAI()
189
+ response1 = filter.chat.completions.create(
190
+ model="gpt-4o-mini",
191
+ messages=[
192
+ {
193
+ "role": "user",
194
+ "content": [
195
+ {
196
+ "type": "text",
197
+ "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
198
+ {ref_exp}""",
199
+ },
200
+ {
201
+ "type": "image_url",
202
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
203
+ },
204
+ ],
205
+ }
206
+ ],
207
+ )
208
+
209
+ response1_content = response1.choices[0].message.content
210
+ describesHighlighted = True if "yes" in response1_content.lower() else False
211
+
212
+ #QA2: 원하지 않는 물체를 설명하지 않는지
213
+ response2 = filter.chat.completions.create(
214
+ model="gpt-4o-mini",
215
+ messages=[
216
+ {
217
+ "role": "user",
218
+ "content": [
219
+ {
220
+ "type": "text",
221
+ "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
222
+ {ref_exp}""",
223
+ },
224
+ {
225
+ "type": "image_url",
226
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
227
+ },
228
+ ],
229
+ }
230
+ ],
231
+ )
232
+
233
+ response2_content = response2.choices[0].message.content
234
+ describesNotHighlighted = True if "yes" in response2_content.lower() else False
235
+
236
+ isValid = True if describesHighlighted and not describesNotHighlighted else False
237
+
238
+ print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
239
+
240
+ return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
241
+
242
+ def createRefExp(video_id, json_data):
243
+ video_data = json_data[video_id]
244
+ obj_ids = list(video_data['annotations'][0].keys())
245
+ frame_names = video_data['frame_names']
246
+
247
+ captions_per_frame = getCaption(video_id, json_data)
248
+
249
+ if captions_per_frame == -1:
250
+ print("There are more than 2 cateories")
251
+ return None
252
+
253
+
254
+ video_ref_exps = {}
255
+
256
+ for frame_name in frame_names:
257
+ frame_caption = captions_per_frame[frame_name]
258
+
259
+ if frame_caption == None:
260
+ video_ref_exps[frame_name] = None
261
+
262
+ else:
263
+ frame_ref_exps = {}
264
+ for obj_id in obj_ids:
265
+ exp_per_obj = getRefExp(video_id, frame_name, frame_caption, obj_id, json_data)
266
+ frame_ref_exps[obj_id] = exp_per_obj
267
+ video_ref_exps[frame_name] = frame_ref_exps
268
+
269
+ return video_ref_exps
270
+
271
+ if __name__ == '__main__':
272
+ with open('mbench/sampled_frame3.json', 'r') as file:
273
+ data = json.load(file)
274
+
275
+ videos = set()
276
+ with open('make_ref-ytvos/selected_frames.jsonl', 'r') as file:
277
+ manual_select = list(file)
278
+ for frame in manual_select:
279
+ result = json.loads(frame)
280
+ videos.add(result['video'])
281
+ videos = list(videos)
282
+
283
+
284
+ all_video_refs = {}
285
+ for i in range(1, 2):
286
+ video_id = videos[i]
287
+ video_ref = createRefExp(video_id, data)
288
+ all_video_refs[video_id] = video_ref
289
+
290
+ json_obj = json.dumps(all_video_refs, indent=4)
291
+ with open('mbench/result.json', 'w') as file:
292
+ file.write(json_obj)
.history/mbench/gpt_ref-ytvos_20250119073047.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from os import path as osp
3
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
4
+
5
+ from datasets import build_dataset
6
+ import argparse
7
+ import opts
8
+
9
+ from pathlib import Path
10
+ import os
11
+ import skimage
12
+ from io import BytesIO
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+ import regex as re
17
+ import json
18
+
19
+ import cv2
20
+ from PIL import Image, ImageDraw
21
+ import torch
22
+ from torchvision.transforms import functional as F
23
+
24
+ from skimage import measure # (pip install scikit-image)
25
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
26
+
27
+ import matplotlib.pyplot as plt
28
+ import matplotlib.patches as patches
29
+ from matplotlib.collections import PatchCollection
30
+ from matplotlib.patches import Rectangle
31
+
32
+
33
+ import ipywidgets as widgets
34
+ from IPython.display import display, clear_output
35
+
36
+ from openai import OpenAI
37
+ import base64
38
+
39
+ os.environ['OPENAI_API_KEY'] = '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'
40
+
41
+ # Function to encode the image
42
+ def encode_image(image_path):
43
+ with open(image_path, "rb") as image_file:
44
+ return base64.b64encode(image_file.read()).decode("utf-8")
45
+
46
+ def getCaption(video_id, json_data):
47
+ #데이터 가져오기
48
+ video_data = json_data[video_id]
49
+ frame_names = video_data['frame_names']
50
+ video_path = video_data['video_path']
51
+
52
+ cat_names = set()
53
+ for obj_id in list(video_data['annotations'][0].keys()):
54
+ cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
55
+
56
+ if len(cat_names) == 1:
57
+ cat_name = next(iter(cat_names))
58
+ else:
59
+ print("more than 2 categories")
60
+ return -1
61
+
62
+ image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
63
+ image_captions = {}
64
+
65
+ captioner = OpenAI()
66
+ for i in range(len(image_paths)):
67
+ image_path = image_paths[i]
68
+ frame_name = frame_names[i]
69
+ base64_image = encode_image(image_path)
70
+
71
+ #1단계: 필터링
72
+ response1 = captioner.chat.completions.create(
73
+ model="gpt-4o-mini",
74
+ messages=[
75
+ {
76
+ "role": "user",
77
+ "content": [
78
+ {
79
+ "type": "text",
80
+ "text": f"Are there multiple {cat_name}s that can be distinguished by action? Each action should be prominent and describe the corresponding object only. If so, only output YES. If not, only output None",
81
+ },
82
+ {
83
+ "type": "image_url",
84
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
85
+ },
86
+ ],
87
+ }
88
+ ],
89
+ )
90
+ response_content = response1.choices[0].message.content
91
+ should_caption = True if "yes" in response_content.lower() else False
92
+
93
+ #2단계: dense caption 만들기
94
+ if should_caption:
95
+ response2 = captioner.chat.completions.create(
96
+ model="gpt-4o-mini",
97
+ messages=[
98
+ {
99
+ "role": "user",
100
+ "content": [
101
+ {
102
+ "type": "text",
103
+ "text": f"""
104
+ Describe the image in detail focusing on the {cat_name}s' actions.
105
+ 1. Each action should be prominent, clear and unique, describing the corresponding object only.
106
+ 2. Avoid overly detailed or indeterminate details such as ‘in anticipation’.
107
+ 3. Avoid subjective descriptions such as ‘soft’, ‘controlled’, ‘attentive’, ‘skilled’, ‘casual atmosphere’ and descriptions of the setting.
108
+ 4. Do not include actions that needs to be guessed or suggested.""",
109
+ },
110
+ {
111
+ "type": "image_url",
112
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
113
+ },
114
+ ],
115
+ }
116
+ ],
117
+ )
118
+
119
+ caption = response2.choices[0].message.content
120
+ else:
121
+ caption = None
122
+
123
+ image_captions[frame_name] = caption
124
+ return image_captions
125
+
126
+ def getRefExp(video_id, frame_name, caption, obj_id, json_data):
127
+ # 이미지에 해당 물체 바운딩 박스 그리기
128
+ video_data = json_data[video_id]
129
+ frame_names = video_data['frame_names']
130
+ video_path = video_data['video_path']
131
+ I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
132
+ frame_indx = frame_names.index(frame_name)
133
+ obj_data = video_data['annotations'][frame_indx][obj_id]
134
+
135
+ bbox = obj_data['bbox']
136
+ cat_name = obj_data['category_name']
137
+ valid = obj_data['valid']
138
+
139
+ if valid == 0:
140
+ print("Object not in this frame!")
141
+ return {}
142
+
143
+
144
+ x_min, y_min, x_max, y_max = bbox
145
+ x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
146
+ cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
147
+ plt.figure()
148
+ plt.imshow(I)
149
+ plt.axis('off')
150
+ plt.show()
151
+ pil_I = Image.fromarray(I)
152
+ buff = BytesIO()
153
+ pil_I.save(buff, format='JPEG')
154
+ base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
155
+
156
+ #ref expression 만들기
157
+ generator = OpenAI()
158
+ response = generator.chat.completions.create(
159
+ model="gpt-4o-mini",
160
+ messages=[
161
+ {
162
+ "role": "user",
163
+ "content": [
164
+ {
165
+ "type": "text",
166
+ "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box.
167
+ 1. The referring expression describes the action and does not contain information about appearance or location in the picture.
168
+ 2. Focus only on prominent actions and avoid overly detailed or indeterminate details.
169
+ 3. Avoid subjective terms describing emotion such as ‘in anticipation’, ‘attentively’ or ‘relaxed’ and professional, difficult words.
170
+ 4. The referring expression should only describe the highlighted {cat_name} and not any other.
171
+ 5. Use '{cat_name}' as the noun for the referring expressions.
172
+ Output only the referring expression.
173
+ {caption}""",
174
+ },
175
+ {
176
+ "type": "image_url",
177
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
178
+ },
179
+ ],
180
+ }
181
+ ],
182
+ )
183
+
184
+ ref_exp = response.choices[0].message.content
185
+
186
+ #QA filtering
187
+ #QA1: 원하는 물체를 설명하는지
188
+ filter = OpenAI()
189
+ response1 = filter.chat.completions.create(
190
+ model="gpt-4o-mini",
191
+ messages=[
192
+ {
193
+ "role": "user",
194
+ "content": [
195
+ {
196
+ "type": "text",
197
+ "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
198
+ {ref_exp}""",
199
+ },
200
+ {
201
+ "type": "image_url",
202
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
203
+ },
204
+ ],
205
+ }
206
+ ],
207
+ )
208
+
209
+ response1_content = response1.choices[0].message.content
210
+ describesHighlighted = True if "yes" in response1_content.lower() else False
211
+
212
+ #QA2: 원하지 않는 물체를 설명하지 않는지
213
+ response2 = filter.chat.completions.create(
214
+ model="gpt-4o-mini",
215
+ messages=[
216
+ {
217
+ "role": "user",
218
+ "content": [
219
+ {
220
+ "type": "text",
221
+ "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
222
+ {ref_exp}""",
223
+ },
224
+ {
225
+ "type": "image_url",
226
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
227
+ },
228
+ ],
229
+ }
230
+ ],
231
+ )
232
+
233
+ response2_content = response2.choices[0].message.content
234
+ describesNotHighlighted = True if "yes" in response2_content.lower() else False
235
+
236
+ isValid = True if describesHighlighted and not describesNotHighlighted else False
237
+
238
+ print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
239
+
240
+ return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
241
+
242
+ def createRefExp(video_id, json_data):
243
+ video_data = json_data[video_id]
244
+ obj_ids = list(video_data['annotations'][0].keys())
245
+ frame_names = video_data['frame_names']
246
+
247
+ captions_per_frame = getCaption(video_id, json_data)
248
+
249
+ if captions_per_frame == -1:
250
+ print("There are more than 2 cateories")
251
+ return None
252
+
253
+
254
+ video_ref_exps = {}
255
+
256
+ for frame_name in frame_names:
257
+ frame_caption = captions_per_frame[frame_name]
258
+
259
+ if frame_caption == None:
260
+ video_ref_exps[frame_name] = None
261
+
262
+ else:
263
+ frame_ref_exps = {}
264
+ for obj_id in obj_ids:
265
+ exp_per_obj = getRefExp(video_id, frame_name, frame_caption, obj_id, json_data)
266
+ frame_ref_exps[obj_id] = exp_per_obj
267
+ video_ref_exps[frame_name] = frame_ref_exps
268
+
269
+ return video_ref_exps
270
+
271
+ if __name__ == '__main__':
272
+ with open('mbench/sampled_frame3.json', 'r') as file:
273
+ data = json.load(file)
274
+
275
+ videos = set()
276
+ with open('make_ref-ytvos/selected_frames.jsonl', 'r') as file:
277
+ manual_select = list(file)
278
+ for frame in manual_select:
279
+ result = json.loads(frame)
280
+ videos.add(result['video'])
281
+ videos = list(videos)
282
+
283
+
284
+ all_video_refs = {}
285
+ for i in range(10):
286
+ video_id = videos[i]
287
+ video_ref = createRefExp(video_id, data)
288
+ all_video_refs[video_id] = video_ref
289
+
290
+ json_obj = json.dumps(all_video_refs, indent=4)
291
+ with open('mbench/result.json', 'w') as file:
292
+ file.write(json_obj)
.history/mbench/gpt_ref-ytvos_numbered_cy_20250131124149.py ADDED
@@ -0,0 +1,427 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4
+
5
+ from os import path as osp
6
+ from io import BytesIO
7
+
8
+ from mbench.ytvos_ref import build as build_ytvos_ref
9
+ import argparse
10
+ import opts
11
+
12
+ import sys
13
+ from pathlib import Path
14
+ import os
15
+ from os import path as osp
16
+ import skimage
17
+ from io import BytesIO
18
+
19
+ import numpy as np
20
+ import pandas as pd
21
+ import regex as re
22
+ import json
23
+
24
+ import cv2
25
+ from PIL import Image, ImageDraw
26
+ import torch
27
+ from torchvision.transforms import functional as F
28
+
29
+ from skimage import measure # (pip install scikit-image)
30
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
31
+
32
+ import matplotlib.pyplot as plt
33
+ import matplotlib.patches as patches
34
+ from matplotlib.collections import PatchCollection
35
+ from matplotlib.patches import Rectangle
36
+ import textwrap
37
+
38
+
39
+ import ipywidgets as widgets
40
+ from IPython.display import display, clear_output
41
+
42
+ from openai import OpenAI
43
+ import base64
44
+ import json
45
+
46
+ def number_objects_and_encode(idx, color_mask=False):
47
+ encoded_frames = {}
48
+ contoured_frames = {} # New dictionary for original images
49
+ vid_cat_cnts = {}
50
+
51
+ vid_meta = metas[idx]
52
+ vid_data = train_dataset[idx]
53
+ vid_id = vid_meta['video']
54
+ frame_indx = vid_meta['sample_indx']
55
+ cat_names = set(vid_meta['obj_id_cat'].values())
56
+ imgs = vid_data[0]
57
+
58
+ for cat in cat_names:
59
+ cat_frames = []
60
+ contour_frames = []
61
+ frame_cat_cnts = {}
62
+
63
+ for i in range(imgs.size(0)):
64
+ frame_name = frame_indx[i]
65
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
66
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
67
+
68
+ frame_data = vid_data[2][frame_name]
69
+ obj_ids = list(frame_data.keys())
70
+
71
+ cat_cnt = 0
72
+
73
+ for j in range(len(obj_ids)):
74
+ obj_id = obj_ids[j]
75
+ obj_data = frame_data[obj_id]
76
+ obj_bbox = obj_data['bbox']
77
+ obj_valid = obj_data['valid']
78
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
79
+ obj_cat = obj_data['category_name']
80
+
81
+ if obj_cat == cat and obj_valid:
82
+ cat_cnt += 1
83
+
84
+ if color_mask == False:
85
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
86
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
87
+ for i, contour in enumerate(contours):
88
+ # 윤곽선 중심 계산
89
+ moments = cv2.moments(contour)
90
+ if moments["m00"] != 0: # 중심 계산 가능 여부 확인
91
+ cx = int(moments["m10"] / moments["m00"])
92
+ cy = int(moments["m01"] / moments["m00"])
93
+ else:
94
+ cx, cy = contour[0][0] # 중심 계산 불가시 대체 좌표 사용
95
+
96
+ # 텍스트 배경 (검은색 배경 만들기)
97
+ font = cv2.FONT_HERSHEY_SIMPLEX
98
+ text = obj_id
99
+ text_size = cv2.getTextSize(text, font, 1, 2)[0]
100
+ text_w, text_h = text_size
101
+
102
+ # 텍스트 배경 그리기 (검은색 배경)
103
+ cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
104
+ (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
105
+
106
+ # 텍스트 그리기 (흰색 텍스트)
107
+ cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
108
+ font, 1, (255, 255, 255), 2)
109
+
110
+ else:
111
+ alpha = 0.08
112
+
113
+ colored_obj_mask = np.zeros_like(frame)
114
+ colored_obj_mask[obj_mask == 1] = colors[j]
115
+ frame[obj_mask == 1] = (
116
+ (1 - alpha) * frame[obj_mask == 1]
117
+ + alpha * colored_obj_mask[obj_mask == 1]
118
+ )
119
+
120
+
121
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
122
+ cv2.drawContours(frame, contours, -1, colors[j], 2)
123
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
124
+
125
+
126
+
127
+ if len(contours) > 0:
128
+ largest_contour = max(contours, key=cv2.contourArea)
129
+ M = cv2.moments(largest_contour)
130
+ if M["m00"] != 0:
131
+ center_x = int(M["m10"] / M["m00"])
132
+ center_y = int(M["m01"] / M["m00"])
133
+ else:
134
+ center_x, center_y = 0, 0
135
+
136
+ font = cv2.FONT_HERSHEY_SIMPLEX
137
+ text = obj_id
138
+
139
+ font_scale = 0.9
140
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
141
+ text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
142
+ text_y = center_y
143
+ # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
144
+
145
+ # 텍스트 배경 사각형 좌표 계산
146
+ rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
147
+ # rect_end = (text_x + text_size[0] + 5, text_y + 5)
148
+ rect_end = (text_x + text_size[0] + 5, text_y)
149
+
150
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
151
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
152
+
153
+ # plt.figure(figsize=(12, 8))
154
+ # plt.imshow(frame)
155
+ # plt.title(f"frame {frame_name}")
156
+ # plt.tight_layout()
157
+ # plt.axis('off')
158
+ # plt.show()
159
+
160
+ buffer = BytesIO()
161
+ frame = Image.fromarray(frame)
162
+ frame.save(buffer, format='jpeg')
163
+ buffer.seek(0)
164
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
165
+ frame_cat_cnts[frame_name] = cat_cnt
166
+
167
+ buffer.seek(0) # Reuse buffer instead of creating a new one
168
+ buffer.truncate()
169
+ frame_for_contour = Image.fromarray(frame_for_contour)
170
+ frame_for_contour.save(buffer, format='jpeg')
171
+ buffer.seek(0)
172
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
173
+
174
+ encoded_frames[cat] = cat_frames
175
+ contoured_frames[cat] = contour_frames
176
+ vid_cat_cnts[cat] = frame_cat_cnts
177
+
178
+ return encoded_frames, vid_cat_cnts, contoured_frames
179
+
180
+
181
+ def getCaption(idx, color_mask=True):
182
+ vid_meta = metas[idx]
183
+ vid_data = train_dataset[idx]
184
+ vid_id = vid_meta['video']
185
+ print(f"vid id: {vid_id}\n")
186
+
187
+ frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
188
+ cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
189
+ all_captions = dict()
190
+
191
+ base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
192
+ marked = "mask with boundary" if color_mask else "boundary"
193
+
194
+ for cat_name in list(cat_names) :
195
+
196
+ is_movable = False
197
+ if cat_name in ytvos_category_valid_list :
198
+ is_movable = True
199
+
200
+ if not is_movable:
201
+ print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
202
+
203
+
204
+ image_captions = {}
205
+ captioner = OpenAI()
206
+ cat_base64_frames = base64_frames[cat_name]
207
+ cont_base64_frames = contoured_frames[cat_name]
208
+
209
+ for i in range(len(cat_base64_frames)):
210
+ frame_name = frame_indx[i]
211
+ cont_base64_image = cont_base64_frames[i]
212
+ base64_image = cat_base64_frames[i]
213
+ should_filter = False
214
+ frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
215
+
216
+ if frame_cat_cnts >= 2:
217
+ should_filter = True
218
+ else:
219
+ print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
220
+
221
+ if is_movable and should_filter:
222
+ #1단계: 필터링
223
+ print(f"-----------category name: {cat_name}, frame name: {frame_name}")
224
+ caption_filter_text = f"""
225
+ You are a visual assistant analyzing a single frame from a video.
226
+ In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
227
+
228
+ Are {cat_name}s in the image performing all different and recognizable actions or postures?
229
+ Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing towards, walking...), motion cues (inferred from the momentary stance or position),
230
+ facial expressions, and any notable interactions with objects or other {cat_name}s or people.
231
+
232
+ Only focus on obvious, prominent actions that can be reliably identified from this single frame.
233
+
234
+ - Respond with "YES" if:
235
+ 1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
236
+ 2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
237
+ 3) Each action is unambiguously recognizable and distinct.
238
+
239
+ - Respond with "NONE" if:
240
+ 1) The actions or pose are not clearly differentiable or too similar.
241
+ 2) They show no noticeable action beyond standing or minor movements.
242
+
243
+ Answer strictly with either "YES" or "NONE".
244
+ """
245
+
246
+
247
+ response1 = captioner.chat.completions.create(
248
+ model="chatgpt-4o-latest",
249
+ messages=[
250
+ {
251
+ "role": "user",
252
+ "content": [
253
+ {
254
+ "type": "text",
255
+ "text": caption_filter_text,
256
+ },
257
+ {
258
+ "type": "image_url",
259
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
260
+ }
261
+ ],
262
+ }
263
+ ],
264
+ )
265
+ response_content = response1.choices[0].message.content
266
+ should_caption = True if "yes" in response_content.lower() else False
267
+ print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
268
+
269
+ else:
270
+ should_caption = False
271
+
272
+ #2단계: dense caption 만들기
273
+ dense_caption_prompt_1 = f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
274
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
275
+ I want to use your expressions to create a action-centric referring expression dataset.
276
+ Therefore, your expressions for these {cat_name}s should describe unique action of each object.
277
+
278
+ 1. Focus only on clear, unique, and prominent actions that distinguish each object.
279
+ 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
280
+ 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
281
+ 4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
282
+ 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
283
+ 6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
284
+ 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
285
+ 8. Include interactions with objects or other entities when they are prominent and observable.
286
+ 9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
287
+ 10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
288
+ 11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
289
+ 12. Do not mention object IDs.
290
+ 13. Use '{cat_name}' as the noun for the referring expressions.
291
+
292
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
293
+ Output referring expressions for each object id.
294
+ """
295
+
296
+ dense_caption_prompt = f"""
297
+ You are a visual assistant analyzing a single frame of a video.
298
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
299
+ I want to use your expressions to create a action-centric referring expression dataset.
300
+ Please describe each {cat_name} using **clearly observable** and **specific** actions.
301
+
302
+ ## Guidelines:
303
+ 1. Focus on visible, prominent actions only (e.g., running, pushing, grasping an object).
304
+ 2. Avoid describing minor or ambiguous actions (e.g., slightly moving a paw).
305
+ 3. Do not include subjective or speculative descriptions (e.g., “it seems excited” or “it might be preparing to jump”).
306
+ 4. Do not use vague expressions like "interacting with something"** or "engaging with another object."
307
+ Instead, specify the interaction in detail (e.g., "grabbing a stick," "pressing a button").
308
+ 5. Use dynamic action verbs (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
309
+ 6. If multiple {cat_name}s appear, ensure each description is detailed enough to differentiate their actions.
310
+ 7. Base your description on the following action definitions:
311
+ - Facial with object manipulation
312
+ - General body movement, body position or pattern
313
+ - Movements when interacting with a specific, named object (e.g., "kicking a ball" instead of "interacting with an object").
314
+ - Body movements in person or animal interaction (e.g., "pushing another person" instead of "engaging with someone").
315
+
316
+ ## Output Format:
317
+ - For each labeled {cat_name}, output one line in the format:
318
+ ID. action-oriented description
319
+
320
+ Example:
321
+ 1. a bear grasping the edge of a wood with its front paws
322
+ 2. the bear pushing another bear, leaning forward
323
+
324
+ **Do not include** appearance details (e.g., color, size, shape) or relative positioning (e.g., “on the left/right”).
325
+ **Do not mention object IDs** in the text of your sentence—just use them as labels for your output lines.
326
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
327
+ For each labeled {cat_name}, output referring expressions for each object id.
328
+ """
329
+ if should_caption:
330
+ response2 = captioner.chat.completions.create(
331
+ model="gpt-4o-mini",
332
+ messages=[
333
+ {
334
+ "role": "user",
335
+ "content": [
336
+ {
337
+ "type": "text",
338
+ "text": dense_caption_prompt,
339
+ },
340
+ {
341
+ "type": "image_url",
342
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
343
+ },
344
+ ],
345
+ }
346
+ ],
347
+ )
348
+
349
+ caption = response2.choices[0].message.content
350
+ #print(f"{image_path} - {frame_name}: {caption}")
351
+ else:
352
+ caption = None
353
+
354
+ image_captions[frame_name] = caption
355
+ all_captions[cat_name] = image_captions
356
+
357
+ # final : also prepare valid object ids
358
+ valid_obj_ids = dict()
359
+
360
+ for cat in cat_names:
361
+ if cat in ytvos_category_valid_list:
362
+ obj_id_cat = vid_meta['obj_id_cat']
363
+ valid_cat_ids = []
364
+ for obj_id in list(obj_id_cat.keys()):
365
+ if obj_id_cat[obj_id] == cat:
366
+ valid_cat_ids.append(obj_id)
367
+ valid_obj_ids[cat] = valid_cat_ids
368
+
369
+ return vid_id, all_captions, valid_obj_ids
370
+
371
+
372
+ if __name__ == '__main__':
373
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
374
+ parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
375
+ parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
376
+
377
+ args = parser.parse_args()
378
+
379
+ #==================데이터 불러오기===================
380
+ # 전체 데이터셋
381
+ train_dataset = build_ytvos_ref(image_set = 'train', args = args)
382
+
383
+ # 전체 데이터셋 메타데이터
384
+ metas = train_dataset.metas
385
+
386
+ # 색상 후보 8개 (RGB 형식)
387
+ colors = [
388
+ (255, 0, 0), # Red
389
+ (0, 255, 0), # Green
390
+ (0, 0, 255), # Blue
391
+ (255, 255, 0), # Yellow
392
+ (255, 0, 255), # Magenta
393
+ (0, 255, 255), # Cyan
394
+ (128, 0, 128), # Purple
395
+ (255, 165, 0) # Orange
396
+ ]
397
+
398
+ ytvos_category_valid_list = [
399
+ 'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
400
+ 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
401
+ 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
402
+ 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
403
+ 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
404
+ 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
405
+ ]
406
+
407
+ #==================gpt 돌리기===================
408
+ os.environ['OPENAI_API_KEY'] = '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'
409
+
410
+ result_captions = {}
411
+ result_valid_obj_ids = {}
412
+
413
+ for i in range(370):
414
+ vid_id, all_captions, valid_obj_ids = getCaption(i, True)
415
+
416
+ if vid_id not in result_captions:
417
+ result_captions[vid_id] = all_captions
418
+ if vid_id not in result_valid_obj_ids:
419
+ result_valid_obj_ids[vid_id] = valid_obj_ids
420
+
421
+ print("Finished!", flush=True)
422
+
423
+ with open(args.save_caption_path, "w") as file:
424
+ json.dump(result_captions, file, indent=4)
425
+
426
+ with open(args.save_valid_obj_ids_path, "w") as file:
427
+ json.dump(result_valid_obj_ids, file, indent=4)
.history/mbench/gpt_ref-ytvos_numbered_cy_20250201141952.py ADDED
@@ -0,0 +1,460 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4
+ import time
5
+
6
+ from os import path as osp
7
+ from io import BytesIO
8
+
9
+ from mbench.ytvos_ref import build as build_ytvos_ref
10
+ import argparse
11
+ import opts
12
+
13
+ import sys
14
+ from pathlib import Path
15
+ import os
16
+ from os import path as osp
17
+ import skimage
18
+ from io import BytesIO
19
+
20
+ import numpy as np
21
+ import pandas as pd
22
+ import regex as re
23
+ import json
24
+
25
+ import cv2
26
+ from PIL import Image, ImageDraw
27
+ import torch
28
+ from torchvision.transforms import functional as F
29
+
30
+ from skimage import measure # (pip install scikit-image)
31
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
32
+
33
+ import matplotlib.pyplot as plt
34
+ import matplotlib.patches as patches
35
+ from matplotlib.collections import PatchCollection
36
+ from matplotlib.patches import Rectangle
37
+ import textwrap
38
+
39
+
40
+ import ipywidgets as widgets
41
+ from IPython.display import display, clear_output
42
+
43
+ from openai import OpenAI
44
+ import base64
45
+ import json
46
+
47
+ def number_objects_and_encode(idx, color_mask=False):
48
+ encoded_frames = {}
49
+ contoured_frames = {} # New dictionary for original images
50
+ vid_cat_cnts = {}
51
+
52
+ vid_meta = metas[idx]
53
+ vid_data = train_dataset[idx]
54
+ vid_id = vid_meta['video']
55
+ frame_indx = vid_meta['sample_indx']
56
+ cat_names = set(vid_meta['obj_id_cat'].values())
57
+ imgs = vid_data[0]
58
+
59
+ for cat in cat_names:
60
+ cat_frames = []
61
+ contour_frames = []
62
+ frame_cat_cnts = {}
63
+
64
+ for i in range(imgs.size(0)):
65
+ frame_name = frame_indx[i]
66
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
67
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
68
+
69
+ frame_data = vid_data[2][frame_name]
70
+ obj_ids = list(frame_data.keys())
71
+
72
+ cat_cnt = 0
73
+
74
+ for j in range(len(obj_ids)):
75
+ obj_id = obj_ids[j]
76
+ obj_data = frame_data[obj_id]
77
+ obj_bbox = obj_data['bbox']
78
+ obj_valid = obj_data['valid']
79
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
80
+ obj_cat = obj_data['category_name']
81
+
82
+ if obj_cat == cat and obj_valid:
83
+ cat_cnt += 1
84
+
85
+ if color_mask == False:
86
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
87
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
88
+ for i, contour in enumerate(contours):
89
+ # 윤곽선 중심 계산
90
+ moments = cv2.moments(contour)
91
+ if moments["m00"] != 0: # 중심 계산 가능 여부 확인
92
+ cx = int(moments["m10"] / moments["m00"])
93
+ cy = int(moments["m01"] / moments["m00"])
94
+ else:
95
+ cx, cy = contour[0][0] # 중심 계산 불가시 대체 좌표 사용
96
+
97
+ # 텍스트 배경 (검은색 배경 만들기)
98
+ font = cv2.FONT_HERSHEY_SIMPLEX
99
+ text = obj_id
100
+ text_size = cv2.getTextSize(text, font, 1, 2)[0]
101
+ text_w, text_h = text_size
102
+
103
+ # 텍스트 배경 그리기 (검은색 배경)
104
+ cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
105
+ (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
106
+
107
+ # 텍스트 그리기 (흰색 텍스트)
108
+ cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
109
+ font, 1, (255, 255, 255), 2)
110
+
111
+ else:
112
+ alpha = 0.08
113
+
114
+ colored_obj_mask = np.zeros_like(frame)
115
+ colored_obj_mask[obj_mask == 1] = colors[j]
116
+ frame[obj_mask == 1] = (
117
+ (1 - alpha) * frame[obj_mask == 1]
118
+ + alpha * colored_obj_mask[obj_mask == 1]
119
+ )
120
+
121
+
122
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
123
+ cv2.drawContours(frame, contours, -1, colors[j], 2)
124
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
125
+
126
+
127
+
128
+ if len(contours) > 0:
129
+ largest_contour = max(contours, key=cv2.contourArea)
130
+ M = cv2.moments(largest_contour)
131
+ if M["m00"] != 0:
132
+ center_x = int(M["m10"] / M["m00"])
133
+ center_y = int(M["m01"] / M["m00"])
134
+ else:
135
+ center_x, center_y = 0, 0
136
+
137
+ font = cv2.FONT_HERSHEY_SIMPLEX
138
+ text = obj_id
139
+
140
+ font_scale = 0.9
141
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
142
+ text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
143
+ text_y = center_y
144
+ # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
145
+
146
+ # 텍스트 배경 사각형 좌표 계산
147
+ rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
148
+ # rect_end = (text_x + text_size[0] + 5, text_y + 5)
149
+ rect_end = (text_x + text_size[0] + 5, text_y)
150
+
151
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
152
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
153
+
154
+ # plt.figure(figsize=(12, 8))
155
+ # plt.imshow(frame)
156
+ # plt.title(f"frame {frame_name}")
157
+ # plt.tight_layout()
158
+ # plt.axis('off')
159
+ # plt.show()
160
+
161
+ buffer = BytesIO()
162
+ frame = Image.fromarray(frame)
163
+ frame.save(buffer, format='jpeg')
164
+ buffer.seek(0)
165
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
166
+ frame_cat_cnts[frame_name] = cat_cnt
167
+
168
+ buffer.seek(0) # Reuse buffer instead of creating a new one
169
+ buffer.truncate()
170
+ frame_for_contour = Image.fromarray(frame_for_contour)
171
+ frame_for_contour.save(buffer, format='jpeg')
172
+ buffer.seek(0)
173
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
174
+
175
+ encoded_frames[cat] = cat_frames
176
+ contoured_frames[cat] = contour_frames
177
+ vid_cat_cnts[cat] = frame_cat_cnts
178
+
179
+ return encoded_frames, vid_cat_cnts, contoured_frames
180
+
181
+
182
+ def getCaption(idx, model='gpt-4o', color_mask=True):
183
+ vid_meta = metas[idx]
184
+ vid_data = train_dataset[idx]
185
+ vid_id = vid_meta['video']
186
+ print(f"vid id: {vid_id}\n")
187
+
188
+ frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
189
+ cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
190
+ all_captions = dict()
191
+
192
+ base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
193
+ #marked = "mask with boundary" if color_mask else "boundary"
194
+
195
+ for cat_name in list(cat_names) :
196
+
197
+ is_movable = False
198
+ if cat_name in ytvos_category_valid_list :
199
+ is_movable = True
200
+
201
+ if not is_movable:
202
+ print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
203
+
204
+
205
+ image_captions = {}
206
+ captioner = OpenAI()
207
+ cat_base64_frames = base64_frames[cat_name]
208
+ cont_base64_frames = contoured_frames[cat_name]
209
+
210
+ for i in range(len(cat_base64_frames)):
211
+ frame_name = frame_indx[i]
212
+ cont_base64_image = cont_base64_frames[i]
213
+ base64_image = cat_base64_frames[i]
214
+ should_filter = False
215
+ frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
216
+
217
+ if frame_cat_cnts >= 2:
218
+ should_filter = True
219
+ else:
220
+ print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
221
+
222
+ if is_movable and should_filter:
223
+ #1단계: 필터링
224
+ print(f"-----------category name: {cat_name}, frame name: {frame_name}")
225
+ caption_filter_text = f"""
226
+ You are a visual assistant analyzing a single frame from a video.
227
+ In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
228
+
229
+ Are {cat_name}s in the image performing all different and recognizable actions or postures?
230
+ Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
231
+ facial expressions, and any notable interactions with objects or other {cat_name}s or people.
232
+
233
+ Only focus on obvious, prominent actions that can be reliably identified from this single frame.
234
+
235
+ - Respond with "YES" if:
236
+ 1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
237
+ (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
238
+ 2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
239
+ 3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
240
+
241
+ - Respond with "NONE" if:
242
+ 1) The actions or pose are not clearly differentiable or too similar.
243
+ 2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
244
+ 3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
245
+
246
+ Answer strictly with either "YES" or "NONE".
247
+ """
248
+
249
+ response1 = captioner.chat.completions.create(
250
+ model=model,
251
+ messages=[
252
+ {
253
+ "role": "user",
254
+ "content": [
255
+ {
256
+ "type": "text",
257
+ "text": caption_filter_text,
258
+ },
259
+ {
260
+ "type": "image_url",
261
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
262
+ }
263
+ ],
264
+ }
265
+ ],
266
+ )
267
+ response_content = response1.choices[0].message.content
268
+ should_caption = True if "yes" in response_content.lower() else False
269
+ print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
270
+
271
+ else:
272
+ should_caption = False
273
+
274
+ #2단계: dense caption 만들기
275
+ dense_caption_prompt_1 = f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
276
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
277
+ I want to use your expressions to create a action-centric referring expression dataset.
278
+ Therefore, your expressions for these {cat_name}s should describe unique action of each object.
279
+
280
+ 1. Focus only on clear, unique, and prominent actions that distinguish each object.
281
+ 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
282
+ 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
283
+ 4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
284
+ 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
285
+ 6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
286
+ 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
287
+ 8. Include interactions with objects or other entities when they are prominent and observable.
288
+ 9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
289
+ 10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
290
+ 11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
291
+ 12. Do not mention object IDs.
292
+ 13. Use '{cat_name}' as the noun for the referring expressions.
293
+
294
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
295
+ Output referring expressions for each object id.
296
+ """
297
+
298
+ dense_caption_prompt = f"""
299
+ You are a visual assistant analyzing a single frame of a video.
300
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
301
+
302
+ I want to use your expressions to create an **action-centric referring expression** dataset.
303
+ Please describe each {cat_name} using **clearly observable** and **specific** actions.
304
+
305
+ ---
306
+ ## Guidelines:
307
+ 1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
308
+ 2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
309
+ 3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
310
+ 4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
311
+ 5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
312
+ 6. If multiple {cat_name}s appear, ensure each description **differentiates** their actions.
313
+ 7. Base your description on these action definitions:
314
+ - Avoid using term 'minimal' or 'slightly'.
315
+ - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
316
+ - details such as motion and intention, facial with object manipulation
317
+ - movements with objects or other entities when they are prominent and observable. expression should be specific.
318
+ (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
319
+ ---
320
+
321
+ ## Output Format:
322
+ - For each labeled {cat_name}, output **exactly one line**. Your answer should contain details and follow the following format :
323
+ object id. using {cat_name} as subject noun, action-oriented description
324
+ (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
325
+ - **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
326
+
327
+ ### Example
328
+ If the frame has 2 labeled bears, your output should look like:
329
+ 1. the bear reaching his right arm while leaning forward to capture the prey
330
+ 2. a bear standing upright facing right, touching the bike aside
331
+
332
+ ---
333
+ **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
334
+ **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
335
+ **Do not include markdown** in the output.
336
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
337
+ For each labeled {cat_name}, output referring expressions for each object id.
338
+ """
339
+ MAX_RETRIES = 2
340
+ retry_count = 0
341
+
342
+ if should_caption:
343
+ while retry_count < MAX_RETRIES:
344
+
345
+ response2 = captioner.chat.completions.create(
346
+ model=model,
347
+ messages=[
348
+ {
349
+ "role": "user",
350
+ "content": [
351
+ {
352
+ "type": "text",
353
+ "text": dense_caption_prompt,
354
+ },
355
+ {
356
+ "type": "image_url",
357
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
358
+ },
359
+ ],
360
+ }
361
+ ],
362
+ )
363
+
364
+ # caption = response2.choices[0].message.content
365
+ #print(f"{image_path} - {frame_name}: {caption}")
366
+
367
+ caption = response2.choices[0].message.content.strip()
368
+ caption_lower = caption.lower().lstrip()
369
+
370
+ if caption_lower.startswith("1.") and not any(
371
+ phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
372
+ ):
373
+ break
374
+
375
+ print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
376
+ retry_count += 1
377
+ time.sleep(2)
378
+
379
+ if retry_count == MAX_RETRIES:
380
+ caption = None
381
+ print("Max retries reached. Caption generation failed.")
382
+
383
+ else:
384
+ caption = None
385
+
386
+ image_captions[frame_name] = caption
387
+ all_captions[cat_name] = image_captions
388
+
389
+ # final : also prepare valid object ids
390
+ valid_obj_ids = dict()
391
+
392
+ for cat in cat_names:
393
+ if cat in ytvos_category_valid_list:
394
+ obj_id_cat = vid_meta['obj_id_cat']
395
+ valid_cat_ids = []
396
+ for obj_id in list(obj_id_cat.keys()):
397
+ if obj_id_cat[obj_id] == cat:
398
+ valid_cat_ids.append(obj_id)
399
+ valid_obj_ids[cat] = valid_cat_ids
400
+
401
+ return vid_id, all_captions, valid_obj_ids
402
+
403
+
404
+
405
+ if __name__ == '__main__':
406
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
407
+ parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
408
+ parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
409
+
410
+ args = parser.parse_args()
411
+
412
+ #==================데이터 불러오기===================
413
+ # 전체 데이터셋
414
+ train_dataset = build_ytvos_ref(image_set = 'train', args = args)
415
+
416
+ # 전체 데이터셋 메타데이터
417
+ metas = train_dataset.metas
418
+
419
+ # 색상 후보 8개 (RGB 형식)
420
+ colors = [
421
+ (255, 0, 0), # Red
422
+ (0, 255, 0), # Green
423
+ (0, 0, 255), # Blue
424
+ (255, 255, 0), # Yellow
425
+ (255, 0, 255), # Magenta
426
+ (0, 255, 255), # Cyan
427
+ (128, 0, 128), # Purple
428
+ (255, 165, 0) # Orange
429
+ ]
430
+
431
+ ytvos_category_valid_list = [
432
+ 'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
433
+ 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
434
+ 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
435
+ 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
436
+ 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
437
+ 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
438
+ ]
439
+
440
+ #==================gpt 돌리기===================
441
+ os.environ['OPENAI_API_KEY'] = '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'
442
+
443
+ result_captions = {}
444
+ result_valid_obj_ids = {}
445
+
446
+ for i in range(370):
447
+ vid_id, all_captions, valid_obj_ids = getCaption(i)
448
+
449
+ if vid_id not in result_captions:
450
+ result_captions[vid_id] = all_captions
451
+ if vid_id not in result_valid_obj_ids:
452
+ result_valid_obj_ids[vid_id] = valid_obj_ids
453
+
454
+ print("Finished!", flush=True)
455
+
456
+ with open(args.save_caption_path, "w") as file:
457
+ json.dump(result_captions, file, indent=4)
458
+
459
+ with open(args.save_valid_obj_ids_path, "w") as file:
460
+ json.dump(result_valid_obj_ids, file, indent=4)
.history/mbench/gpt_ref-ytvos_numbered_cy_20250202183102.py ADDED
@@ -0,0 +1,460 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4
+ import time
5
+
6
+ from os import path as osp
7
+ from io import BytesIO
8
+
9
+ from mbench.ytvos_ref import build as build_ytvos_ref
10
+ import argparse
11
+ import opts
12
+
13
+ import sys
14
+ from pathlib import Path
15
+ import os
16
+ from os import path as osp
17
+ import skimage
18
+ from io import BytesIO
19
+
20
+ import numpy as np
21
+ import pandas as pd
22
+ import regex as re
23
+ import json
24
+
25
+ import cv2
26
+ from PIL import Image, ImageDraw
27
+ import torch
28
+ from torchvision.transforms import functional as F
29
+
30
+ from skimage import measure # (pip install scikit-image)
31
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
32
+
33
+ import matplotlib.pyplot as plt
34
+ import matplotlib.patches as patches
35
+ from matplotlib.collections import PatchCollection
36
+ from matplotlib.patches import Rectangle
37
+ import textwrap
38
+
39
+
40
+ import ipywidgets as widgets
41
+ from IPython.display import display, clear_output
42
+
43
+ from openai import OpenAI
44
+ import base64
45
+ import json
46
+
47
+ def number_objects_and_encode(idx, color_mask=False):
48
+ encoded_frames = {}
49
+ contoured_frames = {} # New dictionary for original images
50
+ vid_cat_cnts = {}
51
+
52
+ vid_meta = metas[idx]
53
+ vid_data = train_dataset[idx]
54
+ vid_id = vid_meta['video']
55
+ frame_indx = vid_meta['sample_indx']
56
+ cat_names = set(vid_meta['obj_id_cat'].values())
57
+ imgs = vid_data[0]
58
+
59
+ for cat in cat_names:
60
+ cat_frames = []
61
+ contour_frames = []
62
+ frame_cat_cnts = {}
63
+
64
+ for i in range(imgs.size(0)):
65
+ frame_name = frame_indx[i]
66
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
67
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
68
+
69
+ frame_data = vid_data[2][frame_name]
70
+ obj_ids = list(frame_data.keys())
71
+
72
+ cat_cnt = 0
73
+
74
+ for j in range(len(obj_ids)):
75
+ obj_id = obj_ids[j]
76
+ obj_data = frame_data[obj_id]
77
+ obj_bbox = obj_data['bbox']
78
+ obj_valid = obj_data['valid']
79
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
80
+ obj_cat = obj_data['category_name']
81
+
82
+ if obj_cat == cat and obj_valid:
83
+ cat_cnt += 1
84
+
85
+ if color_mask == False:
86
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
87
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
88
+ for i, contour in enumerate(contours):
89
+ # 윤곽선 중심 계산
90
+ moments = cv2.moments(contour)
91
+ if moments["m00"] != 0: # 중심 계산 가능 여부 확인
92
+ cx = int(moments["m10"] / moments["m00"])
93
+ cy = int(moments["m01"] / moments["m00"])
94
+ else:
95
+ cx, cy = contour[0][0] # 중심 계산 불가시 대체 좌표 사용
96
+
97
+ # 텍스트 배경 (검은색 배경 만들기)
98
+ font = cv2.FONT_HERSHEY_SIMPLEX
99
+ text = obj_id
100
+ text_size = cv2.getTextSize(text, font, 1, 2)[0]
101
+ text_w, text_h = text_size
102
+
103
+ # 텍스트 배경 그리기 (검은색 배경)
104
+ cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
105
+ (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
106
+
107
+ # 텍스트 그리기 (흰색 텍스트)
108
+ cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
109
+ font, 1, (255, 255, 255), 2)
110
+
111
+ else:
112
+ alpha = 0.08
113
+
114
+ colored_obj_mask = np.zeros_like(frame)
115
+ colored_obj_mask[obj_mask == 1] = colors[j]
116
+ frame[obj_mask == 1] = (
117
+ (1 - alpha) * frame[obj_mask == 1]
118
+ + alpha * colored_obj_mask[obj_mask == 1]
119
+ )
120
+
121
+
122
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
123
+ cv2.drawContours(frame, contours, -1, colors[j], 2)
124
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
125
+
126
+
127
+
128
+ if len(contours) > 0:
129
+ largest_contour = max(contours, key=cv2.contourArea)
130
+ M = cv2.moments(largest_contour)
131
+ if M["m00"] != 0:
132
+ center_x = int(M["m10"] / M["m00"])
133
+ center_y = int(M["m01"] / M["m00"])
134
+ else:
135
+ center_x, center_y = 0, 0
136
+
137
+ font = cv2.FONT_HERSHEY_SIMPLEX
138
+ text = obj_id
139
+
140
+ font_scale = 0.9
141
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
142
+ text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
143
+ text_y = center_y
144
+ # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
145
+
146
+ # 텍스트 배경 사각형 좌표 계산
147
+ rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
148
+ # rect_end = (text_x + text_size[0] + 5, text_y + 5)
149
+ rect_end = (text_x + text_size[0] + 5, text_y)
150
+
151
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
152
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
153
+
154
+ # plt.figure(figsize=(12, 8))
155
+ # plt.imshow(frame)
156
+ # plt.title(f"frame {frame_name}")
157
+ # plt.tight_layout()
158
+ # plt.axis('off')
159
+ # plt.show()
160
+
161
+ buffer = BytesIO()
162
+ frame = Image.fromarray(frame)
163
+ frame.save(buffer, format='jpeg')
164
+ buffer.seek(0)
165
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
166
+ frame_cat_cnts[frame_name] = cat_cnt
167
+
168
+ buffer.seek(0) # Reuse buffer instead of creating a new one
169
+ buffer.truncate()
170
+ frame_for_contour = Image.fromarray(frame_for_contour)
171
+ frame_for_contour.save(buffer, format='jpeg')
172
+ buffer.seek(0)
173
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
174
+
175
+ encoded_frames[cat] = cat_frames
176
+ contoured_frames[cat] = contour_frames
177
+ vid_cat_cnts[cat] = frame_cat_cnts
178
+
179
+ return encoded_frames, vid_cat_cnts, contoured_frames
180
+
181
+
182
+ def getCaption(idx, model='gpt-4o', color_mask=True):
183
+ vid_meta = metas[idx]
184
+ vid_data = train_dataset[idx]
185
+ vid_id = vid_meta['video']
186
+ print(f"vid id: {vid_id}\n")
187
+
188
+ frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
189
+ cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
190
+ all_captions = dict()
191
+
192
+ base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
193
+ #marked = "mask with boundary" if color_mask else "boundary"
194
+
195
+ for cat_name in list(cat_names) :
196
+
197
+ is_movable = False
198
+ if cat_name in ytvos_category_valid_list :
199
+ is_movable = True
200
+
201
+ if not is_movable:
202
+ print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
203
+
204
+
205
+ image_captions = {}
206
+ captioner = OpenAI()
207
+ cat_base64_frames = base64_frames[cat_name]
208
+ cont_base64_frames = contoured_frames[cat_name]
209
+
210
+ for i in range(len(cat_base64_frames)):
211
+ frame_name = frame_indx[i]
212
+ cont_base64_image = cont_base64_frames[i]
213
+ base64_image = cat_base64_frames[i]
214
+ should_filter = False
215
+ frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
216
+
217
+ if frame_cat_cnts >= 2:
218
+ should_filter = True
219
+ else:
220
+ print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
221
+
222
+ if is_movable and should_filter:
223
+ #1단계: 필터링
224
+ print(f"-----------category name: {cat_name}, frame name: {frame_name}")
225
+ caption_filter_text = f"""
226
+ You are a visual assistant analyzing a single frame from a video.
227
+ In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
228
+
229
+ Are {cat_name}s in the image performing all different and recognizable actions or postures?
230
+ Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
231
+ facial expressions, and any notable interactions with objects or other {cat_name}s or people.
232
+
233
+ Only focus on obvious, prominent actions that can be reliably identified from this single frame.
234
+
235
+ - Respond with "YES" if:
236
+ 1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
237
+ (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
238
+ 2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
239
+ 3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
240
+
241
+ - Respond with "NONE" if:
242
+ 1) The actions or pose are not clearly differentiable or too similar.
243
+ 2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
244
+ 3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
245
+
246
+ Answer strictly with either "YES" or "NONE".
247
+ """
248
+
249
+ response1 = captioner.chat.completions.create(
250
+ model=model,
251
+ messages=[
252
+ {
253
+ "role": "user",
254
+ "content": [
255
+ {
256
+ "type": "text",
257
+ "text": caption_filter_text,
258
+ },
259
+ {
260
+ "type": "image_url",
261
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
262
+ }
263
+ ],
264
+ }
265
+ ],
266
+ )
267
+ response_content = response1.choices[0].message.content
268
+ should_caption = True if "yes" in response_content.lower() else False
269
+ print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
270
+
271
+ else:
272
+ should_caption = False
273
+
274
+ #2단계: dense caption 만들기
275
+ dense_caption_prompt_1 = f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
276
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
277
+ I want to use your expressions to create a action-centric referring expression dataset.
278
+ Therefore, your expressions for these {cat_name}s should describe unique action of each object.
279
+
280
+ 1. Focus only on clear, unique, and prominent actions that distinguish each object.
281
+ 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
282
+ 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
283
+ 4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
284
+ 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
285
+ 6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
286
+ 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
287
+ 8. Include interactions with objects or other entities when they are prominent and observable.
288
+ 9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
289
+ 10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
290
+ 11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
291
+ 12. Do not mention object IDs.
292
+ 13. Use '{cat_name}' as the noun for the referring expressions.
293
+
294
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
295
+ Output referring expressions for each object id.
296
+ """
297
+
298
+ dense_caption_prompt = f"""
299
+ You are a visual assistant analyzing a single frame of a video.
300
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
301
+
302
+ I want to use your expressions to create an **action-centric referring expression** dataset.
303
+ Please describe each {cat_name} using **clearly observable** and **specific** actions.
304
+
305
+ ---
306
+ ## Guidelines:
307
+ 1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
308
+ 2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
309
+ 3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
310
+ 4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
311
+ 5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
312
+ 6. If multiple {cat_name}s appear, ensure each description **differentiates** their actions.
313
+ 7. Base your description on these action definitions:
314
+ - Avoid using term 'minimal' or 'slightly'.
315
+ - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
316
+ - details such as motion and intention, facial with object manipulation
317
+ - movements with objects or other entities when they are prominent and observable. expression should be specific.
318
+ (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
319
+ ---
320
+
321
+ ## Output Format:
322
+ - For each labeled {cat_name}, output **exactly one line**. Your answer should contain details and follow the following format :
323
+ object id. using {cat_name} as subject noun, action-oriented description
324
+ (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
325
+ - **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
326
+
327
+ ### Example
328
+ If the frame has 2 labeled bears, your output should look like:
329
+ 1. the bear reaching his right arm while leaning forward to capture the prey
330
+ 2. a bear standing upright facing right, touching the bike aside
331
+
332
+ ---
333
+ **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
334
+ **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
335
+ **Do not include markdown** in the output.
336
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
337
+ For each labeled {cat_name}, output referring expressions for each object id.
338
+ """
339
+ MAX_RETRIES = 2
340
+ retry_count = 0
341
+
342
+ if should_caption:
343
+ while retry_count < MAX_RETRIES:
344
+
345
+ response2 = captioner.chat.completions.create(
346
+ model=model,
347
+ messages=[
348
+ {
349
+ "role": "user",
350
+ "content": [
351
+ {
352
+ "type": "text",
353
+ "text": dense_caption_prompt,
354
+ },
355
+ {
356
+ "type": "image_url",
357
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
358
+ },
359
+ ],
360
+ }
361
+ ],
362
+ )
363
+
364
+ # caption = response2.choices[0].message.content
365
+ #print(f"{image_path} - {frame_name}: {caption}")
366
+
367
+ caption = response2.choices[0].message.content.strip()
368
+ caption_lower = caption.lower().lstrip()
369
+
370
+ if caption_lower.startswith("1.") and not any(
371
+ phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
372
+ ):
373
+ break
374
+
375
+ print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
376
+ retry_count += 1
377
+ time.sleep(2)
378
+
379
+ if retry_count == MAX_RETRIES:
380
+ caption = None
381
+ print("Max retries reached. Caption generation failed.")
382
+
383
+ else:
384
+ caption = None
385
+
386
+ image_captions[frame_name] = caption
387
+ all_captions[cat_name] = image_captions
388
+
389
+ # final : also prepare valid object ids
390
+ valid_obj_ids = dict()
391
+
392
+ for cat in cat_names:
393
+ if cat in ytvos_category_valid_list:
394
+ obj_id_cat = vid_meta['obj_id_cat']
395
+ valid_cat_ids = []
396
+ for obj_id in list(obj_id_cat.keys()):
397
+ if obj_id_cat[obj_id] == cat:
398
+ valid_cat_ids.append(obj_id)
399
+ valid_obj_ids[cat] = valid_cat_ids
400
+
401
+ return vid_id, all_captions, valid_obj_ids
402
+
403
+
404
+
405
+ if __name__ == '__main__':
406
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
407
+ parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
408
+ parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
409
+
410
+ args = parser.parse_args()
411
+
412
+ #==================데이터 불러오기===================
413
+ # 전체 데이터셋
414
+ train_dataset = build_ytvos_ref(image_set = 'train', args = args)
415
+
416
+ # 전체 데이터셋 메타데이터
417
+ metas = train_dataset.metas
418
+
419
+ # 색상 후보 8개 (RGB 형식)
420
+ colors = [
421
+ (255, 0, 0), # Red
422
+ (0, 255, 0), # Green
423
+ (0, 0, 255), # Blue
424
+ (255, 255, 0), # Yellow
425
+ (255, 0, 255), # Magenta
426
+ (0, 255, 255), # Cyan
427
+ (128, 0, 128), # Purple
428
+ (255, 165, 0) # Orange
429
+ ]
430
+
431
+ ytvos_category_valid_list = [
432
+ 'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
433
+ 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
434
+ 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
435
+ 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
436
+ 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
437
+ 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
438
+ ]
439
+
440
+ #==================gpt 돌리기===================
441
+ os.environ['OPENAI_API_KEY'] = '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'
442
+
443
+ result_captions = {}
444
+ result_valid_obj_ids = {}
445
+
446
+ for i in range(370):
447
+ vid_id, all_captions, valid_obj_ids = getCaption(i, color_mask=False)
448
+
449
+ if vid_id not in result_captions:
450
+ result_captions[vid_id] = all_captions
451
+ if vid_id not in result_valid_obj_ids:
452
+ result_valid_obj_ids[vid_id] = valid_obj_ids
453
+
454
+ print("Finished!", flush=True)
455
+
456
+ with open(args.save_caption_path, "w") as file:
457
+ json.dump(result_captions, file, indent=4)
458
+
459
+ with open(args.save_valid_obj_ids_path, "w") as file:
460
+ json.dump(result_valid_obj_ids, file, indent=4)
.history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207172804.py ADDED
@@ -0,0 +1,656 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4
+ import time
5
+
6
+ from os import path as osp
7
+ from io import BytesIO
8
+ import random
9
+
10
+ from mbench.ytvos_ref import build as build_ytvos_ref
11
+ import argparse
12
+ import opts
13
+
14
+ import sys
15
+ from pathlib import Path
16
+ import os
17
+ from os import path as osp
18
+ import skimage
19
+ from io import BytesIO
20
+
21
+ import numpy as np
22
+ import pandas as pd
23
+ import regex as re
24
+ import json
25
+
26
+ import cv2
27
+ from PIL import Image, ImageDraw
28
+ import torch
29
+ from torchvision.transforms import functional as F
30
+
31
+ from skimage import measure # (pip install scikit-image)
32
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
33
+
34
+ import matplotlib.pyplot as plt
35
+ import matplotlib.patches as patches
36
+ from matplotlib.collections import PatchCollection
37
+ from matplotlib.patches import Rectangle
38
+ import textwrap
39
+
40
+
41
+ import ipywidgets as widgets
42
+ from IPython.display import display, clear_output
43
+
44
+ from openai import OpenAI
45
+ import base64
46
+ import json
47
+ import requests
48
+ from openai.error import APIConnectionError, OpenAIError
49
+
50
+ def number_objects_and_encode_old(idx, color_mask=False):
51
+ encoded_frames = {}
52
+ contoured_frames = {} # New dictionary for original images
53
+ vid_cat_cnts = {}
54
+
55
+ vid_meta = metas[idx]
56
+ vid_data = train_dataset[idx]
57
+ vid_id = vid_meta['video']
58
+ frame_indx = vid_meta['sample_indx']
59
+ cat_names = set(vid_meta['obj_id_cat'].values())
60
+ imgs = vid_data[0]
61
+
62
+ for cat in cat_names:
63
+ cat_frames = []
64
+ contour_frames = []
65
+ frame_cat_cnts = {}
66
+
67
+ for i in range(imgs.size(0)):
68
+ frame_name = frame_indx[i]
69
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
70
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
71
+
72
+ frame_data = vid_data[2][frame_name]
73
+ obj_ids = list(frame_data.keys())
74
+
75
+ cat_cnt = 0
76
+
77
+ for j in range(len(obj_ids)):
78
+ obj_id = obj_ids[j]
79
+ obj_data = frame_data[obj_id]
80
+ obj_bbox = obj_data['bbox']
81
+ obj_valid = obj_data['valid']
82
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
83
+ obj_cat = obj_data['category_name']
84
+
85
+ if obj_cat == cat and obj_valid:
86
+ cat_cnt += 1
87
+
88
+ if color_mask == False:
89
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
90
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
91
+ for i, contour in enumerate(contours):
92
+ moments = cv2.moments(contour)
93
+ if moments["m00"] != 0:
94
+ cx = int(moments["m10"] / moments["m00"])
95
+ cy = int(moments["m01"] / moments["m00"])
96
+ else:
97
+ cx, cy = contour[0][0]
98
+
99
+ font = cv2.FONT_HERSHEY_SIMPLEX
100
+ text = obj_id
101
+ text_size = cv2.getTextSize(text, font, 1, 2)[0]
102
+ text_w, text_h = text_size
103
+
104
+ cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
105
+ (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
106
+
107
+ cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
108
+ font, 1, (255, 255, 255), 2)
109
+
110
+ else:
111
+ alpha = 0.08
112
+
113
+ colored_obj_mask = np.zeros_like(frame)
114
+ colored_obj_mask[obj_mask == 1] = colors[j]
115
+ frame[obj_mask == 1] = (
116
+ (1 - alpha) * frame[obj_mask == 1]
117
+ + alpha * colored_obj_mask[obj_mask == 1]
118
+ )
119
+
120
+
121
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
122
+ cv2.drawContours(frame, contours, -1, colors[j], 2)
123
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
124
+
125
+ if len(contours) > 0:
126
+ largest_contour = max(contours, key=cv2.contourArea)
127
+ M = cv2.moments(largest_contour)
128
+ if M["m00"] != 0:
129
+ center_x = int(M["m10"] / M["m00"])
130
+ center_y = int(M["m01"] / M["m00"])
131
+ else:
132
+ center_x, center_y = 0, 0
133
+
134
+ font = cv2.FONT_HERSHEY_SIMPLEX
135
+ text = obj_id
136
+
137
+ font_scale = 0.9
138
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
139
+ text_x = center_x - text_size[0] // 1
140
+ text_y = center_y
141
+
142
+ rect_start = (text_x - 5, text_y - text_size[1] - 5)
143
+ rect_end = (text_x + text_size[0] + 5, text_y)
144
+
145
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
146
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
147
+
148
+ # plt.figure(figsize=(12, 8))
149
+ # plt.imshow(frame)
150
+ # plt.title(f"frame {frame_name}")
151
+ # plt.tight_layout()
152
+ # plt.axis('off')
153
+ # plt.show()
154
+
155
+ buffer = BytesIO()
156
+ frame = Image.fromarray(frame)
157
+ frame.save(buffer, format='jpeg')
158
+ buffer.seek(0)
159
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
160
+ frame_cat_cnts[frame_name] = cat_cnt
161
+
162
+ buffer.seek(0) # Reuse buffer instead of creating a new one
163
+ buffer.truncate()
164
+ frame_for_contour = Image.fromarray(frame_for_contour)
165
+ frame_for_contour.save(buffer, format='jpeg')
166
+ buffer.seek(0)
167
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
168
+
169
+ encoded_frames[cat] = cat_frames
170
+ contoured_frames[cat] = contour_frames
171
+ vid_cat_cnts[cat] = frame_cat_cnts
172
+
173
+ return encoded_frames, contoured_frames, vid_cat_cnts
174
+
175
+
176
+ def number_objects_and_encode(idx, color_mask=False):
177
+ encoded_frames = {}
178
+ contoured_frames = {} # New dictionary for original images
179
+ vid_cat_cnts = {}
180
+
181
+ vid_meta = metas[idx]
182
+ vid_data = train_dataset[idx]
183
+ vid_id = vid_meta['video']
184
+ frame_indx = vid_meta['sample_indx']
185
+ cat_names = set(vid_meta['obj_id_cat'].values())
186
+ imgs = vid_data[0]
187
+
188
+ for cat in cat_names:
189
+ cat_frames = []
190
+ contour_frames = []
191
+ frame_cat_cnts = {}
192
+
193
+ for i in range(imgs.size(0)):
194
+ frame_name = frame_indx[i]
195
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
196
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
197
+
198
+ frame_data = vid_data[2][frame_name]
199
+ obj_ids = list(frame_data.keys())
200
+
201
+ cat_cnt = 0
202
+
203
+ for j in range(len(obj_ids)):
204
+ obj_id = obj_ids[j]
205
+ obj_data = frame_data[obj_id]
206
+ obj_bbox = obj_data['bbox']
207
+ obj_valid = obj_data['valid']
208
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
209
+ obj_cat = obj_data['category_name']
210
+
211
+ if obj_cat == cat and obj_valid:
212
+ cat_cnt += 1
213
+
214
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
215
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
216
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
217
+
218
+ if len(contours) > 0:
219
+ largest_contour = max(contours, key=cv2.contourArea)
220
+ M = cv2.moments(largest_contour)
221
+ if M["m00"] != 0:
222
+ center_x = int(M["m10"] / M["m00"])
223
+ center_y = int(M["m01"] / M["m00"])
224
+ else:
225
+ center_x, center_y = 0, 0
226
+
227
+ font = cv2.FONT_HERSHEY_SIMPLEX
228
+ text = obj_id
229
+ font_scale = 1.2
230
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
231
+ text_x = center_x - text_size[0] // 1
232
+ text_y = center_y
233
+
234
+ rect_start = (text_x - 5, text_y - text_size[1] - 5)
235
+ rect_end = (text_x + text_size[0] + 5, text_y + 3)
236
+
237
+ contour_thickness = 1
238
+ rect_start_contour = (rect_start[0] - contour_thickness, rect_start[1] - contour_thickness)
239
+ rect_end_contour = (rect_end[0] + contour_thickness, rect_end[1] + contour_thickness)
240
+
241
+ cv2.rectangle(frame, rect_start_contour, rect_end_contour, colors[j], contour_thickness)
242
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
243
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
244
+
245
+
246
+ if color_mask:
247
+ alpha = 0.08
248
+ colored_obj_mask = np.zeros_like(frame)
249
+ colored_obj_mask[obj_mask == 1] = colors[j]
250
+ frame[obj_mask == 1] = (
251
+ (1 - alpha) * frame[obj_mask == 1]
252
+ + alpha * colored_obj_mask[obj_mask == 1]
253
+ )
254
+
255
+ # plt.figure(figsize=(12, 8))
256
+ # plt.imshow(frame)
257
+ # plt.title(f"frame {frame_name}")
258
+ # plt.tight_layout()
259
+ # plt.axis('off')
260
+ # plt.show()
261
+
262
+ buffer = BytesIO()
263
+ frame = Image.fromarray(frame)
264
+ frame.save(buffer, format='jpeg')
265
+ buffer.seek(0)
266
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
267
+ frame_cat_cnts[frame_name] = cat_cnt
268
+
269
+ buffer.seek(0) # Reuse buffer instead of creating a new one
270
+ buffer.truncate()
271
+ frame_for_contour = Image.fromarray(frame_for_contour)
272
+ frame_for_contour.save(buffer, format='jpeg')
273
+ buffer.seek(0)
274
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
275
+
276
+ encoded_frames[cat] = cat_frames
277
+ contoured_frames[cat] = contour_frames
278
+ vid_cat_cnts[cat] = frame_cat_cnts
279
+
280
+ return encoded_frames, contoured_frames, vid_cat_cnts
281
+
282
+
283
+
284
+ def getCaption(idx, model='gpt-4o'):
285
+ vid_meta = metas[idx]
286
+ vid_data = train_dataset[idx]
287
+ vid_id = vid_meta['video']
288
+ print(f"vid id: {vid_id}\n")
289
+
290
+ frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
291
+ cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
292
+ all_captions = dict()
293
+
294
+ # color_mask = random.choice([True, False])
295
+ color_mask = random.choices([False, True], weights=[60, 40])[0]
296
+
297
+ base64_frames, _ , vid_cat_cnts = number_objects_and_encode(idx, color_mask)
298
+ #marked = "mask with boundary" if color_mask else "boundary"
299
+
300
+ for cat_name in list(cat_names) :
301
+
302
+ is_movable = False
303
+ if cat_name in ytvos_category_valid_list :
304
+ is_movable = True
305
+
306
+ if not is_movable:
307
+ print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
308
+
309
+
310
+ image_captions = {}
311
+ captioner = OpenAI()
312
+ cat_base64_frames = base64_frames[cat_name]
313
+ # cont_base64_frames = contoured_frames[cat_name]
314
+
315
+ for i in range(len(cat_base64_frames)):
316
+ frame_name = frame_indx[i]
317
+ # cont_base64_image = cont_base64_frames[i]
318
+ base64_image = cat_base64_frames[i]
319
+ should_filter = False
320
+ frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
321
+
322
+ if frame_cat_cnts >= 2:
323
+ should_filter = True
324
+ else:
325
+ print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
326
+
327
+
328
+ if is_movable and should_filter:
329
+ #1단계: 필터링
330
+ print(f"-----------category name: {cat_name}, frame name: {frame_name}")
331
+ caption_filter_text = f"""
332
+ You are a visual assistant analyzing a single frame from a video.
333
+ In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
334
+
335
+ Are {cat_name}s in the image performing all different and recognizable actions or postures?
336
+ Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
337
+ facial expressions, and any notable interactions with objects or other {cat_name}s or people.
338
+
339
+ Only focus on obvious, prominent actions that can be reliably identified from this single frame.
340
+
341
+ - Respond with "YES" if:
342
+ 1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
343
+ (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
344
+ 2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
345
+ 3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
346
+
347
+ - Respond with "NONE" if:
348
+ 1) The actions or pose are not clearly differentiable or too similar.
349
+ 2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
350
+ 3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
351
+
352
+ Answer strictly with either "YES" or "NONE".
353
+ """
354
+
355
+ response1 = captioner.chat.completions.create(
356
+ model=model,
357
+ messages=[
358
+ {
359
+ "role": "user",
360
+ "content": [
361
+ {
362
+ "type": "text",
363
+ "text": caption_filter_text,
364
+ },
365
+ {
366
+ "type": "image_url",
367
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
368
+ }
369
+ ],
370
+ }
371
+ ],
372
+ )
373
+ response_content = response1.choices[0].message.content
374
+ should_caption = True if "yes" in response_content.lower() else False
375
+ print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
376
+
377
+ else:
378
+ should_caption = False
379
+
380
+ #2단계: dense caption 만들기
381
+ dense_caption_prompt_1 = f"""
382
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. The category name of these objects are : {cat_name}.
383
+
384
+ Please describe the image focusing on labeled {cat_name}s in detail, focusing on their actions and interactions.
385
+
386
+ 1. Focus only on clear, unique, and prominent actions that distinguish each object.
387
+ 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
388
+ 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
389
+ 4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
390
+ 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
391
+ 6. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
392
+ - expressions like 'seems to be', 'appears to be' are BANNED!
393
+ 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
394
+ 8. Include interactions with objects or other entities when they are prominent and observable.
395
+ 9. **Do not include descriptions of appearance** such as clothes, color, size, shape etc.
396
+ 10. **Do not include relative position** between objects such as 'the left elephant' because left/right can be ambiguous.
397
+ 11. Do not mention object IDs.
398
+ 12. Use '{cat_name}' as the noun for the referring expressions.
399
+
400
+ Note that I want to use your description to create a grounding dataset, therefore, your descriptions for different objects should be unique, i.e., If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
401
+
402
+ - Your answer should contain details, and follow the following format:
403
+ object id. action-oriented description
404
+ (e.g. 1. the person is holding bananas on two hands and opening his mouth, turning the head right.
405
+ 2. a person bending over and touching his boots to tie the shoelace.)
406
+ - for action-oriented description, use {cat_name} as subject noun
407
+
408
+ **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
409
+ Please pay attention to the categories of these objects and don’t change them.
410
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
411
+ Output referring expressions for each object id. Please start your answer:"""
412
+
413
+
414
+ dense_caption_prompt_2 = f"""
415
+ You are an advanced visual language model analyzing a video frame.
416
+ In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been distinctly labeled with bright numerical IDs at their center and boundary.
417
+
418
+ Your task is to generate **action-oriented descriptions** for each labeled {cat_name}.
419
+ Your descriptions should capture their **observable actions and interactions**, making sure to highlight movement, gestures, and dynamic behaviors.
420
+
421
+ ---
422
+ ## Key Guidelines:
423
+ 1. **Describe only clear and visible actions** that uniquely define what the {cat_name} is doing.
424
+ - Example: "grabbing a branch and pulling it down" (**(O) Specific**)
425
+ - Avoid: "moving slightly to the side" (**(X) Too vague**)
426
+
427
+ 2. **Do not describe appearance, color, or position**—focus purely on the action.
428
+ - (X) "A large brown bear standing on the left"
429
+ - (O) "The bear is lifting its front paws and swiping forward."
430
+
431
+ 3. **Use dynamic, action-specific verbs** rather than passive descriptions.
432
+ - (O) "The giraffe is tilting its head and sniffing the ground."
433
+ - (X) "The giraffe is near a tree and looking around."
434
+
435
+ 4. **Avoid assumptions, emotions, or speculative phrasing.**
436
+ - (X) "The person seems excited" / "The person might be preparing to jump."
437
+ - (O) "The person is pushing its front legs against the rock and leaping forward."
438
+
439
+ 5. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
440
+ - expressions like 'seems to be', 'appears to be' are BANNED!
441
+ 6. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
442
+
443
+ 7. If multiple {cat_name}s are present, make sure their descriptions are **distinct and non-overlapping**.
444
+ - **Each object should have a unique, descriptive action.**
445
+ - (X) "Two dogs are running."
446
+ - (O) "1. One dog is chasing another, its legs stretched mid-air.
447
+ 2. The other dog is looking back while speeding up."
448
+
449
+ ---
450
+ ## Output Format:
451
+ - Each labeled **{cat_name}** should have exactly **one line of description**.
452
+ - Format: `ID. {cat_name} + action-based description`
453
+ - (O) Example:
454
+ ```
455
+ 1. The person is leaning forward while opening a bag with both hands.
456
+ 2. The person is holding onto a rope and pulling themselves up.
457
+ ```
458
+ - **Ensure that each object is described individually.**
459
+ - **Do not group objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
460
+
461
+ ---
462
+ ## Additional Instructions:
463
+ - **Do NOT** use expressions like "it appears that..." or "it seems like...".
464
+ - **Do NOT** mention object IDs in the description (only use the provided format).
465
+ - **DO NOT** include markdown formatting (no bullet points, no asterisks).
466
+ - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
467
+
468
+ Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:
469
+ """
470
+
471
+
472
+ dense_caption_prompt = f"""
473
+ You are a visual assistant analyzing a single frame of a video.
474
+ In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been labeled with bright numeric IDs at their center and boundary.
475
+
476
+ I am building an **action-centric referring expression** dataset.
477
+ Your task is to describe each labeled {cat_name} based on **clearly observable and specific actions**.
478
+
479
+ ---
480
+ ## Guidelines:
481
+ 1. **Focus only on visible and prominent actions** (e.g., running, pushing, grasping an object).
482
+ 2. **Avoid describing minor or ambiguous movements** (e.g., "slightly moving a paw," "tilting head a bit").
483
+ 3. **Do not include subjective or speculative descriptions** (e.g., "it seems excited" or "it might be preparing to jump").
484
+ 4. **Avoid vague expressions** like "engaging with something." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
485
+ 5. **Use dynamic action verbs** (e.g., holding, throwing, inspecting, leaning, pressing) to highlight motion and interaction.
486
+ 6. If multiple {cat_name}s appear, ensure each description is **distinct and non-overlapping**.
487
+ 7. Base your descriptions on these principles:
488
+ - **Avoid words like 'minimal' or 'slightly'.**
489
+ - Emphasize **body movement, posture, and motion patterns** (e.g., "lifting its head," "facing forward," "showing its back").
490
+ - Describe **facial expressions and interactions with objects** (e.g., "opening its mouth wide," "smiling while holding an item").
491
+ - **Specify actions with other objects or entities** only when they are clear and observable.
492
+ - (O) "pushing another person"
493
+ - (X) "interacting with another object"
494
+
495
+ ---
496
+ ## Output Format:
497
+ - Each labeled **{cat_name}** must have **exactly one line**.
498
+ - Format: `ID. {cat_name} + action-based description`
499
+ - (O) Example:
500
+ ```
501
+ 1. The person is holding ski poles and skiing down a snowy mountain with bent knees.
502
+ 2. The person is pulling a baby carriage while smiling.
503
+ ```
504
+ - **Ensure each object is described individually.**
505
+ - **Do not group multiple objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
506
+
507
+ ---
508
+ ## Example:
509
+ If the frame has two labeled **bears**, your output should be:
510
+ ```
511
+ 1. The bear is reaching out its right paw while leaning forward to catch prey.
512
+ 2. A bear is standing upright, facing right, and touching the bike beside it.
513
+ ```
514
+
515
+ ---
516
+ ## Additional Instructions:
517
+ - **Do NOT** describe appearance (e.g., color, size, texture) or relative positioning (e.g., "on the left/right").
518
+ - **Do NOT** reference object IDs explicitly (e.g., "Person 1" or "Object 2" is NOT allowed).
519
+ - **Do NOT** include markdown formatting (no bullet points, asterisks, or extra symbols).
520
+ - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
521
+
522
+ Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:"""
523
+
524
+
525
+ MAX_RETRIES = 3
526
+ retry_count = 0
527
+
528
+ if should_caption:
529
+ while retry_count < MAX_RETRIES:
530
+ selected_prompt = random.choice([dense_caption_prompt, dense_caption_prompt_2])
531
+
532
+ response2 = captioner.chat.completions.create(
533
+ model=model,
534
+ messages=[
535
+ {
536
+ "role": "user",
537
+ "content": [
538
+ {
539
+ "type": "text",
540
+ "text": selected_prompt,
541
+ },
542
+ {
543
+ "type": "image_url",
544
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
545
+ },
546
+ ],
547
+ }
548
+ ],
549
+ )
550
+
551
+ # caption = response2.choices[0].message.content
552
+ #print(f"{image_path} - {frame_name}: {caption}")
553
+
554
+ caption = response2.choices[0].message.content.strip()
555
+ caption_lower = caption.lower().lstrip()
556
+
557
+ if caption_lower.startswith("1.") and not any(
558
+ phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
559
+ ):
560
+ break
561
+
562
+ print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
563
+ retry_count += 1
564
+ time.sleep(2)
565
+
566
+ if retry_count == MAX_RETRIES:
567
+ caption = None
568
+ print("Max retries reached. Caption generation failed.")
569
+
570
+ else:
571
+ caption = None
572
+
573
+ image_captions[frame_name] = caption
574
+ all_captions[cat_name] = image_captions
575
+
576
+ # final : also prepare valid object ids
577
+ valid_obj_ids = dict()
578
+
579
+ for cat in cat_names:
580
+ if cat in ytvos_category_valid_list:
581
+ obj_id_cat = vid_meta['obj_id_cat']
582
+ valid_cat_ids = []
583
+ for obj_id in list(obj_id_cat.keys()):
584
+ if obj_id_cat[obj_id] == cat:
585
+ valid_cat_ids.append(obj_id)
586
+ valid_obj_ids[cat] = valid_cat_ids
587
+
588
+ return vid_id, all_captions, valid_obj_ids
589
+
590
+
591
+ if __name__ == '__main__':
592
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
593
+ parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions_gpt-4o_randcap.json")
594
+ parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids_gpt-4o_randcap.json")
595
+
596
+ args = parser.parse_args()
597
+
598
+ #==================데이터 불러오기===================
599
+ # 전체 데이터셋
600
+ train_dataset = build_ytvos_ref(image_set = 'train', args = args)
601
+
602
+ # 전체 데이터셋 메타데이터
603
+ metas = train_dataset.metas
604
+
605
+ # 색상 후보 8개 (RGB 형식)
606
+ colors = [
607
+ (255, 0, 0), # Red
608
+ (0, 255, 0), # Green
609
+ (0, 0, 255), # Blue
610
+ (255, 255, 0), # Yellow
611
+ (255, 0, 255), # Magenta
612
+ (0, 255, 255), # Cyan
613
+ (128, 0, 128), # Purple
614
+ (255, 165, 0) # Orange
615
+ ]
616
+
617
+ ytvos_category_valid_list = [
618
+ 'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
619
+ 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
620
+ 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
621
+ 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
622
+ 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
623
+ 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
624
+ ]
625
+
626
+ #==================gpt 돌리기===================
627
+ os.environ['OPENAI_API_KEY'] = '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'
628
+
629
+ result_captions = {}
630
+ result_valid_obj_ids = {}
631
+
632
+ for i in range(len(metas)):
633
+ try:
634
+ vid_id, all_captions, valid_obj_ids = getCaption(i)
635
+
636
+ if vid_id not in result_captions:
637
+ result_captions[vid_id] = all_captions
638
+ if vid_id not in result_valid_obj_ids:
639
+ result_valid_obj_ids[vid_id] = valid_obj_ids
640
+
641
+ except (requests.exceptions.ConnectionError, APIConnectionError) as e:
642
+ print(f"created caption until {i-1}", flush=True)
643
+
644
+ with open(args.save_caption_path, "w") as file:
645
+ json.dump(result_captions, file, indent=4)
646
+
647
+ with open(args.save_valid_obj_ids_path, "w") as file:
648
+ json.dump(result_valid_obj_ids, file, indent=4)
649
+
650
+ print("Finished!", flush=True)
651
+
652
+ with open(args.save_caption_path, "w") as file:
653
+ json.dump(result_captions, file, indent=4)
654
+
655
+ with open(args.save_valid_obj_ids_path, "w") as file:
656
+ json.dump(result_valid_obj_ids, file, indent=4)
.history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207173210.py ADDED
@@ -0,0 +1,656 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4
+ import time
5
+
6
+ from os import path as osp
7
+ from io import BytesIO
8
+ import random
9
+
10
+ from mbench.ytvos_ref import build as build_ytvos_ref
11
+ import argparse
12
+ import opts
13
+
14
+ import sys
15
+ from pathlib import Path
16
+ import os
17
+ from os import path as osp
18
+ import skimage
19
+ from io import BytesIO
20
+
21
+ import numpy as np
22
+ import pandas as pd
23
+ import regex as re
24
+ import json
25
+
26
+ import cv2
27
+ from PIL import Image, ImageDraw
28
+ import torch
29
+ from torchvision.transforms import functional as F
30
+
31
+ from skimage import measure # (pip install scikit-image)
32
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
33
+
34
+ import matplotlib.pyplot as plt
35
+ import matplotlib.patches as patches
36
+ from matplotlib.collections import PatchCollection
37
+ from matplotlib.patches import Rectangle
38
+ import textwrap
39
+
40
+
41
+ import ipywidgets as widgets
42
+ from IPython.display import display, clear_output
43
+
44
+ from openai import OpenAI
45
+ import base64
46
+ import json
47
+ import requests
48
+ from openai.error import APIConnectionError, OpenAIError
49
+
50
+ def number_objects_and_encode_old(idx, color_mask=False):
51
+ encoded_frames = {}
52
+ contoured_frames = {} # New dictionary for original images
53
+ vid_cat_cnts = {}
54
+
55
+ vid_meta = metas[idx]
56
+ vid_data = train_dataset[idx]
57
+ vid_id = vid_meta['video']
58
+ frame_indx = vid_meta['sample_indx']
59
+ cat_names = set(vid_meta['obj_id_cat'].values())
60
+ imgs = vid_data[0]
61
+
62
+ for cat in cat_names:
63
+ cat_frames = []
64
+ contour_frames = []
65
+ frame_cat_cnts = {}
66
+
67
+ for i in range(imgs.size(0)):
68
+ frame_name = frame_indx[i]
69
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
70
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
71
+
72
+ frame_data = vid_data[2][frame_name]
73
+ obj_ids = list(frame_data.keys())
74
+
75
+ cat_cnt = 0
76
+
77
+ for j in range(len(obj_ids)):
78
+ obj_id = obj_ids[j]
79
+ obj_data = frame_data[obj_id]
80
+ obj_bbox = obj_data['bbox']
81
+ obj_valid = obj_data['valid']
82
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
83
+ obj_cat = obj_data['category_name']
84
+
85
+ if obj_cat == cat and obj_valid:
86
+ cat_cnt += 1
87
+
88
+ if color_mask == False:
89
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
90
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
91
+ for i, contour in enumerate(contours):
92
+ moments = cv2.moments(contour)
93
+ if moments["m00"] != 0:
94
+ cx = int(moments["m10"] / moments["m00"])
95
+ cy = int(moments["m01"] / moments["m00"])
96
+ else:
97
+ cx, cy = contour[0][0]
98
+
99
+ font = cv2.FONT_HERSHEY_SIMPLEX
100
+ text = obj_id
101
+ text_size = cv2.getTextSize(text, font, 1, 2)[0]
102
+ text_w, text_h = text_size
103
+
104
+ cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
105
+ (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
106
+
107
+ cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
108
+ font, 1, (255, 255, 255), 2)
109
+
110
+ else:
111
+ alpha = 0.08
112
+
113
+ colored_obj_mask = np.zeros_like(frame)
114
+ colored_obj_mask[obj_mask == 1] = colors[j]
115
+ frame[obj_mask == 1] = (
116
+ (1 - alpha) * frame[obj_mask == 1]
117
+ + alpha * colored_obj_mask[obj_mask == 1]
118
+ )
119
+
120
+
121
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
122
+ cv2.drawContours(frame, contours, -1, colors[j], 2)
123
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
124
+
125
+ if len(contours) > 0:
126
+ largest_contour = max(contours, key=cv2.contourArea)
127
+ M = cv2.moments(largest_contour)
128
+ if M["m00"] != 0:
129
+ center_x = int(M["m10"] / M["m00"])
130
+ center_y = int(M["m01"] / M["m00"])
131
+ else:
132
+ center_x, center_y = 0, 0
133
+
134
+ font = cv2.FONT_HERSHEY_SIMPLEX
135
+ text = obj_id
136
+
137
+ font_scale = 0.9
138
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
139
+ text_x = center_x - text_size[0] // 1
140
+ text_y = center_y
141
+
142
+ rect_start = (text_x - 5, text_y - text_size[1] - 5)
143
+ rect_end = (text_x + text_size[0] + 5, text_y)
144
+
145
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
146
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
147
+
148
+ # plt.figure(figsize=(12, 8))
149
+ # plt.imshow(frame)
150
+ # plt.title(f"frame {frame_name}")
151
+ # plt.tight_layout()
152
+ # plt.axis('off')
153
+ # plt.show()
154
+
155
+ buffer = BytesIO()
156
+ frame = Image.fromarray(frame)
157
+ frame.save(buffer, format='jpeg')
158
+ buffer.seek(0)
159
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
160
+ frame_cat_cnts[frame_name] = cat_cnt
161
+
162
+ buffer.seek(0) # Reuse buffer instead of creating a new one
163
+ buffer.truncate()
164
+ frame_for_contour = Image.fromarray(frame_for_contour)
165
+ frame_for_contour.save(buffer, format='jpeg')
166
+ buffer.seek(0)
167
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
168
+
169
+ encoded_frames[cat] = cat_frames
170
+ contoured_frames[cat] = contour_frames
171
+ vid_cat_cnts[cat] = frame_cat_cnts
172
+
173
+ return encoded_frames, contoured_frames, vid_cat_cnts
174
+
175
+
176
+ def number_objects_and_encode(idx, color_mask=False):
177
+ encoded_frames = {}
178
+ contoured_frames = {} # New dictionary for original images
179
+ vid_cat_cnts = {}
180
+
181
+ vid_meta = metas[idx]
182
+ vid_data = train_dataset[idx]
183
+ vid_id = vid_meta['video']
184
+ frame_indx = vid_meta['sample_indx']
185
+ cat_names = set(vid_meta['obj_id_cat'].values())
186
+ imgs = vid_data[0]
187
+
188
+ for cat in cat_names:
189
+ cat_frames = []
190
+ contour_frames = []
191
+ frame_cat_cnts = {}
192
+
193
+ for i in range(imgs.size(0)):
194
+ frame_name = frame_indx[i]
195
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
196
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
197
+
198
+ frame_data = vid_data[2][frame_name]
199
+ obj_ids = list(frame_data.keys())
200
+
201
+ cat_cnt = 0
202
+
203
+ for j in range(len(obj_ids)):
204
+ obj_id = obj_ids[j]
205
+ obj_data = frame_data[obj_id]
206
+ obj_bbox = obj_data['bbox']
207
+ obj_valid = obj_data['valid']
208
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
209
+ obj_cat = obj_data['category_name']
210
+
211
+ if obj_cat == cat and obj_valid:
212
+ cat_cnt += 1
213
+
214
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
215
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
216
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
217
+
218
+ if len(contours) > 0:
219
+ largest_contour = max(contours, key=cv2.contourArea)
220
+ M = cv2.moments(largest_contour)
221
+ if M["m00"] != 0:
222
+ center_x = int(M["m10"] / M["m00"])
223
+ center_y = int(M["m01"] / M["m00"])
224
+ else:
225
+ center_x, center_y = 0, 0
226
+
227
+ font = cv2.FONT_HERSHEY_SIMPLEX
228
+ text = obj_id
229
+ font_scale = 1.2
230
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
231
+ text_x = center_x - text_size[0] // 1
232
+ text_y = center_y
233
+
234
+ rect_start = (text_x - 5, text_y - text_size[1] - 5)
235
+ rect_end = (text_x + text_size[0] + 5, text_y + 3)
236
+
237
+ contour_thickness = 1
238
+ rect_start_contour = (rect_start[0] - contour_thickness, rect_start[1] - contour_thickness)
239
+ rect_end_contour = (rect_end[0] + contour_thickness, rect_end[1] + contour_thickness)
240
+
241
+ cv2.rectangle(frame, rect_start_contour, rect_end_contour, colors[j], contour_thickness)
242
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
243
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
244
+
245
+
246
+ if color_mask:
247
+ alpha = 0.08
248
+ colored_obj_mask = np.zeros_like(frame)
249
+ colored_obj_mask[obj_mask == 1] = colors[j]
250
+ frame[obj_mask == 1] = (
251
+ (1 - alpha) * frame[obj_mask == 1]
252
+ + alpha * colored_obj_mask[obj_mask == 1]
253
+ )
254
+
255
+ # plt.figure(figsize=(12, 8))
256
+ # plt.imshow(frame)
257
+ # plt.title(f"frame {frame_name}")
258
+ # plt.tight_layout()
259
+ # plt.axis('off')
260
+ # plt.show()
261
+
262
+ buffer = BytesIO()
263
+ frame = Image.fromarray(frame)
264
+ frame.save(buffer, format='jpeg')
265
+ buffer.seek(0)
266
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
267
+ frame_cat_cnts[frame_name] = cat_cnt
268
+
269
+ buffer.seek(0) # Reuse buffer instead of creating a new one
270
+ buffer.truncate()
271
+ frame_for_contour = Image.fromarray(frame_for_contour)
272
+ frame_for_contour.save(buffer, format='jpeg')
273
+ buffer.seek(0)
274
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
275
+
276
+ encoded_frames[cat] = cat_frames
277
+ contoured_frames[cat] = contour_frames
278
+ vid_cat_cnts[cat] = frame_cat_cnts
279
+
280
+ return encoded_frames, contoured_frames, vid_cat_cnts
281
+
282
+
283
+
284
+ def getCaption(idx, model='gpt-4o'):
285
+ vid_meta = metas[idx]
286
+ vid_data = train_dataset[idx]
287
+ vid_id = vid_meta['video']
288
+ print(f"vid id: {vid_id}\n")
289
+
290
+ frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
291
+ cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
292
+ all_captions = dict()
293
+
294
+ # color_mask = random.choice([True, False])
295
+ color_mask = random.choices([False, True], weights=[60, 40])[0]
296
+
297
+ base64_frames, _ , vid_cat_cnts = number_objects_and_encode(idx, color_mask)
298
+ #marked = "mask with boundary" if color_mask else "boundary"
299
+
300
+ for cat_name in list(cat_names) :
301
+
302
+ is_movable = False
303
+ if cat_name in ytvos_category_valid_list :
304
+ is_movable = True
305
+
306
+ if not is_movable:
307
+ print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
308
+
309
+
310
+ image_captions = {}
311
+ captioner = OpenAI()
312
+ cat_base64_frames = base64_frames[cat_name]
313
+ # cont_base64_frames = contoured_frames[cat_name]
314
+
315
+ for i in range(len(cat_base64_frames)):
316
+ frame_name = frame_indx[i]
317
+ # cont_base64_image = cont_base64_frames[i]
318
+ base64_image = cat_base64_frames[i]
319
+ should_filter = False
320
+ frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
321
+
322
+ if frame_cat_cnts >= 2:
323
+ should_filter = True
324
+ else:
325
+ print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
326
+
327
+
328
+ if is_movable and should_filter:
329
+ #1단계: 필터링
330
+ print(f"-----------category name: {cat_name}, frame name: {frame_name}")
331
+ caption_filter_text = f"""
332
+ You are a visual assistant analyzing a single frame from a video.
333
+ In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
334
+
335
+ Are {cat_name}s in the image performing all different and recognizable actions or postures?
336
+ Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
337
+ facial expressions, and any notable interactions with objects or other {cat_name}s or people.
338
+
339
+ Only focus on obvious, prominent actions that can be reliably identified from this single frame.
340
+
341
+ - Respond with "YES" if:
342
+ 1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
343
+ (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
344
+ 2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
345
+ 3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
346
+
347
+ - Respond with "NONE" if:
348
+ 1) The actions or pose are not clearly differentiable or too similar.
349
+ 2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
350
+ 3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
351
+
352
+ Answer strictly with either "YES" or "NONE".
353
+ """
354
+
355
+ response1 = captioner.chat.completions.create(
356
+ model=model,
357
+ messages=[
358
+ {
359
+ "role": "user",
360
+ "content": [
361
+ {
362
+ "type": "text",
363
+ "text": caption_filter_text,
364
+ },
365
+ {
366
+ "type": "image_url",
367
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
368
+ }
369
+ ],
370
+ }
371
+ ],
372
+ )
373
+ response_content = response1.choices[0].message.content
374
+ should_caption = True if "yes" in response_content.lower() else False
375
+ print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
376
+
377
+ else:
378
+ should_caption = False
379
+
380
+ #2단계: dense caption 만들기
381
+ dense_caption_prompt_1 = f"""
382
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. The category name of these objects are : {cat_name}.
383
+
384
+ Please describe the image focusing on labeled {cat_name}s in detail, focusing on their actions and interactions.
385
+
386
+ 1. Focus only on clear, unique, and prominent actions that distinguish each object.
387
+ 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
388
+ 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
389
+ 4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
390
+ 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
391
+ 6. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
392
+ - expressions like 'seems to be', 'appears to be' are BANNED!
393
+ 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
394
+ 8. Include interactions with objects or other entities when they are prominent and observable.
395
+ 9. **Do not include descriptions of appearance** such as clothes, color, size, shape etc.
396
+ 10. **Do not include relative position** between objects such as 'the left elephant' because left/right can be ambiguous.
397
+ 11. Do not mention object IDs.
398
+ 12. Use '{cat_name}' as the noun for the referring expressions.
399
+
400
+ Note that I want to use your description to create a grounding dataset, therefore, your descriptions for different objects should be unique, i.e., If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
401
+
402
+ - Your answer should contain details, and follow the following format:
403
+ object id. action-oriented description
404
+ (e.g. 1. the person is holding bananas on two hands and opening his mouth, turning the head right.
405
+ 2. a person bending over and touching his boots to tie the shoelace.)
406
+ - for action-oriented description, use {cat_name} as subject noun
407
+
408
+ **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
409
+ Please pay attention to the categories of these objects and don’t change them.
410
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
411
+ Output referring expressions for each object id. Please start your answer:"""
412
+
413
+
414
+ dense_caption_prompt_2 = f"""
415
+ You are an advanced visual language model analyzing a video frame.
416
+ In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been distinctly labeled with bright numerical IDs at their center and boundary.
417
+
418
+ Your task is to generate **action-oriented descriptions** for each labeled {cat_name}.
419
+ Your descriptions should capture their **observable actions and interactions**, making sure to highlight movement, gestures, and dynamic behaviors.
420
+
421
+ ---
422
+ ## Key Guidelines:
423
+ 1. **Describe only clear and visible actions** that uniquely define what the {cat_name} is doing.
424
+ - Example: "grabbing a branch and pulling it down" (**(O) Specific**)
425
+ - Avoid: "moving slightly to the side" (**(X) Too vague**)
426
+
427
+ 2. **Do not describe appearance, color, or position**—focus purely on the action.
428
+ - (X) "A large brown bear standing on the left"
429
+ - (O) "The bear is lifting its front paws and swiping forward."
430
+
431
+ 3. **Use dynamic, action-specific verbs** rather than passive descriptions.
432
+ - (O) "The giraffe is tilting its head and sniffing the ground."
433
+ - (X) "The giraffe is near a tree and looking around."
434
+
435
+ 4. **Avoid assumptions, emotions, or speculative phrasing.**
436
+ - (X) "The person seems excited" / "The person might be preparing to jump."
437
+ - (O) "The person is pushing its front legs against the rock and leaping forward."
438
+
439
+ 5. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
440
+ - expressions like 'seems to be', 'appears to be' are BANNED!
441
+ 6. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
442
+
443
+ 7. If multiple {cat_name}s are present, make sure their descriptions are **distinct and non-overlapping**.
444
+ - **Each object should have a unique, descriptive action.**
445
+ - (X) "Two dogs are running."
446
+ - (O) "1. One dog is chasing another, its legs stretched mid-air.
447
+ 2. The other dog is looking back while speeding up."
448
+
449
+ ---
450
+ ## Output Format:
451
+ - Each labeled **{cat_name}** should have exactly **one line of description**.
452
+ - Format: `ID. {cat_name} + action-based description`
453
+ - (O) Example:
454
+ ```
455
+ 1. The person is leaning forward while opening a bag with both hands.
456
+ 2. The person is holding onto a rope and pulling themselves up.
457
+ ```
458
+ - **Ensure that each object is described individually.**
459
+ - **Do not group objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
460
+
461
+ ---
462
+ ## Additional Instructions:
463
+ - **Do NOT** use expressions like "it appears that..." or "it seems like...".
464
+ - **Do NOT** mention object IDs in the description (only use the provided format).
465
+ - **DO NOT** include markdown formatting (no bullet points, no asterisks).
466
+ - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
467
+
468
+ Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:
469
+ """
470
+
471
+
472
+ dense_caption_prompt = f"""
473
+ You are a visual assistant analyzing a single frame of a video.
474
+ In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been labeled with bright numeric IDs at their center and boundary.
475
+
476
+ I am building an **action-centric referring expression** dataset.
477
+ Your task is to describe each labeled {cat_name} based on **clearly observable and specific actions**.
478
+
479
+ ---
480
+ ## Guidelines:
481
+ 1. **Focus only on visible and prominent actions** (e.g., running, pushing, grasping an object).
482
+ 2. **Avoid describing minor or ambiguous movements** (e.g., "slightly moving a paw," "tilting head a bit").
483
+ 3. **Do not include subjective or speculative descriptions** (e.g., "it seems excited" or "it might be preparing to jump").
484
+ 4. **Avoid vague expressions** like "engaging with something." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
485
+ 5. **Use dynamic action verbs** (e.g., holding, throwing, inspecting, leaning, pressing) to highlight motion and interaction.
486
+ 6. If multiple {cat_name}s appear, ensure each description is **distinct and non-overlapping**.
487
+ 7. Base your descriptions on these principles:
488
+ - **Avoid words like 'minimal' or 'slightly'.**
489
+ - Emphasize **body movement, posture, and motion patterns** (e.g., "lifting its head," "facing forward," "showing its back").
490
+ - Describe **facial expressions and interactions with objects** (e.g., "opening its mouth wide," "smiling while holding an item").
491
+ - **Specify actions with other objects or entities** only when they are clear and observable.
492
+ - (O) "pushing another person"
493
+ - (X) "interacting with another object"
494
+
495
+ ---
496
+ ## Output Format:
497
+ - Each labeled **{cat_name}** must have **exactly one line**.
498
+ - Format: `ID. {cat_name} + action-based description`
499
+ - (O) Example:
500
+ ```
501
+ 1. The person is holding ski poles and skiing down a snowy mountain with bent knees.
502
+ 2. The person is pulling a baby carriage while smiling.
503
+ ```
504
+ - **Ensure each object is described individually.**
505
+ - **Do not group multiple objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
506
+
507
+ ---
508
+ ## Example:
509
+ If the frame has two labeled **bears**, your output should be:
510
+ ```
511
+ 1. The bear is reaching out its right paw while leaning forward to catch prey.
512
+ 2. A bear is standing upright, facing right, and touching the bike beside it.
513
+ ```
514
+
515
+ ---
516
+ ## Additional Instructions:
517
+ - **Do NOT** describe appearance (e.g., color, size, texture) or relative positioning (e.g., "on the left/right").
518
+ - **Do NOT** reference object IDs explicitly (e.g., "Person 1" or "Object 2" is NOT allowed).
519
+ - **Do NOT** include markdown formatting (no bullet points, asterisks, or extra symbols).
520
+ - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
521
+
522
+ Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:"""
523
+
524
+
525
+ MAX_RETRIES = 3
526
+ retry_count = 0
527
+
528
+ if should_caption:
529
+ while retry_count < MAX_RETRIES:
530
+ selected_prompt = random.choice([dense_caption_prompt, dense_caption_prompt_2])
531
+
532
+ response2 = captioner.chat.completions.create(
533
+ model=model,
534
+ messages=[
535
+ {
536
+ "role": "user",
537
+ "content": [
538
+ {
539
+ "type": "text",
540
+ "text": selected_prompt,
541
+ },
542
+ {
543
+ "type": "image_url",
544
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
545
+ },
546
+ ],
547
+ }
548
+ ],
549
+ )
550
+
551
+ # caption = response2.choices[0].message.content
552
+ #print(f"{image_path} - {frame_name}: {caption}")
553
+
554
+ caption = response2.choices[0].message.content.strip()
555
+ caption_lower = caption.lower().lstrip()
556
+
557
+ if caption_lower.startswith("1.") and not any(
558
+ phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
559
+ ):
560
+ break
561
+
562
+ print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
563
+ retry_count += 1
564
+ time.sleep(2)
565
+
566
+ if retry_count == MAX_RETRIES:
567
+ caption = None
568
+ print("Max retries reached. Caption generation failed.")
569
+
570
+ else:
571
+ caption = None
572
+
573
+ image_captions[frame_name] = caption
574
+ all_captions[cat_name] = image_captions
575
+
576
+ # final : also prepare valid object ids
577
+ valid_obj_ids = dict()
578
+
579
+ for cat in cat_names:
580
+ if cat in ytvos_category_valid_list:
581
+ obj_id_cat = vid_meta['obj_id_cat']
582
+ valid_cat_ids = []
583
+ for obj_id in list(obj_id_cat.keys()):
584
+ if obj_id_cat[obj_id] == cat:
585
+ valid_cat_ids.append(obj_id)
586
+ valid_obj_ids[cat] = valid_cat_ids
587
+
588
+ return vid_id, all_captions, valid_obj_ids
589
+
590
+
591
+ if __name__ == '__main__':
592
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
593
+ parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions_gpt-4o_randcap.json")
594
+ parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids_gpt-4o_randcap.json")
595
+
596
+ args = parser.parse_args()
597
+
598
+ #==================데이터 불러오기===================
599
+ # 전체 데이터셋
600
+ train_dataset = build_ytvos_ref(image_set = 'train', args = args)
601
+
602
+ # 전체 데이터셋 메타데이터
603
+ metas = train_dataset.metas
604
+
605
+ # 색상 후보 8개 (RGB 형식)
606
+ colors = [
607
+ (255, 0, 0), # Red
608
+ (0, 255, 0), # Green
609
+ (0, 0, 255), # Blue
610
+ (255, 255, 0), # Yellow
611
+ (255, 0, 255), # Magenta
612
+ (0, 255, 255), # Cyan
613
+ (128, 0, 128), # Purple
614
+ (255, 165, 0) # Orange
615
+ ]
616
+
617
+ ytvos_category_valid_list = [
618
+ 'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
619
+ 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
620
+ 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
621
+ 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
622
+ 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
623
+ 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
624
+ ]
625
+
626
+ #==================gpt 돌리기===================
627
+ os.environ['OPENAI_API_KEY'] = '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'
628
+
629
+ result_captions = {}
630
+ result_valid_obj_ids = {}
631
+
632
+ for i in range(len(metas)):
633
+ try:
634
+ vid_id, all_captions, valid_obj_ids = getCaption(i)
635
+
636
+ if vid_id not in result_captions:
637
+ result_captions[vid_id] = all_captions
638
+ if vid_id not in result_valid_obj_ids:
639
+ result_valid_obj_ids[vid_id] = valid_obj_ids
640
+
641
+ except (requests.exceptions.ConnectionError, APIConnectionError, OpenAIError) as e:
642
+ print(f"created caption until {i-1}", flush=True)
643
+
644
+ with open(args.save_caption_path, "w") as file:
645
+ json.dump(result_captions, file, indent=4)
646
+
647
+ with open(args.save_valid_obj_ids_path, "w") as file:
648
+ json.dump(result_valid_obj_ids, file, indent=4)
649
+
650
+ print("Finished!", flush=True)
651
+
652
+ with open(args.save_caption_path, "w") as file:
653
+ json.dump(result_captions, file, indent=4)
654
+
655
+ with open(args.save_valid_obj_ids_path, "w") as file:
656
+ json.dump(result_valid_obj_ids, file, indent=4)
.history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207173355.py ADDED
@@ -0,0 +1,677 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4
+ import time
5
+
6
+ from os import path as osp
7
+ from io import BytesIO
8
+ import random
9
+
10
+ from mbench.ytvos_ref import build as build_ytvos_ref
11
+ import argparse
12
+ import opts
13
+
14
+ import sys
15
+ from pathlib import Path
16
+ import os
17
+ from os import path as osp
18
+ import skimage
19
+ from io import BytesIO
20
+
21
+ import numpy as np
22
+ import pandas as pd
23
+ import regex as re
24
+ import json
25
+
26
+ import cv2
27
+ from PIL import Image, ImageDraw
28
+ import torch
29
+ from torchvision.transforms import functional as F
30
+
31
+ from skimage import measure # (pip install scikit-image)
32
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
33
+
34
+ import matplotlib.pyplot as plt
35
+ import matplotlib.patches as patches
36
+ from matplotlib.collections import PatchCollection
37
+ from matplotlib.patches import Rectangle
38
+ import textwrap
39
+
40
+
41
+ import ipywidgets as widgets
42
+ from IPython.display import display, clear_output
43
+
44
+ from openai import OpenAI
45
+ import base64
46
+ import json
47
+ import requests
48
+ from openai.error import APIConnectionError, OpenAIError
49
+
50
+ def number_objects_and_encode_old(idx, color_mask=False):
51
+ encoded_frames = {}
52
+ contoured_frames = {} # New dictionary for original images
53
+ vid_cat_cnts = {}
54
+
55
+ vid_meta = metas[idx]
56
+ vid_data = train_dataset[idx]
57
+ vid_id = vid_meta['video']
58
+ frame_indx = vid_meta['sample_indx']
59
+ cat_names = set(vid_meta['obj_id_cat'].values())
60
+ imgs = vid_data[0]
61
+
62
+ for cat in cat_names:
63
+ cat_frames = []
64
+ contour_frames = []
65
+ frame_cat_cnts = {}
66
+
67
+ for i in range(imgs.size(0)):
68
+ frame_name = frame_indx[i]
69
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
70
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
71
+
72
+ frame_data = vid_data[2][frame_name]
73
+ obj_ids = list(frame_data.keys())
74
+
75
+ cat_cnt = 0
76
+
77
+ for j in range(len(obj_ids)):
78
+ obj_id = obj_ids[j]
79
+ obj_data = frame_data[obj_id]
80
+ obj_bbox = obj_data['bbox']
81
+ obj_valid = obj_data['valid']
82
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
83
+ obj_cat = obj_data['category_name']
84
+
85
+ if obj_cat == cat and obj_valid:
86
+ cat_cnt += 1
87
+
88
+ if color_mask == False:
89
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
90
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
91
+ for i, contour in enumerate(contours):
92
+ moments = cv2.moments(contour)
93
+ if moments["m00"] != 0:
94
+ cx = int(moments["m10"] / moments["m00"])
95
+ cy = int(moments["m01"] / moments["m00"])
96
+ else:
97
+ cx, cy = contour[0][0]
98
+
99
+ font = cv2.FONT_HERSHEY_SIMPLEX
100
+ text = obj_id
101
+ text_size = cv2.getTextSize(text, font, 1, 2)[0]
102
+ text_w, text_h = text_size
103
+
104
+ cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
105
+ (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
106
+
107
+ cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
108
+ font, 1, (255, 255, 255), 2)
109
+
110
+ else:
111
+ alpha = 0.08
112
+
113
+ colored_obj_mask = np.zeros_like(frame)
114
+ colored_obj_mask[obj_mask == 1] = colors[j]
115
+ frame[obj_mask == 1] = (
116
+ (1 - alpha) * frame[obj_mask == 1]
117
+ + alpha * colored_obj_mask[obj_mask == 1]
118
+ )
119
+
120
+
121
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
122
+ cv2.drawContours(frame, contours, -1, colors[j], 2)
123
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
124
+
125
+ if len(contours) > 0:
126
+ largest_contour = max(contours, key=cv2.contourArea)
127
+ M = cv2.moments(largest_contour)
128
+ if M["m00"] != 0:
129
+ center_x = int(M["m10"] / M["m00"])
130
+ center_y = int(M["m01"] / M["m00"])
131
+ else:
132
+ center_x, center_y = 0, 0
133
+
134
+ font = cv2.FONT_HERSHEY_SIMPLEX
135
+ text = obj_id
136
+
137
+ font_scale = 0.9
138
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
139
+ text_x = center_x - text_size[0] // 1
140
+ text_y = center_y
141
+
142
+ rect_start = (text_x - 5, text_y - text_size[1] - 5)
143
+ rect_end = (text_x + text_size[0] + 5, text_y)
144
+
145
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
146
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
147
+
148
+ # plt.figure(figsize=(12, 8))
149
+ # plt.imshow(frame)
150
+ # plt.title(f"frame {frame_name}")
151
+ # plt.tight_layout()
152
+ # plt.axis('off')
153
+ # plt.show()
154
+
155
+ buffer = BytesIO()
156
+ frame = Image.fromarray(frame)
157
+ frame.save(buffer, format='jpeg')
158
+ buffer.seek(0)
159
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
160
+ frame_cat_cnts[frame_name] = cat_cnt
161
+
162
+ buffer.seek(0) # Reuse buffer instead of creating a new one
163
+ buffer.truncate()
164
+ frame_for_contour = Image.fromarray(frame_for_contour)
165
+ frame_for_contour.save(buffer, format='jpeg')
166
+ buffer.seek(0)
167
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
168
+
169
+ encoded_frames[cat] = cat_frames
170
+ contoured_frames[cat] = contour_frames
171
+ vid_cat_cnts[cat] = frame_cat_cnts
172
+
173
+ return encoded_frames, contoured_frames, vid_cat_cnts
174
+
175
+
176
+ def number_objects_and_encode(idx, color_mask=False):
177
+ encoded_frames = {}
178
+ contoured_frames = {} # New dictionary for original images
179
+ vid_cat_cnts = {}
180
+
181
+ vid_meta = metas[idx]
182
+ vid_data = train_dataset[idx]
183
+ vid_id = vid_meta['video']
184
+ frame_indx = vid_meta['sample_indx']
185
+ cat_names = set(vid_meta['obj_id_cat'].values())
186
+ imgs = vid_data[0]
187
+
188
+ for cat in cat_names:
189
+ cat_frames = []
190
+ contour_frames = []
191
+ frame_cat_cnts = {}
192
+
193
+ for i in range(imgs.size(0)):
194
+ frame_name = frame_indx[i]
195
+ frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
196
+ frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
197
+
198
+ frame_data = vid_data[2][frame_name]
199
+ obj_ids = list(frame_data.keys())
200
+
201
+ cat_cnt = 0
202
+
203
+ for j in range(len(obj_ids)):
204
+ obj_id = obj_ids[j]
205
+ obj_data = frame_data[obj_id]
206
+ obj_bbox = obj_data['bbox']
207
+ obj_valid = obj_data['valid']
208
+ obj_mask = obj_data['mask'].numpy().astype(np.uint8)
209
+ obj_cat = obj_data['category_name']
210
+
211
+ if obj_cat == cat and obj_valid:
212
+ cat_cnt += 1
213
+
214
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
215
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
216
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
217
+
218
+ if len(contours) > 0:
219
+ largest_contour = max(contours, key=cv2.contourArea)
220
+ M = cv2.moments(largest_contour)
221
+ if M["m00"] != 0:
222
+ center_x = int(M["m10"] / M["m00"])
223
+ center_y = int(M["m01"] / M["m00"])
224
+ else:
225
+ center_x, center_y = 0, 0
226
+
227
+ font = cv2.FONT_HERSHEY_SIMPLEX
228
+ text = obj_id
229
+ font_scale = 1.2
230
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
231
+ text_x = center_x - text_size[0] // 1
232
+ text_y = center_y
233
+
234
+ rect_start = (text_x - 5, text_y - text_size[1] - 5)
235
+ rect_end = (text_x + text_size[0] + 5, text_y + 3)
236
+
237
+ contour_thickness = 1
238
+ rect_start_contour = (rect_start[0] - contour_thickness, rect_start[1] - contour_thickness)
239
+ rect_end_contour = (rect_end[0] + contour_thickness, rect_end[1] + contour_thickness)
240
+
241
+ cv2.rectangle(frame, rect_start_contour, rect_end_contour, colors[j], contour_thickness)
242
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
243
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
244
+
245
+
246
+ if color_mask:
247
+ alpha = 0.08
248
+ colored_obj_mask = np.zeros_like(frame)
249
+ colored_obj_mask[obj_mask == 1] = colors[j]
250
+ frame[obj_mask == 1] = (
251
+ (1 - alpha) * frame[obj_mask == 1]
252
+ + alpha * colored_obj_mask[obj_mask == 1]
253
+ )
254
+
255
+ # plt.figure(figsize=(12, 8))
256
+ # plt.imshow(frame)
257
+ # plt.title(f"frame {frame_name}")
258
+ # plt.tight_layout()
259
+ # plt.axis('off')
260
+ # plt.show()
261
+
262
+ buffer = BytesIO()
263
+ frame = Image.fromarray(frame)
264
+ frame.save(buffer, format='jpeg')
265
+ buffer.seek(0)
266
+ cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
267
+ frame_cat_cnts[frame_name] = cat_cnt
268
+
269
+ buffer.seek(0) # Reuse buffer instead of creating a new one
270
+ buffer.truncate()
271
+ frame_for_contour = Image.fromarray(frame_for_contour)
272
+ frame_for_contour.save(buffer, format='jpeg')
273
+ buffer.seek(0)
274
+ contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
275
+
276
+ encoded_frames[cat] = cat_frames
277
+ contoured_frames[cat] = contour_frames
278
+ vid_cat_cnts[cat] = frame_cat_cnts
279
+
280
+ return encoded_frames, contoured_frames, vid_cat_cnts
281
+
282
+
283
+
284
+ def getCaption(idx, model='gpt-4o'):
285
+ vid_meta = metas[idx]
286
+ vid_data = train_dataset[idx]
287
+ vid_id = vid_meta['video']
288
+ print(f"vid id: {vid_id}\n")
289
+
290
+ frame_indx = vid_meta['sample_indx'] # e.g. [4, 7, 9, 16]
291
+ cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
292
+ all_captions = dict()
293
+
294
+ # color_mask = random.choice([True, False])
295
+ color_mask = random.choices([False, True], weights=[60, 40])[0]
296
+
297
+ base64_frames, _ , vid_cat_cnts = number_objects_and_encode(idx, color_mask)
298
+ #marked = "mask with boundary" if color_mask else "boundary"
299
+
300
+ for cat_name in list(cat_names) :
301
+
302
+ is_movable = False
303
+ if cat_name in ytvos_category_valid_list :
304
+ is_movable = True
305
+
306
+ if not is_movable:
307
+ print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
308
+
309
+
310
+ image_captions = {}
311
+ captioner = OpenAI()
312
+ cat_base64_frames = base64_frames[cat_name]
313
+ # cont_base64_frames = contoured_frames[cat_name]
314
+
315
+ for i in range(len(cat_base64_frames)):
316
+ frame_name = frame_indx[i]
317
+ # cont_base64_image = cont_base64_frames[i]
318
+ base64_image = cat_base64_frames[i]
319
+ should_filter = False
320
+ frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
321
+
322
+ if frame_cat_cnts >= 2:
323
+ should_filter = True
324
+ else:
325
+ print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
326
+
327
+
328
+ if is_movable and should_filter:
329
+ #1단계: 필터링
330
+ print(f"-----------category name: {cat_name}, frame name: {frame_name}")
331
+ caption_filter_text = f"""
332
+ You are a visual assistant analyzing a single frame from a video.
333
+ In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
334
+
335
+ Are {cat_name}s in the image performing all different and recognizable actions or postures?
336
+ Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
337
+ facial expressions, and any notable interactions with objects or other {cat_name}s or people.
338
+
339
+ Only focus on obvious, prominent actions that can be reliably identified from this single frame.
340
+
341
+ - Respond with "YES" if:
342
+ 1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
343
+ (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
344
+ 2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
345
+ 3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
346
+
347
+ - Respond with "NONE" if:
348
+ 1) The actions or pose are not clearly differentiable or too similar.
349
+ 2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
350
+ 3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
351
+
352
+ Answer strictly with either "YES" or "NONE".
353
+ """
354
+
355
+ response1 = captioner.chat.completions.create(
356
+ model=model,
357
+ messages=[
358
+ {
359
+ "role": "user",
360
+ "content": [
361
+ {
362
+ "type": "text",
363
+ "text": caption_filter_text,
364
+ },
365
+ {
366
+ "type": "image_url",
367
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
368
+ }
369
+ ],
370
+ }
371
+ ],
372
+ )
373
+ response_content = response1.choices[0].message.content
374
+ should_caption = True if "yes" in response_content.lower() else False
375
+ print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
376
+
377
+ else:
378
+ should_caption = False
379
+
380
+ #2단계: dense caption 만들기
381
+ dense_caption_prompt_1 = f"""
382
+ In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. The category name of these objects are : {cat_name}.
383
+
384
+ Please describe the image focusing on labeled {cat_name}s in detail, focusing on their actions and interactions.
385
+
386
+ 1. Focus only on clear, unique, and prominent actions that distinguish each object.
387
+ 2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
388
+ 3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
389
+ 4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
390
+ 5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
391
+ 6. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
392
+ - expressions like 'seems to be', 'appears to be' are BANNED!
393
+ 7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
394
+ 8. Include interactions with objects or other entities when they are prominent and observable.
395
+ 9. **Do not include descriptions of appearance** such as clothes, color, size, shape etc.
396
+ 10. **Do not include relative position** between objects such as 'the left elephant' because left/right can be ambiguous.
397
+ 11. Do not mention object IDs.
398
+ 12. Use '{cat_name}' as the noun for the referring expressions.
399
+
400
+ Note that I want to use your description to create a grounding dataset, therefore, your descriptions for different objects should be unique, i.e., If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
401
+
402
+ - Your answer should contain details, and follow the following format:
403
+ object id. action-oriented description
404
+ (e.g. 1. the person is holding bananas on two hands and opening his mouth, turning the head right.
405
+ 2. a person bending over and touching his boots to tie the shoelace.)
406
+ - for action-oriented description, use {cat_name} as subject noun
407
+
408
+ **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
409
+ Please pay attention to the categories of these objects and don’t change them.
410
+ Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
411
+ Output referring expressions for each object id. Please start your answer:"""
412
+
413
+
414
+ dense_caption_prompt_2 = f"""
415
+ You are an advanced visual language model analyzing a video frame.
416
+ In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been distinctly labeled with bright numerical IDs at their center and boundary.
417
+
418
+ Your task is to generate **action-oriented descriptions** for each labeled {cat_name}.
419
+ Your descriptions should capture their **observable actions and interactions**, making sure to highlight movement, gestures, and dynamic behaviors.
420
+
421
+ ---
422
+ ## Key Guidelines:
423
+ 1. **Describe only clear and visible actions** that uniquely define what the {cat_name} is doing.
424
+ - Example: "grabbing a branch and pulling it down" (**(O) Specific**)
425
+ - Avoid: "moving slightly to the side" (**(X) Too vague**)
426
+
427
+ 2. **Do not describe appearance, color, or position**—focus purely on the action.
428
+ - (X) "A large brown bear standing on the left"
429
+ - (O) "The bear is lifting its front paws and swiping forward."
430
+
431
+ 3. **Use dynamic, action-specific verbs** rather than passive descriptions.
432
+ - (O) "The giraffe is tilting its head and sniffing the ground."
433
+ - (X) "The giraffe is near a tree and looking around."
434
+
435
+ 4. **Avoid assumptions, emotions, or speculative phrasing.**
436
+ - (X) "The person seems excited" / "The person might be preparing to jump."
437
+ - (O) "The person is pushing its front legs against the rock and leaping forward."
438
+
439
+ 5. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
440
+ - expressions like 'seems to be', 'appears to be' are BANNED!
441
+ 6. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
442
+
443
+ 7. If multiple {cat_name}s are present, make sure their descriptions are **distinct and non-overlapping**.
444
+ - **Each object should have a unique, descriptive action.**
445
+ - (X) "Two dogs are running."
446
+ - (O) "1. One dog is chasing another, its legs stretched mid-air.
447
+ 2. The other dog is looking back while speeding up."
448
+
449
+ ---
450
+ ## Output Format:
451
+ - Each labeled **{cat_name}** should have exactly **one line of description**.
452
+ - Format: `ID. {cat_name} + action-based description`
453
+ - (O) Example:
454
+ ```
455
+ 1. The person is leaning forward while opening a bag with both hands.
456
+ 2. The person is holding onto a rope and pulling themselves up.
457
+ ```
458
+ - **Ensure that each object is described individually.**
459
+ - **Do not group objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
460
+
461
+ ---
462
+ ## Additional Instructions:
463
+ - **Do NOT** use expressions like "it appears that..." or "it seems like...".
464
+ - **Do NOT** mention object IDs in the description (only use the provided format).
465
+ - **DO NOT** include markdown formatting (no bullet points, no asterisks).
466
+ - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
467
+
468
+ Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:
469
+ """
470
+
471
+
472
+ dense_caption_prompt = f"""
473
+ You are a visual assistant analyzing a single frame of a video.
474
+ In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been labeled with bright numeric IDs at their center and boundary.
475
+
476
+ I am building an **action-centric referring expression** dataset.
477
+ Your task is to describe each labeled {cat_name} based on **clearly observable and specific actions**.
478
+
479
+ ---
480
+ ## Guidelines:
481
+ 1. **Focus only on visible and prominent actions** (e.g., running, pushing, grasping an object).
482
+ 2. **Avoid describing minor or ambiguous movements** (e.g., "slightly moving a paw," "tilting head a bit").
483
+ 3. **Do not include subjective or speculative descriptions** (e.g., "it seems excited" or "it might be preparing to jump").
484
+ 4. **Avoid vague expressions** like "engaging with something." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
485
+ 5. **Use dynamic action verbs** (e.g., holding, throwing, inspecting, leaning, pressing) to highlight motion and interaction.
486
+ 6. If multiple {cat_name}s appear, ensure each description is **distinct and non-overlapping**.
487
+ 7. Base your descriptions on these principles:
488
+ - **Avoid words like 'minimal' or 'slightly'.**
489
+ - Emphasize **body movement, posture, and motion patterns** (e.g., "lifting its head," "facing forward," "showing its back").
490
+ - Describe **facial expressions and interactions with objects** (e.g., "opening its mouth wide," "smiling while holding an item").
491
+ - **Specify actions with other objects or entities** only when they are clear and observable.
492
+ - (O) "pushing another person"
493
+ - (X) "interacting with another object"
494
+
495
+ ---
496
+ ## Output Format:
497
+ - Each labeled **{cat_name}** must have **exactly one line**.
498
+ - Format: `ID. {cat_name} + action-based description`
499
+ - (O) Example:
500
+ ```
501
+ 1. The person is holding ski poles and skiing down a snowy mountain with bent knees.
502
+ 2. The person is pulling a baby carriage while smiling.
503
+ ```
504
+ - **Ensure each object is described individually.**
505
+ - **Do not group multiple objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
506
+
507
+ ---
508
+ ## Example:
509
+ If the frame has two labeled **bears**, your output should be:
510
+ ```
511
+ 1. The bear is reaching out its right paw while leaning forward to catch prey.
512
+ 2. A bear is standing upright, facing right, and touching the bike beside it.
513
+ ```
514
+
515
+ ---
516
+ ## Additional Instructions:
517
+ - **Do NOT** describe appearance (e.g., color, size, texture) or relative positioning (e.g., "on the left/right").
518
+ - **Do NOT** reference object IDs explicitly (e.g., "Person 1" or "Object 2" is NOT allowed).
519
+ - **Do NOT** include markdown formatting (no bullet points, asterisks, or extra symbols).
520
+ - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
521
+
522
+ Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:"""
523
+
524
+
525
+ MAX_RETRIES = 3
526
+ retry_count = 0
527
+
528
+ if should_caption:
529
+ while retry_count < MAX_RETRIES:
530
+ selected_prompt = random.choice([dense_caption_prompt, dense_caption_prompt_2])
531
+
532
+ response2 = captioner.chat.completions.create(
533
+ model=model,
534
+ messages=[
535
+ {
536
+ "role": "user",
537
+ "content": [
538
+ {
539
+ "type": "text",
540
+ "text": selected_prompt,
541
+ },
542
+ {
543
+ "type": "image_url",
544
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
545
+ },
546
+ ],
547
+ }
548
+ ],
549
+ )
550
+
551
+ # caption = response2.choices[0].message.content
552
+ #print(f"{image_path} - {frame_name}: {caption}")
553
+
554
+ caption = response2.choices[0].message.content.strip()
555
+ caption_lower = caption.lower().lstrip()
556
+
557
+ if caption_lower.startswith("1.") and not any(
558
+ phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
559
+ ):
560
+ break
561
+
562
+ print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
563
+ retry_count += 1
564
+ time.sleep(2)
565
+
566
+ if retry_count == MAX_RETRIES:
567
+ caption = None
568
+ print("Max retries reached. Caption generation failed.")
569
+
570
+ else:
571
+ caption = None
572
+
573
+ image_captions[frame_name] = caption
574
+ all_captions[cat_name] = image_captions
575
+
576
+ # final : also prepare valid object ids
577
+ valid_obj_ids = dict()
578
+
579
+ for cat in cat_names:
580
+ if cat in ytvos_category_valid_list:
581
+ obj_id_cat = vid_meta['obj_id_cat']
582
+ valid_cat_ids = []
583
+ for obj_id in list(obj_id_cat.keys()):
584
+ if obj_id_cat[obj_id] == cat:
585
+ valid_cat_ids.append(obj_id)
586
+ valid_obj_ids[cat] = valid_cat_ids
587
+
588
+ return vid_id, all_captions, valid_obj_ids
589
+
590
+
591
+ if __name__ == '__main__':
592
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
593
+ parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions_gpt-4o_randcap.json")
594
+ parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids_gpt-4o_randcap.json")
595
+
596
+ args = parser.parse_args()
597
+
598
+ #==================데이터 불러오기===================
599
+ # 전체 데이터셋
600
+ train_dataset = build_ytvos_ref(image_set = 'train', args = args)
601
+
602
+ # 전체 데이터셋 메타데이터
603
+ metas = train_dataset.metas
604
+
605
+ # 색상 후보 8개 (RGB 형식)
606
+ colors = [
607
+ (255, 0, 0), # Red
608
+ (0, 255, 0), # Green
609
+ (0, 0, 255), # Blue
610
+ (255, 255, 0), # Yellow
611
+ (255, 0, 255), # Magenta
612
+ (0, 255, 255), # Cyan
613
+ (128, 0, 128), # Purple
614
+ (255, 165, 0) # Orange
615
+ ]
616
+
617
+ ytvos_category_valid_list = [
618
+ 'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
619
+ 'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
620
+ 'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
621
+ 'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
622
+ 'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
623
+ 'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
624
+ ]
625
+
626
+ #==================gpt 돌리기===================
627
+ os.environ['OPENAI_API_KEY'] = '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'
628
+
629
+ result_captions = {}
630
+ result_valid_obj_ids = {}
631
+
632
+ for i in range(len(metas)):
633
+ try:
634
+ vid_id, all_captions, valid_obj_ids = getCaption(i)
635
+
636
+ if vid_id not in result_captions:
637
+ result_captions[vid_id] = all_captions
638
+ if vid_id not in result_valid_obj_ids:
639
+ result_valid_obj_ids[vid_id] = valid_obj_ids
640
+
641
+ except (requests.exceptions.ConnectionError, APIConnectionError) as e:
642
+ print(f"created caption until {i-1}", flush=True)
643
+ print("인터넷 연결 문제로 요청을 처리할 수 없습니다:", e, flush=True)
644
+
645
+ with open(args.save_caption_path, "w") as file:
646
+ json.dump(result_captions, file, indent=4)
647
+
648
+ with open(args.save_valid_obj_ids_path, "w") as file:
649
+ json.dump(result_valid_obj_ids, file, indent=4)
650
+
651
+ except OpenAIError as e:
652
+ print(f"created caption until {i-1}", flush=True)
653
+ print("OpenAI API 관련 오류가 발생했습니다:", e, flush=True)
654
+
655
+ with open(args.save_caption_path, "w") as file:
656
+ json.dump(result_captions, file, indent=4)
657
+
658
+ with open(args.save_valid_obj_ids_path, "w") as file:
659
+ json.dump(result_valid_obj_ids, file, indent=4)
660
+
661
+ except Exception as e:
662
+ print(f"created caption until {i-1}", flush=True)
663
+ print("알 수 없는 오류 발생:", e, flush=True)
664
+
665
+ with open(args.save_caption_path, "w") as file:
666
+ json.dump(result_captions, file, indent=4)
667
+
668
+ with open(args.save_valid_obj_ids_path, "w") as file:
669
+ json.dump(result_valid_obj_ids, file, indent=4)
670
+
671
+ print("Finished!", flush=True)
672
+
673
+ with open(args.save_caption_path, "w") as file:
674
+ json.dump(result_captions, file, indent=4)
675
+
676
+ with open(args.save_valid_obj_ids_path, "w") as file:
677
+ json.dump(result_valid_obj_ids, file, indent=4)
.history/mbench/make_ref-ytvos_json_20250117032501.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ from os import path as osp
4
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
5
+
6
+ from datasets import build_dataset
7
+ import argparse
8
+ import opts
9
+
10
+
11
+ from pathlib import Path
12
+ import io
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+ import regex as re
17
+ import json
18
+
19
+ import cv2
20
+ from PIL import Image, ImageDraw
21
+ import torch
22
+ from torchvision.transforms import functional as F
23
+
24
+ from skimage import measure # (pip install scikit-image)
25
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
26
+
27
+ import matplotlib.pyplot as plt
28
+ import matplotlib.patches as patches
29
+ from matplotlib.collections import PatchCollection
30
+ from matplotlib.patches import Rectangle
31
+
32
+
33
+ import ipywidgets as widgets
34
+ from IPython.display import display, clear_output
35
+
36
+ #==================json 만들기===================
37
+ def createJson(train_dataset, metas):
38
+ entire_json = {}
39
+
40
+ #초기화
41
+ vid_idx = 0
42
+
43
+ while vid_idx < len(train_dataset):
44
+
45
+ #하나의 비디오에 대해
46
+ video_data = {}
47
+ video_train_frames, video_train_info = train_dataset[vid_idx]
48
+ video_meta = metas[vid_idx]
49
+
50
+ video_id = video_meta['video']
51
+ video_data['bins'] = video_meta['bins']
52
+ bin_nums = len(video_meta['bins'])
53
+ obj_nums = len(list(video_meta['obj_id_cat'].keys()))
54
+
55
+ annotation_data = []
56
+ frame_names = []
57
+
58
+ for i in range(bin_nums):
59
+ bin_data = {}
60
+ for j in range(obj_nums):
61
+ obj_id = str(j+1)
62
+ obj_data = {
63
+ "category_name":video_meta['obj_id_cat'][obj_id],
64
+ "bbox":video_train_info['boxes'][i*obj_nums+j, :]
65
+ }
66
+ bin_data[obj_id] = obj_data
67
+ annotation_data.append(bin_data)
68
+
69
+ video_data['annotations'] = annotation_data
70
+
71
+
72
+ sample_indx = metas[vid_idx]['sample_indx']
73
+ frames = metas[vid_idx]['frames']
74
+ for i in sample_indx:
75
+ frame_name = frames[i]
76
+ frame_names.append(frame_name)
77
+
78
+ video_data['frame_names'] = frame_names
79
+ video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
80
+ entire_json[video_id] = video_data
81
+
82
+ vid_idx += 1
83
+
84
+ return entire_json
85
+
86
+
87
+ if __name__ == '__main__':
88
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
89
+ args = parser.parse_args()
90
+
91
+ #==================데이터 불러오기===================
92
+ # 전체 데이터셋
93
+ train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
94
+
95
+ # 전체 데이터셋 메타데이터
96
+ metas = train_dataset.metas
97
+
98
+ #==================json 만들기===================
99
+ entire_json_dict = createJson(train_dataset, metas)
100
+ print(type(entire_json_dict))
101
+ entire_json = json.dumps(entire_json_dict, indent=4)
102
+
103
+ with open('mbench/sampled_frame2.json', mode='w') as file:
104
+ file.write(entire_json)
.history/mbench/make_ref-ytvos_json_20250117072314.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ from os import path as osp
4
+ sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
5
+
6
+ from datasets import build_dataset
7
+ import argparse
8
+ import opts
9
+
10
+
11
+ from pathlib import Path
12
+ import io
13
+
14
+ import numpy as np
15
+ import pandas as pd
16
+ import regex as re
17
+ import json
18
+
19
+ import cv2
20
+ from PIL import Image, ImageDraw
21
+ import torch
22
+ from torchvision.transforms import functional as F
23
+
24
+ from skimage import measure # (pip install scikit-image)
25
+ from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
26
+
27
+ import matplotlib.pyplot as plt
28
+ import matplotlib.patches as patches
29
+ from matplotlib.collections import PatchCollection
30
+ from matplotlib.patches import Rectangle
31
+
32
+
33
+ import ipywidgets as widgets
34
+ from IPython.display import display, clear_output
35
+
36
+ #==================json 만들기===================
37
+ def createJson(train_dataset, metas):
38
+ entire_json = {}
39
+
40
+ #초기화
41
+ vid_idx = 0
42
+
43
+ while vid_idx < len(train_dataset):
44
+
45
+ #하나의 비디오에 대해
46
+ video_data = {}
47
+ video_train_frames, video_train_info = train_dataset[vid_idx]
48
+ video_meta = metas[vid_idx]
49
+
50
+ video_id = video_meta['video']
51
+ video_data['bins'] = video_meta['bins']
52
+ bin_nums = len(video_meta['bins'])
53
+ obj_nums = len(list(video_meta['obj_id_cat'].keys()))
54
+
55
+ annotation_data = []
56
+ frame_names = []
57
+
58
+ for i in range(bin_nums):
59
+ bin_data = {}
60
+ for j in range(obj_nums):
61
+ try:
62
+ obj_id = str(j+1)
63
+ obj_data = {
64
+ "category_name":video_meta['obj_id_cat'][obj_id],
65
+ "bbox":video_train_info['boxes'][i*obj_nums+j, :]
66
+ }
67
+ bin_data[obj_id] = obj_data
68
+ except:
69
+ continue
70
+ annotation_data.append(bin_data)
71
+
72
+ video_data['annotations'] = annotation_data
73
+
74
+
75
+ sample_indx = metas[vid_idx]['sample_indx']
76
+ frames = metas[vid_idx]['frames']
77
+ for i in sample_indx:
78
+ frame_name = frames[i]
79
+ frame_names.append(frame_name)
80
+
81
+ video_data['frame_names'] = frame_names
82
+ video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
83
+ entire_json[video_id] = video_data
84
+
85
+ vid_idx += 1
86
+
87
+ return entire_json
88
+
89
+
90
+ if __name__ == '__main__':
91
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
92
+ args = parser.parse_args()
93
+
94
+ #==================데이터 불러오기===================
95
+ # 전체 데이터셋
96
+ train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
97
+
98
+ # 전체 데이터셋 메타데이터
99
+ metas = train_dataset.metas
100
+
101
+ #==================json 만들기===================
102
+ entire_json_dict = createJson(train_dataset, metas)
103
+ print(type(entire_json_dict))
104
+ entire_json = json.dumps(entire_json_dict, indent=4)
105
+
106
+ with open('mbench/sampled_frame2.json', mode='w') as file:
107
+ file.write(entire_json)
.history/mbench_a2d/gpt_a2d_numbered_20250206114207.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
4
+
5
+ from datasets import build_dataset
6
+ import argparse
7
+ import opts
8
+ import time
9
+
10
+ import numpy as np
11
+ import matplotlib.pyplot as plt
12
+ import cv2
13
+ from io import BytesIO
14
+ import base64
15
+ from PIL import Image
16
+ import json
17
+
18
+ from openai import OpenAI
19
+
20
+ def mark_object_and_encode(frame, mask, instance_id, text_query, color_mask=False, label_number=False):
21
+ #마스크 색칠할지
22
+ if color_mask == True:
23
+ alpha = 0.1
24
+
25
+ colored_mask = np.zeros_like(frame)
26
+ colored_mask[mask == 1] = [255, 0, 0]
27
+ frame[mask == 1] = (
28
+ (1 - alpha) * frame[mask == 1] +
29
+ alpha * colored_mask[mask == 1]
30
+ )
31
+
32
+ #마스크 아웃라인 그리기
33
+ contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
34
+ cv2.drawContours(frame, contours, -1, [255, 0, 0], 2)
35
+
36
+ #instance_id 적을지
37
+ if label_number == True:
38
+ if len(contours) > 0:
39
+ largest_contour = max(contours, key=cv2.contourArea)
40
+ M = cv2.moments(largest_contour)
41
+ if M["m00"] != 0:
42
+ center_x = int(M["m10"] / M["m00"])
43
+ center_y = int(M["m01"] / M["m00"])
44
+ else:
45
+ center_x, center_y = 0, 0
46
+
47
+ font = cv2.FONT_HERSHEY_SIMPLEX
48
+ text = str(instance_id)
49
+ font_scale = 0.6
50
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
51
+ text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
52
+ text_y = center_y
53
+ # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
54
+
55
+ # 텍스트 배경 사각형 좌표 계산
56
+ rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
57
+ # rect_end = (text_x + text_size[0] + 5, text_y + 5)
58
+ rect_end = (text_x + text_size[0] + 5, text_y)
59
+
60
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
61
+ cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), 2)
62
+
63
+ # plt.figure(figsize=(6, 10))
64
+ # plt.imshow(frame)
65
+ # plt.title(text_query)
66
+ # plt.tight_layout()
67
+ # plt.axis('off')
68
+ # plt.show()
69
+
70
+ buffer = BytesIO()
71
+ frame = Image.fromarray(frame)
72
+ frame.save(buffer, format='jpeg')
73
+ buffer.seek(0)
74
+ encoded_frame = base64.b64encode(buffer.read()).decode("utf-8")
75
+
76
+ return encoded_frame
77
+
78
+ def getCaption(frame, mask, instance_id, text_query, model='gpt-4o', color_mask=False, label_number=True):
79
+
80
+ base64_image = mark_object_and_encode(frame, mask, instance_id, text_query, color_mask, label_number)
81
+
82
+ captioner = OpenAI()
83
+
84
+ #필터링하지 않고 바로 ref exp 만들기
85
+ dense_caption_prompt = f"""
86
+ You are a visual assistant analyzing a single frame of a video.
87
+ In the given frame, I labeled 1 object by marking each with a bright numeric ID at the center and its boundary.
88
+ I also give you a text query describing the marked object.
89
+ I want to use your expression to create an **action-centric referring expression** dataset.
90
+ Based on the frame and text query, please describe the marked object using **clearly observable** and **specific** actions
91
+ ---
92
+ ## Guidelines:
93
+ 1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
94
+ 2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
95
+ 3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
96
+ 4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
97
+ 5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
98
+ 6. If there are multiple objects, ensure the description for the marked object **differentiates** its action.
99
+ 7. Base your description on these action definitions:
100
+ - Avoid using term 'minimal' or 'slightly'.
101
+ - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
102
+ - details such as motion and intention, facial with object manipulation
103
+ - movements with object or other entities when they are prominent and observable. expression should be specific.
104
+ (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
105
+ --
106
+ ## Output Format:
107
+ - For each labeled object, output **exactly one line**. Your answer should contain details and follow the following format :
108
+ object id. action-oriented description
109
+ (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
110
+ ### Example
111
+ If the frame has 1 labeled bear, your output should look like:
112
+ 1. the bear reaching his right arm while leaning forward to capture the prey
113
+ ---
114
+ **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
115
+ **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
116
+ **Do not include markdown** in the output.
117
+ Keep in mind that you should not group the object, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
118
+ For each labeled object, output referring expressions for each object id.
119
+ """
120
+ prompt_with_text_query = f"prompt: {dense_caption_prompt}\n text query: {text_query}"
121
+
122
+ MAX_RETRIES = 2
123
+ retry_count = 0
124
+
125
+ while retry_count < MAX_RETRIES:
126
+ response = captioner.chat.completions.create(
127
+ model=model,
128
+ messages=[
129
+ {
130
+ "role": "user",
131
+ "content": [
132
+ {
133
+ "type": "text",
134
+ "text": prompt_with_text_query,
135
+ },
136
+ {
137
+ "type": "image_url",
138
+ "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
139
+ },
140
+ ],
141
+ }
142
+ ],
143
+ )
144
+
145
+
146
+ caption = response.choices[0].message.content.strip()
147
+ caption_lower = caption.lower().lstrip()
148
+ if caption_lower.startswith("1.") and not any(
149
+ phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
150
+ ):
151
+ break
152
+ print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
153
+ retry_count += 1
154
+ time.sleep(2)
155
+
156
+ if retry_count == MAX_RETRIES:
157
+ caption = None
158
+ print("Max retries reached. Caption generation failed.")
159
+
160
+ else:
161
+ caption = None
162
+
163
+ return caption
164
+
165
+ if __name__ == "__main__":
166
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
167
+ parser.add_argument('--save_caption_path', type=str, default='mbench_a2d/numbered_captions.json')
168
+ args = parser.parse_args()
169
+
170
+ train_dataset = build_dataset('a2d', image_set = 'train', args = args)
171
+ text_annotations = train_dataset.text_annotations
172
+
173
+ all_captions = {}
174
+
175
+ #os.environ['OPENAI_API_KEY'] = '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'
176
+ os.environ['OPENAI_API_KEY'] = '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'
177
+
178
+ first_text_query = ""
179
+ for idx in range(100):
180
+ imgs, target = train_dataset[idx]
181
+ frames_idx = target['frames_idx'].tolist()
182
+ text_query, vid_id, frame_id, instance_id = text_annotations[idx]
183
+
184
+ if text_query == first_text_query:
185
+ continue
186
+
187
+ print(f"------------vid id: {vid_id}, frame id: {frame_id}", flush=True)
188
+
189
+ frame_id = frame_id - 1
190
+ frame_order = frames_idx.index(frame_id)
191
+
192
+ frame = imgs[frame_order, :, :, :].permute(1, 2, 0).numpy()
193
+ mask = target['masks'].numpy().astype(np.uint8).squeeze()
194
+
195
+ caption = getCaption(frame, mask, instance_id, text_query, model='gpt-4o-mini')
196
+ if vid_id not in all_captions:
197
+ all_captions[vid_id] = {frame_id : caption}
198
+ else:
199
+ all_captions[vid_id][frame_id] = caption
200
+
201
+ print("Finished!", flush=True)
202
+
203
+ with open(args.save_caption_path, 'w') as file:
204
+ json.dump(all_captions, file, indent=4)
205
+
__pycache__/opts.cpython-310.pyc ADDED
Binary file (5.44 kB). View file
 
__pycache__/opts.cpython-39.pyc ADDED
Binary file (5.44 kB). View file
 
__pycache__/refer.cpython-39.pyc ADDED
Binary file (10.1 kB). View file
 
davis2017/davis.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from glob import glob
3
+ from collections import defaultdict
4
+ import numpy as np
5
+ from PIL import Image
6
+
7
+
8
+ class DAVIS(object):
9
+ SUBSET_OPTIONS = ['train', 'val', 'test-dev', 'test-challenge']
10
+ TASKS = ['semi-supervised', 'unsupervised']
11
+ DATASET_WEB = 'https://davischallenge.org/davis2017/code.html'
12
+ VOID_LABEL = 255
13
+
14
+ def __init__(self, root, task='unsupervised', subset='val', sequences='all', resolution='480p', codalab=False):
15
+ """
16
+ Class to read the DAVIS dataset
17
+ :param root: Path to the DAVIS folder that contains JPEGImages, Annotations, etc. folders.
18
+ :param task: Task to load the annotations, choose between semi-supervised or unsupervised.
19
+ :param subset: Set to load the annotations
20
+ :param sequences: Sequences to consider, 'all' to use all the sequences in a set.
21
+ :param resolution: Specify the resolution to use the dataset, choose between '480' and 'Full-Resolution'
22
+ """
23
+ if subset not in self.SUBSET_OPTIONS:
24
+ raise ValueError(f'Subset should be in {self.SUBSET_OPTIONS}')
25
+ if task not in self.TASKS:
26
+ raise ValueError(f'The only tasks that are supported are {self.TASKS}')
27
+
28
+ self.task = task
29
+ self.subset = subset
30
+ self.root = root
31
+ self.img_path = os.path.join(self.root, 'JPEGImages', resolution)
32
+ annotations_folder = 'Annotations' if task == 'semi-supervised' else 'Annotations_unsupervised'
33
+ self.mask_path = os.path.join(self.root, annotations_folder, resolution)
34
+ year = '2019' if task == 'unsupervised' and (subset == 'test-dev' or subset == 'test-challenge') else '2017'
35
+ self.imagesets_path = os.path.join(self.root, 'ImageSets', year)
36
+
37
+ self._check_directories()
38
+
39
+ if sequences == 'all':
40
+ with open(os.path.join(self.imagesets_path, f'{self.subset}.txt'), 'r') as f:
41
+ tmp = f.readlines()
42
+ sequences_names = [x.strip() for x in tmp]
43
+ else:
44
+ sequences_names = sequences if isinstance(sequences, list) else [sequences]
45
+ self.sequences = defaultdict(dict)
46
+
47
+ for seq in sequences_names:
48
+ images = np.sort(glob(os.path.join(self.img_path, seq, '*.jpg'))).tolist()
49
+ if len(images) == 0 and not codalab:
50
+ raise FileNotFoundError(f'Images for sequence {seq} not found.')
51
+ self.sequences[seq]['images'] = images
52
+ masks = np.sort(glob(os.path.join(self.mask_path, seq, '*.png'))).tolist()
53
+ masks.extend([-1] * (len(images) - len(masks)))
54
+ self.sequences[seq]['masks'] = masks
55
+
56
+ def _check_directories(self):
57
+ if not os.path.exists(self.root):
58
+ raise FileNotFoundError(f'DAVIS not found in the specified directory, download it from {self.DATASET_WEB}')
59
+ if not os.path.exists(os.path.join(self.imagesets_path, f'{self.subset}.txt')):
60
+ raise FileNotFoundError(f'Subset sequences list for {self.subset} not found, download the missing subset '
61
+ f'for the {self.task} task from {self.DATASET_WEB}')
62
+ if self.subset in ['train', 'val'] and not os.path.exists(self.mask_path):
63
+ raise FileNotFoundError(f'Annotations folder for the {self.task} task not found, download it from {self.DATASET_WEB}')
64
+
65
+ def get_frames(self, sequence):
66
+ for img, msk in zip(self.sequences[sequence]['images'], self.sequences[sequence]['masks']):
67
+ image = np.array(Image.open(img))
68
+ mask = None if msk is None else np.array(Image.open(msk))
69
+ yield image, mask
70
+
71
+ def _get_all_elements(self, sequence, obj_type):
72
+ obj = np.array(Image.open(self.sequences[sequence][obj_type][0]))
73
+ all_objs = np.zeros((len(self.sequences[sequence][obj_type]), *obj.shape))
74
+ obj_id = []
75
+ for i, obj in enumerate(self.sequences[sequence][obj_type]):
76
+ all_objs[i, ...] = np.array(Image.open(obj))
77
+ obj_id.append(''.join(obj.split('/')[-1].split('.')[:-1]))
78
+ return all_objs, obj_id
79
+
80
+ def get_all_images(self, sequence):
81
+ return self._get_all_elements(sequence, 'images')
82
+
83
+ def get_all_masks(self, sequence, separate_objects_masks=False):
84
+ masks, masks_id = self._get_all_elements(sequence, 'masks')
85
+ masks_void = np.zeros_like(masks)
86
+
87
+ # Separate void and object masks
88
+ for i in range(masks.shape[0]):
89
+ masks_void[i, ...] = masks[i, ...] == 255
90
+ masks[i, masks[i, ...] == 255] = 0
91
+
92
+ if separate_objects_masks:
93
+ num_objects = int(np.max(masks[0, ...]))
94
+ tmp = np.ones((num_objects, *masks.shape))
95
+ tmp = tmp * np.arange(1, num_objects + 1)[:, None, None, None]
96
+ masks = (tmp == masks[None, ...])
97
+ masks = masks > 0
98
+ return masks, masks_void, masks_id
99
+
100
+ def get_sequences(self):
101
+ for seq in self.sequences:
102
+ yield seq
103
+
104
+
105
+ if __name__ == '__main__':
106
+ from matplotlib import pyplot as plt
107
+
108
+ only_first_frame = True
109
+ subsets = ['train', 'val']
110
+
111
+ for s in subsets:
112
+ dataset = DAVIS(root='/home/csergi/scratch2/Databases/DAVIS2017_private', subset=s)
113
+ for seq in dataset.get_sequences():
114
+ g = dataset.get_frames(seq)
115
+ img, mask = next(g)
116
+ plt.subplot(2, 1, 1)
117
+ plt.title(seq)
118
+ plt.imshow(img)
119
+ plt.subplot(2, 1, 2)
120
+ plt.imshow(mask)
121
+ plt.show(block=True)
122
+
docs/davis_demo1.gif ADDED

Git LFS Details

  • SHA256: e3203f7df580fb3903bf46f23a95d1efdf23ed57497691fe673ed51c05a790df
  • Pointer size: 133 Bytes
  • Size of remote file: 15.3 MB
docs/davis_demo2.gif ADDED

Git LFS Details

  • SHA256: b9301ea2739bd30f44acfd98f99e68ceb4d9deef0ac7458a5de72b7efd2e7445
  • Pointer size: 133 Bytes
  • Size of remote file: 12.7 MB
docs/install.md ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Installation
2
+
3
+ We provide the instructions to install the dependency packages.
4
+
5
+ ## Requirements
6
+
7
+ We test the code in the following environments, other versions may also be compatible:
8
+
9
+ - CUDA 11.1
10
+ - Python 3.7
11
+ - Pytorch 1.8.1
12
+
13
+
14
+
15
+ ## Setup
16
+
17
+ First, clone the repository locally.
18
+
19
+ ```
20
+ git clone https://github.com/wjn922/ReferFormer.git
21
+ ```
22
+
23
+ Then, install Pytorch 1.8.1 using the conda environment.
24
+ ```
25
+ conda install pytorch==1.8.1 torchvision==0.9.1 torchaudio==0.8.1 -c pytorch
26
+ ```
27
+
28
+ Install the necessary packages and pycocotools.
29
+
30
+ ```
31
+ pip install -r requirements.txt
32
+ pip install 'git+https://github.com/facebookresearch/fvcore'
33
+ pip install -U 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'
34
+ ```
35
+
36
+ Finally, compile CUDA operators.
37
+
38
+ ```
39
+ cd models/ops
40
+ python setup.py build install
41
+ cd ../..
42
+ ```
docs/network.png ADDED

Git LFS Details

  • SHA256: 9b52dc182c58c0ce59086750d1c2657dcdbeb9c9771add8cca93ad8a55feba0e
  • Pointer size: 132 Bytes
  • Size of remote file: 3.48 MB
docs/ytvos_demo1.gif ADDED

Git LFS Details

  • SHA256: 073a90379317b9ddcaae5459a7f7a66ba4d722fa34600af44c3ebc0a3b6fe402
  • Pointer size: 132 Bytes
  • Size of remote file: 9.67 MB
docs/ytvos_demo2.gif ADDED

Git LFS Details

  • SHA256: 97fe7b1d10968a32bc5dac0c0f543e334c6e59c65de78029d1eed0e3f10b0486
  • Pointer size: 133 Bytes
  • Size of remote file: 14.8 MB
hf_cache/.locks/models--zhiqiulin--clip-flant5-xxl/e14a3254bf04f32056759bdc60c64736e7638f31b43957586ff2442ff393890a.lock ADDED
File without changes
hf_cache/models--zhiqiulin--clip-flant5-xxl/snapshots/89bad6fffe1126b24d4360c1e1f69145eb6103aa/pytorch_model-00002-of-00003.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12acb5074c883dcab3e166d86d20130615ff83b0d26736ee046f4184202ebd3b
3
+ size 9999791010
make_ref-ytvos/manual_selection.ipynb ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "/data/projects/yejin/VerbCentric_RIS/ReferFormer\n"
13
+ ]
14
+ },
15
+ {
16
+ "name": "stderr",
17
+ "output_type": "stream",
18
+ "text": [
19
+ "/home/yejin/.conda/envs/VerbCentric_RIS/lib/python3.9/site-packages/IPython/core/magics/osm.py:417: UserWarning: using dhist requires you to install the `pickleshare` library.\n",
20
+ " self.shell.db['dhist'] = compress_dhist(dhist)[-100:]\n"
21
+ ]
22
+ }
23
+ ],
24
+ "source": [
25
+ "%cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer"
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "markdown",
30
+ "metadata": {},
31
+ "source": [
32
+ "## 1. manual 필터링 반영"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": 18,
38
+ "metadata": {},
39
+ "outputs": [],
40
+ "source": [
41
+ "import pandas as pd\n",
42
+ "import re\n",
43
+ "import json"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "execution_count": 31,
49
+ "metadata": {},
50
+ "outputs": [],
51
+ "source": [
52
+ "selected_frames_df = pd.read_json(\"/home/yejin/data/dataset/VRIS/mbench/ytvos/selected_instances.jsonl\", lines = True)\n",
53
+ "manual_selected = pd.read_json(\"manual_selected_frames.jsonl\", lines = True)"
54
+ ]
55
+ },
56
+ {
57
+ "cell_type": "code",
58
+ "execution_count": 32,
59
+ "metadata": {},
60
+ "outputs": [
61
+ {
62
+ "data": {
63
+ "text/html": [
64
+ "<div>\n",
65
+ "<style scoped>\n",
66
+ " .dataframe tbody tr th:only-of-type {\n",
67
+ " vertical-align: middle;\n",
68
+ " }\n",
69
+ "\n",
70
+ " .dataframe tbody tr th {\n",
71
+ " vertical-align: top;\n",
72
+ " }\n",
73
+ "\n",
74
+ " .dataframe thead th {\n",
75
+ " text-align: right;\n",
76
+ " }\n",
77
+ "</style>\n",
78
+ "<table border=\"1\" class=\"dataframe\">\n",
79
+ " <thead>\n",
80
+ " <tr style=\"text-align: right;\">\n",
81
+ " <th></th>\n",
82
+ " <th>segmentation</th>\n",
83
+ " <th>bbox</th>\n",
84
+ " <th>area</th>\n",
85
+ " <th>file_name</th>\n",
86
+ " <th>height</th>\n",
87
+ " <th>width</th>\n",
88
+ " <th>label</th>\n",
89
+ " <th>category_name</th>\n",
90
+ " <th>sentences</th>\n",
91
+ " </tr>\n",
92
+ " </thead>\n",
93
+ " <tbody>\n",
94
+ " <tr>\n",
95
+ " <th>0</th>\n",
96
+ " <td>[[1081.0, 719.5, 1051.0, 719.5, 1050.5, 716.0,...</td>\n",
97
+ " <td>[708.5, 156.5, 420.0, 563.0]</td>\n",
98
+ " <td>131357.25</td>\n",
99
+ " <td>00917dcfc4_00000.png</td>\n",
100
+ " <td>720</td>\n",
101
+ " <td>1280</td>\n",
102
+ " <td>64</td>\n",
103
+ " <td>zebra</td>\n",
104
+ " <td>{'tokens': ['a', 'zebra', 'on', 'the', 'right'...</td>\n",
105
+ " </tr>\n",
106
+ " </tbody>\n",
107
+ "</table>\n",
108
+ "</div>"
109
+ ],
110
+ "text/plain": [
111
+ " segmentation \\\n",
112
+ "0 [[1081.0, 719.5, 1051.0, 719.5, 1050.5, 716.0,... \n",
113
+ "\n",
114
+ " bbox area file_name height \\\n",
115
+ "0 [708.5, 156.5, 420.0, 563.0] 131357.25 00917dcfc4_00000.png 720 \n",
116
+ "\n",
117
+ " width label category_name \\\n",
118
+ "0 1280 64 zebra \n",
119
+ "\n",
120
+ " sentences \n",
121
+ "0 {'tokens': ['a', 'zebra', 'on', 'the', 'right'... "
122
+ ]
123
+ },
124
+ "execution_count": 32,
125
+ "metadata": {},
126
+ "output_type": "execute_result"
127
+ }
128
+ ],
129
+ "source": [
130
+ "selected_frames_df"
131
+ ]
132
+ },
133
+ {
134
+ "cell_type": "code",
135
+ "execution_count": null,
136
+ "metadata": {},
137
+ "outputs": [],
138
+ "source": [
139
+ "for i in range(len(manual_selected)):\n",
140
+ " idx = manual_selected.loc[i, \"index\"]\n",
141
+ " new_sent = manual_selected.loc[i, 'new_sent']\n",
142
+ "\n",
143
+ " if new_sent != \"\":\n",
144
+ " new_sent_dict = {\n",
145
+ " \"tokens\" : new_sent.split(' '),\n",
146
+ " \"raw\" : new_sent,\n",
147
+ " \"sent\" : re.sub('[^A-Za-z0-9\\s]+', '', new_sent.lower())\n",
148
+ " }\n",
149
+ " selected_frames_df.at[idx, 'sentences'] = new_sent_dict"
150
+ ]
151
+ },
152
+ {
153
+ "cell_type": "code",
154
+ "execution_count": null,
155
+ "metadata": {},
156
+ "outputs": [],
157
+ "source": [
158
+ "manual_selected_frames = selected_frames_df.loc[manual_selected['index'].values]"
159
+ ]
160
+ },
161
+ {
162
+ "cell_type": "code",
163
+ "execution_count": 31,
164
+ "metadata": {},
165
+ "outputs": [],
166
+ "source": [
167
+ "manual_selected_frames.to_json(\"revised_frames.jsonl\", orient='records', lines=True)"
168
+ ]
169
+ },
170
+ {
171
+ "cell_type": "markdown",
172
+ "metadata": {},
173
+ "source": [
174
+ "## 2. lmdb로 변환하기 위해 마스크 저장하기"
175
+ ]
176
+ },
177
+ {
178
+ "cell_type": "code",
179
+ "execution_count": 2,
180
+ "metadata": {},
181
+ "outputs": [],
182
+ "source": [
183
+ "import argparse\n",
184
+ "import os\n",
185
+ "import os.path as osp\n",
186
+ "import lmdb\n",
187
+ "import pyarrow as pa\n",
188
+ "import json\n",
189
+ "from tqdm import tqdm\n",
190
+ "import matplotlib.pyplot as plt\n",
191
+ "from skimage import io\n",
192
+ "import numpy as np\n",
193
+ "from shapely.geometry import Polygon, MultiPolygon\n",
194
+ "from matplotlib.collections import PatchCollection\n",
195
+ "from pycocotools import mask\n",
196
+ "import warnings\n",
197
+ "warnings.filterwarnings(\"ignore\")"
198
+ ]
199
+ },
200
+ {
201
+ "cell_type": "code",
202
+ "execution_count": 3,
203
+ "metadata": {},
204
+ "outputs": [],
205
+ "source": [
206
+ "#jsonl 파일을 {index: json_obj, ... }형식으로\n",
207
+ "\n",
208
+ "json_data = []\n",
209
+ "\n",
210
+ "with open('revised_frames.jsonl', 'rb') as f:\n",
211
+ " for line in f:\n",
212
+ " json_data.append(json.loads(line)) "
213
+ ]
214
+ },
215
+ {
216
+ "cell_type": "code",
217
+ "execution_count": 45,
218
+ "metadata": {},
219
+ "outputs": [],
220
+ "source": [
221
+ "def getMask(ann):\n",
222
+ " # return mask, area and mask-center\n",
223
+ " if type(ann['segmentation'][0]) == list: # polygon\n",
224
+ " rle = mask.frPyObjects(ann['segmentation'], ann['height'],\n",
225
+ " ann['width'])\n",
226
+ " else:\n",
227
+ " rle = ann['segmentation']\n",
228
+ " # for i in range(len(rle['counts'])):\n",
229
+ " # print(rle)\n",
230
+ " m = mask.decode(rle)\n",
231
+ " m = np.sum(\n",
232
+ " m, axis=2\n",
233
+ " ) # sometimes there are multiple binary map (corresponding to multiple segs)\n",
234
+ " m = m.astype(np.uint8) # convert to np.uint8\n",
235
+ " # compute area\n",
236
+ " area = sum(mask.area(rle)) # should be close to ann['area']\n",
237
+ " return {'mask': m, 'area': area}\n",
238
+ " # # position\n",
239
+ " # position_x = np.mean(np.where(m==1)[1]) # [1] means columns (matlab style) -> x (c style)\n",
240
+ " # position_y = np.mean(np.where(m==1)[0]) # [0] means rows (matlab style) -> y (c style)\n",
241
+ " # # mass position (if there were multiple regions, we use the largest one.)\n",
242
+ " # label_m = label(m, connectivity=m.ndim)\n",
243
+ " # regions = regionprops(label_m)\n",
244
+ " # if len(regions) > 0:\n",
245
+ " # \tlargest_id = np.argmax(np.array([props.filled_area for props in regions]))\n",
246
+ " # \tlargest_props = regions[largest_id]\n",
247
+ " # \tmass_y, mass_x = largest_props.centroid\n",
248
+ " # else:\n",
249
+ " # \tmass_x, mass_y = position_x, position_y\n",
250
+ " # # if centroid is not in mask, we find the closest point to it from mask\n",
251
+ " # if m[mass_y, mass_x] != 1:\n",
252
+ " # \tprint 'Finding closes mask point ...'\n",
253
+ " # \tkernel = np.ones((10, 10),np.uint8)\n",
254
+ " # \tme = cv2.erode(m, kernel, iterations = 1)\n",
255
+ " # \tpoints = zip(np.where(me == 1)[0].tolist(), np.where(me == 1)[1].tolist()) # row, col style\n",
256
+ " # \tpoints = np.array(points)\n",
257
+ " # \tdist = np.sum((points - (mass_y, mass_x))**2, axis=1)\n",
258
+ " # \tid = np.argsort(dist)[0]\n",
259
+ " # \tmass_y, mass_x = points[id]\n",
260
+ " # \t# return\n",
261
+ " # return {'mask': m, 'area': area, 'position_x': position_x, 'position_y': position_y, 'mass_x': mass_x, 'mass_y': mass_y}\n",
262
+ " # # show image and mask\n",
263
+ " # I = io.imread(osp.join(self.IMAGE_DIR, image['file_name']))\n",
264
+ " # plt.figure()\n",
265
+ " # plt.imshow(I)\n",
266
+ " # ax = plt.gca()\n",
267
+ " # img = np.ones( (m.shape[0], m.shape[1], 3) )\n",
268
+ " # color_mask = np.array([2.0,166.0,101.0])/255\n",
269
+ " # for i in range(3):\n",
270
+ " # img[:,:,i] = color_mask[i]\n",
271
+ " # ax.imshow(np.dstack( (img, m*0.5) ))\n",
272
+ " # plt.show()\n",
273
+ "\n",
274
+ "def showMask(ann, image_dir, mask_dir):\n",
275
+ " \n",
276
+ " fig, ax = plt.subplots()\n",
277
+ " I = io.imread(osp.join(image_dir, ann['file_name']))\n",
278
+ " ax.imshow(I)\n",
279
+ "\n",
280
+ " M = getMask(ann)\n",
281
+ " msk = M['mask']\n",
282
+ " #msk = io.imread(osp.join(mask_dir, ann['file_name']))\n",
283
+ " \n",
284
+ " ax.imshow(msk, alpha = 0.5)\n",
285
+ " ax.set_title(ann['sentences']['sent'])\n",
286
+ " plt.show()\n",
287
+ "\n",
288
+ "\n",
289
+ "\n",
290
+ "def saveMask(ann, mask_dir, seg_id):\n",
291
+ " M = getMask(ann)\n",
292
+ " msk = M['mask']\n",
293
+ " height, width = msk.shape\n",
294
+ " \n",
295
+ " fig, ax = plt.subplots(figsize=(width / 100, height / 100), dpi=100)\n",
296
+ " ax.imshow(msk, cmap='gray', vmin=0, vmax=1)\n",
297
+ "\n",
298
+ " save_path = f'{mask_dir}/{seg_id}'\n",
299
+ " plt.axis('off')\n",
300
+ " plt.subplots_adjust(left=0, right=1, top=1, bottom=0) # Remove padding\n",
301
+ "\n",
302
+ " fig.savefig(save_path, dpi=100, bbox_inches='tight', pad_inches=0)\n",
303
+ " \n",
304
+ " plt.close(fig)"
305
+ ]
306
+ },
307
+ {
308
+ "cell_type": "code",
309
+ "execution_count": 46,
310
+ "metadata": {},
311
+ "outputs": [],
312
+ "source": [
313
+ "for i in range(len(json_data)):\n",
314
+ " #showMask(json_data[i], image_dir = '/home/yejin/data/dataset/VRIS/mbench/ytvos/selected_frames', mask_dir = '/home/yejin/data/dataset/VRIS/mbench/ytvos/filtered_masks')\n",
315
+ " saveMask(json_data[i], '/home/yejin/data/dataset/VRIS/mbench/ytvos/filtered_masks_segid', i)"
316
+ ]
317
+ },
318
+ {
319
+ "cell_type": "code",
320
+ "execution_count": null,
321
+ "metadata": {},
322
+ "outputs": [],
323
+ "source": [
324
+ "##############안 쓰는 함수!###################\n",
325
+ "# 마스크 저장\n",
326
+ "# annotation dictionary as input\n",
327
+ "def saveMask(annotation, mask_dir, seg_box='seg'):\n",
328
+ " image_width = annotation['width']\n",
329
+ " image_height = annotation['height']\n",
330
+ "\n",
331
+ " fig, ax = plt.subplots(figsize=(image_width / 100, image_height / 100), facecolor='black') # figsize 단위는 인치, DPI 고려\n",
332
+ " ax.set_facecolor('black')\n",
333
+ " \n",
334
+ " \n",
335
+ " if seg_box == 'seg':\n",
336
+ " polygons = []\n",
337
+ " color = (1, 1, 1)\n",
338
+ " \n",
339
+ " if type(annotation['segmentation'][0]) == list:\n",
340
+ " # polygon used for refcoco*\n",
341
+ " for seg in annotation['segmentation']:\n",
342
+ " poly = np.array(seg).reshape((int(len(seg) / 2), 2))\n",
343
+ " polygons.append(Polygon(poly))\n",
344
+ "\n",
345
+ " p = PatchCollection(polygons,\n",
346
+ " facecolors=(1, 1, 1),\n",
347
+ " linewidths=0)\n",
348
+ " ax.add_collection(p)\n",
349
+ "\n",
350
+ " # 축 범위를 이미지 크기에 맞게 설정\n",
351
+ " ax.set_xlim(0, image_width)\n",
352
+ " ax.set_ylim(0, image_height)\n",
353
+ " \n",
354
+ " # y축 방향 뒤집기 (이미지 좌표계와 일치)\n",
355
+ " ax.invert_yaxis()\n",
356
+ " \n",
357
+ " # 플롯 표시\n",
358
+ " #plt.axis('equal') # 축 비율을 동일하게 설정\n",
359
+ " #plt.show()\n",
360
+ "\n",
361
+ " #플롯 저장\n",
362
+ " plt.axis('off') # 축 숨김 (선택 사항)\n",
363
+ " save_path = f'{mask_dir}/{annotation[\"file_name\"]}'\n",
364
+ " plt.savefig(save_path, bbox_inches='tight', pad_inches=0, facecolor='black')\n",
365
+ "\n",
366
+ "for annotation in json_data:\n",
367
+ " saveMask(annotation, mask_dir='/home/yejin/data/dataset/VRIS/mbench/ytvos/filtered_masks')\n",
368
+ " "
369
+ ]
370
+ }
371
+ ],
372
+ "metadata": {
373
+ "kernelspec": {
374
+ "display_name": "VerbCentric_RIS",
375
+ "language": "python",
376
+ "name": "verbcentric_ris"
377
+ }
378
+ },
379
+ "nbformat": 4,
380
+ "nbformat_minor": 2
381
+ }
make_refcoco/refcocog_google/multi_object_data_gref_google.json ADDED
The diff for this file is too large to render. See raw diff
 
make_refcoco/refcocog_google/needrevision_refid_part4.json ADDED
@@ -0,0 +1,506 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "4859": {
3
+ "101105": "man sitting on the ground playing wii",
4
+ "101106": "man in white and light blue t - shirt"
5
+ },
6
+ "678": {
7
+ "14720": "the man crouching inside the plane",
8
+ "14721": "the man wearing white hat"
9
+ },
10
+ "162": {
11
+ "2908": "the man resting his face on his hands",
12
+ "2909": "the man with a plastic bag between his feet"
13
+ },
14
+ "3052": {
15
+ "63901": "person looking at a book",
16
+ "63902": "person wearing a hat and backpack"
17
+ },
18
+ "2355": {
19
+ "49522": "the cat sitting in the chair",
20
+ "49523": "cat on left side"
21
+ },
22
+ "3408": {
23
+ "71397": "a man bending and judging a tennis match",
24
+ "71398": "a man wearing a red shirt and black pants"
25
+ },
26
+ "834": {
27
+ "17983": "a giraffe who is eating hay out of a feeder",
28
+ "17984": "the giraffe on the right side of the pole"
29
+ },
30
+ "328": {
31
+ "6730": "person bending over",
32
+ "6731": "big person in blue cap"
33
+ },
34
+ "1646": {
35
+ "35169": "person about to hit a ball",
36
+ "35170": "person wearing shirt and pants"
37
+ },
38
+ "4400": {
39
+ "91825": "boy sitting on his skateboard and looking at another boy",
40
+ "91826": "boy wearing dark t - shirt and jeans"
41
+ },
42
+ "3683": {
43
+ "77184": "a man dishing up food",
44
+ "77185": "a man in military camo and a black hat on the right"
45
+ },
46
+ "3788": {
47
+ "79367": "a black cat sitting and starring",
48
+ "79368": "a cat with a heart shaped tag"
49
+ },
50
+ "4701": {
51
+ "97795": "person whose tie is being pulled by another person",
52
+ "97796": "person in blue shirt with a red undone tie"
53
+ },
54
+ "1211": {
55
+ "26003": "person putting arm around another person",
56
+ "26004": "person with backpack"
57
+ },
58
+ "2138": {
59
+ "45446": "a person sleeping on the top bunk",
60
+ "45447": "a person in a green shirt and brown shorts"
61
+ },
62
+ "3510": {
63
+ "73478": "personn sitting in a train compartment and reading book",
64
+ "73479": "person in striped shirt"
65
+ },
66
+ "899": {
67
+ "19308": "a man serving soup",
68
+ "19309": "a man with tattoo on his arm"
69
+ },
70
+ "293": {
71
+ "5939": "a lady laughing and looking at another lady",
72
+ "5940": "a lady with dark hair and a dark shirt"
73
+ },
74
+ "3196": {
75
+ "67017": "person holding a pen",
76
+ "67018": "person in a brown suit"
77
+ },
78
+ "1939": {
79
+ "41076": "a person sitting cross legged on the beach",
80
+ "41077": "person in khakis and a white shirt with yellow flowers"
81
+ },
82
+ "2659": {
83
+ "56121": "person helping another cross a stream",
84
+ "56122": "person in white dress"
85
+ },
86
+ "2849": {
87
+ "59798": "person looking down drinking a glass of wine",
88
+ "59799": "person on the right side not wearing glasses"
89
+ },
90
+ "756": {
91
+ "16375": "the woman about to pick up a slice of pizza",
92
+ "16376": "a woman with a flower shirt"
93
+ },
94
+ "4573": {
95
+ "95258": "person reaching for another person with the frisbee",
96
+ "95259": "person with blue and white striped shirt on"
97
+ },
98
+ "4514": {
99
+ "94061": "person running behind",
100
+ "94062": "person in dark brown top and jeans"
101
+ },
102
+ "304": {
103
+ "6165": "person resting her head in hand and crossing one's legs",
104
+ "6166": "the person in pink jacket"
105
+ },
106
+ "3465": {
107
+ "72753": "person sitting on a love seat and watching others play wii",
108
+ "72754": "person in a black shirt and white shorts"
109
+ },
110
+ "1092": {
111
+ "23796": "a bear standing up with its mouth open",
112
+ "23797": "a bear on the right"
113
+ },
114
+ "2025": {
115
+ "42838": "the person leading the horse",
116
+ "42839": "the person in gray top and jeans"
117
+ },
118
+ "1701": {
119
+ "36094": "giraffe biting off of a tree",
120
+ "36095": "tall giraffe on the right"
121
+ },
122
+ "2958": {
123
+ "62137": "person playing with dog",
124
+ "62138": "balding person wearing brown hoodie"
125
+ },
126
+ "4793": {
127
+ "99824": "the girl eating and looking at her plate",
128
+ "99825": "the girl wearing a pink shirt"
129
+ },
130
+ "1247": {
131
+ "26624": "the person holding the bat",
132
+ "26625": "the person in white t - shirt and grey pants"
133
+ },
134
+ "1841": {
135
+ "38888": "person resting hands on other people's shoulders",
136
+ "38889": "tallest person wearing bright suit"
137
+ },
138
+ "4404": {
139
+ "91907": "a elephant whose trunk pointing to the floor , may be touching",
140
+ "91908": "elephant more on the right side of the picture"
141
+ },
142
+ "4536": {
143
+ "94448": "a person reaching for the microwave looking at the camera",
144
+ "94449": "person in black t shirt"
145
+ },
146
+ "2787": {
147
+ "58740": "a giraffe snacking on the tree",
148
+ "58741": "a giraffe on the right"
149
+ },
150
+ "3377": {
151
+ "70765": "a zebra resting its head on another zebra ' s back",
152
+ "70766": "a zebra on the left"
153
+ },
154
+ "3889": {
155
+ "81051": "a man holding a basket of pastries",
156
+ "81052": "a man wearing grey hoodie"
157
+ },
158
+ "2194": {
159
+ "46507": "standing dog",
160
+ "46508": "a black and white dog with a blue collar tag"
161
+ },
162
+ "508": {
163
+ "11146": "person being held by another person",
164
+ "11147": "person dressed in a red suit and blue cap"
165
+ },
166
+ "2312": {
167
+ "48847": "a bird standing on a table",
168
+ "48848": "a bird on the left"
169
+ },
170
+ "3948": {
171
+ "82190": "the woman who is squinting in one eye",
172
+ "82191": "a blue eyed brown haired woman not wearing glasses"
173
+ },
174
+ "1388": {
175
+ "29353": "person holding another person while watching giraffe drink water",
176
+ "29354": "person in brown shirt with bag"
177
+ },
178
+ "2690": {
179
+ "56849": "a man about to kick a ball",
180
+ "56850": "a man in all white with number 23 on his chest"
181
+ },
182
+ "1109": {
183
+ "24043": "man holding the ktie",
184
+ "24044": "man on the right"
185
+ },
186
+ "1374": {
187
+ "29120": "person arranging pansts of another person",
188
+ "29121": "the person with in the black tuxedo and glasses in his head"
189
+ },
190
+ "3475": {
191
+ "72951": "woman holding the horse",
192
+ "72952": "a woman wearing spectacles with violet shirt and flourecent colour waist vest"
193
+ },
194
+ "1333": {
195
+ "28225": "a person holding another person",
196
+ "28226": "a person in a pink and orange flannel shirt"
197
+ },
198
+ "2068": {
199
+ "43909": "person standing and playing wii",
200
+ "43910": "person wearing black t - shirt"
201
+ },
202
+ "2824": {
203
+ "59394": "person standing besides a table crossing arms",
204
+ "59395": "person with glasses and long hair"
205
+ },
206
+ "2294": {
207
+ "48483": "a person sitting on bike holding another person",
208
+ "48484": "a person with a helmet on the head"
209
+ },
210
+ "2446": {
211
+ "51355": "an elephant that has it ' s trunk pointing towards the water",
212
+ "51356": "elephant on the left"
213
+ },
214
+ "2686": {
215
+ "56783": "a man staring at another man",
216
+ "56784": "a man in an orange tie"
217
+ },
218
+ "4558": {
219
+ "94950": "a zebra facing the camera",
220
+ "94951": "a small zebra beside a larger zebra"
221
+ },
222
+ "1499": {
223
+ "32051": "a man resting on a metal fence",
224
+ "32052": "a man in white shirt and polka dot tie"
225
+ },
226
+ "4303": {
227
+ "89833": "a man throwing a banana",
228
+ "89834": "a man in bike gear on the right of the picture"
229
+ },
230
+ "1376": {
231
+ "29146": "a man sitting down with his hands together",
232
+ "29147": "a man with a purple shirt and khaki pants "
233
+ },
234
+ "3544": {
235
+ "74100": "the man holding a riding crop",
236
+ "74101": "man in black shirt and slacks on the left"
237
+ },
238
+ "1858": {
239
+ "39103": "a bull standing",
240
+ "39104": "a white and brown bull on the left of the picture"
241
+ },
242
+ "434": {
243
+ "9561": "the man looking down",
244
+ "9562": "the man on the left"
245
+ },
246
+ "3024": {
247
+ "63345": "a baseball player sliding into a base",
248
+ "63346": "baseball player wearing the number 12"
249
+ },
250
+ "513": {
251
+ "11239": "a man riding on a skateboard as his picture is being taken",
252
+ "11240": "a man in a purple t - shirt and ripped jeans"
253
+ },
254
+ "693": {
255
+ "14989": "a person standing",
256
+ "14990": "a small person"
257
+ },
258
+ "2523": {
259
+ "53103": "a baseball player sliding into home plate and getting tagged by the catcher",
260
+ "53104": "a la dodgers player on the right of the picture"
261
+ },
262
+ "4987": {
263
+ "104145": "a girl punching out her arm while playing an interactive video game",
264
+ "104146": "girl wearing grey and white stripes and sweatpants"
265
+ },
266
+ "4041": {
267
+ "84159": "soccer player about to kick soccer ball",
268
+ "84160": "soccer player wearing black t - shirt and black gloves"
269
+ },
270
+ "2105": {
271
+ "44674": "a baseball player holding his arm up to catch a ball",
272
+ "44675": "a baseball player wearing helmet and vest"
273
+ },
274
+ "135": {
275
+ "2353": "dog resting it ' s head on a table",
276
+ "2354": "golden dog"
277
+ },
278
+ "3613": {
279
+ "75580": "person talking to another person while crossing legs",
280
+ "75581": "person with long sleeve shirt, jeans and cap"
281
+ },
282
+ "1722": {
283
+ "36451": "person pulling another person's tie",
284
+ "36452": "blonde person in black dress"
285
+ },
286
+ "1607": {
287
+ "34281": "a person reading a book to another person he ' s holding",
288
+ "34282": "a bald person wearing a beige t - shirt and gray jeans"
289
+ },
290
+ "2761": {
291
+ "58225": "girl propping her chin on her hand",
292
+ "58226": "girl in a pink shirt near window"
293
+ },
294
+ "2454": {
295
+ "51492": "a man looking at laptop",
296
+ "51493": "the man with glasses and painted fingernails"
297
+ },
298
+ "1603": {
299
+ "34234": "person eating a donut",
300
+ "34235": "person with the black beanie"
301
+ },
302
+ "4794": {
303
+ "99868": "a duck that is looking straight ahead",
304
+ "99869": "the duck on the right side"
305
+ },
306
+ "2485": {
307
+ "52246": "a person reaching across the net",
308
+ "52247": "tallest person in a grey shirt and shorts"
309
+ },
310
+ "3280": {
311
+ "68799": "a boy walking towards his skate board",
312
+ "68800": "a boy in a striped shirt"
313
+ },
314
+ "3336": {
315
+ "69882": "person holding a piece of chocolate cake",
316
+ "69883": "person wearing a purple dress"
317
+ },
318
+ "3118": {
319
+ "65349": "giraffe stretching its neck straight up",
320
+ "65350": "taller giraffe"
321
+ },
322
+ "4494": {
323
+ "93729": "man touching the frisbee",
324
+ "93730": "a man in a white shirt"
325
+ },
326
+ "3004": {
327
+ "62940": "person crouching to catch a ball",
328
+ "62941": "person in a red uniform and helmet"
329
+ },
330
+ "127": {
331
+ "2256": "a person holding a plate",
332
+ "2257": "the person in the purple coat"
333
+ },
334
+ "3389": {
335
+ "70905": "person waving",
336
+ "70906": "person in black sneakers"
337
+ },
338
+ "2568": {
339
+ "54256": "person looking at phone",
340
+ "54257": "blonde person on the right"
341
+ },
342
+ "2283": {
343
+ "48251": "the cook holding a plate",
344
+ "48252": "middle cook of three cooks"
345
+ },
346
+ "1530": {
347
+ "32639": "person petting the cat",
348
+ "32640": "person with sleeves rolled up"
349
+ },
350
+ "4251": {
351
+ "88833": "a person reading a book",
352
+ "88834": "person in a striped jacket "
353
+ },
354
+ "2540": {
355
+ "53539": "a man reaching out his right arm holding a controller",
356
+ "53540": "a man in red shirt and black jeans"
357
+ },
358
+ "2870": {
359
+ "60169": "a person watching horse riding",
360
+ "60170": "a person in a white jacket and beige pants"
361
+ },
362
+ "4946": {
363
+ "103092": "a man about to hit a ball",
364
+ "103093": "a man in red shirt and blue vest"
365
+ },
366
+ "113": {
367
+ "1973": "person holding phone",
368
+ "1974": "person with a black shirt and brown coat"
369
+ },
370
+ "711": {
371
+ "15398": "girl crouching and holding an umbrella",
372
+ "15399": "girl wearing light green socks on the left"
373
+ },
374
+ "3209": {
375
+ "67236": "the person that is sliding into home , getting tagged out by the catcher",
376
+ "67237": "the person in the white vest over the blue shirt"
377
+ },
378
+ "3620": {
379
+ "75711": "person petting a horse",
380
+ "75712": "a person in white t - shirt"
381
+ },
382
+ "4382": {
383
+ "91559": "horse being hugged by a person",
384
+ "91560": "white and brown horse"
385
+ },
386
+ "2861": {
387
+ "60004": "a man playing tennis",
388
+ "60005": "a man wearing a blue shirt and white shorts"
389
+ },
390
+ "3954": {
391
+ "82306": "a person putting gloves on",
392
+ "82307": "person with dark blue jumper"
393
+ },
394
+ "1984": {
395
+ "42076": "a person being held by another person",
396
+ "42077": "little person on pink skiis with yellow parka on"
397
+ },
398
+ "2069": {
399
+ "43945": "a person helping another person ski",
400
+ "43946": "a big person in white jumper and backpack"
401
+ },
402
+ "2016": {
403
+ "42686": "person putting food in the oven",
404
+ "42687": "person in green t - shirt"
405
+ },
406
+ "1153": {
407
+ "25076": "a giraffe , with head lowered , crosses in front of another giraffe",
408
+ "25077": "giraffe in the middle"
409
+ },
410
+ "3614": {
411
+ "75583": "a man in explaining something on a tablet",
412
+ "75584": "a man with a blue cap and striped shirt"
413
+ },
414
+ "198": {
415
+ "3830": "a giraffe bending down to eat grass",
416
+ "3831": "giraffe in front"
417
+ },
418
+ "3012": {
419
+ "63097": "person standing with hands on hips",
420
+ "63098": "person in a white collared shirt and jeans"
421
+ },
422
+ "4247": {
423
+ "88808": "man pointing toward another man",
424
+ "88809": "man in plaid shirt"
425
+ },
426
+ "2205": {
427
+ "46674": "person bending over",
428
+ "46675": "person in red shirt and cap"
429
+ },
430
+ "4831": {
431
+ "100694": "person holding bat in hands",
432
+ "100695": "person wearing light blue shirt and glass"
433
+ },
434
+ "4534": {
435
+ "94419": "the bird not drinking",
436
+ "94420": "the bird on the left"
437
+ },
438
+ "638": {
439
+ "13717": "person sitting on another person's lap and holding the remote controller",
440
+ "13718": "small person in red shirt"
441
+ },
442
+ "1419": {
443
+ "30082": "person squatting on the ground to catch a ball",
444
+ "30083": "person in red and white wearing glove"
445
+ },
446
+ "1992": {
447
+ "42197": "a person reaching for a cupcake",
448
+ "42198": "a person in a blue vest"
449
+ },
450
+ "542": {
451
+ "11877": "man receiving food",
452
+ "11878": "a black man in a black shirt"
453
+ },
454
+ "2223": {
455
+ "47051": "person sitting a chair holding a protest sign",
456
+ "47052": "old person in grey t - shirt and blue jeans"
457
+ },
458
+ "4865": {
459
+ "101219": "person being held by another person",
460
+ "101220": "a young person wearing a yellow shirt"
461
+ },
462
+ "751": {
463
+ "16247": "person holding a painting brush",
464
+ "16248": "person wearing white top and cap"
465
+ },
466
+ "3540": {
467
+ "74039": "a man swinging a bat",
468
+ "74040": "a man in a blue baseball shirt and white pants"
469
+ },
470
+ "3765": {
471
+ "78908": "person sitting",
472
+ "78909": "person wearing white shirt and red shoes"
473
+ },
474
+ "2879": {
475
+ "60471": "bear standing against the fence",
476
+ "60472": "a small bear on the right"
477
+ },
478
+ "4529": {
479
+ "94312": "kid holding out left arm playing wii",
480
+ "94313": "kid in a green and red sweatshirt"
481
+ },
482
+ "2131": {
483
+ "45308": "man putting both hands behind his head",
484
+ "45309": "a man with the pool noodle"
485
+ },
486
+ "1306": {
487
+ "27841": "a cow eating grass",
488
+ "27842": "the cow on the right"
489
+ },
490
+ "3508": {
491
+ "73469": "a person standing and playing a video game",
492
+ "73470": "a little person dressed in brown"
493
+ },
494
+ "4165": {
495
+ "87036": "a child holding feathers",
496
+ "87037": "a child wearing green t - shirt"
497
+ },
498
+ "4126": {
499
+ "86073": "a person standing and reading a book",
500
+ "86074": "a person in a suit"
501
+ },
502
+ "388": {
503
+ "8339": "a man holding up an umbrella in the rain for a man who is fixing a tire",
504
+ "8340": "a man wearing glasses in a red jacket"
505
+ }
506
+ }
make_refcoco/refcocog_umd/needrevision_refid_part4.json ADDED
@@ -0,0 +1,498 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "1679": {
3
+ "37582": "player holding a baseball glove",
4
+ "37583": "a blurred player"
5
+ },
6
+ "4048": {
7
+ "92810": "player hitting a ball with a baseball bat",
8
+ "92811": "player with number 18 on his back"
9
+ },
10
+ "2530": {
11
+ "57782": "man crouching ready to catch a ball",
12
+ "57783": "man with 55 on his back"
13
+ },
14
+ "4385": {
15
+ "101410": "man leaning on one leg watching the players",
16
+ "101411": "man in gray pants"
17
+ },
18
+ "5018": {
19
+ "102413": "man standing ready to swing his bat",
20
+ "102414": "man in front of the other two men"
21
+ },
22
+ "2290": {
23
+ "52302": "sheep standing in the pasture next to a sitting sheep",
24
+ "52303": "the front most sheep"
25
+ },
26
+ "2347": {
27
+ "53861": "a sheep sitting down in the grass",
28
+ "53862": "a sheep in the background"
29
+ },
30
+ "3143": {
31
+ "71854": "a horse being led by it ' s trainer",
32
+ "71855": "a horse in front of the picture"
33
+ },
34
+ "1688": {
35
+ "37818": "zebra eating grass",
36
+ "37819": "the zebra in the middle with its face near the ground"
37
+ },
38
+ "944": {
39
+ "21007": "a bird touching its neck with its right feet",
40
+ "21008": "a bird in the back"
41
+ },
42
+ "3477": {
43
+ "79163": "the bird standing and looking to the left",
44
+ "79164": "bird with both feet in the water"
45
+ },
46
+ "2497": {
47
+ "56845": "person holding a baseball bat",
48
+ "56846": "person in blue baseball cap"
49
+ },
50
+ "4110": {
51
+ "94298": "person sitting and watching children play a ballgame",
52
+ "94299": "person wearing a white shirt and black leggings"
53
+ },
54
+ "2011": {
55
+ "45909": "a woman talking on her cell phone",
56
+ "45910": "a blonde woman wearing a blue shirt and white shorts"
57
+ },
58
+ "2884": {
59
+ "65819": "a woman looking at her phone",
60
+ "65820": "a woman with black hair wearing jeans, a striped gray shirt and flip flops"
61
+ },
62
+ "1076": {
63
+ "24000": "person crossing a stream of water",
64
+ "24001": "person wearing jeans and a green vest"
65
+ },
66
+ "4803": {
67
+ "56121": "person helping the other cross a stream",
68
+ "56122": "person in white dress"
69
+ },
70
+ "3508": {
71
+ "80112": "baseball player placing his hands on his hips",
72
+ "80113": "a baseball player named datz"
73
+ },
74
+ "169": {
75
+ "4002": "person feeding a giraffe",
76
+ "4003": "a small person in light blue shirt"
77
+ },
78
+ "258": {
79
+ "5988": "person holding a child",
80
+ "5989": "person wearing glasses and navy shirt"
81
+ },
82
+ "3661": {
83
+ "83542": "person sitting on the floor",
84
+ "83543": "person in a grey shirt and dark pants"
85
+ },
86
+ "4831": {
87
+ "62137": "person sitting on couch and playing with a dog",
88
+ "62138": "bald person wearing jeans and brown hoodie"
89
+ },
90
+ "2214": {
91
+ "50208": "a woman eating a donut",
92
+ "50209": "a brown hair woman in gray sweater"
93
+ },
94
+ "2266": {
95
+ "51661": "a woman holding a purse",
96
+ "51662": "a woman with blonde hair and a black shirt"
97
+ },
98
+ "2477": {
99
+ "56429": "girl talking and looking at another girl",
100
+ "56430": "girl in black"
101
+ },
102
+ "5005": {
103
+ "99824": "girl eating and looking at her plate",
104
+ "99825": "girl wearing a pink shirt"
105
+ },
106
+ "2919": {
107
+ "66832": "person riding a bike",
108
+ "66833": "asian person wearing black jacket"
109
+ },
110
+ "1850": {
111
+ "42078": "man placing his hand on another man's shoulder",
112
+ "42079": "a man who is wearing a red color tie"
113
+ },
114
+ "3757": {
115
+ "85761": "boy holding a cell phone",
116
+ "85762": "boy in a blue hoodie"
117
+ },
118
+ "524": {
119
+ "12089": "a zebra that is not eating grass",
120
+ "12090": "a zebra on the far right"
121
+ },
122
+ "4363": {
123
+ "100914": "elephant holding up its trunk",
124
+ "100915": "an elephant in front of another"
125
+ },
126
+ "2976": {
127
+ "68306": "girl eating food from her right hand",
128
+ "68307": "a girl in a black flowered top"
129
+ },
130
+ "838": {
131
+ "18887": "man leaning on bike on boat",
132
+ "18888": "a man not wearing a hat"
133
+ },
134
+ "3044": {
135
+ "69755": "man rowing boat",
136
+ "69756": "a man on the left side of the picture"
137
+ },
138
+ "2426": {
139
+ "55424": "the baseball player facing towards the right not doing a high five",
140
+ "55425": "baseball player in catcher ' s uniform"
141
+ },
142
+ "2113": {
143
+ "47984": "person that is dancing",
144
+ "47985": "person with the thick beard, glasses and a hat"
145
+ },
146
+ "2327": {
147
+ "53376": "person bathing another person",
148
+ "53377": "person in a floral print dress and hat"
149
+ },
150
+ "4727": {
151
+ "39103": "a bull laying down",
152
+ "39104": "a white and brown bull on the right"
153
+ },
154
+ "859": {
155
+ "19350": "cat sitting on a luggage and staring at the camera",
156
+ "19351": "cat infront of another cat"
157
+ },
158
+ "935": {
159
+ "20809": "cat laying down on a bag",
160
+ "20810": "cat behind another cat"
161
+ },
162
+ "1105": {
163
+ "24654": "an elephant stepping on a large log",
164
+ "24655": "elephant on far right"
165
+ },
166
+ "395": {
167
+ "8819": "person placing her hands on one's hips",
168
+ "8820": "person on the far left"
169
+ },
170
+ "771": {
171
+ "17614": "person holding a child on one's shoulders",
172
+ "17615": "tall person on the right"
173
+ },
174
+ "2942": {
175
+ "67334": "person sitting on another person's shoulders",
176
+ "67335": "small person on the right"
177
+ },
178
+ "41": {
179
+ "961": "a lady pouring wine in a glass",
180
+ "962": "a lady in black tank top"
181
+ },
182
+ "885": {
183
+ "19926": "person feeding another person with a bottle",
184
+ "19927": "person in black blouse"
185
+ },
186
+ "4862": {
187
+ "69276": "person drinking from a bottle",
188
+ "69277": "small person in white pajamas"
189
+ },
190
+ "1246": {
191
+ "27831": "person holding a laptop",
192
+ "27832": "person with curly brown hair wearing jeans"
193
+ },
194
+ "3346": {
195
+ "76051": "person filing her nails",
196
+ "76052": "person wearing a red robe and has a towel on her head"
197
+ },
198
+ "3657": {
199
+ "83493": "person holding a bottle and listening to music",
200
+ "83494": "person wearing black in headphones"
201
+ },
202
+ "540": {
203
+ "12381": "the woman is swinging the controller",
204
+ "12382": "woman in brown top on the right"
205
+ },
206
+ "3364": {
207
+ "76757": "the woman looking at the camera and opening her mouth",
208
+ "76758": "a woman wearing a brown hooded sweatshirt on the left"
209
+ },
210
+ "1880": {
211
+ "42973": "man looking ahead at the tv",
212
+ "42974": "a man in a white shirt"
213
+ },
214
+ "1949": {
215
+ "44400": "a man looking at his phone",
216
+ "44401": "man in black t - shirt and cap"
217
+ },
218
+ "1620": {
219
+ "36248": "person playing tennis",
220
+ "36249": "person in red tank top and black shorts"
221
+ },
222
+ "2902": {
223
+ "66297": "person sitting and watching a tennis game",
224
+ "66298": "person in blue top"
225
+ },
226
+ "397": {
227
+ "8843": "giraffe bending its head down",
228
+ "8844": "giraffe on the far right"
229
+ },
230
+ "732": {
231
+ "16725": "baseball player squatting and watching closely to judge a play",
232
+ "16726": "baseball player in black top and gray pants"
233
+ },
234
+ "1173": {
235
+ "26074": "a man swinging a bat",
236
+ "26075": "a man in blue and grey"
237
+ },
238
+ "2920": {
239
+ "66854": "a man reaching out his left arm to catch a ball",
240
+ "66855": "a man in red uniform and helmet"
241
+ },
242
+ "1643": {
243
+ "36762": "a man smiling looking down at other people",
244
+ "36763": "a man in a grey suite wearing a pink tie"
245
+ },
246
+ "1454": {
247
+ "32177": "person in putting hands in one's pockets",
248
+ "32178": "person in gray shirt and jeans"
249
+ },
250
+ "1725": {
251
+ "38835": "person crossing her arms walking with another person",
252
+ "38836": "person in a black shirt and jeans"
253
+ },
254
+ "2338": {
255
+ "53733": "the person crouching and placing his hands on his knees",
256
+ "53734": "person with a black shirt and dark grey pants"
257
+ },
258
+ "4249": {
259
+ "97957": "a baseball player reaching out his arm to catch a ball",
260
+ "97958": "a baseball player in green top"
261
+ },
262
+ "3917": {
263
+ "89675": "cow looking at camera",
264
+ "89676": "a cow with an ear tag with the number 949 on it"
265
+ },
266
+ "1156": {
267
+ "25761": "man sitting on the couch using a laptop",
268
+ "25762": "a man with a hat"
269
+ },
270
+ "1998": {
271
+ "45619": "a person watching his phone",
272
+ "45620": "person wearing glasses"
273
+ },
274
+ "3571": {
275
+ "81719": "person looking at one's phone",
276
+ "81720": "mature person with blonde hair and glasses"
277
+ },
278
+ "292": {
279
+ "6707": "a zebra lying down in dirt",
280
+ "6708": "the zebra in the foreground"
281
+ },
282
+ "3367": {
283
+ "76808": "a zebra standing in the zoo",
284
+ "76809": "a zebra in the background"
285
+ },
286
+ "2069": {
287
+ "47212": "person leaning forward on skis",
288
+ "47213": "person in blue hat and jacket, black pants"
289
+ },
290
+ "4050": {
291
+ "92834": "person standing straight looking at another person",
292
+ "92835": "a small person wearing purple pants"
293
+ },
294
+ "2953": {
295
+ "67711": "person who is looking away",
296
+ "67712": "person in a suit"
297
+ },
298
+ "4280": {
299
+ "98813": "person pulling another person's tie",
300
+ "98814": "a person in a white shirt"
301
+ },
302
+ "1743": {
303
+ "39371": "a person holding and looking at another person",
304
+ "39372": "person with bald head and glasses"
305
+ },
306
+ "4598": {
307
+ "13717": "person playing with the remote controller",
308
+ "13718": "small person in red shirt"
309
+ },
310
+ "3380": {
311
+ "77052": "a person cutting a cake",
312
+ "77053": "a person in gray shirt that is not striped"
313
+ },
314
+ "3439": {
315
+ "78305": "a person holding a spatula getting readyy to have a cake",
316
+ "78306": "a person in striped shirt"
317
+ },
318
+ "3355": {
319
+ "76309": "a man swining his bat",
320
+ "76310": "a man in a baseball uniform with a brace on his left ankle"
321
+ },
322
+ "3409": {
323
+ "77608": "a man holding out his arm to catch a ball",
324
+ "77609": "a man wearing a red vest with red shin guards"
325
+ },
326
+ "711": {
327
+ "16184": "the man holding a cat in his arms",
328
+ "16185": "this is a man with thin rimmed glasses and a black scarf"
329
+ },
330
+ "3764": {
331
+ "85913": "person holding a remote and smilling",
332
+ "85914": "person in a black t - shirt and not wearing glasses"
333
+ },
334
+ "113": {
335
+ "2741": "a sheep being fed by a little girl",
336
+ "2742": "a sheep on the right"
337
+ },
338
+ "518": {
339
+ "12021": "a sheep eating grass with its head down",
340
+ "12022": "a sheep on the left"
341
+ },
342
+ "3158": {
343
+ "72128": "a boy crouching and placing both hands on his knees",
344
+ "72129": "boy wearing white baseball helmet , white baseball uniform with orange writing"
345
+ },
346
+ "3223": {
347
+ "73555": "a boy pitching the ball to a player",
348
+ "73556": "a boy with the number 4 on his blue jersey"
349
+ },
350
+ "914": {
351
+ "20478": "a person standing on a surf board , riding a wave",
352
+ "20479": "a person on the right"
353
+ },
354
+ "3568": {
355
+ "81669": "surfer laying down",
356
+ "81670": "surfer on the left"
357
+ },
358
+ "592": {
359
+ "13643": "person sits on the floor watching tv",
360
+ "13644": "person with a black hat and a beige shirt"
361
+ },
362
+ "2856": {
363
+ "65208": "person sitting on a chair watching another person play video games",
364
+ "65209": "person in black shirt and jeans"
365
+ },
366
+ "4879": {
367
+ "73469": "person playing a video game",
368
+ "73470": "blonde person dressed in brown"
369
+ },
370
+ "157": {
371
+ "3682": "a woman holding a plate and reaching for condiments",
372
+ "3683": "woman wearing grey button up sweater"
373
+ },
374
+ "1774": {
375
+ "40317": "person being held by another person",
376
+ "40318": "person with red hair, wearing a pink shirt"
377
+ },
378
+ "2354": {
379
+ "53948": "person with child , catching a frisby",
380
+ "53949": "bigger person in white t - shirt"
381
+ },
382
+ "174": {
383
+ "4179": "a lamb eating grass",
384
+ "4180": "a lamb to the left of another lamb"
385
+ },
386
+ "2369": {
387
+ "54196": "the sheep that is looking into the camera",
388
+ "54197": "a white sheep with a black head on the right"
389
+ },
390
+ "4247": {
391
+ "97897": "a woman holding an umbrella on a bench",
392
+ "97898": "woman on the right"
393
+ },
394
+ "1014": {
395
+ "22621": "man receiving an award",
396
+ "22622": "a man in an orange and white uniform with a black cap"
397
+ },
398
+ "1080": {
399
+ "24100": "a man offers a trophy to anothe man",
400
+ "24101": "a man in a suit"
401
+ },
402
+ "2272": {
403
+ "51815": "the baseball player catching a ball",
404
+ "51816": "the baseball player in dark top and helmet"
405
+ },
406
+ "2495": {
407
+ "56804": "a baseball player swinging at a ball",
408
+ "56805": "the baseball player in white uniform"
409
+ },
410
+ "3511": {
411
+ "80309": "person holding a cup",
412
+ "80310": "person wearing pink shirt"
413
+ },
414
+ "3955": {
415
+ "90542": "person holding a remote control",
416
+ "90543": "person in orange shirt"
417
+ },
418
+ "2409": {
419
+ "55054": "a man adjusting his head band",
420
+ "55055": "man in orange and gray shirt"
421
+ },
422
+ "2775": {
423
+ "63273": "a person holding a remote control",
424
+ "63274": "a tall person in white striped shirt and black pants"
425
+ },
426
+ "996": {
427
+ "22281": "a woman holding a baby",
428
+ "22282": "woman wearing a black shirt and green apron"
429
+ },
430
+ "4789": {
431
+ "52629": "a person holding skies in one's hands",
432
+ "52630": "a person with orange mirrored goggles"
433
+ },
434
+ "1028": {
435
+ "22786": "the cow standing up",
436
+ "22787": "a cow in the middle"
437
+ },
438
+ "244": {
439
+ "5666": "a man holding wine glass",
440
+ "5668": "a blonde man in a white shirt"
441
+ },
442
+ "3538": {
443
+ "80923": "the man throwing the ball from the picther ' s mound",
444
+ "80924": "the man in front"
445
+ },
446
+ "557": {
447
+ "12739": "a baseball player getting ready to swing the bat",
448
+ "12740": "a baseball player , wearing a white and blue uniform"
449
+ },
450
+ "4982": {
451
+ "95870": "cat sitting in front of television on a stand",
452
+ "95871": "orange cat on the right side of the picture"
453
+ },
454
+ "4570": {
455
+ "6638": "a woman cutting a cake",
456
+ "6639": "a woman wearing a long sleeve pink sweater"
457
+ },
458
+ "1698": {
459
+ "38093": "a baseball player swinging his bat",
460
+ "38094": "a baseball player weaing a white uniform and blue helmet"
461
+ },
462
+ "3182": {
463
+ "72616": "the baseball player playing the catcher position",
464
+ "72617": "the baseball player wearing a red and white uniform"
465
+ },
466
+ "846": {
467
+ "19100": "a man holding a toothbrush in his mouth",
468
+ "19101": "a man wearing striped shirt"
469
+ },
470
+ "671": {
471
+ "15227": "person petting a horse",
472
+ "15228": "person wearing a red jacket"
473
+ },
474
+ "3254": {
475
+ "74216": "person sitting in the chair",
476
+ "74217": "person in the tan shirt wearing glasses"
477
+ },
478
+ "3318": {
479
+ "75539": "the person who is smashing cake in his own face",
480
+ "75540": "person with a fake tie on its onesie"
481
+ },
482
+ "1424": {
483
+ "31548": "person watching another person eat",
484
+ "31549": "person in the green shirt"
485
+ },
486
+ "3926": {
487
+ "89831": "person eating a sandwich",
488
+ "89832": "person in orange top with sunglasses in one's head"
489
+ },
490
+ "862": {
491
+ "19444": "a man driving a bicycle and pulling a cart behind",
492
+ "19445": "the man is wearing a pair of khaki shorts"
493
+ },
494
+ "2932": {
495
+ "67140": "man standing on bike",
496
+ "67141": "man in blue jean shorts"
497
+ }
498
+ }
mbench/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (160 Bytes). View file
 
mbench/__pycache__/ytvos_ref.cpython-310.pyc ADDED
Binary file (7.81 kB). View file
 
mbench/check_image_numbered_cy.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
mbench/check_image_numbered_cy_score.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import argparse
4
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
5
+
6
+ import opts
7
+
8
+ import numpy as np
9
+ import cv2
10
+ from PIL import Image
11
+ import json
12
+
13
+ from mbench.ytvos_ref import build as build_ytvos_ref
14
+ import t2v_metrics
15
+
16
+ import matplotlib.pyplot as plt
17
+ import textwrap
18
+
19
+
20
+ def scoreCaption(idx, all_captions, all_valid_obj_ids, clip_flant5_score, color_mask = False):
21
+ vid_meta = metas[idx]
22
+ vid_id = vid_meta['video']
23
+ frames = vid_meta['frames']
24
+
25
+ first_cat = list(all_captions[vid_id].keys())[0]
26
+ sampled_frames = list(all_captions[vid_id][first_cat].keys())
27
+ imgs = []
28
+ masks = []
29
+ for frame_indx in sampled_frames:
30
+ frame_name = frames[int(frame_indx)]
31
+ img_path = os.path.join(str(train_dataset.img_folder), 'JPEGImages', vid_id, frame_name + '.jpg')
32
+ mask_path = os.path.join(str(train_dataset.img_folder), 'Annotations', vid_id, frame_name + '.png')
33
+ img = Image.open(img_path).convert('RGB')
34
+ imgs.append(img)
35
+ mask = Image.open(mask_path).convert('P')
36
+ mask = np.array(mask)
37
+ masks.append(mask)
38
+
39
+ vid_captions = all_captions[vid_id]
40
+ cat_names = set(list(vid_captions.keys()))
41
+
42
+ vid_result = {}
43
+
44
+ for cat in cat_names:
45
+
46
+ cat_captions = vid_captions[cat]
47
+
48
+ cat_result = {}
49
+
50
+ for i in range(len(imgs)):
51
+ frame_name = sampled_frames[i]
52
+ frame = np.copy(np.array(imgs[i]))
53
+ frame_for_contour = np.copy(np.array(imgs[i]))
54
+
55
+ mask = masks[i]
56
+
57
+ all_obj_ids = np.unique(mask).astype(int)
58
+ all_obj_ids = [str(obj_id) for obj_id in all_obj_ids if obj_id != 0]
59
+
60
+ if cat in all_valid_obj_ids[vid_id]:
61
+ valid_obj_ids = all_valid_obj_ids[vid_id][cat]
62
+ else:
63
+ valid_obj_ids = []
64
+
65
+ for j in range(len(all_obj_ids)):
66
+ obj_id = all_obj_ids[j]
67
+ obj_mask = (mask == int(obj_id)).astype(np.uint8)
68
+
69
+ if obj_id in valid_obj_ids:
70
+ if color_mask == False:
71
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
72
+ cv2.drawContours(frame, contours, -1, colors[j], 3)
73
+ for i, contour in enumerate(contours):
74
+ # 윤곽선 중심 계산
75
+ moments = cv2.moments(contour)
76
+ if moments["m00"] != 0: # 중심 계산 가능 여부 확인
77
+ cx = int(moments["m10"] / moments["m00"])
78
+ cy = int(moments["m01"] / moments["m00"])
79
+ else:
80
+ cx, cy = contour[0][0] # 중심 계산 불가시 대체 좌표 사용
81
+
82
+ # 텍스트 배경 (검은색 배경 만들기)
83
+ font = cv2.FONT_HERSHEY_SIMPLEX
84
+ text = obj_id
85
+ text_size = cv2.getTextSize(text, font, 1, 2)[0]
86
+ text_w, text_h = text_size
87
+
88
+ # 텍스트 배경 그리기 (검은색 배경)
89
+ cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
90
+ (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
91
+
92
+ # 텍스트 그리기 (흰색 텍스트)
93
+ cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
94
+ font, 1, (255, 255, 255), 2)
95
+ else:
96
+ alpha = 0.08
97
+ colored_obj_mask = np.zeros_like(frame)
98
+ colored_obj_mask[obj_mask == 1] = colors[j]
99
+ frame[obj_mask == 1] = (
100
+ (1 - alpha) * frame[obj_mask == 1]
101
+ + alpha * colored_obj_mask[obj_mask == 1]
102
+ )
103
+
104
+
105
+ contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
106
+ cv2.drawContours(frame, contours, -1, colors[j], 2)
107
+ cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
108
+
109
+
110
+
111
+ if len(contours) > 0:
112
+ largest_contour = max(contours, key=cv2.contourArea)
113
+ M = cv2.moments(largest_contour)
114
+ if M["m00"] != 0:
115
+ center_x = int(M["m10"] / M["m00"])
116
+ center_y = int(M["m01"] / M["m00"])
117
+ else:
118
+ center_x, center_y = 0, 0
119
+
120
+ font = cv2.FONT_HERSHEY_SIMPLEX
121
+ text = obj_id
122
+
123
+ font_scale = 0.9
124
+ text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
125
+ text_x = center_x - text_size[0] // 1 # 텍스트의 가로 중심
126
+ text_y = center_y
127
+ # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
128
+
129
+ # 텍스트 배경 사각형 좌표 계산
130
+ rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
131
+ # rect_end = (text_x + text_size[0] + 5, text_y + 5)
132
+ rect_end = (text_x + text_size[0] + 5, text_y)
133
+
134
+ cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
135
+ cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
136
+
137
+
138
+
139
+ # fig, ax = plt.subplots()
140
+ # ax.imshow(frame)
141
+ # ax.axis('off')
142
+
143
+ frame_caption = cat_captions[frame_name]
144
+ if frame_caption:
145
+ # wrapped_text = "\n".join(textwrap.wrap(frame_caption, width=60))
146
+ # ax.text(0.5, -0.3, wrapped_text, ha='center', va='center', fontsize=12, transform=ax.transAxes)
147
+
148
+ #calculate vqa score
149
+ frame = Image.fromarray(frame)
150
+ score = clip_flant5_score(images=[frame], texts=[frame_caption])
151
+ else:
152
+ score = None
153
+
154
+ # plt.title(f"vid_id: {vid_id}, cat: {cat}, frame: {frame_name}, score: {score}")
155
+ # plt.tight_layout()
156
+ # plt.show()
157
+
158
+ cat_result[frame_name] = {
159
+ "caption" : frame_caption,
160
+ "score" : score
161
+ }
162
+
163
+ vid_result[cat] = cat_result
164
+
165
+ return vid_id, vid_result
166
+
167
+
168
+
169
+ if __name__ == '__main__':
170
+ parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
171
+ args = parser.parse_args()
172
+
173
+ #==================데이터 불러오기===================
174
+ # 전체 데이터셋
175
+ train_dataset = build_ytvos_ref(image_set = 'train', args = args)
176
+
177
+ # 전체 데이터셋 메타데이터
178
+ metas = train_dataset.metas
179
+
180
+ # caption 데이터
181
+ with open('mbench/numbered_captions_gpt-4o_final.json', 'r') as file:
182
+ all_captions = json.load(file)
183
+
184
+ # valid obj ids 데이터
185
+ with open('mbench/numbered_valid_obj_ids_gpt-4o_final.json', 'r') as file:
186
+ all_valid_obj_ids = json.load(file)
187
+
188
+ # 색상 후보 8개 (RGB 형식)
189
+ colors = [
190
+ (255, 0, 0), # Red
191
+ (0, 255, 0), # Green
192
+ (0, 0, 255), # Blue
193
+ (255, 255, 0), # Yellow
194
+ (255, 0, 255), # Magenta
195
+ (0, 255, 255), # Cyan
196
+ (128, 0, 128), # Purple
197
+ (255, 165, 0) # Orange
198
+ ]
199
+
200
+ #==================vqa score 모델 불러오기===================
201
+ clip_flant5_score = t2v_metrics.VQAScore(model='clip-flant5-xxl')
202
+
203
+ #==================vqa score 점수 계산하기===================
204
+ all_scores = {}
205
+ for i in range(5):
206
+ vid_id, vid_result = scoreCaption(i, all_captions, all_valid_obj_ids, clip_flant5_score, False)
207
+ all_scores[vid_id] = vid_result
208
+
209
+ with open('mbench/numbered_captions_gpt-4o_final_scores.json', 'w', encoding='utf-8') as json_file:
210
+ json.dump(all_scores, indent=4, ensure_ascii=False)
211
+
212
+ print("JSON 파일이 성공적으로 저장되었습니다!")
mbench/gpt_ref-ytvos-cy.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
mbench/gpt_ref-ytvos-revised.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
mbench/gpt_ref-ytvos_numbered.ipynb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fd89176d8bf426500d18caf6b5983b0765f147d17a6bb59f41c4edcaf3c3158
3
+ size 16214561
mbench/gpt_ref-ytvos_numbered_cy.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
mbench/numbered_captions.json ADDED
The diff for this file is too large to render. See raw diff
 
mbench/numbered_captions_gpt-4o.json ADDED
The diff for this file is too large to render. See raw diff
 
mbench/numbered_captions_gpt-4o_nomask_randcap2.json ADDED
The diff for this file is too large to render. See raw diff
 
mbench/numbered_valid_obj_ids_gpt-4o_final.json ADDED
The diff for this file is too large to render. See raw diff
 
mbench/numbered_valid_obj_ids_gpt-4o_nomask_randcap2.json ADDED
@@ -0,0 +1,2153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "003234408d": {
3
+ "penguin": [
4
+ "1",
5
+ "2",
6
+ "3",
7
+ "4",
8
+ "5"
9
+ ]
10
+ },
11
+ "0043f083b5": {
12
+ "bus": [
13
+ "1"
14
+ ],
15
+ "sedan": [
16
+ "2",
17
+ "3"
18
+ ]
19
+ },
20
+ "0044fa5fba": {
21
+ "giant_panda": [
22
+ "1"
23
+ ]
24
+ },
25
+ "005a527edd": {
26
+ "ape": [
27
+ "1",
28
+ "2"
29
+ ]
30
+ },
31
+ "0065b171f9": {
32
+ "giant_panda": [
33
+ "1"
34
+ ]
35
+ },
36
+ "00917dcfc4": {
37
+ "zebra": [
38
+ "1",
39
+ "2",
40
+ "3"
41
+ ]
42
+ },
43
+ "00a23ccf53": {
44
+ "shark": [
45
+ "1"
46
+ ]
47
+ },
48
+ "00ad5016a4": {
49
+ "airplane": [
50
+ "1"
51
+ ]
52
+ },
53
+ "01082ae388": {
54
+ "leopard": [
55
+ "1"
56
+ ]
57
+ },
58
+ "011ac0a06f": {
59
+ "ape": [
60
+ "1",
61
+ "2",
62
+ "3",
63
+ "4",
64
+ "5"
65
+ ]
66
+ },
67
+ "013099c098": {
68
+ "giant_panda": [
69
+ "1",
70
+ "2"
71
+ ]
72
+ },
73
+ "0155498c85": {
74
+ "person": [
75
+ "1"
76
+ ],
77
+ "motorbike": [
78
+ "2"
79
+ ]
80
+ },
81
+ "01694ad9c8": {
82
+ "bird": [
83
+ "1"
84
+ ]
85
+ },
86
+ "017ac35701": {
87
+ "giant_panda": [
88
+ "1"
89
+ ]
90
+ },
91
+ "01b80e8e1a": {
92
+ "zebra": [
93
+ "1",
94
+ "2"
95
+ ]
96
+ },
97
+ "01baa5a4e1": {},
98
+ "01c3111683": {
99
+ "whale": [
100
+ "1"
101
+ ]
102
+ },
103
+ "01c4cb5ffe": {
104
+ "person": [
105
+ "1",
106
+ "3"
107
+ ]
108
+ },
109
+ "01c76f0a82": {
110
+ "sedan": [
111
+ "1",
112
+ "4"
113
+ ]
114
+ },
115
+ "01c783268c": {
116
+ "person": [
117
+ "2"
118
+ ],
119
+ "ape": [
120
+ "1"
121
+ ]
122
+ },
123
+ "01e64dd36a": {
124
+ "cow": [
125
+ "1",
126
+ "2",
127
+ "3"
128
+ ]
129
+ },
130
+ "01ed275c6e": {
131
+ "giraffe": [
132
+ "1",
133
+ "2"
134
+ ]
135
+ },
136
+ "01ff60d1fa": {
137
+ "lizard": [
138
+ "1"
139
+ ]
140
+ },
141
+ "020cd28cd2": {
142
+ "person": [
143
+ "1"
144
+ ]
145
+ },
146
+ "02264db755": {
147
+ "fox": [
148
+ "1"
149
+ ]
150
+ },
151
+ "0248626d9a": {
152
+ "train": [
153
+ "1"
154
+ ]
155
+ },
156
+ "02668dbffa": {
157
+ "frog": [
158
+ "1"
159
+ ]
160
+ },
161
+ "0274193026": {
162
+ "person": [
163
+ "2"
164
+ ]
165
+ },
166
+ "02d28375aa": {
167
+ "fox": [
168
+ "1"
169
+ ]
170
+ },
171
+ "031ccc99b1": {
172
+ "person": [
173
+ "1",
174
+ "2",
175
+ "3"
176
+ ]
177
+ },
178
+ "0321b18c10": {
179
+ "elephant": [
180
+ "3"
181
+ ],
182
+ "person": [
183
+ "1",
184
+ "2"
185
+ ]
186
+ },
187
+ "0348a45bca": {
188
+ "fish": [
189
+ "1",
190
+ "2",
191
+ "3",
192
+ "4",
193
+ "5"
194
+ ]
195
+ },
196
+ "0355e92655": {
197
+ "boat": [
198
+ "3"
199
+ ],
200
+ "person": [
201
+ "2"
202
+ ]
203
+ },
204
+ "0358b938c1": {
205
+ "elephant": [
206
+ "1",
207
+ "2",
208
+ "3",
209
+ "4"
210
+ ]
211
+ },
212
+ "0368107cf1": {
213
+ "person": [
214
+ "1",
215
+ "2"
216
+ ]
217
+ },
218
+ "0379ddf557": {
219
+ "person": [
220
+ "1"
221
+ ]
222
+ },
223
+ "038b2cc71d": {
224
+ "lizard": [
225
+ "1"
226
+ ]
227
+ },
228
+ "038c15a5dd": {
229
+ "hedgehog": [
230
+ "1"
231
+ ]
232
+ },
233
+ "03a06cc98a": {
234
+ "giraffe": [
235
+ "1",
236
+ "2",
237
+ "3"
238
+ ]
239
+ },
240
+ "03a63e187f": {
241
+ "lizard": [
242
+ "1"
243
+ ]
244
+ },
245
+ "03c95b4dae": {
246
+ "elephant": [
247
+ "1",
248
+ "2",
249
+ "3"
250
+ ]
251
+ },
252
+ "03e2b57b0e": {
253
+ "lizard": [
254
+ "1"
255
+ ]
256
+ },
257
+ "04194e1248": {
258
+ "lizard": [
259
+ "1"
260
+ ]
261
+ },
262
+ "04259896e2": {
263
+ "lizard": [
264
+ "1"
265
+ ]
266
+ },
267
+ "0444918a5f": {
268
+ "truck": [
269
+ "1",
270
+ "2",
271
+ "3",
272
+ "4"
273
+ ]
274
+ },
275
+ "04460a7a52": {
276
+ "lizard": [
277
+ "1"
278
+ ]
279
+ },
280
+ "04474174a4": {
281
+ "ape": [
282
+ "1",
283
+ "2"
284
+ ]
285
+ },
286
+ "0450095513": {
287
+ "snail": [
288
+ "1"
289
+ ]
290
+ },
291
+ "045f00aed2": {
292
+ "tiger": [
293
+ "1"
294
+ ],
295
+ "person": [
296
+ "3"
297
+ ]
298
+ },
299
+ "04667fabaa": {
300
+ "parrot": [
301
+ "1"
302
+ ]
303
+ },
304
+ "04735c5030": {
305
+ "cat": [
306
+ "1",
307
+ "2"
308
+ ]
309
+ },
310
+ "04990d1915": {
311
+ "sedan": [
312
+ "1"
313
+ ],
314
+ "truck": [
315
+ "3"
316
+ ],
317
+ "bus": [
318
+ "2"
319
+ ]
320
+ },
321
+ "04d62d9d98": {
322
+ "person": [
323
+ "1"
324
+ ]
325
+ },
326
+ "04f21da964": {
327
+ "monkey": [
328
+ "1"
329
+ ]
330
+ },
331
+ "04fbad476e": {
332
+ "parrot": [
333
+ "1"
334
+ ]
335
+ },
336
+ "04fe256562": {
337
+ "truck": [
338
+ "2"
339
+ ],
340
+ "motorbike": [
341
+ "1"
342
+ ]
343
+ },
344
+ "0503bf89c9": {
345
+ "hedgehog": [
346
+ "1"
347
+ ]
348
+ },
349
+ "0536c9eed0": {
350
+ "cat": [
351
+ "1"
352
+ ]
353
+ },
354
+ "054acb238f": {
355
+ "owl": [
356
+ "1"
357
+ ]
358
+ },
359
+ "05579ca250": {
360
+ "sedan": [
361
+ "3"
362
+ ],
363
+ "person": [
364
+ "1"
365
+ ]
366
+ },
367
+ "056c200404": {},
368
+ "05774f3a2c": {
369
+ "ape": [
370
+ "1",
371
+ "2",
372
+ "3"
373
+ ]
374
+ },
375
+ "058a7592c8": {
376
+ "train": [
377
+ "1"
378
+ ]
379
+ },
380
+ "05a0a513df": {
381
+ "person": [
382
+ "1",
383
+ "2"
384
+ ]
385
+ },
386
+ "05a569d8aa": {
387
+ "cat": [
388
+ "1"
389
+ ],
390
+ "mouse": [
391
+ "2"
392
+ ]
393
+ },
394
+ "05aa652648": {
395
+ "ape": [
396
+ "1"
397
+ ]
398
+ },
399
+ "05d7715782": {},
400
+ "05e0b0f28f": {
401
+ "person": [
402
+ "2"
403
+ ],
404
+ "mouse": [
405
+ "1"
406
+ ]
407
+ },
408
+ "05fdbbdd7a": {},
409
+ "05ffcfed85": {
410
+ "monkey": [
411
+ "1",
412
+ "2"
413
+ ]
414
+ },
415
+ "0630391881": {
416
+ "person": [
417
+ "1"
418
+ ]
419
+ },
420
+ "06840b2bbe": {
421
+ "snake": [
422
+ "1"
423
+ ]
424
+ },
425
+ "068f7dce6f": {
426
+ "shark": [
427
+ "1"
428
+ ]
429
+ },
430
+ "0693719753": {
431
+ "turtle": [
432
+ "1",
433
+ "2"
434
+ ]
435
+ },
436
+ "06ce2b51fb": {
437
+ "person": [
438
+ "1",
439
+ "2"
440
+ ]
441
+ },
442
+ "06e224798e": {
443
+ "tiger": [
444
+ "1"
445
+ ]
446
+ },
447
+ "06ee361788": {
448
+ "duck": [
449
+ "1",
450
+ "2",
451
+ "3"
452
+ ]
453
+ },
454
+ "06fbb3fa2c": {
455
+ "eagle": [
456
+ "1"
457
+ ]
458
+ },
459
+ "0700264286": {
460
+ "cow": [
461
+ "1",
462
+ "2"
463
+ ]
464
+ },
465
+ "070c918ca7": {
466
+ "parrot": [
467
+ "1"
468
+ ]
469
+ },
470
+ "07129e14a4": {
471
+ "person": [
472
+ "3"
473
+ ],
474
+ "parrot": [
475
+ "1",
476
+ "2"
477
+ ]
478
+ },
479
+ "07177017e9": {
480
+ "motorbike": [
481
+ "1",
482
+ "2"
483
+ ]
484
+ },
485
+ "07238ffc58": {
486
+ "monkey": [
487
+ "1",
488
+ "2",
489
+ "3"
490
+ ]
491
+ },
492
+ "07353b2a89": {
493
+ "sheep": [
494
+ "1",
495
+ "2",
496
+ "3",
497
+ "4"
498
+ ]
499
+ },
500
+ "0738493cbf": {
501
+ "airplane": [
502
+ "1"
503
+ ]
504
+ },
505
+ "075926c651": {
506
+ "person": [
507
+ "1",
508
+ "2"
509
+ ]
510
+ },
511
+ "075c701292": {
512
+ "duck": [
513
+ "1",
514
+ "2",
515
+ "3",
516
+ "4"
517
+ ]
518
+ },
519
+ "0762ea9a30": {
520
+ "person": [
521
+ "1"
522
+ ]
523
+ },
524
+ "07652ee4af": {
525
+ "person": [
526
+ "1"
527
+ ]
528
+ },
529
+ "076f206928": {
530
+ "person": [
531
+ "3"
532
+ ],
533
+ "zebra": [
534
+ "1",
535
+ "2"
536
+ ]
537
+ },
538
+ "077d32af19": {
539
+ "person": [
540
+ "1",
541
+ "2",
542
+ "3"
543
+ ],
544
+ "train": [
545
+ "4"
546
+ ]
547
+ },
548
+ "079049275c": {
549
+ "mouse": [
550
+ "1"
551
+ ]
552
+ },
553
+ "07913cdda7": {
554
+ "person": [
555
+ "2",
556
+ "3"
557
+ ],
558
+ "train": [
559
+ "1"
560
+ ]
561
+ },
562
+ "07a11a35e8": {
563
+ "ape": [
564
+ "1",
565
+ "2"
566
+ ]
567
+ },
568
+ "07ac33b6df": {
569
+ "ape": [
570
+ "1"
571
+ ]
572
+ },
573
+ "07c62c3d11": {
574
+ "parrot": [
575
+ "1",
576
+ "2",
577
+ "3"
578
+ ]
579
+ },
580
+ "07cc1c7d74": {
581
+ "snake": [
582
+ "1"
583
+ ]
584
+ },
585
+ "080196ef01": {
586
+ "lizard": [
587
+ "1"
588
+ ]
589
+ },
590
+ "081207976e": {},
591
+ "081ae4fa44": {
592
+ "shark": [
593
+ "1",
594
+ "2"
595
+ ]
596
+ },
597
+ "081d8250cb": {
598
+ "sedan": [
599
+ "3"
600
+ ],
601
+ "person": [
602
+ "1"
603
+ ]
604
+ },
605
+ "082900c5d4": {
606
+ "duck": [
607
+ "1",
608
+ "2",
609
+ "3"
610
+ ]
611
+ },
612
+ "0860df21e2": {},
613
+ "0866d4c5e3": {
614
+ "bird": [
615
+ "1",
616
+ "2",
617
+ "3"
618
+ ]
619
+ },
620
+ "0891ac2eb6": {
621
+ "person": [
622
+ "1",
623
+ "2",
624
+ "3"
625
+ ]
626
+ },
627
+ "08931bc458": {
628
+ "person": [
629
+ "1"
630
+ ]
631
+ },
632
+ "08aa2705d5": {
633
+ "snake": [
634
+ "1"
635
+ ]
636
+ },
637
+ "08c8450db7": {},
638
+ "08d50b926c": {
639
+ "turtle": [
640
+ "1",
641
+ "2"
642
+ ]
643
+ },
644
+ "08e1e4de15": {
645
+ "monkey": [
646
+ "1",
647
+ "2",
648
+ "3",
649
+ "4"
650
+ ]
651
+ },
652
+ "08e48c1a48": {
653
+ "cow": [
654
+ "1"
655
+ ]
656
+ },
657
+ "08f561c65e": {
658
+ "person": [
659
+ "2"
660
+ ],
661
+ "giant_panda": [
662
+ "1"
663
+ ]
664
+ },
665
+ "08feb87790": {
666
+ "sheep": [
667
+ "1"
668
+ ]
669
+ },
670
+ "09049f6fe3": {
671
+ "mouse": [
672
+ "1",
673
+ "2"
674
+ ]
675
+ },
676
+ "092e4ff450": {
677
+ "snake": [
678
+ "1"
679
+ ]
680
+ },
681
+ "09338adea8": {
682
+ "whale": [
683
+ "1",
684
+ "2"
685
+ ]
686
+ },
687
+ "093c335ccc": {
688
+ "person": [
689
+ "2"
690
+ ]
691
+ },
692
+ "0970d28339": {
693
+ "ape": [
694
+ "1",
695
+ "2"
696
+ ]
697
+ },
698
+ "0974a213dc": {
699
+ "giraffe": [
700
+ "1",
701
+ "2",
702
+ "3"
703
+ ]
704
+ },
705
+ "097b471ed8": {
706
+ "cat": [
707
+ "1",
708
+ "2"
709
+ ]
710
+ },
711
+ "0990941758": {
712
+ "giant_panda": [
713
+ "1"
714
+ ]
715
+ },
716
+ "09a348f4fa": {
717
+ "lizard": [
718
+ "1"
719
+ ]
720
+ },
721
+ "09a6841288": {
722
+ "duck": [
723
+ "1",
724
+ "2"
725
+ ]
726
+ },
727
+ "09c5bad17b": {
728
+ "airplane": [
729
+ "1"
730
+ ]
731
+ },
732
+ "09c9ce80c7": {
733
+ "giant_panda": [
734
+ "1"
735
+ ]
736
+ },
737
+ "09ff54fef4": {
738
+ "fox": [
739
+ "1",
740
+ "2"
741
+ ]
742
+ },
743
+ "0a23765d15": {
744
+ "person": [
745
+ "1",
746
+ "2"
747
+ ]
748
+ },
749
+ "0a275e7f12": {
750
+ "elephant": [
751
+ "1"
752
+ ]
753
+ },
754
+ "0a2f2bd294": {
755
+ "motorbike": [
756
+ "1"
757
+ ]
758
+ },
759
+ "0a7a2514aa": {
760
+ "lizard": [
761
+ "2"
762
+ ],
763
+ "cat": [
764
+ "1"
765
+ ]
766
+ },
767
+ "0a7b27fde9": {
768
+ "parrot": [
769
+ "1",
770
+ "2"
771
+ ]
772
+ },
773
+ "0a8c467cc3": {
774
+ "fish": [
775
+ "1",
776
+ "2",
777
+ "3"
778
+ ]
779
+ },
780
+ "0ac8c560ae": {
781
+ "person": [
782
+ "2",
783
+ "3"
784
+ ]
785
+ },
786
+ "0b1627e896": {
787
+ "boat": [
788
+ "1"
789
+ ]
790
+ },
791
+ "0b285c47f6": {
792
+ "mouse": [
793
+ "1"
794
+ ]
795
+ },
796
+ "0b34ec1d55": {
797
+ "ape": [
798
+ "1"
799
+ ]
800
+ },
801
+ "0b5b5e8e5a": {
802
+ "sedan": [
803
+ "2"
804
+ ],
805
+ "person": [
806
+ "1"
807
+ ]
808
+ },
809
+ "0b68535614": {
810
+ "rabbit": [
811
+ "1"
812
+ ]
813
+ },
814
+ "0b6f9105fc": {
815
+ "rabbit": [
816
+ "1"
817
+ ]
818
+ },
819
+ "0b7dbfa3cb": {
820
+ "cow": [
821
+ "1"
822
+ ]
823
+ },
824
+ "0b9cea51ca": {
825
+ "whale": [
826
+ "1"
827
+ ]
828
+ },
829
+ "0b9d012be8": {
830
+ "camel": [
831
+ "1"
832
+ ]
833
+ },
834
+ "0bcfc4177d": {
835
+ "truck": [
836
+ "1"
837
+ ]
838
+ },
839
+ "0bd37b23c1": {
840
+ "motorbike": [
841
+ "1"
842
+ ]
843
+ },
844
+ "0bd864064c": {
845
+ "eagle": [
846
+ "1"
847
+ ]
848
+ },
849
+ "0c11c6bf7b": {
850
+ "deer": [
851
+ "1"
852
+ ]
853
+ },
854
+ "0c26bc77ac": {
855
+ "crocodile": [
856
+ "1"
857
+ ]
858
+ },
859
+ "0c3a04798c": {
860
+ "duck": [
861
+ "1"
862
+ ],
863
+ "fish": [
864
+ "2"
865
+ ]
866
+ },
867
+ "0c44a9d545": {
868
+ "tiger": [
869
+ "1"
870
+ ]
871
+ },
872
+ "0c817cc390": {
873
+ "dog": [
874
+ "2"
875
+ ],
876
+ "hedgehog": [
877
+ "1"
878
+ ]
879
+ },
880
+ "0ca839ee9a": {
881
+ "ape": [
882
+ "1",
883
+ "2"
884
+ ]
885
+ },
886
+ "0cd7ac0ac0": {
887
+ "rabbit": [
888
+ "1"
889
+ ]
890
+ },
891
+ "0ce06e0121": {
892
+ "parrot": [
893
+ "1",
894
+ "2"
895
+ ]
896
+ },
897
+ "0cfe974a89": {
898
+ "turtle": [
899
+ "1",
900
+ "2"
901
+ ]
902
+ },
903
+ "0d2fcc0dcd": {
904
+ "zebra": [
905
+ "1",
906
+ "2",
907
+ "3",
908
+ "4"
909
+ ]
910
+ },
911
+ "0d3aad05d2": {
912
+ "person": [
913
+ "1"
914
+ ]
915
+ },
916
+ "0d40b015f4": {
917
+ "person": [
918
+ "1"
919
+ ]
920
+ },
921
+ "0d97fba242": {
922
+ "person": [
923
+ "2"
924
+ ],
925
+ "dog": [
926
+ "1"
927
+ ]
928
+ },
929
+ "0d9cc80d7e": {
930
+ "person": [
931
+ "1",
932
+ "2",
933
+ "3"
934
+ ]
935
+ },
936
+ "0dab85b6d3": {
937
+ "lizard": [
938
+ "1",
939
+ "2"
940
+ ]
941
+ },
942
+ "0db5c427a5": {
943
+ "train": [
944
+ "1"
945
+ ]
946
+ },
947
+ "0dbaf284f1": {
948
+ "cat": [
949
+ "1",
950
+ "2"
951
+ ]
952
+ },
953
+ "0de4923598": {},
954
+ "0df28a9101": {
955
+ "turtle": [
956
+ "1",
957
+ "2",
958
+ "3"
959
+ ]
960
+ },
961
+ "0e04f636c4": {
962
+ "frog": [
963
+ "1"
964
+ ]
965
+ },
966
+ "0e05f0e232": {
967
+ "lizard": [
968
+ "1",
969
+ "2"
970
+ ]
971
+ },
972
+ "0e0930474b": {
973
+ "sedan": [
974
+ "1"
975
+ ],
976
+ "person": [
977
+ "2",
978
+ "3"
979
+ ]
980
+ },
981
+ "0e27472bea": {
982
+ "turtle": [
983
+ "1"
984
+ ]
985
+ },
986
+ "0e30020549": {
987
+ "parrot": [
988
+ "1"
989
+ ]
990
+ },
991
+ "0e621feb6c": {
992
+ "lizard": [
993
+ "1",
994
+ "2"
995
+ ]
996
+ },
997
+ "0e803c7d73": {},
998
+ "0e9ebe4e3c": {
999
+ "truck": [
1000
+ "1"
1001
+ ]
1002
+ },
1003
+ "0e9f2785ec": {
1004
+ "person": [
1005
+ "2"
1006
+ ]
1007
+ },
1008
+ "0ea68d418b": {
1009
+ "airplane": [
1010
+ "1"
1011
+ ]
1012
+ },
1013
+ "0eb403a222": {},
1014
+ "0ee92053d6": {
1015
+ "person": [
1016
+ "1"
1017
+ ]
1018
+ },
1019
+ "0eefca067f": {
1020
+ "giant_panda": [
1021
+ "1",
1022
+ "2"
1023
+ ]
1024
+ },
1025
+ "0f17fa6fcb": {
1026
+ "duck": [
1027
+ "1",
1028
+ "2",
1029
+ "3"
1030
+ ]
1031
+ },
1032
+ "0f1ac8e9a3": {
1033
+ "frog": [
1034
+ "1"
1035
+ ]
1036
+ },
1037
+ "0f202e9852": {
1038
+ "parrot": [
1039
+ "1"
1040
+ ]
1041
+ },
1042
+ "0f2ab8b1ff": {
1043
+ "dolphin": [
1044
+ "1",
1045
+ "2",
1046
+ "3"
1047
+ ]
1048
+ },
1049
+ "0f51a78756": {
1050
+ "sheep": [
1051
+ "1"
1052
+ ]
1053
+ },
1054
+ "0f5fbe16b0": {
1055
+ "raccoon": [
1056
+ "1",
1057
+ "2"
1058
+ ]
1059
+ },
1060
+ "0f6072077b": {
1061
+ "person": [
1062
+ "1",
1063
+ "2",
1064
+ "3"
1065
+ ]
1066
+ },
1067
+ "0f6b69b2f4": {
1068
+ "rabbit": [
1069
+ "1"
1070
+ ]
1071
+ },
1072
+ "0f6c2163de": {
1073
+ "snail": [
1074
+ "1"
1075
+ ]
1076
+ },
1077
+ "0f74ec5599": {
1078
+ "giant_panda": [
1079
+ "1"
1080
+ ]
1081
+ },
1082
+ "0f9683715b": {
1083
+ "elephant": [
1084
+ "1"
1085
+ ]
1086
+ },
1087
+ "0fa7b59356": {
1088
+ "duck": [
1089
+ "1"
1090
+ ]
1091
+ },
1092
+ "0fb173695b": {
1093
+ "person": [
1094
+ "3"
1095
+ ]
1096
+ },
1097
+ "0fc958cde2": {
1098
+ "owl": [
1099
+ "1"
1100
+ ]
1101
+ },
1102
+ "0fe7b1a621": {
1103
+ "parrot": [
1104
+ "1"
1105
+ ]
1106
+ },
1107
+ "0ffcdb491c": {
1108
+ "person": [
1109
+ "1",
1110
+ "2",
1111
+ "3"
1112
+ ]
1113
+ },
1114
+ "101caff7d4": {
1115
+ "giant_panda": [
1116
+ "1",
1117
+ "2"
1118
+ ]
1119
+ },
1120
+ "1022fe8417": {
1121
+ "person": [
1122
+ "1",
1123
+ "2",
1124
+ "3"
1125
+ ]
1126
+ },
1127
+ "1032e80b37": {
1128
+ "giraffe": [
1129
+ "1"
1130
+ ]
1131
+ },
1132
+ "103f501680": {
1133
+ "fish": [
1134
+ "1"
1135
+ ]
1136
+ },
1137
+ "104e64565f": {
1138
+ "elephant": [
1139
+ "1"
1140
+ ]
1141
+ },
1142
+ "104f1ab997": {
1143
+ "person": [
1144
+ "1",
1145
+ "2",
1146
+ "3"
1147
+ ]
1148
+ },
1149
+ "106242403f": {
1150
+ "person": [
1151
+ "1",
1152
+ "2"
1153
+ ]
1154
+ },
1155
+ "10b31f5431": {
1156
+ "person": [
1157
+ "1",
1158
+ "3",
1159
+ "4"
1160
+ ]
1161
+ },
1162
+ "10eced835e": {
1163
+ "giant_panda": [
1164
+ "1",
1165
+ "2"
1166
+ ]
1167
+ },
1168
+ "110d26fa3a": {
1169
+ "shark": [
1170
+ "1"
1171
+ ]
1172
+ },
1173
+ "1122c1d16a": {
1174
+ "person": [
1175
+ "6"
1176
+ ],
1177
+ "parrot": [
1178
+ "1",
1179
+ "2",
1180
+ "3",
1181
+ "4",
1182
+ "5"
1183
+ ]
1184
+ },
1185
+ "1145b49a5f": {
1186
+ "rabbit": [
1187
+ "1"
1188
+ ]
1189
+ },
1190
+ "11485838c2": {
1191
+ "giraffe": [
1192
+ "1",
1193
+ "2",
1194
+ "3"
1195
+ ]
1196
+ },
1197
+ "114e7676ec": {
1198
+ "person": [
1199
+ "1"
1200
+ ]
1201
+ },
1202
+ "1157472b95": {
1203
+ "parrot": [
1204
+ "1",
1205
+ "2"
1206
+ ]
1207
+ },
1208
+ "115ee1072c": {
1209
+ "cow": [
1210
+ "1"
1211
+ ]
1212
+ },
1213
+ "1171141012": {
1214
+ "person": [
1215
+ "2"
1216
+ ],
1217
+ "turtle": [
1218
+ "1"
1219
+ ]
1220
+ },
1221
+ "117757b4b8": {
1222
+ "snail": [
1223
+ "1"
1224
+ ]
1225
+ },
1226
+ "1178932d2f": {
1227
+ "person": [
1228
+ "1",
1229
+ "2"
1230
+ ],
1231
+ "motorbike": [
1232
+ "3"
1233
+ ]
1234
+ },
1235
+ "117cc76bda": {
1236
+ "whale": [
1237
+ "1"
1238
+ ]
1239
+ },
1240
+ "1180cbf814": {
1241
+ "fish": [
1242
+ "1",
1243
+ "2"
1244
+ ]
1245
+ },
1246
+ "1187bbd0e3": {
1247
+ "cat": [
1248
+ "1"
1249
+ ]
1250
+ },
1251
+ "1197e44b26": {
1252
+ "giant_panda": [
1253
+ "1"
1254
+ ]
1255
+ },
1256
+ "119cf20728": {
1257
+ "lizard": [
1258
+ "1"
1259
+ ]
1260
+ },
1261
+ "119dd54871": {
1262
+ "lion": [
1263
+ "1",
1264
+ "2"
1265
+ ]
1266
+ },
1267
+ "11a0c3b724": {
1268
+ "mouse": [
1269
+ "1",
1270
+ "2"
1271
+ ]
1272
+ },
1273
+ "11a6ba8c94": {
1274
+ "person": [
1275
+ "1",
1276
+ "2"
1277
+ ]
1278
+ },
1279
+ "11c722a456": {
1280
+ "turtle": [
1281
+ "1",
1282
+ "2"
1283
+ ]
1284
+ },
1285
+ "11cbcb0b4d": {
1286
+ "zebra": [
1287
+ "1"
1288
+ ]
1289
+ },
1290
+ "11ccf5e99d": {
1291
+ "person": [
1292
+ "2"
1293
+ ]
1294
+ },
1295
+ "11ce6f452e": {
1296
+ "person": [
1297
+ "1",
1298
+ "2",
1299
+ "3"
1300
+ ]
1301
+ },
1302
+ "11feabe596": {
1303
+ "rabbit": [
1304
+ "1"
1305
+ ]
1306
+ },
1307
+ "120cb9514d": {
1308
+ "person": [
1309
+ "1",
1310
+ "2",
1311
+ "3"
1312
+ ]
1313
+ },
1314
+ "12156b25b3": {
1315
+ "person": [
1316
+ "1"
1317
+ ]
1318
+ },
1319
+ "122896672d": {
1320
+ "person": [
1321
+ "1",
1322
+ "3"
1323
+ ]
1324
+ },
1325
+ "1233ac8596": {
1326
+ "dog": [
1327
+ "1"
1328
+ ]
1329
+ },
1330
+ "1239c87234": {
1331
+ "lizard": [
1332
+ "1"
1333
+ ]
1334
+ },
1335
+ "1250423f7c": {
1336
+ "elephant": [
1337
+ "3",
1338
+ "4"
1339
+ ],
1340
+ "person": [
1341
+ "2"
1342
+ ]
1343
+ },
1344
+ "1257a1bc67": {
1345
+ "snake": [
1346
+ "1"
1347
+ ]
1348
+ },
1349
+ "125d1b19dd": {
1350
+ "giant_panda": [
1351
+ "1",
1352
+ "2"
1353
+ ]
1354
+ },
1355
+ "126d203967": {
1356
+ "person": [
1357
+ "2"
1358
+ ]
1359
+ },
1360
+ "1295e19071": {
1361
+ "airplane": [
1362
+ "1"
1363
+ ]
1364
+ },
1365
+ "12ad198c54": {
1366
+ "person": [
1367
+ "1"
1368
+ ]
1369
+ },
1370
+ "12bddb2bcb": {
1371
+ "person": [
1372
+ "2"
1373
+ ]
1374
+ },
1375
+ "12ec9b93ee": {
1376
+ "giant_panda": [
1377
+ "1"
1378
+ ]
1379
+ },
1380
+ "12eebedc35": {
1381
+ "bird": [
1382
+ "1"
1383
+ ]
1384
+ },
1385
+ "132852e094": {
1386
+ "fox": [
1387
+ "1"
1388
+ ]
1389
+ },
1390
+ "1329409f2a": {
1391
+ "fish": [
1392
+ "1"
1393
+ ]
1394
+ },
1395
+ "13325cfa14": {
1396
+ "person": [
1397
+ "2"
1398
+ ]
1399
+ },
1400
+ "1336440745": {
1401
+ "mouse": [
1402
+ "1",
1403
+ "2"
1404
+ ]
1405
+ },
1406
+ "134d06dbf9": {
1407
+ "cat": [
1408
+ "1"
1409
+ ]
1410
+ },
1411
+ "135625b53d": {
1412
+ "parrot": [
1413
+ "1"
1414
+ ]
1415
+ },
1416
+ "13870016f9": {
1417
+ "person": [
1418
+ "1"
1419
+ ],
1420
+ "cow": [
1421
+ "2",
1422
+ "3"
1423
+ ]
1424
+ },
1425
+ "13960b3c84": {
1426
+ "giraffe": [
1427
+ "1",
1428
+ "2",
1429
+ "3"
1430
+ ]
1431
+ },
1432
+ "13adaad9d9": {
1433
+ "giant_panda": [
1434
+ "1"
1435
+ ]
1436
+ },
1437
+ "13ae097e20": {
1438
+ "giant_panda": [
1439
+ "1"
1440
+ ]
1441
+ },
1442
+ "13e3070469": {
1443
+ "zebra": [
1444
+ "1",
1445
+ "2",
1446
+ "3"
1447
+ ]
1448
+ },
1449
+ "13f6a8c20d": {
1450
+ "fish": [
1451
+ "1"
1452
+ ]
1453
+ },
1454
+ "1416925cf2": {
1455
+ "truck": [
1456
+ "1",
1457
+ "2"
1458
+ ]
1459
+ },
1460
+ "142d2621f5": {
1461
+ "person": [
1462
+ "1",
1463
+ "2"
1464
+ ],
1465
+ "motorbike": [
1466
+ "3"
1467
+ ]
1468
+ },
1469
+ "145d5d7c03": {
1470
+ "giant_panda": [
1471
+ "1"
1472
+ ]
1473
+ },
1474
+ "145fdc3ac5": {
1475
+ "lizard": [
1476
+ "1"
1477
+ ]
1478
+ },
1479
+ "1471274fa7": {
1480
+ "person": [
1481
+ "1"
1482
+ ]
1483
+ },
1484
+ "14a6b5a139": {
1485
+ "fish": [
1486
+ "1"
1487
+ ]
1488
+ },
1489
+ "14c21cea0d": {
1490
+ "monkey": [
1491
+ "1",
1492
+ "2"
1493
+ ]
1494
+ },
1495
+ "14dae0dc93": {
1496
+ "person": [
1497
+ "2"
1498
+ ]
1499
+ },
1500
+ "14f9bd22b5": {
1501
+ "tiger": [
1502
+ "1"
1503
+ ]
1504
+ },
1505
+ "14fd28ae99": {
1506
+ "parrot": [
1507
+ "1"
1508
+ ]
1509
+ },
1510
+ "15097d5d4e": {
1511
+ "parrot": [
1512
+ "1"
1513
+ ]
1514
+ },
1515
+ "150ea711f2": {
1516
+ "whale": [
1517
+ "1"
1518
+ ]
1519
+ },
1520
+ "1514e3563f": {
1521
+ "earless_seal": [
1522
+ "1",
1523
+ "2"
1524
+ ]
1525
+ },
1526
+ "152aaa3a9e": {
1527
+ "raccoon": [
1528
+ "1"
1529
+ ]
1530
+ },
1531
+ "152b7d3bd7": {
1532
+ "giant_panda": [
1533
+ "1"
1534
+ ]
1535
+ },
1536
+ "15617297cc": {
1537
+ "person": [
1538
+ "1"
1539
+ ]
1540
+ },
1541
+ "15abbe0c52": {
1542
+ "person": [
1543
+ "1"
1544
+ ]
1545
+ },
1546
+ "15d1fb3de5": {
1547
+ "owl": [
1548
+ "1"
1549
+ ],
1550
+ "cat": [
1551
+ "2"
1552
+ ]
1553
+ },
1554
+ "15f67b0fab": {
1555
+ "person": [
1556
+ "1"
1557
+ ]
1558
+ },
1559
+ "161eb59aad": {
1560
+ "giraffe": [
1561
+ "1"
1562
+ ],
1563
+ "cow": [
1564
+ "2",
1565
+ "3"
1566
+ ]
1567
+ },
1568
+ "16288ea47f": {
1569
+ "duck": [
1570
+ "1",
1571
+ "2"
1572
+ ]
1573
+ },
1574
+ "164410ce62": {
1575
+ "person": [
1576
+ "1"
1577
+ ]
1578
+ },
1579
+ "165c3c8cd4": {
1580
+ "person": [
1581
+ "1",
1582
+ "2",
1583
+ "3"
1584
+ ]
1585
+ },
1586
+ "165c42b41b": {
1587
+ "person": [
1588
+ "1",
1589
+ "4"
1590
+ ],
1591
+ "motorbike": [
1592
+ "2",
1593
+ "3"
1594
+ ]
1595
+ },
1596
+ "165ec9e22b": {
1597
+ "person": [
1598
+ "1",
1599
+ "2"
1600
+ ]
1601
+ },
1602
+ "1669502269": {
1603
+ "person": [
1604
+ "1"
1605
+ ]
1606
+ },
1607
+ "16763cccbb": {
1608
+ "ape": [
1609
+ "1"
1610
+ ]
1611
+ },
1612
+ "16adde065e": {
1613
+ "person": [
1614
+ "3"
1615
+ ],
1616
+ "cat": [
1617
+ "2"
1618
+ ]
1619
+ },
1620
+ "16af445362": {
1621
+ "airplane": [
1622
+ "1"
1623
+ ]
1624
+ },
1625
+ "16afd538ad": {
1626
+ "parrot": [
1627
+ "1",
1628
+ "2"
1629
+ ]
1630
+ },
1631
+ "16c3fa4d5d": {
1632
+ "sedan": [
1633
+ "1"
1634
+ ]
1635
+ },
1636
+ "16d1d65c27": {
1637
+ "monkey": [
1638
+ "1"
1639
+ ]
1640
+ },
1641
+ "16e8599e94": {
1642
+ "giant_panda": [
1643
+ "1"
1644
+ ]
1645
+ },
1646
+ "16fe9fb444": {
1647
+ "person": [
1648
+ "2"
1649
+ ],
1650
+ "motorbike": [
1651
+ "1"
1652
+ ]
1653
+ },
1654
+ "1705796b02": {
1655
+ "train": [
1656
+ "1"
1657
+ ]
1658
+ },
1659
+ "1724db7671": {
1660
+ "giant_panda": [
1661
+ "1"
1662
+ ]
1663
+ },
1664
+ "17418e81ea": {
1665
+ "shark": [
1666
+ "1"
1667
+ ]
1668
+ },
1669
+ "175169edbb": {
1670
+ "ape": [
1671
+ "1",
1672
+ "2"
1673
+ ]
1674
+ },
1675
+ "17622326fd": {
1676
+ "lizard": [
1677
+ "1"
1678
+ ]
1679
+ },
1680
+ "17656bae77": {
1681
+ "elephant": [
1682
+ "1"
1683
+ ]
1684
+ },
1685
+ "17b0d94172": {
1686
+ "airplane": [
1687
+ "1"
1688
+ ]
1689
+ },
1690
+ "17c220e4f6": {
1691
+ "giant_panda": [
1692
+ "1"
1693
+ ]
1694
+ },
1695
+ "17c7bcd146": {
1696
+ "train": [
1697
+ "1"
1698
+ ]
1699
+ },
1700
+ "17cb4afe89": {
1701
+ "tiger": [
1702
+ "1"
1703
+ ]
1704
+ },
1705
+ "17cd79a434": {
1706
+ "squirrel": [
1707
+ "1"
1708
+ ]
1709
+ },
1710
+ "17d18604c3": {
1711
+ "person": [
1712
+ "1",
1713
+ "2"
1714
+ ]
1715
+ },
1716
+ "17d8ca1a37": {
1717
+ "owl": [
1718
+ "1"
1719
+ ],
1720
+ "person": [
1721
+ "2"
1722
+ ]
1723
+ },
1724
+ "17e33f4330": {
1725
+ "monkey": [
1726
+ "1"
1727
+ ]
1728
+ },
1729
+ "17f7a6d805": {
1730
+ "snail": [
1731
+ "1"
1732
+ ]
1733
+ },
1734
+ "180abc8378": {
1735
+ "owl": [
1736
+ "1"
1737
+ ],
1738
+ "person": [
1739
+ "2"
1740
+ ]
1741
+ },
1742
+ "183ba3d652": {
1743
+ "motorbike": [
1744
+ "3"
1745
+ ],
1746
+ "person": [
1747
+ "2"
1748
+ ]
1749
+ },
1750
+ "185bf64702": {
1751
+ "zebra": [
1752
+ "1",
1753
+ "2"
1754
+ ]
1755
+ },
1756
+ "18913cc690": {
1757
+ "train": [
1758
+ "1"
1759
+ ]
1760
+ },
1761
+ "1892651815": {
1762
+ "camel": [
1763
+ "1"
1764
+ ]
1765
+ },
1766
+ "189ac8208a": {
1767
+ "giraffe": [
1768
+ "1",
1769
+ "2"
1770
+ ]
1771
+ },
1772
+ "189b44e92c": {
1773
+ "zebra": [
1774
+ "1"
1775
+ ]
1776
+ },
1777
+ "18ac264b76": {
1778
+ "person": [
1779
+ "2"
1780
+ ]
1781
+ },
1782
+ "18b245ab49": {
1783
+ "penguin": [
1784
+ "1",
1785
+ "2",
1786
+ "3",
1787
+ "4"
1788
+ ]
1789
+ },
1790
+ "18b5cebc34": {
1791
+ "mouse": [
1792
+ "1"
1793
+ ]
1794
+ },
1795
+ "18bad52083": {
1796
+ "parrot": [
1797
+ "1",
1798
+ "2"
1799
+ ]
1800
+ },
1801
+ "18bb5144d5": {
1802
+ "lizard": [
1803
+ "1"
1804
+ ]
1805
+ },
1806
+ "18c6f205c5": {
1807
+ "person": [
1808
+ "1",
1809
+ "2",
1810
+ "3"
1811
+ ]
1812
+ },
1813
+ "1903f9ea15": {
1814
+ "bird": [
1815
+ "1",
1816
+ "2",
1817
+ "3"
1818
+ ]
1819
+ },
1820
+ "1917b209f2": {
1821
+ "person": [
1822
+ "1"
1823
+ ],
1824
+ "cow": [
1825
+ "3",
1826
+ "4"
1827
+ ],
1828
+ "horse": [
1829
+ "2"
1830
+ ]
1831
+ },
1832
+ "191e74c01d": {
1833
+ "deer": [
1834
+ "1"
1835
+ ]
1836
+ },
1837
+ "19367bb94e": {
1838
+ "fish": [
1839
+ "1",
1840
+ "2",
1841
+ "3"
1842
+ ]
1843
+ },
1844
+ "193ffaa217": {
1845
+ "person": [
1846
+ "1",
1847
+ "2",
1848
+ "3"
1849
+ ]
1850
+ },
1851
+ "19696b67d3": {
1852
+ "cow": [
1853
+ "1"
1854
+ ]
1855
+ },
1856
+ "197f3ab6f3": {
1857
+ "giant_panda": [
1858
+ "1"
1859
+ ]
1860
+ },
1861
+ "1981e763cc": {
1862
+ "sheep": [
1863
+ "1",
1864
+ "2"
1865
+ ]
1866
+ },
1867
+ "198afe39ae": {
1868
+ "person": [
1869
+ "1"
1870
+ ]
1871
+ },
1872
+ "19a6e62b9b": {
1873
+ "monkey": [
1874
+ "1",
1875
+ "2"
1876
+ ]
1877
+ },
1878
+ "19b60d5335": {
1879
+ "hedgehog": [
1880
+ "1"
1881
+ ]
1882
+ },
1883
+ "19c00c11f9": {
1884
+ "person": [
1885
+ "1"
1886
+ ]
1887
+ },
1888
+ "19e061eb88": {
1889
+ "boat": [
1890
+ "1",
1891
+ "2"
1892
+ ]
1893
+ },
1894
+ "19e8bc6178": {
1895
+ "dog": [
1896
+ "1"
1897
+ ]
1898
+ },
1899
+ "19ee80dac6": {
1900
+ "person": [
1901
+ "1",
1902
+ "3",
1903
+ "4"
1904
+ ]
1905
+ },
1906
+ "1a25a9170a": {
1907
+ "cow": [
1908
+ "1"
1909
+ ],
1910
+ "person": [
1911
+ "2",
1912
+ "3"
1913
+ ]
1914
+ },
1915
+ "1a359a6c1a": {
1916
+ "sheep": [
1917
+ "1"
1918
+ ]
1919
+ },
1920
+ "1a3e87c566": {
1921
+ "frog": [
1922
+ "1"
1923
+ ]
1924
+ },
1925
+ "1a5fe06b00": {
1926
+ "bus": [
1927
+ "1"
1928
+ ]
1929
+ },
1930
+ "1a6c0fbd1e": {
1931
+ "person": [
1932
+ "1"
1933
+ ]
1934
+ },
1935
+ "1a6f3b5a4b": {
1936
+ "sedan": [
1937
+ "3"
1938
+ ]
1939
+ },
1940
+ "1a8afbad92": {
1941
+ "zebra": [
1942
+ "1",
1943
+ "2",
1944
+ "3"
1945
+ ]
1946
+ },
1947
+ "1a8bdc5842": {
1948
+ "parrot": [
1949
+ "1",
1950
+ "2"
1951
+ ]
1952
+ },
1953
+ "1a95752aca": {
1954
+ "duck": [
1955
+ "1",
1956
+ "2"
1957
+ ]
1958
+ },
1959
+ "1a9c131cb7": {
1960
+ "ape": [
1961
+ "1",
1962
+ "2",
1963
+ "3"
1964
+ ]
1965
+ },
1966
+ "1aa3da3ee3": {
1967
+ "sheep": [
1968
+ "1",
1969
+ "2",
1970
+ "3",
1971
+ "4"
1972
+ ]
1973
+ },
1974
+ "1ab27ec7ea": {
1975
+ "deer": [
1976
+ "1"
1977
+ ]
1978
+ },
1979
+ "1abf16d21d": {
1980
+ "turtle": [
1981
+ "1"
1982
+ ]
1983
+ },
1984
+ "1acd0f993b": {
1985
+ "dog": [
1986
+ "1"
1987
+ ],
1988
+ "person": [
1989
+ "3"
1990
+ ]
1991
+ },
1992
+ "1ad202e499": {
1993
+ "lizard": [
1994
+ "1",
1995
+ "2"
1996
+ ]
1997
+ },
1998
+ "1af8d2395d": {
1999
+ "person": [
2000
+ "1",
2001
+ "2"
2002
+ ],
2003
+ "airplane": [
2004
+ "4"
2005
+ ]
2006
+ },
2007
+ "1afd39a1fa": {
2008
+ "motorbike": [
2009
+ "2"
2010
+ ]
2011
+ },
2012
+ "1b2d31306f": {
2013
+ "lizard": [
2014
+ "1"
2015
+ ]
2016
+ },
2017
+ "1b3fa67f0e": {
2018
+ "airplane": [
2019
+ "1"
2020
+ ]
2021
+ },
2022
+ "1b43fa74b4": {
2023
+ "owl": [
2024
+ "1",
2025
+ "2"
2026
+ ]
2027
+ },
2028
+ "1b73ea9fc2": {
2029
+ "parrot": [
2030
+ "1"
2031
+ ]
2032
+ },
2033
+ "1b7e8bb255": {
2034
+ "person": [
2035
+ "2"
2036
+ ]
2037
+ },
2038
+ "1b8680f8cd": {
2039
+ "person": [
2040
+ "2",
2041
+ "3"
2042
+ ]
2043
+ },
2044
+ "1b883843c0": {
2045
+ "person": [
2046
+ "1",
2047
+ "2"
2048
+ ]
2049
+ },
2050
+ "1b8898785b": {
2051
+ "monkey": [
2052
+ "1",
2053
+ "2"
2054
+ ]
2055
+ },
2056
+ "1b88ba1aa4": {
2057
+ "giant_panda": [
2058
+ "1"
2059
+ ]
2060
+ },
2061
+ "1b96a498e5": {
2062
+ "ape": [
2063
+ "1"
2064
+ ]
2065
+ },
2066
+ "1bbc4c274f": {
2067
+ "fish": [
2068
+ "2"
2069
+ ]
2070
+ },
2071
+ "1bd87fe9ab": {
2072
+ "train": [
2073
+ "1"
2074
+ ]
2075
+ },
2076
+ "1c4090c75b": {
2077
+ "whale": [
2078
+ "1"
2079
+ ]
2080
+ },
2081
+ "1c41934f84": {
2082
+ "elephant": [
2083
+ "1",
2084
+ "2"
2085
+ ]
2086
+ },
2087
+ "1c72b04b56": {
2088
+ "lion": [
2089
+ "1"
2090
+ ]
2091
+ },
2092
+ "1c87955a3a": {
2093
+ "crocodile": [
2094
+ "1"
2095
+ ],
2096
+ "turtle": [
2097
+ "2"
2098
+ ]
2099
+ },
2100
+ "1c9f9eb792": {
2101
+ "person": [
2102
+ "2"
2103
+ ]
2104
+ },
2105
+ "1ca240fede": {
2106
+ "train": [
2107
+ "1"
2108
+ ]
2109
+ },
2110
+ "1ca5673803": {
2111
+ "person": [
2112
+ "1",
2113
+ "3"
2114
+ ]
2115
+ },
2116
+ "1cada35274": {
2117
+ "duck": [
2118
+ "1"
2119
+ ]
2120
+ },
2121
+ "1cb44b920d": {
2122
+ "eagle": [
2123
+ "1",
2124
+ "2"
2125
+ ]
2126
+ },
2127
+ "1cd10e62be": {
2128
+ "leopard": [
2129
+ "1"
2130
+ ]
2131
+ },
2132
+ "1d3087d5e5": {
2133
+ "fish": [
2134
+ "1",
2135
+ "2",
2136
+ "3",
2137
+ "4",
2138
+ "5"
2139
+ ]
2140
+ },
2141
+ "1d3685150a": {
2142
+ "person": [
2143
+ "1",
2144
+ "3"
2145
+ ]
2146
+ },
2147
+ "1d6ff083aa": {
2148
+ "person": [
2149
+ "1",
2150
+ "2"
2151
+ ]
2152
+ }
2153
+ }
mbench/sampled_frame.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ac6df555665b2f0cc411641ce023ac10565ea7e8a5c0586c4a9e775481bca62
3
+ size 17415938
mbench/sampled_frame2.json ADDED
The diff for this file is too large to render. See raw diff