dianecy commited on 27 days ago

Commit

2c58401

verified ·

1 Parent(s): 3ec4928

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +7 -0
.history/datasets/__init___20241227174300.py +37 -0
.history/datasets/ytvos_ref_20250113130043.py +0 -0
.history/datasets/ytvos_ref_20250116073805.py +239 -0
.history/mbench/gpt_ref-ytvos-cy_20250121155719.py +428 -0
.history/mbench/gpt_ref-ytvos_20250119070039.py +277 -0
.history/mbench/gpt_ref-ytvos_20250119070740.py +285 -0
.history/mbench/gpt_ref-ytvos_20250119071412.py +292 -0
.history/mbench/gpt_ref-ytvos_20250119072601.py +292 -0
.history/mbench/gpt_ref-ytvos_20250119073047.py +292 -0
.history/mbench/gpt_ref-ytvos_numbered_cy_20250131124149.py +427 -0
.history/mbench/gpt_ref-ytvos_numbered_cy_20250201141952.py +460 -0
.history/mbench/gpt_ref-ytvos_numbered_cy_20250202183102.py +460 -0
.history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207172804.py +656 -0
.history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207173210.py +656 -0
.history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207173355.py +677 -0
.history/mbench/make_ref-ytvos_json_20250117032501.py +104 -0
.history/mbench/make_ref-ytvos_json_20250117072314.py +107 -0
.history/mbench_a2d/gpt_a2d_numbered_20250206114207.py +205 -0
__pycache__/opts.cpython-310.pyc +0 -0
__pycache__/opts.cpython-39.pyc +0 -0
__pycache__/refer.cpython-39.pyc +0 -0
davis2017/davis.py +122 -0
docs/davis_demo1.gif +3 -0
docs/davis_demo2.gif +3 -0
docs/install.md +42 -0
docs/network.png +3 -0
docs/ytvos_demo1.gif +3 -0
docs/ytvos_demo2.gif +3 -0
hf_cache/.locks/models--zhiqiulin--clip-flant5-xxl/e14a3254bf04f32056759bdc60c64736e7638f31b43957586ff2442ff393890a.lock +0 -0
hf_cache/models--zhiqiulin--clip-flant5-xxl/snapshots/89bad6fffe1126b24d4360c1e1f69145eb6103aa/pytorch_model-00002-of-00003.bin +3 -0
make_ref-ytvos/manual_selection.ipynb +381 -0
make_refcoco/refcocog_google/multi_object_data_gref_google.json +0 -0
make_refcoco/refcocog_google/needrevision_refid_part4.json +506 -0
make_refcoco/refcocog_umd/needrevision_refid_part4.json +498 -0
mbench/__pycache__/__init__.cpython-310.pyc +0 -0
mbench/__pycache__/ytvos_ref.cpython-310.pyc +0 -0
mbench/check_image_numbered_cy.ipynb +0 -0
mbench/check_image_numbered_cy_score.py +212 -0
mbench/gpt_ref-ytvos-cy.ipynb +0 -0
mbench/gpt_ref-ytvos-revised.ipynb +0 -0
mbench/gpt_ref-ytvos_numbered.ipynb +3 -0
mbench/gpt_ref-ytvos_numbered_cy.ipynb +0 -0
mbench/numbered_captions.json +0 -0
mbench/numbered_captions_gpt-4o.json +0 -0
mbench/numbered_captions_gpt-4o_nomask_randcap2.json +0 -0
mbench/numbered_valid_obj_ids_gpt-4o_final.json +0 -0
mbench/numbered_valid_obj_ids_gpt-4o_nomask_randcap2.json +2153 -0
mbench/sampled_frame.json +3 -0
mbench/sampled_frame2.json +0 -0

.gitattributes CHANGED Viewed

@@ -47,3 +47,10 @@ LAVT-RIS/refer/data/refcocog/refs(google).p filter=lfs diff=lfs merge=lfs -text
 LAVT-RIS/refer/data/refcocog/refs(umd).p filter=lfs diff=lfs merge=lfs -text
 LAVT-RIS/refer/evaluation/tokenizer/stanford-corenlp-3.4.1.jar filter=lfs diff=lfs merge=lfs -text
 hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/12acb5074c883dcab3e166d86d20130615ff83b0d26736ee046f4184202ebd3b filter=lfs diff=lfs merge=lfs -text

 LAVT-RIS/refer/data/refcocog/refs(umd).p filter=lfs diff=lfs merge=lfs -text
 LAVT-RIS/refer/evaluation/tokenizer/stanford-corenlp-3.4.1.jar filter=lfs diff=lfs merge=lfs -text
 hf_cache/models--zhiqiulin--clip-flant5-xxl/blobs/12acb5074c883dcab3e166d86d20130615ff83b0d26736ee046f4184202ebd3b filter=lfs diff=lfs merge=lfs -text
+docs/davis_demo2.gif filter=lfs diff=lfs merge=lfs -text
+mbench/gpt_ref-ytvos_numbered.ipynb filter=lfs diff=lfs merge=lfs -text
+docs/ytvos_demo2.gif filter=lfs diff=lfs merge=lfs -text
+mbench/sampled_frame.json filter=lfs diff=lfs merge=lfs -text
+docs/network.png filter=lfs diff=lfs merge=lfs -text
+docs/ytvos_demo1.gif filter=lfs diff=lfs merge=lfs -text
+docs/davis_demo1.gif filter=lfs diff=lfs merge=lfs -text

.history/datasets/__init___20241227174300.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import torch.utils.data
+import torchvision
+from .ytvos import build as build_ytvos
+from .davis import build as build_davis
+from .a2d import build as build_a2d
+from .jhmdb import build as build_jhmdb
+from .refexp import build as build_refexp
+from .concat_dataset import build as build_joint
+def get_coco_api_from_dataset(dataset):
+    for _ in range(10):
+        # if isinstance(dataset, torchvision.datasets.CocoDetection):
+        #     break
+        if isinstance(dataset, torch.utils.data.Subset):
+            dataset = dataset.dataset
+    if isinstance(dataset, torchvision.datasets.CocoDetection):
+        return dataset.coco
+def build_dataset(dataset_file: str, image_set: str, args):
+    if dataset_file == 'ytvos':
+        return build_ytvos(image_set, args)
+    if dataset_file == 'davis':
+        return build_davis(image_set, args)
+    if dataset_file == 'a2d':
+        return build_a2d(image_set, args)
+    if dataset_file == 'jhmdb':
+        return build_jhmdb(image_set, args)
+    # for pretraining
+    if dataset_file == "refcoco" or dataset_file == "refcoco+" or dataset_file == "refcocog":
+        return build_refexp(dataset_file, image_set, args)
+    # for joint training of refcoco and ytvos
+    if dataset_file == 'joint':
+        return build_joint(image_set, args)
+    raise ValueError(f'dataset {dataset_file} not supported')

.history/datasets/ytvos_ref_20250113130043.py ADDED Viewed

File without changes

.history/datasets/ytvos_ref_20250116073805.py ADDED Viewed

	@@ -0,0 +1,239 @@

+"""
+Ref-YoutubeVOS data loader
+"""
+from pathlib import Path
+import torch
+from torch.autograd.grad_mode import F
+from torch.utils.data import Dataset
+import datasets.transforms_video as T
+import os
+from PIL import Image
+import json
+import numpy as np
+import random
+from datasets.categories import ytvos_category_dict as category_dict
+class YTVOSDataset(Dataset):
+    """
+    A dataset class for the Refer-Youtube-VOS dataset which was first introduced in the paper:
+    "URVOS: Unified Referring Video Object Segmentation Network with a Large-Scale Benchmark"
+    (see https://link.springer.com/content/pdf/10.1007/978-3-030-58555-6_13.pdf).
+    The original release of the dataset contained both 'first-frame' and 'full-video' expressions. However, the first
+    dataset is not publicly available anymore as now only the harder 'full-video' subset is available to download
+    through the Youtube-VOS referring video object segmentation competition page at:
+    https://competitions.codalab.org/competitions/29139
+    Furthermore, for the competition the subset's original validation set, which consists of 507 videos, was split into
+    two competition 'validation' & 'test' subsets, consisting of 202 and 305 videos respectively. Evaluation can
+    currently only be done on the competition 'validation' subset using the competition's server, as
+    annotations were publicly released only for the 'train' subset of the competition.
+    """
+    def __init__(self, img_folder: Path, ann_file: Path, transforms, return_masks: bool,
+                 num_frames: int, max_skip: int):
+        self.img_folder = img_folder
+        self.ann_file = ann_file
+        self._transforms = transforms
+        self.return_masks = return_masks # not used
+        self.num_frames = num_frames
+        self.max_skip = max_skip
+        # create video meta data
+        self.prepare_metas()
+        print('\n video num: ', len(self.videos), ' clip num: ', len(self.metas))
+        print('\n')
+    def prepare_metas(self):
+        # read object information
+        with open(os.path.join(str(self.img_folder), 'meta.json'), 'r') as f:
+            subset_metas_by_video = json.load(f)['videos']
+        # read expression data
+        with open(str(self.ann_file), 'r') as f:
+            subset_expressions_by_video = json.load(f)['videos']
+        self.videos = list(subset_expressions_by_video.keys())
+        self.metas = []
+        skip_vid_count = 0
+        for vid in self.videos:
+            vid_meta = subset_metas_by_video[vid]
+            vid_data = subset_expressions_by_video[vid]
+            vid_frames = sorted(vid_data['frames'])
+            vid_len = len(vid_frames)
+            if vid_len < 11:
+                #print(f"Too short video: {vid} with frame length {vid_len}")
+                skip_vid_count += 1
+                continue
+            # Exclude start_idx (0, 1) and end_idx (vid_len-1, vid_len-2)
+            start_idx , end_idx = 2, vid_len-2
+            bin_size = (end_idx - start_idx) // 4
+            bins = []
+            for i in range(4):
+                bin_start = start_idx + i * bin_size
+                bin_end = bin_start + bin_size if i < 3 else end_idx
+                bins.append((bin_start, bin_end))
+            # Random sample one frame from each bin
+            sample_indx = []
+            for start_idx, end_idx in bins:
+                sample_indx.append(random.randint(start_idx, end_idx - 1))
+            sample_indx.sort()  # Ensure indices are in order
+            meta = {
+                'video':vid,
+                'sample_indx':sample_indx,
+                'bins':bins,
+                'frames':vid_frames
+            }
+            obj_id_cat = {}
+            for exp_id, exp_dict in vid_data['expressions'].items():
+                obj_id = exp_dict['obj_id']
+                if obj_id not in obj_id_cat:
+                    obj_id_cat[obj_id] = vid_meta['objects'][obj_id]['category']
+            meta['obj_id_cat'] = obj_id_cat
+            self.metas.append(meta)
+        print(f"skipped {skip_vid_count} short videos")
+    @staticmethod
+    def bounding_box(img):
+        rows = np.any(img, axis=1)
+        cols = np.any(img, axis=0)
+        rmin, rmax = np.where(rows)[0][[0, -1]]
+        cmin, cmax = np.where(cols)[0][[0, -1]]
+        return rmin, rmax, cmin, cmax # y1, y2, x1, x2
+    def __len__(self):
+        return len(self.metas)
+    def __getitem__(self, idx):
+        meta = self.metas[idx]  # dict
+        video, sample_indx, bins, frames, obj_id_cat = \
+            meta['video'], meta['sample_indx'], meta['bins'], meta['frames'], meta['obj_id_cat']
+        # read frames and masks
+        imgs, labels, boxes, masks, valid = [], [], [], [], []
+        for frame_indx in sample_indx:
+            frame_name = frames[frame_indx]
+            img_path = os.path.join(str(self.img_folder), 'JPEGImages', video, frame_name + '.jpg')
+            mask_path = os.path.join(str(self.img_folder), 'Annotations', video, frame_name + '.png')
+            img = Image.open(img_path).convert('RGB')
+            imgs.append(img)
+            mask = Image.open(mask_path).convert('P')
+            mask = np.array(mask)
+            # create the target
+            for obj_id in list(obj_id_cat.keys()):
+                obj_mask = (mask==int(obj_id)).astype(np.float32) # 0,1 binary
+                if (obj_mask > 0).any():
+                    y1, y2, x1, x2 = self.bounding_box(mask)
+                    box = torch.tensor([x1, y1, x2, y2]).to(torch.float)
+                    valid.append(1)
+                else: # some frame didn't contain the instance
+                    box = torch.tensor([0, 0, 0, 0]).to(torch.float)
+                    valid.append(0)
+                obj_mask = torch.from_numpy(obj_mask)
+                # append
+                masks.append(obj_mask)
+                boxes.append(box)
+        # transform
+        w, h = img.size
+        boxes = torch.stack(boxes, dim=0)
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+        masks = torch.stack(masks, dim=0)
+        target = {
+            'frames_idx': sample_indx, # [T,]
+            'boxes': boxes,                          # [T, 4], xyxy
+            'masks': masks,                          # [T, H, W]
+            'valid': torch.tensor(valid),            # [T,]
+            'obj_ids' : list(obj_id_cat.keys()),
+            'orig_size': torch.as_tensor([int(h), int(w)]),
+            'size': torch.as_tensor([int(h), int(w)])
+        }
+        # "boxes" normalize to [0, 1] and transform from xyxy to cxcywh in self._transform
+        if self._transforms:
+            imgs, target = self._transforms(imgs, target)
+            imgs = torch.stack(imgs, dim=0) # [T, 3, H, W]
+        else:
+            imgs = np.array(imgs)
+            imgs = torch.tensor(imgs.transpose(0, 3, 1, 2))
+        #  # FIXME: handle "valid", since some box may be removed due to random crop
+        # if torch.any(target['valid'] == 1):  # at leatst one instance
+        #     instance_check = True
+        # else:
+        #     idx = random.randint(0, self.__len__() - 1)
+        return imgs, target
+def make_coco_transforms(image_set, max_size=640):
+    normalize = T.Compose([
+        T.ToTensor(),
+        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+    ])
+    scales = [288, 320, 352, 392, 416, 448, 480, 512]
+    if image_set == 'train':
+        return T.Compose([
+            T.RandomHorizontalFlip(),
+            T.PhotometricDistort(),
+            T.RandomSelect(
+                T.Compose([
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ]),
+                T.Compose([
+                    T.RandomResize([400, 500, 600]),
+                    T.RandomSizeCrop(384, 600),
+                    T.RandomResize(scales, max_size=max_size),
+                    T.Check(),
+                ])
+            ),
+            normalize,
+        ])
+    # we do not use the 'val' set since the annotations are inaccessible
+    if image_set == 'val':
+        return T.Compose([
+            T.RandomResize([360], max_size=640),
+            normalize,
+        ])
+    raise ValueError(f'unknown {image_set}')
+def build(image_set, args):
+    root = Path(args.ytvos_path)
+    assert root.exists(), f'provided YTVOS path {root} does not exist'
+    PATHS = {
+        "train": (root / "train", root / "meta_expressions" / "train" / "meta_expressions.json"),
+        "val": (root / "valid", root / "meta_expressions" / "valid" / "meta_expressions.json"),    # not used actually
+    }
+    img_folder, ann_file = PATHS[image_set]
+    # dataset = YTVOSDataset(img_folder, ann_file, transforms=make_coco_transforms(image_set, max_size=args.max_size), return_masks=args.masks,
+    #                        num_frames=args.num_frames, max_skip=args.max_skip)
+    dataset = YTVOSDataset(img_folder, ann_file, transforms=None, return_masks=args.masks,
+                           num_frames=args.num_frames, max_skip=args.max_skip)
+    return dataset

.history/mbench/gpt_ref-ytvos-cy_20250121155719.py ADDED Viewed

	@@ -0,0 +1,428 @@

+import sys
+from os import path as osp
+sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
+from mbench.ytvos_ref import build as build_ytvos_ref
+import argparse
+import opts
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+from openai import OpenAI
+import base64
+# Function to encode the image
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+# Captioner
+ytvos_category_valid_list = [
+    'airplane', 'ape', 'bear', 'bike', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
+    'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
+    'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
+    'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
+    'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
+    'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
+]
+def getCaption(video_id, json_data):
+    #데이터 가져오기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    cat_names = set()
+    all_captions = dict()
+    for obj_id in list(video_data['annotations'][0].keys()):
+        cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
+    # cat_names : person, snowboard
+    # 1. gpt에서 직접 action의 대상이 될 수 있는가 물어보기
+    # 2. ref-youtube-vos 에서 제공하는 카테고리 정보에서 우리가 처리하고 싶은 카테고리 이름만 남긴다
+    for cat_name in list(cat_names) :
+        image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
+        image_captions = {}
+        captioner = OpenAI()
+        #0단계: action의 대상이 될 수 있는가?
+        is_movable = False
+        if cat_name in ytvos_category_valid_list :
+            is_movable = True
+        # response_check = captioner.chat.completions.create(
+        #     model="gpt-4o",
+        #     messages=[
+        #         {
+        #             "role": "user",
+        #             "content": f"""
+        #                 Can a {cat_name} be a subject of distinct actions or movements?
+        #                 For example, if {cat_name} is a person, animal, or vehicle, it is likely an action-capable subject.
+        #                 However, if it is an inanimate object like a snowboard, tree, or book, it cannot independently perform actions.
+        #                 Respond with YES if {cat_name} can perform distinct actions or movements; otherwise, respond with NONE.
+        #                 Answer only YES or NONE.
+        #             """
+        #         }
+        #     ],
+        # )
+        # response_check_content = response_check.choices[0].message.content.strip().lower()
+        # print(f"Movable Check for {cat_name}: {response_check_content}")
+        # if response_check_content == "yes": is_movable = True
+        if not is_movable:
+            print(f"Skipping {cat_name}: Determined to be non-movable.")
+            continue
+        for i in range(len(image_paths)):
+            image_path = image_paths[i]
+            frame_name = frame_names[i]
+            base64_image = encode_image(image_path)
+            #1단계: 필터링
+            #print(f"-----------category name: {cat_name}, frame name: {frame_name}")
+            response1 = captioner.chat.completions.create(
+                model="chatgpt-4o-latest",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": f"""Are there multiple {cat_name}s in the image, each performing distinct and recognizable actions?
+                                        Focus only on clear and prominent actions, avoiding minor or ambiguous ones.
+                                        Each action should be unique and clearly associated with a specific object.
+                                        Respond with YES if:
+                                        - The {cat_name}s are people, animals or vehicles, and their actions are distinct and recognizable.
+                                        - The {cat_name}s involve clear, distinguishable actions performed independently.
+                                        Respond with NONE if:
+                                        - The {cat_name}s are objects (e.g., snowboard, tree, books) and do not involve direct interaction with a person.
+                                        - Actions are ambiguous, minor, or not clearly visible.
+                                        If the {cat_name} is 'snowboard' and it is not actively being used or interacted with by a person, output NONE.
+                                        If the {cat_name} is 'person' and their actions are distinct and clear, output YES.
+                                        Answer only YES or NONE."""
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                            },
+                        ],
+                    }
+                ],
+            )
+            response_content = response1.choices[0].message.content
+            should_caption = True if "yes" in response_content.lower() else False
+            #print(f"are {cat_name}s distinguished by action: {response_content}")
+            #2단계: dense caption 만들기
+            if should_caption:
+                response2 = captioner.chat.completions.create(
+                    model="chatgpt-4o-latest",
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": f"""
+                                            Generate a detailed action-centric caption describing the actions of the {cat_name}s in the image.
+                                            1. Focus only on clear, unique, and prominent actions that distinguish each object.
+                                            2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
+                                            3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
+                                            4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
+                                            5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
+                                            6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                                            7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+                                            8. Include interactions with objects or other entities when they are prominent and observable.
+                                            9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
+                                            Output only the caption.""",
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                },
+                            ],
+                        }
+                    ],
+                )
+                caption = response2.choices[0].message.content
+                #print(f"{image_path} - {frame_name}: {caption}")
+            else:
+                caption = None
+            image_captions[frame_name] = caption
+        all_captions[cat_name] = image_captions
+    # final : also prepare valid object ids
+    valid_obj_ids = []
+    valid_cat_names = list(all_captions.keys())
+    for obj_id in list(video_data['annotations'][0].keys()):
+        cat = video_data['annotations'][0][obj_id]['category_name']
+        if cat in valid_cat_names : valid_obj_ids.append(obj_id)
+    return all_captions, valid_obj_ids
+# Referring expression generator and QA filter
+def getRefExp(video_id, frame_name, caption, obj_id, json_data):
+    # 이미지에 해당 물체 바운딩 박스 그리기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
+    frame_indx = frame_names.index(frame_name)
+    obj_data = video_data['annotations'][frame_indx][obj_id]
+    bbox = obj_data['bbox']
+    cat_name = obj_data['category_name']
+    valid = obj_data['valid']
+    if valid == 0:
+        print("Object not in this frame!")
+        return {}
+    x_min, y_min, x_max, y_max = bbox
+    x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
+    cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
+    plt.figure()
+    plt.imshow(I)
+    plt.axis('off')
+    plt.show()
+    #cropped object for visibility check
+    cropped_I = I[y_min:y_max, x_min:x_max]
+    pil_cropped_I = Image.fromarray(cropped_I)
+    buff_crop = BytesIO()
+    pil_cropped_I.save(buff_crop, format='JPEG')
+    base64_cropped_I = base64.b64encode(buff_crop.getvalue()).decode("utf-8")
+    #entire image for referring expression generation
+    pil_I = Image.fromarray(I)
+    buff = BytesIO()
+    pil_I.save(buff, format='JPEG')
+    base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
+    # 구분 가능 여부 확인
+    generator = OpenAI()
+    response_check = generator.chat.completions.create(
+        model="chatgpt-4o-latest",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Can the {cat_name} in the provided cropped image be clearly identified as belonging to the category {cat_name}?
+                                    Focus on whether the cropped image provides enough visible features (e.g., ears, head shape, fur texture) to confirm that it is a {cat_name}, even if the full body is not visible.
+                                    Guidelines:
+                                    - If the visible features (like ears, fur texture or head shape) are sufficient to identify the {cat_name}, respond with YES.
+                                    - If multiple {cat_name}s are entangled or overlapping, making it difficult to distinguish one from another, respond with NONE.
+                                    - If the object is clearly visible and identifiable as a {cat_name}, respond with YES.
+                                    Output only either YES or NONE.
+                        """
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
+                    }
+                ]
+            },
+        ]
+    )
+    response_check_content = response_check.choices[0].message.content.strip().lower()
+    #print(f"is object {obj_id} visible: {response_check_content}")
+    if "yes" not in response_check_content:
+        print(f"Referring expression not generated: {cat_name} is ambiguous in this frame.")
+        return {"ref_exp": "NONE", "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : False}
+    # Referring expression 만들기
+    # generator = OpenAI()
+    response = generator.chat.completions.create(
+        model="chatgpt-4o-latest",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box, corresponding to Object ID {obj_id}.
+                        Guidelines for creating the referring expression:
+                        1. The referring expression should describe the prominent actions or poses of the highlighted {cat_name} (Object ID {obj_id}).
+                        2. Focus on the behavior or pose described in the caption that is specifically associated with this {cat_name}. Do not include actions or poses of other {cat_name}s.
+                        3. If multiple {cat_name}s are present, ensure that the referring expression exclusively describes the {cat_name} corresponding to Object ID {obj_id}.
+                        4. Avoid ambiguous or subjective terms. Use specific and clear action verbs to describe the highlighted {cat_name}.
+                        5. The referring expression should only describe Object ID {obj_id} and not any other objects or entities.
+                        6. Use '{cat_name}' as the noun for the referring expressions.
+                        Output only the referring expression for the highlighted {cat_name} (Object ID {obj_id}).
+                        {caption}
+                        """
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                    # {
+                    #     "type": "image_url",
+                    #     "image_url": {"url": f"data:image/jpeg;base64,{base64_cropped_I}"},
+                    # }
+                ],
+            }
+        ],
+    )
+    ref_exp = response.choices[0].message.content.strip()
+    #QA filtering
+    #QA1: 원하는 물체를 설명하는지
+    filter = OpenAI()
+    response1 = filter.chat.completions.create(
+        model="chatgpt-4o-latest",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    response1_content = response1.choices[0].message.content
+    describesHighlighted = True if "yes" in response1_content.lower() else False
+    #QA2: 원하지 않는 물체를 설명하지 않는지
+    response2 = filter.chat.completions.create(
+        model="chatgpt-4o-latest",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    response2_content = response2.choices[0].message.content
+    notDescribesNotHighlighted = False if "yes" in response2_content.lower() else True
+    isValid = True if describesHighlighted and notDescribesNotHighlighted else False
+    #print(f"describesHighlighted: {describesHighlighted}, notDescribesNotHighlighted: {notDescribesNotHighlighted}")
+    #print(f"ref exp: {ref_exp}")
+    #print("")
+    return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
+if __name__ == '__main__':
+    with open('mbench/sampled_frame3.json', 'r') as file:
+        data = json.load(file)
+    vid_ids = list(data.keys())
+    all_ref_exps = {}
+    os.environ['OPENAI_API_KEY'] = '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'
+    # 전체 데이터셋의 vid_id에 대해
+    for i in range(1):
+        vid_id = vid_ids[i]
+        #====캡션 만들기====
+        # print("=====================captioner========================")
+        captions, valid_obj_ids = getCaption(vid_id, data)
+        cats_in_vid = list(captions.keys())
+        # print()
+        #====referring expression 만들고 QA filtering====
+        # print("=====================referring expression generator & QA filter========================")
+        ref_expressions = {}
+        # 각 카테고리별로
+        for cat_name in cats_in_vid:
+            if cat_name not in ref_expressions:
+                ref_expressions[cat_name] = {}
+            # 각 비디오 프레임 별로
+            for frame_name in data[vid_id]['frame_names']:
+                # print(f'--------category: {cat_name}, frame_name: {frame_name}')
+                if frame_name not in ref_expressions[cat_name]:
+                    ref_expressions[cat_name][frame_name] = {}  # Create frame-level dictionary
+                caption = captions[cat_name][frame_name]
+                if not caption : continue
+                else :
+                    # 각 obj id별로
+                    for obj_id in valid_obj_ids:
+                        ref_exp = getRefExp(vid_id, frame_name, caption, obj_id, data)
+                        ref_expressions[cat_name][frame_name][obj_id] = ref_exp  # Store ref_exp
+        all_ref_exps[vid_id] = ref_expressions
+        with open('mbench/result_revised.json', 'w') as file:
+            json.dump(all_ref_exps, file, indent=4)

.history/mbench/gpt_ref-ytvos_20250119070039.py ADDED Viewed

	@@ -0,0 +1,277 @@

+from datasets import build_dataset
+import argparse
+import opts
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+from openai import OpenAI
+import base64
+os.environ['OPENAI_API_KEY'] = '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'
+# Function to encode the image
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+def getCaption(video_id, json_data):
+    #데이터 가져오기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    cat_names = set()
+    for obj_id in list(video_data['annotations'][0].keys()):
+        cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
+    if len(cat_names) == 1:
+        cat_name = next(iter(cat_names))
+    else:
+        print("more than 2 categories")
+        return -1
+    image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
+    image_captions = {}
+    captioner = OpenAI()
+    for i in range(len(image_paths)):
+        image_path = image_paths[i]
+        frame_name = frame_names[i]
+        base64_image = encode_image(image_path)
+        #1단계: 필터링
+        response1 = captioner.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": f"Are there multiple {cat_name}s that can be distinguished by action? Each action should be prominent and describe the corresponding object only. If so, only output YES. If not, only output None",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                        },
+                    ],
+                }
+            ],
+        )
+        response_content = response1.choices[0].message.content
+        should_caption = True if "yes" in response_content.lower() else False
+        #2단계: dense caption 만들기
+        if should_caption:
+            response2 = captioner.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": f"""
+                                    Describe the image in detail focusing on the {cat_name}s' actions.
+                                    1. Each action should be prominent, clear and unique, describing the corresponding object only.
+                                    2. Avoid overly detailed or indeterminate details such as ‘in anticipation’.
+                                    3. Avoid subjective descriptions such as ‘soft’, ‘controlled’, ‘attentive’, ‘skilled’, ‘casual atmosphere’ and descriptions of the setting.
+                                    4. Do not include actions that needs to be guessed or suggested.""",
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                            },
+                        ],
+                    }
+                ],
+            )
+            caption = response2.choices[0].message.content
+        else:
+            caption = None
+        image_captions[frame_name] = caption
+    return image_captions
+def getRefExp(video_id, frame_name, caption, obj_id, json_data):
+    # 이미지에 해당 물체 바운딩 박스 그리기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
+    frame_indx = frame_names.index(frame_name)
+    obj_data = video_data['annotations'][frame_indx][obj_id]
+    bbox = obj_data['bbox']
+    cat_name = obj_data['category_name']
+    valid = obj_data['valid']
+    if valid == 0:
+        print("Object not in this frame!")
+        return {}
+    x_min, y_min, x_max, y_max = bbox
+    x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
+    cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
+    plt.figure()
+    plt.imshow(I)
+    plt.axis('off')
+    plt.show()
+    pil_I = Image.fromarray(I)
+    buff = BytesIO()
+    pil_I.save(buff, format='JPEG')
+    base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
+    #ref expression 만들기
+    generator = OpenAI()
+    response = generator.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box.
+                                1. The referring expression describes the action and does not contain information about appearance or location in the picture.
+                                2. Focus only on prominent actions and avoid overly detailed or indeterminate details.
+                                3. Avoid subjective terms describing emotion such as ‘in anticipation’, ‘attentively’ or ‘relaxed’ and professional, difficult words.
+                                4. The referring expression should only describe the highlighted {cat_name} and not any other.
+                                5. Use '{cat_name}' as the noun for the referring expressions.
+                                Output only the referring expression.
+                                {caption}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    ref_exp = response.choices[0].message.content
+    #QA filtering
+    #QA1: 원하는 물체를 설명하는지
+    filter = OpenAI()
+    response1 = filter.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    response1_content = response1.choices[0].message.content
+    describesHighlighted = True if "yes" in response1_content.lower() else False
+    #QA2: 원하지 않는 물체를 설명하지 않는지
+    response2 = filter.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    response2_content = response2.choices[0].message.content
+    describesNotHighlighted = True if "yes" in response2_content.lower() else False
+    isValid = True if describesHighlighted and not describesNotHighlighted else False
+    print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
+    return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
+def createRefExp(video_id, json_data):
+    video_data = json_data[video_id]
+    obj_ids = list(video_data['annotations'][0].keys())
+    frame_names = video_data['frame_names']
+    captions_per_frame = getCaption(video_id, json_data)
+    if captions_per_frame == -1:
+        print("There are more than 2 cateories")
+        return
+    video_ref_exps = {}
+    for frame_name in frame_names:
+        frame_caption = captions_per_frame[frame_name]
+        if frame_caption == None:
+            video_ref_exps[frame_name] = None
+        else:
+            frame_ref_exps = {}
+            for obj_id in obj_ids:
+                exp_per_obj = getRefExp(video_id, frame_name, frame_caption, obj_id, json_data)
+                frame_ref_exps[obj_id] = exp_per_obj
+            video_ref_exps[frame_name] = frame_ref_exps
+    return video_ref_exps
+if __name__ == '__main__':
+    with open('mbench/sampled_frame3.json', 'r') as file:
+        data = json.load(file)
+    all_video_refs = {}
+    for i in range(3):
+        video_id = list(data.keys())[i]
+        video_ref = createRefExp(video_id, data)
+        all_video_refs[video_id] = video_ref

.history/mbench/gpt_ref-ytvos_20250119070740.py ADDED Viewed

	@@ -0,0 +1,285 @@

+from datasets import build_dataset
+import argparse
+import opts
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+from openai import OpenAI
+import base64
+os.environ['OPENAI_API_KEY'] = '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'
+# Function to encode the image
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+def getCaption(video_id, json_data):
+    #데이터 가져오기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    cat_names = set()
+    for obj_id in list(video_data['annotations'][0].keys()):
+        cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
+    if len(cat_names) == 1:
+        cat_name = next(iter(cat_names))
+    else:
+        print("more than 2 categories")
+        return -1
+    image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
+    image_captions = {}
+    captioner = OpenAI()
+    for i in range(len(image_paths)):
+        image_path = image_paths[i]
+        frame_name = frame_names[i]
+        base64_image = encode_image(image_path)
+        #1단계: 필터링
+        response1 = captioner.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": f"Are there multiple {cat_name}s that can be distinguished by action? Each action should be prominent and describe the corresponding object only. If so, only output YES. If not, only output None",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                        },
+                    ],
+                }
+            ],
+        )
+        response_content = response1.choices[0].message.content
+        should_caption = True if "yes" in response_content.lower() else False
+        #2단계: dense caption 만들기
+        if should_caption:
+            response2 = captioner.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": f"""
+                                    Describe the image in detail focusing on the {cat_name}s' actions.
+                                    1. Each action should be prominent, clear and unique, describing the corresponding object only.
+                                    2. Avoid overly detailed or indeterminate details such as ‘in anticipation’.
+                                    3. Avoid subjective descriptions such as ‘soft’, ‘controlled’, ‘attentive’, ‘skilled’, ‘casual atmosphere’ and descriptions of the setting.
+                                    4. Do not include actions that needs to be guessed or suggested.""",
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                            },
+                        ],
+                    }
+                ],
+            )
+            caption = response2.choices[0].message.content
+        else:
+            caption = None
+        image_captions[frame_name] = caption
+    return image_captions
+def getRefExp(video_id, frame_name, caption, obj_id, json_data):
+    # 이미지에 해당 물체 바운딩 박스 그리기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
+    frame_indx = frame_names.index(frame_name)
+    obj_data = video_data['annotations'][frame_indx][obj_id]
+    bbox = obj_data['bbox']
+    cat_name = obj_data['category_name']
+    valid = obj_data['valid']
+    if valid == 0:
+        print("Object not in this frame!")
+        return {}
+    x_min, y_min, x_max, y_max = bbox
+    x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
+    cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
+    plt.figure()
+    plt.imshow(I)
+    plt.axis('off')
+    plt.show()
+    pil_I = Image.fromarray(I)
+    buff = BytesIO()
+    pil_I.save(buff, format='JPEG')
+    base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
+    #ref expression 만들기
+    generator = OpenAI()
+    response = generator.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box.
+                                1. The referring expression describes the action and does not contain information about appearance or location in the picture.
+                                2. Focus only on prominent actions and avoid overly detailed or indeterminate details.
+                                3. Avoid subjective terms describing emotion such as ‘in anticipation’, ‘attentively’ or ‘relaxed’ and professional, difficult words.
+                                4. The referring expression should only describe the highlighted {cat_name} and not any other.
+                                5. Use '{cat_name}' as the noun for the referring expressions.
+                                Output only the referring expression.
+                                {caption}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    ref_exp = response.choices[0].message.content
+    #QA filtering
+    #QA1: 원하는 물체를 설명하는지
+    filter = OpenAI()
+    response1 = filter.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    response1_content = response1.choices[0].message.content
+    describesHighlighted = True if "yes" in response1_content.lower() else False
+    #QA2: 원하지 않는 물체를 설명하지 않는지
+    response2 = filter.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    response2_content = response2.choices[0].message.content
+    describesNotHighlighted = True if "yes" in response2_content.lower() else False
+    isValid = True if describesHighlighted and not describesNotHighlighted else False
+    print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
+    return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
+def createRefExp(video_id, json_data):
+    video_data = json_data[video_id]
+    obj_ids = list(video_data['annotations'][0].keys())
+    frame_names = video_data['frame_names']
+    captions_per_frame = getCaption(video_id, json_data)
+    if captions_per_frame == -1:
+        print("There are more than 2 cateories")
+        return
+    video_ref_exps = {}
+    for frame_name in frame_names:
+        frame_caption = captions_per_frame[frame_name]
+        if frame_caption == None:
+            video_ref_exps[frame_name] = None
+        else:
+            frame_ref_exps = {}
+            for obj_id in obj_ids:
+                exp_per_obj = getRefExp(video_id, frame_name, frame_caption, obj_id, json_data)
+                frame_ref_exps[obj_id] = exp_per_obj
+            video_ref_exps[frame_name] = frame_ref_exps
+    return video_ref_exps
+if __name__ == '__main__':
+    with open('mbench/sampled_frame3.json', 'r') as file:
+        data = json.load(file)
+    videos = set()
+    with open('make_ref-ytvos/selected_frames.jsonl', 'r') as file:
+        manual_select = list(file)
+    for frame in manual_select:
+        result = json.loads(frame)
+        videos.add(result['video'])
+    all_video_refs = {}
+    for i in range(10):
+        video_id = list(data.keys())[i]
+        video_ref = createRefExp(video_id, data)
+        all_video_refs[video_id] = video_ref

.history/mbench/gpt_ref-ytvos_20250119071412.py ADDED Viewed

	@@ -0,0 +1,292 @@

+import sys
+from os import path as osp
+sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
+from datasets import build_dataset
+import argparse
+import opts
+from pathlib import Path
+import os
+import skimage
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+from openai import OpenAI
+import base64
+os.environ['OPENAI_API_KEY'] = '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'
+# Function to encode the image
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+def getCaption(video_id, json_data):
+    #데이터 가져오기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    cat_names = set()
+    for obj_id in list(video_data['annotations'][0].keys()):
+        cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
+    if len(cat_names) == 1:
+        cat_name = next(iter(cat_names))
+    else:
+        print("more than 2 categories")
+        return -1
+    image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
+    image_captions = {}
+    captioner = OpenAI()
+    for i in range(len(image_paths)):
+        image_path = image_paths[i]
+        frame_name = frame_names[i]
+        base64_image = encode_image(image_path)
+        #1단계: 필터링
+        response1 = captioner.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": f"Are there multiple {cat_name}s that can be distinguished by action? Each action should be prominent and describe the corresponding object only. If so, only output YES. If not, only output None",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                        },
+                    ],
+                }
+            ],
+        )
+        response_content = response1.choices[0].message.content
+        should_caption = True if "yes" in response_content.lower() else False
+        #2단계: dense caption 만들기
+        if should_caption:
+            response2 = captioner.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": f"""
+                                    Describe the image in detail focusing on the {cat_name}s' actions.
+                                    1. Each action should be prominent, clear and unique, describing the corresponding object only.
+                                    2. Avoid overly detailed or indeterminate details such as ‘in anticipation’.
+                                    3. Avoid subjective descriptions such as ‘soft’, ‘controlled’, ‘attentive’, ‘skilled’, ‘casual atmosphere’ and descriptions of the setting.
+                                    4. Do not include actions that needs to be guessed or suggested.""",
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                            },
+                        ],
+                    }
+                ],
+            )
+            caption = response2.choices[0].message.content
+        else:
+            caption = None
+        image_captions[frame_name] = caption
+    return image_captions
+def getRefExp(video_id, frame_name, caption, obj_id, json_data):
+    # 이미지에 해당 물체 바운딩 박스 그리기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
+    frame_indx = frame_names.index(frame_name)
+    obj_data = video_data['annotations'][frame_indx][obj_id]
+    bbox = obj_data['bbox']
+    cat_name = obj_data['category_name']
+    valid = obj_data['valid']
+    if valid == 0:
+        print("Object not in this frame!")
+        return {}
+    x_min, y_min, x_max, y_max = bbox
+    x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
+    cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
+    plt.figure()
+    plt.imshow(I)
+    plt.axis('off')
+    plt.show()
+    pil_I = Image.fromarray(I)
+    buff = BytesIO()
+    pil_I.save(buff, format='JPEG')
+    base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
+    #ref expression 만들기
+    generator = OpenAI()
+    response = generator.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box.
+                                1. The referring expression describes the action and does not contain information about appearance or location in the picture.
+                                2. Focus only on prominent actions and avoid overly detailed or indeterminate details.
+                                3. Avoid subjective terms describing emotion such as ‘in anticipation’, ‘attentively’ or ‘relaxed’ and professional, difficult words.
+                                4. The referring expression should only describe the highlighted {cat_name} and not any other.
+                                5. Use '{cat_name}' as the noun for the referring expressions.
+                                Output only the referring expression.
+                                {caption}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    ref_exp = response.choices[0].message.content
+    #QA filtering
+    #QA1: 원하는 물체를 설명하는지
+    filter = OpenAI()
+    response1 = filter.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    response1_content = response1.choices[0].message.content
+    describesHighlighted = True if "yes" in response1_content.lower() else False
+    #QA2: 원하지 않는 물체를 설명하지 않는지
+    response2 = filter.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    response2_content = response2.choices[0].message.content
+    describesNotHighlighted = True if "yes" in response2_content.lower() else False
+    isValid = True if describesHighlighted and not describesNotHighlighted else False
+    print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
+    return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
+def createRefExp(video_id, json_data):
+    video_data = json_data[video_id]
+    obj_ids = list(video_data['annotations'][0].keys())
+    frame_names = video_data['frame_names']
+    captions_per_frame = getCaption(video_id, json_data)
+    if captions_per_frame == -1:
+        print("There are more than 2 cateories")
+        return
+    video_ref_exps = {}
+    for frame_name in frame_names:
+        frame_caption = captions_per_frame[frame_name]
+        if frame_caption == None:
+            video_ref_exps[frame_name] = None
+        else:
+            frame_ref_exps = {}
+            for obj_id in obj_ids:
+                exp_per_obj = getRefExp(video_id, frame_name, frame_caption, obj_id, json_data)
+                frame_ref_exps[obj_id] = exp_per_obj
+            video_ref_exps[frame_name] = frame_ref_exps
+    return video_ref_exps
+if __name__ == '__main__':
+    with open('mbench/sampled_frame3.json', 'r') as file:
+        data = json.load(file)
+    videos = set()
+    with open('make_ref-ytvos/selected_frames.jsonl', 'r') as file:
+        manual_select = list(file)
+    for frame in manual_select:
+        result = json.loads(frame)
+        videos.add(result['video'])
+    videos = list(videos)
+    all_video_refs = {}
+    for i in range(1):
+        video_id = videos[i]
+        video_ref = createRefExp(video_id, data)
+        all_video_refs[video_id] = video_ref
+    json_obj = json.dumps(all_video_refs, indent=4)
+    with open('mbench/result.json', 'w') as file:
+        file.wirte(json_obj)

.history/mbench/gpt_ref-ytvos_20250119072601.py ADDED Viewed

	@@ -0,0 +1,292 @@

+import sys
+from os import path as osp
+sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
+from datasets import build_dataset
+import argparse
+import opts
+from pathlib import Path
+import os
+import skimage
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+from openai import OpenAI
+import base64
+os.environ['OPENAI_API_KEY'] = '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'
+# Function to encode the image
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+def getCaption(video_id, json_data):
+    #데이터 가져오기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    cat_names = set()
+    for obj_id in list(video_data['annotations'][0].keys()):
+        cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
+    if len(cat_names) == 1:
+        cat_name = next(iter(cat_names))
+    else:
+        print("more than 2 categories")
+        return -1
+    image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
+    image_captions = {}
+    captioner = OpenAI()
+    for i in range(len(image_paths)):
+        image_path = image_paths[i]
+        frame_name = frame_names[i]
+        base64_image = encode_image(image_path)
+        #1단계: 필터링
+        response1 = captioner.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": f"Are there multiple {cat_name}s that can be distinguished by action? Each action should be prominent and describe the corresponding object only. If so, only output YES. If not, only output None",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                        },
+                    ],
+                }
+            ],
+        )
+        response_content = response1.choices[0].message.content
+        should_caption = True if "yes" in response_content.lower() else False
+        #2단계: dense caption 만들기
+        if should_caption:
+            response2 = captioner.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": f"""
+                                    Describe the image in detail focusing on the {cat_name}s' actions.
+                                    1. Each action should be prominent, clear and unique, describing the corresponding object only.
+                                    2. Avoid overly detailed or indeterminate details such as ‘in anticipation’.
+                                    3. Avoid subjective descriptions such as ‘soft’, ‘controlled’, ‘attentive’, ‘skilled’, ‘casual atmosphere’ and descriptions of the setting.
+                                    4. Do not include actions that needs to be guessed or suggested.""",
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                            },
+                        ],
+                    }
+                ],
+            )
+            caption = response2.choices[0].message.content
+        else:
+            caption = None
+        image_captions[frame_name] = caption
+    return image_captions
+def getRefExp(video_id, frame_name, caption, obj_id, json_data):
+    # 이미지에 해당 물체 바운딩 박스 그리기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
+    frame_indx = frame_names.index(frame_name)
+    obj_data = video_data['annotations'][frame_indx][obj_id]
+    bbox = obj_data['bbox']
+    cat_name = obj_data['category_name']
+    valid = obj_data['valid']
+    if valid == 0:
+        print("Object not in this frame!")
+        return {}
+    x_min, y_min, x_max, y_max = bbox
+    x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
+    cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
+    plt.figure()
+    plt.imshow(I)
+    plt.axis('off')
+    plt.show()
+    pil_I = Image.fromarray(I)
+    buff = BytesIO()
+    pil_I.save(buff, format='JPEG')
+    base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
+    #ref expression 만들기
+    generator = OpenAI()
+    response = generator.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box.
+                                1. The referring expression describes the action and does not contain information about appearance or location in the picture.
+                                2. Focus only on prominent actions and avoid overly detailed or indeterminate details.
+                                3. Avoid subjective terms describing emotion such as ‘in anticipation’, ‘attentively’ or ‘relaxed’ and professional, difficult words.
+                                4. The referring expression should only describe the highlighted {cat_name} and not any other.
+                                5. Use '{cat_name}' as the noun for the referring expressions.
+                                Output only the referring expression.
+                                {caption}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    ref_exp = response.choices[0].message.content
+    #QA filtering
+    #QA1: 원하는 물체를 설명하는지
+    filter = OpenAI()
+    response1 = filter.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    response1_content = response1.choices[0].message.content
+    describesHighlighted = True if "yes" in response1_content.lower() else False
+    #QA2: 원하지 않는 물체를 설명하지 않는지
+    response2 = filter.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    response2_content = response2.choices[0].message.content
+    describesNotHighlighted = True if "yes" in response2_content.lower() else False
+    isValid = True if describesHighlighted and not describesNotHighlighted else False
+    print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
+    return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
+def createRefExp(video_id, json_data):
+    video_data = json_data[video_id]
+    obj_ids = list(video_data['annotations'][0].keys())
+    frame_names = video_data['frame_names']
+    captions_per_frame = getCaption(video_id, json_data)
+    if captions_per_frame == -1:
+        print("There are more than 2 cateories")
+        return None
+    video_ref_exps = {}
+    for frame_name in frame_names:
+        frame_caption = captions_per_frame[frame_name]
+        if frame_caption == None:
+            video_ref_exps[frame_name] = None
+        else:
+            frame_ref_exps = {}
+            for obj_id in obj_ids:
+                exp_per_obj = getRefExp(video_id, frame_name, frame_caption, obj_id, json_data)
+                frame_ref_exps[obj_id] = exp_per_obj
+            video_ref_exps[frame_name] = frame_ref_exps
+    return video_ref_exps
+if __name__ == '__main__':
+    with open('mbench/sampled_frame3.json', 'r') as file:
+        data = json.load(file)
+    videos = set()
+    with open('make_ref-ytvos/selected_frames.jsonl', 'r') as file:
+        manual_select = list(file)
+    for frame in manual_select:
+        result = json.loads(frame)
+        videos.add(result['video'])
+    videos = list(videos)
+    all_video_refs = {}
+    for i in range(1, 2):
+        video_id = videos[i]
+        video_ref = createRefExp(video_id, data)
+        all_video_refs[video_id] = video_ref
+    json_obj = json.dumps(all_video_refs, indent=4)
+    with open('mbench/result.json', 'w') as file:
+        file.write(json_obj)

.history/mbench/gpt_ref-ytvos_20250119073047.py ADDED Viewed

	@@ -0,0 +1,292 @@

+import sys
+from os import path as osp
+sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
+from datasets import build_dataset
+import argparse
+import opts
+from pathlib import Path
+import os
+import skimage
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+from openai import OpenAI
+import base64
+os.environ['OPENAI_API_KEY'] = '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'
+# Function to encode the image
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+def getCaption(video_id, json_data):
+    #데이터 가져오기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    cat_names = set()
+    for obj_id in list(video_data['annotations'][0].keys()):
+        cat_names.add(video_data['annotations'][0][obj_id]['category_name'])
+    if len(cat_names) == 1:
+        cat_name = next(iter(cat_names))
+    else:
+        print("more than 2 categories")
+        return -1
+    image_paths = [os.path.join(video_path, frame_name + '.jpg') for frame_name in frame_names]
+    image_captions = {}
+    captioner = OpenAI()
+    for i in range(len(image_paths)):
+        image_path = image_paths[i]
+        frame_name = frame_names[i]
+        base64_image = encode_image(image_path)
+        #1단계: 필터링
+        response1 = captioner.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": f"Are there multiple {cat_name}s that can be distinguished by action? Each action should be prominent and describe the corresponding object only. If so, only output YES. If not, only output None",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                        },
+                    ],
+                }
+            ],
+        )
+        response_content = response1.choices[0].message.content
+        should_caption = True if "yes" in response_content.lower() else False
+        #2단계: dense caption 만들기
+        if should_caption:
+            response2 = captioner.chat.completions.create(
+                model="gpt-4o-mini",
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": f"""
+                                    Describe the image in detail focusing on the {cat_name}s' actions.
+                                    1. Each action should be prominent, clear and unique, describing the corresponding object only.
+                                    2. Avoid overly detailed or indeterminate details such as ‘in anticipation’.
+                                    3. Avoid subjective descriptions such as ‘soft’, ‘controlled’, ‘attentive’, ‘skilled’, ‘casual atmosphere’ and descriptions of the setting.
+                                    4. Do not include actions that needs to be guessed or suggested.""",
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                            },
+                        ],
+                    }
+                ],
+            )
+            caption = response2.choices[0].message.content
+        else:
+            caption = None
+        image_captions[frame_name] = caption
+    return image_captions
+def getRefExp(video_id, frame_name, caption, obj_id, json_data):
+    # 이미지에 해당 물체 바운딩 박스 그리기
+    video_data = json_data[video_id]
+    frame_names = video_data['frame_names']
+    video_path = video_data['video_path']
+    I = skimage.io.imread(osp.join(video_path, frame_name + '.jpg'))
+    frame_indx = frame_names.index(frame_name)
+    obj_data = video_data['annotations'][frame_indx][obj_id]
+    bbox = obj_data['bbox']
+    cat_name = obj_data['category_name']
+    valid = obj_data['valid']
+    if valid == 0:
+        print("Object not in this frame!")
+        return {}
+    x_min, y_min, x_max, y_max = bbox
+    x_min, y_min, x_max, y_max = int(x_min), int(y_min), int(x_max), int(y_max)
+    cv2.rectangle(I, (x_min, y_min), (x_max, y_max), (225, 0, 0), 2)
+    plt.figure()
+    plt.imshow(I)
+    plt.axis('off')
+    plt.show()
+    pil_I = Image.fromarray(I)
+    buff = BytesIO()
+    pil_I.save(buff, format='JPEG')
+    base64_I = base64.b64encode(buff.getvalue()).decode("utf-8")
+    #ref expression 만들기
+    generator = OpenAI()
+    response = generator.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Based on the dense caption, create a referring expression for the {cat_name} highlighted with the red box.
+                                1. The referring expression describes the action and does not contain information about appearance or location in the picture.
+                                2. Focus only on prominent actions and avoid overly detailed or indeterminate details.
+                                3. Avoid subjective terms describing emotion such as ‘in anticipation’, ‘attentively’ or ‘relaxed’ and professional, difficult words.
+                                4. The referring expression should only describe the highlighted {cat_name} and not any other.
+                                5. Use '{cat_name}' as the noun for the referring expressions.
+                                Output only the referring expression.
+                                {caption}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    ref_exp = response.choices[0].message.content
+    #QA filtering
+    #QA1: 원하는 물체를 설명하는지
+    filter = OpenAI()
+    response1 = filter.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the {cat_name} highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    response1_content = response1.choices[0].message.content
+    describesHighlighted = True if "yes" in response1_content.lower() else False
+    #QA2: 원하지 않는 물체를 설명하지 않는지
+    response2 = filter.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"""Does the given expression describe the person not highlighted with the red box? If so, only return YES and if not, NO.
+                                    {ref_exp}""",
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_I}"},
+                    },
+                ],
+            }
+        ],
+    )
+    response2_content = response2.choices[0].message.content
+    describesNotHighlighted = True if "yes" in response2_content.lower() else False
+    isValid = True if describesHighlighted and not describesNotHighlighted else False
+    print(f"describesHighlighted: {describesHighlighted}, describesNotHighlighted: {describesNotHighlighted}")
+    return {"ref_exp": ref_exp, "caption": caption, "cat_name": cat_name, "file_name": frame_name, "isValid" : isValid}
+def createRefExp(video_id, json_data):
+    video_data = json_data[video_id]
+    obj_ids = list(video_data['annotations'][0].keys())
+    frame_names = video_data['frame_names']
+    captions_per_frame = getCaption(video_id, json_data)
+    if captions_per_frame == -1:
+        print("There are more than 2 cateories")
+        return None
+    video_ref_exps = {}
+    for frame_name in frame_names:
+        frame_caption = captions_per_frame[frame_name]
+        if frame_caption == None:
+            video_ref_exps[frame_name] = None
+        else:
+            frame_ref_exps = {}
+            for obj_id in obj_ids:
+                exp_per_obj = getRefExp(video_id, frame_name, frame_caption, obj_id, json_data)
+                frame_ref_exps[obj_id] = exp_per_obj
+            video_ref_exps[frame_name] = frame_ref_exps
+    return video_ref_exps
+if __name__ == '__main__':
+    with open('mbench/sampled_frame3.json', 'r') as file:
+        data = json.load(file)
+    videos = set()
+    with open('make_ref-ytvos/selected_frames.jsonl', 'r') as file:
+        manual_select = list(file)
+    for frame in manual_select:
+        result = json.loads(frame)
+        videos.add(result['video'])
+    videos = list(videos)
+    all_video_refs = {}
+    for i in range(10):
+        video_id = videos[i]
+        video_ref = createRefExp(video_id, data)
+        all_video_refs[video_id] = video_ref
+    json_obj = json.dumps(all_video_refs, indent=4)
+    with open('mbench/result.json', 'w') as file:
+        file.write(json_obj)

.history/mbench/gpt_ref-ytvos_numbered_cy_20250131124149.py ADDED Viewed

	@@ -0,0 +1,427 @@

+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from os import path as osp
+from io import BytesIO
+from mbench.ytvos_ref import build as build_ytvos_ref
+import argparse
+import opts
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import textwrap
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+from openai import OpenAI
+import base64
+import json
+def number_objects_and_encode(idx, color_mask=False):
+    encoded_frames = {}
+    contoured_frames = {}  # New dictionary for original images
+    vid_cat_cnts = {}
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    frame_indx = vid_meta['sample_indx']
+    cat_names = set(vid_meta['obj_id_cat'].values())
+    imgs = vid_data[0]
+    for cat in cat_names:
+        cat_frames = []
+        contour_frames = []
+        frame_cat_cnts = {}
+        for i in range(imgs.size(0)):
+            frame_name = frame_indx[i]
+            frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_data = vid_data[2][frame_name]
+            obj_ids = list(frame_data.keys())
+            cat_cnt = 0
+            for j in range(len(obj_ids)):
+                obj_id = obj_ids[j]
+                obj_data = frame_data[obj_id]
+                obj_bbox = obj_data['bbox']
+                obj_valid = obj_data['valid']
+                obj_mask = obj_data['mask'].numpy().astype(np.uint8)
+                obj_cat = obj_data['category_name']
+                if obj_cat == cat and obj_valid:
+                    cat_cnt += 1
+                    if color_mask == False:
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 3)
+                        for i, contour in enumerate(contours):
+                            # 윤곽선 중심 계산
+                            moments = cv2.moments(contour)
+                            if moments["m00"] != 0:  # 중심 계산 가능 여부 확인
+                                cx = int(moments["m10"] / moments["m00"])
+                                cy = int(moments["m01"] / moments["m00"])
+                            else:
+                                cx, cy = contour[0][0]  # 중심 계산 불가시 대체 좌표 사용
+                            # 텍스트 배경 (검은색 배경 만들기)
+                            font = cv2.FONT_HERSHEY_SIMPLEX
+                            text = obj_id
+                            text_size = cv2.getTextSize(text, font, 1, 2)[0]
+                            text_w, text_h = text_size
+                            # 텍스트 배경 그리기 (검은색 배경)
+                            cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
+                                        (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
+                            # 텍스트 그리기 (흰색 텍스트)
+                            cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
+                                        font, 1, (255, 255, 255), 2)
+                    else:
+                        alpha = 0.08
+                        colored_obj_mask = np.zeros_like(frame)
+                        colored_obj_mask[obj_mask == 1] = colors[j]
+                        frame[obj_mask == 1] = (
+                            (1 - alpha) * frame[obj_mask == 1]
+                            + alpha * colored_obj_mask[obj_mask == 1]
+                        )
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 2)
+                        cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
+                        if len(contours) > 0:
+                            largest_contour = max(contours, key=cv2.contourArea)
+                            M = cv2.moments(largest_contour)
+                            if M["m00"] != 0:
+                                center_x = int(M["m10"] / M["m00"])
+                                center_y = int(M["m01"] / M["m00"])
+                            else:
+                                center_x, center_y = 0, 0
+                        font = cv2.FONT_HERSHEY_SIMPLEX
+                        text = obj_id
+                        font_scale = 0.9
+                        text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
+                        text_x = center_x - text_size[0] // 1  # 텍스트의 가로 중심
+                        text_y = center_y
+                        # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
+                        # 텍스트 배경 사각형 좌표 계산
+                        rect_start = (text_x - 5, text_y - text_size[1] - 5)  # 배경 사각형 좌상단
+                        # rect_end = (text_x + text_size[0] + 5, text_y + 5)
+                        rect_end = (text_x + text_size[0] + 5, text_y)
+                        cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+                        cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
+            # plt.figure(figsize=(12, 8))
+            # plt.imshow(frame)
+            # plt.title(f"frame {frame_name}")
+            # plt.tight_layout()
+            # plt.axis('off')
+            # plt.show()
+            buffer = BytesIO()
+            frame = Image.fromarray(frame)
+            frame.save(buffer, format='jpeg')
+            buffer.seek(0)
+            cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+            frame_cat_cnts[frame_name] = cat_cnt
+            buffer.seek(0)  # Reuse buffer instead of creating a new one
+            buffer.truncate()
+            frame_for_contour = Image.fromarray(frame_for_contour)
+            frame_for_contour.save(buffer, format='jpeg')
+            buffer.seek(0)
+            contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+        encoded_frames[cat] = cat_frames
+        contoured_frames[cat] = contour_frames
+        vid_cat_cnts[cat] = frame_cat_cnts
+    return encoded_frames, vid_cat_cnts, contoured_frames
+def getCaption(idx, color_mask=True):
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    print(f"vid id: {vid_id}\n")
+    frame_indx = vid_meta['sample_indx']             # e.g. [4, 7, 9, 16]
+    cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
+    all_captions = dict()
+    base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
+    marked = "mask with boundary" if color_mask else "boundary"
+    for cat_name in list(cat_names) :
+        is_movable = False
+        if cat_name in ytvos_category_valid_list :
+            is_movable = True
+        if not is_movable:
+            print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
+        image_captions = {}
+        captioner = OpenAI()
+        cat_base64_frames = base64_frames[cat_name]
+        cont_base64_frames = contoured_frames[cat_name]
+        for i in range(len(cat_base64_frames)):
+            frame_name = frame_indx[i]
+            cont_base64_image = cont_base64_frames[i]
+            base64_image = cat_base64_frames[i]
+            should_filter = False
+            frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
+            if frame_cat_cnts >= 2:
+                should_filter = True
+            else:
+                print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
+            if is_movable and should_filter:
+                #1단계: 필터링
+                print(f"-----------category name: {cat_name}, frame name: {frame_name}")
+                caption_filter_text = f"""
+                You are a visual assistant analyzing a single frame from a video.
+                In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
+                Are {cat_name}s in the image performing all different and recognizable actions or postures?
+                Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing towards, walking...), motion cues (inferred from the momentary stance or position),
+                facial expressions, and any notable interactions with objects or other {cat_name}s or people.
+                Only focus on obvious, prominent actions that can be reliably identified from this single frame.
+                - Respond with "YES" if:
+                1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
+                2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
+                3) Each action is unambiguously recognizable and distinct.
+                - Respond with "NONE" if:
+                1) The actions or pose are not clearly differentiable or too similar.
+                2) They show no noticeable action beyond standing or minor movements.
+                Answer strictly with either "YES" or "NONE".
+                """
+                response1 = captioner.chat.completions.create(
+                    model="chatgpt-4o-latest",
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": caption_filter_text,
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                }
+                            ],
+                        }
+                    ],
+                )
+                response_content = response1.choices[0].message.content
+                should_caption = True if "yes" in response_content.lower() else False
+                print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
+            else:
+                should_caption = False
+            #2단계: dense caption 만들기
+            dense_caption_prompt_1 =  f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
+                                        In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
+                                        I want to use your expressions to create a action-centric referring expression dataset.
+                                        Therefore, your expressions for these {cat_name}s should describe unique action of each object.
+                                        1. Focus only on clear, unique, and prominent actions that distinguish each object.
+                                        2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
+                                        3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
+                                        4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
+                                        5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
+                                        6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                                        7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+                                        8. Include interactions with objects or other entities when they are prominent and observable.
+                                        9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
+                                        10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
+                                        11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
+                                        12. Do not mention object IDs.
+                                        13. Use '{cat_name}' as the noun for the referring expressions.
+                                        Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
+                                        Output referring expressions for each object id.
+                                        """
+            dense_caption_prompt = f"""
+            You are a visual assistant analyzing a single frame of a video.
+            In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
+            I want to use your expressions to create a action-centric referring expression dataset.
+            Please describe each {cat_name} using **clearly observable** and **specific** actions.
+            ## Guidelines:
+            1. Focus on visible, prominent actions only (e.g., running, pushing, grasping an object).
+            2. Avoid describing minor or ambiguous actions (e.g., slightly moving a paw).
+            3. Do not include subjective or speculative descriptions (e.g., “it seems excited” or “it might be preparing to jump”).
+            4. Do not use vague expressions like "interacting with something"** or "engaging with another object."
+                Instead, specify the interaction in detail (e.g., "grabbing a stick," "pressing a button").
+            5. Use dynamic action verbs (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
+            6. If multiple {cat_name}s appear, ensure each description is detailed enough to differentiate their actions.
+            7. Base your description on the following action definitions:
+            - Facial with object manipulation
+            - General body movement, body position or pattern
+            - Movements when interacting with a specific, named object (e.g., "kicking a ball" instead of "interacting with an object").
+            - Body movements in person or animal interaction (e.g., "pushing another person" instead of "engaging with someone").
+            ## Output Format:
+            - For each labeled {cat_name}, output one line in the format:
+            ID. action-oriented description
+            Example:
+            1. a bear grasping the edge of a wood with its front paws
+            2. the bear pushing another bear, leaning forward
+            **Do not include** appearance details (e.g., color, size, shape) or relative positioning (e.g., “on the left/right”).
+            **Do not mention object IDs** in the text of your sentence—just use them as labels for your output lines.
+            Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
+            For each labeled {cat_name}, output referring expressions for each object id.
+            """
+            if should_caption:
+                response2 = captioner.chat.completions.create(
+                    model="gpt-4o-mini",
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": dense_caption_prompt,
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                },
+                            ],
+                        }
+                    ],
+                )
+                caption = response2.choices[0].message.content
+                #print(f"{image_path} - {frame_name}: {caption}")
+            else:
+                caption = None
+            image_captions[frame_name] = caption
+        all_captions[cat_name] = image_captions
+    # final : also prepare valid object ids
+    valid_obj_ids = dict()
+    for cat in cat_names:
+        if cat in ytvos_category_valid_list:
+            obj_id_cat = vid_meta['obj_id_cat']
+            valid_cat_ids = []
+            for obj_id in list(obj_id_cat.keys()):
+                if obj_id_cat[obj_id] == cat:
+                    valid_cat_ids.append(obj_id)
+            valid_obj_ids[cat] = valid_cat_ids
+    return vid_id, all_captions, valid_obj_ids
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
+    parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_ytvos_ref(image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    # 색상 후보 8개 (RGB 형식)
+    colors = [
+        (255, 0, 0),    # Red
+        (0, 255, 0),    # Green
+        (0, 0, 255),    # Blue
+        (255, 255, 0),  # Yellow
+        (255, 0, 255),  # Magenta
+        (0, 255, 255),  # Cyan
+        (128, 0, 128),  # Purple
+        (255, 165, 0)   # Orange
+    ]
+    ytvos_category_valid_list = [
+    'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
+    'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
+    'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
+    'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
+    'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
+    'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
+    ]
+    #==================gpt 돌리기===================
+    os.environ['OPENAI_API_KEY'] = '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'
+    result_captions = {}
+    result_valid_obj_ids = {}
+    for i in range(370):
+        vid_id, all_captions, valid_obj_ids = getCaption(i, True)
+        if vid_id not in result_captions:
+            result_captions[vid_id] = all_captions
+        if vid_id not in result_valid_obj_ids:
+            result_valid_obj_ids[vid_id] = valid_obj_ids
+    print("Finished!", flush=True)
+    with open(args.save_caption_path, "w") as file:
+        json.dump(result_captions, file, indent=4)
+    with open(args.save_valid_obj_ids_path, "w") as file:
+        json.dump(result_valid_obj_ids, file, indent=4)

.history/mbench/gpt_ref-ytvos_numbered_cy_20250201141952.py ADDED Viewed

	@@ -0,0 +1,460 @@

+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import time
+from os import path as osp
+from io import BytesIO
+from mbench.ytvos_ref import build as build_ytvos_ref
+import argparse
+import opts
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import textwrap
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+from openai import OpenAI
+import base64
+import json
+def number_objects_and_encode(idx, color_mask=False):
+    encoded_frames = {}
+    contoured_frames = {}  # New dictionary for original images
+    vid_cat_cnts = {}
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    frame_indx = vid_meta['sample_indx']
+    cat_names = set(vid_meta['obj_id_cat'].values())
+    imgs = vid_data[0]
+    for cat in cat_names:
+        cat_frames = []
+        contour_frames = []
+        frame_cat_cnts = {}
+        for i in range(imgs.size(0)):
+            frame_name = frame_indx[i]
+            frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_data = vid_data[2][frame_name]
+            obj_ids = list(frame_data.keys())
+            cat_cnt = 0
+            for j in range(len(obj_ids)):
+                obj_id = obj_ids[j]
+                obj_data = frame_data[obj_id]
+                obj_bbox = obj_data['bbox']
+                obj_valid = obj_data['valid']
+                obj_mask = obj_data['mask'].numpy().astype(np.uint8)
+                obj_cat = obj_data['category_name']
+                if obj_cat == cat and obj_valid:
+                    cat_cnt += 1
+                    if color_mask == False:
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 3)
+                        for i, contour in enumerate(contours):
+                            # 윤곽선 중심 계산
+                            moments = cv2.moments(contour)
+                            if moments["m00"] != 0:  # 중심 계산 가능 여부 확인
+                                cx = int(moments["m10"] / moments["m00"])
+                                cy = int(moments["m01"] / moments["m00"])
+                            else:
+                                cx, cy = contour[0][0]  # 중심 계산 불가시 대체 좌표 사용
+                            # 텍스트 배경 (검은색 배경 만들기)
+                            font = cv2.FONT_HERSHEY_SIMPLEX
+                            text = obj_id
+                            text_size = cv2.getTextSize(text, font, 1, 2)[0]
+                            text_w, text_h = text_size
+                            # 텍스트 배경 그리기 (검은색 배경)
+                            cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
+                                        (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
+                            # 텍스트 그리기 (흰색 텍스트)
+                            cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
+                                        font, 1, (255, 255, 255), 2)
+                    else:
+                        alpha = 0.08
+                        colored_obj_mask = np.zeros_like(frame)
+                        colored_obj_mask[obj_mask == 1] = colors[j]
+                        frame[obj_mask == 1] = (
+                            (1 - alpha) * frame[obj_mask == 1]
+                            + alpha * colored_obj_mask[obj_mask == 1]
+                        )
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 2)
+                        cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
+                        if len(contours) > 0:
+                            largest_contour = max(contours, key=cv2.contourArea)
+                            M = cv2.moments(largest_contour)
+                            if M["m00"] != 0:
+                                center_x = int(M["m10"] / M["m00"])
+                                center_y = int(M["m01"] / M["m00"])
+                            else:
+                                center_x, center_y = 0, 0
+                        font = cv2.FONT_HERSHEY_SIMPLEX
+                        text = obj_id
+                        font_scale = 0.9
+                        text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
+                        text_x = center_x - text_size[0] // 1  # 텍스트의 가로 중심
+                        text_y = center_y
+                        # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
+                        # 텍스트 배경 사각형 좌표 계산
+                        rect_start = (text_x - 5, text_y - text_size[1] - 5)  # 배경 사각형 좌상단
+                        # rect_end = (text_x + text_size[0] + 5, text_y + 5)
+                        rect_end = (text_x + text_size[0] + 5, text_y)
+                        cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+                        cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
+            # plt.figure(figsize=(12, 8))
+            # plt.imshow(frame)
+            # plt.title(f"frame {frame_name}")
+            # plt.tight_layout()
+            # plt.axis('off')
+            # plt.show()
+            buffer = BytesIO()
+            frame = Image.fromarray(frame)
+            frame.save(buffer, format='jpeg')
+            buffer.seek(0)
+            cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+            frame_cat_cnts[frame_name] = cat_cnt
+            buffer.seek(0)  # Reuse buffer instead of creating a new one
+            buffer.truncate()
+            frame_for_contour = Image.fromarray(frame_for_contour)
+            frame_for_contour.save(buffer, format='jpeg')
+            buffer.seek(0)
+            contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+        encoded_frames[cat] = cat_frames
+        contoured_frames[cat] = contour_frames
+        vid_cat_cnts[cat] = frame_cat_cnts
+    return encoded_frames, vid_cat_cnts, contoured_frames
+def getCaption(idx, model='gpt-4o', color_mask=True):
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    print(f"vid id: {vid_id}\n")
+    frame_indx = vid_meta['sample_indx']             # e.g. [4, 7, 9, 16]
+    cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
+    all_captions = dict()
+    base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
+    #marked = "mask with boundary" if color_mask else "boundary"
+    for cat_name in list(cat_names) :
+        is_movable = False
+        if cat_name in ytvos_category_valid_list :
+            is_movable = True
+        if not is_movable:
+            print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
+        image_captions = {}
+        captioner = OpenAI()
+        cat_base64_frames = base64_frames[cat_name]
+        cont_base64_frames = contoured_frames[cat_name]
+        for i in range(len(cat_base64_frames)):
+            frame_name = frame_indx[i]
+            cont_base64_image = cont_base64_frames[i]
+            base64_image = cat_base64_frames[i]
+            should_filter = False
+            frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
+            if frame_cat_cnts >= 2:
+                should_filter = True
+            else:
+                print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
+            if is_movable and should_filter:
+                #1단계: 필터링
+                print(f"-----------category name: {cat_name}, frame name: {frame_name}")
+                caption_filter_text = f"""
+                You are a visual assistant analyzing a single frame from a video.
+                In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
+                Are {cat_name}s in the image performing all different and recognizable actions or postures?
+                Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
+                facial expressions, and any notable interactions with objects or other {cat_name}s or people.
+                Only focus on obvious, prominent actions that can be reliably identified from this single frame.
+                - Respond with "YES" if:
+                1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
+                    (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
+                2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
+                3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
+                - Respond with "NONE" if:
+                1) The actions or pose are not clearly differentiable or too similar.
+                2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
+                3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
+                Answer strictly with either "YES" or "NONE".
+                """
+                response1 = captioner.chat.completions.create(
+                    model=model,
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": caption_filter_text,
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                }
+                            ],
+                        }
+                    ],
+                )
+                response_content = response1.choices[0].message.content
+                should_caption = True if "yes" in response_content.lower() else False
+                print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
+            else:
+                should_caption = False
+            #2단계: dense caption 만들기
+            dense_caption_prompt_1 =  f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
+                                        In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
+                                        I want to use your expressions to create a action-centric referring expression dataset.
+                                        Therefore, your expressions for these {cat_name}s should describe unique action of each object.
+                                        1. Focus only on clear, unique, and prominent actions that distinguish each object.
+                                        2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
+                                        3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
+                                        4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
+                                        5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
+                                        6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                                        7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+                                        8. Include interactions with objects or other entities when they are prominent and observable.
+                                        9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
+                                        10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
+                                        11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
+                                        12. Do not mention object IDs.
+                                        13. Use '{cat_name}' as the noun for the referring expressions.
+                                        Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
+                                        Output referring expressions for each object id.
+                                        """
+            dense_caption_prompt = f"""
+            You are a visual assistant analyzing a single frame of a video.
+            In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
+            I want to use your expressions to create an **action-centric referring expression** dataset.
+            Please describe each {cat_name} using **clearly observable** and **specific** actions.
+            ---
+            ## Guidelines:
+            1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
+            2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
+            3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
+            4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
+            5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
+            6. If multiple {cat_name}s appear, ensure each description **differentiates** their actions.
+            7. Base your description on these action definitions:
+            - Avoid using term 'minimal' or 'slightly'.
+            - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
+            - details such as motion and intention, facial with object manipulation
+            - movements with objects or other entities when they are prominent and observable. expression should be specific.
+                (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
+            ---
+            ## Output Format:
+            - For each labeled {cat_name}, output **exactly one line**. Your answer should contain details and follow the following format :
+                object id. using {cat_name} as subject noun, action-oriented description
+                (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
+            - **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
+            ### Example
+            If the frame has 2 labeled bears, your output should look like:
+            1. the bear reaching his right arm while leaning forward to capture the prey
+            2. a bear standing upright facing right, touching the bike aside
+            ---
+            **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
+            **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
+            **Do not include markdown** in the output.
+            Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
+            For each labeled {cat_name}, output referring expressions for each object id.
+            """
+            MAX_RETRIES = 2
+            retry_count = 0
+            if should_caption:
+                while retry_count < MAX_RETRIES:
+                    response2 = captioner.chat.completions.create(
+                        model=model,
+                        messages=[
+                            {
+                                "role": "user",
+                                "content": [
+                                    {
+                                        "type": "text",
+                                        "text": dense_caption_prompt,
+                                    },
+                                    {
+                                        "type": "image_url",
+                                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                    },
+                                ],
+                            }
+                        ],
+                    )
+                    # caption = response2.choices[0].message.content
+                    #print(f"{image_path} - {frame_name}: {caption}")
+                    caption = response2.choices[0].message.content.strip()
+                    caption_lower = caption.lower().lstrip()
+                    if caption_lower.startswith("1.") and not any(
+                        phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
+                    ):
+                        break
+                    print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
+                    retry_count += 1
+                    time.sleep(2)
+                if retry_count == MAX_RETRIES:
+                    caption = None
+                    print("Max retries reached. Caption generation failed.")
+            else:
+                caption = None
+            image_captions[frame_name] = caption
+        all_captions[cat_name] = image_captions
+    # final : also prepare valid object ids
+    valid_obj_ids = dict()
+    for cat in cat_names:
+        if cat in ytvos_category_valid_list:
+            obj_id_cat = vid_meta['obj_id_cat']
+            valid_cat_ids = []
+            for obj_id in list(obj_id_cat.keys()):
+                if obj_id_cat[obj_id] == cat:
+                    valid_cat_ids.append(obj_id)
+            valid_obj_ids[cat] = valid_cat_ids
+    return vid_id, all_captions, valid_obj_ids
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
+    parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_ytvos_ref(image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    # 색상 후보 8개 (RGB 형식)
+    colors = [
+        (255, 0, 0),    # Red
+        (0, 255, 0),    # Green
+        (0, 0, 255),    # Blue
+        (255, 255, 0),  # Yellow
+        (255, 0, 255),  # Magenta
+        (0, 255, 255),  # Cyan
+        (128, 0, 128),  # Purple
+        (255, 165, 0)   # Orange
+    ]
+    ytvos_category_valid_list = [
+    'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
+    'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
+    'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
+    'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
+    'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
+    'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
+    ]
+    #==================gpt 돌리기===================
+    os.environ['OPENAI_API_KEY'] = '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'
+    result_captions = {}
+    result_valid_obj_ids = {}
+    for i in range(370):
+        vid_id, all_captions, valid_obj_ids = getCaption(i)
+        if vid_id not in result_captions:
+            result_captions[vid_id] = all_captions
+        if vid_id not in result_valid_obj_ids:
+            result_valid_obj_ids[vid_id] = valid_obj_ids
+    print("Finished!", flush=True)
+    with open(args.save_caption_path, "w") as file:
+        json.dump(result_captions, file, indent=4)
+    with open(args.save_valid_obj_ids_path, "w") as file:
+        json.dump(result_valid_obj_ids, file, indent=4)

.history/mbench/gpt_ref-ytvos_numbered_cy_20250202183102.py ADDED Viewed

	@@ -0,0 +1,460 @@

+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import time
+from os import path as osp
+from io import BytesIO
+from mbench.ytvos_ref import build as build_ytvos_ref
+import argparse
+import opts
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import textwrap
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+from openai import OpenAI
+import base64
+import json
+def number_objects_and_encode(idx, color_mask=False):
+    encoded_frames = {}
+    contoured_frames = {}  # New dictionary for original images
+    vid_cat_cnts = {}
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    frame_indx = vid_meta['sample_indx']
+    cat_names = set(vid_meta['obj_id_cat'].values())
+    imgs = vid_data[0]
+    for cat in cat_names:
+        cat_frames = []
+        contour_frames = []
+        frame_cat_cnts = {}
+        for i in range(imgs.size(0)):
+            frame_name = frame_indx[i]
+            frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_data = vid_data[2][frame_name]
+            obj_ids = list(frame_data.keys())
+            cat_cnt = 0
+            for j in range(len(obj_ids)):
+                obj_id = obj_ids[j]
+                obj_data = frame_data[obj_id]
+                obj_bbox = obj_data['bbox']
+                obj_valid = obj_data['valid']
+                obj_mask = obj_data['mask'].numpy().astype(np.uint8)
+                obj_cat = obj_data['category_name']
+                if obj_cat == cat and obj_valid:
+                    cat_cnt += 1
+                    if color_mask == False:
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 3)
+                        for i, contour in enumerate(contours):
+                            # 윤곽선 중심 계산
+                            moments = cv2.moments(contour)
+                            if moments["m00"] != 0:  # 중심 계산 가능 여부 확인
+                                cx = int(moments["m10"] / moments["m00"])
+                                cy = int(moments["m01"] / moments["m00"])
+                            else:
+                                cx, cy = contour[0][0]  # 중심 계산 불가시 대체 좌표 사용
+                            # 텍스트 배경 (검은색 배경 만들기)
+                            font = cv2.FONT_HERSHEY_SIMPLEX
+                            text = obj_id
+                            text_size = cv2.getTextSize(text, font, 1, 2)[0]
+                            text_w, text_h = text_size
+                            # 텍스트 배경 그리기 (검은색 배경)
+                            cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
+                                        (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
+                            # 텍스트 그리기 (흰색 텍스트)
+                            cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
+                                        font, 1, (255, 255, 255), 2)
+                    else:
+                        alpha = 0.08
+                        colored_obj_mask = np.zeros_like(frame)
+                        colored_obj_mask[obj_mask == 1] = colors[j]
+                        frame[obj_mask == 1] = (
+                            (1 - alpha) * frame[obj_mask == 1]
+                            + alpha * colored_obj_mask[obj_mask == 1]
+                        )
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 2)
+                        cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
+                        if len(contours) > 0:
+                            largest_contour = max(contours, key=cv2.contourArea)
+                            M = cv2.moments(largest_contour)
+                            if M["m00"] != 0:
+                                center_x = int(M["m10"] / M["m00"])
+                                center_y = int(M["m01"] / M["m00"])
+                            else:
+                                center_x, center_y = 0, 0
+                        font = cv2.FONT_HERSHEY_SIMPLEX
+                        text = obj_id
+                        font_scale = 0.9
+                        text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
+                        text_x = center_x - text_size[0] // 1  # 텍스트의 가로 중심
+                        text_y = center_y
+                        # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
+                        # 텍스트 배경 사각형 좌표 계산
+                        rect_start = (text_x - 5, text_y - text_size[1] - 5)  # 배경 사각형 좌상단
+                        # rect_end = (text_x + text_size[0] + 5, text_y + 5)
+                        rect_end = (text_x + text_size[0] + 5, text_y)
+                        cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+                        cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
+            # plt.figure(figsize=(12, 8))
+            # plt.imshow(frame)
+            # plt.title(f"frame {frame_name}")
+            # plt.tight_layout()
+            # plt.axis('off')
+            # plt.show()
+            buffer = BytesIO()
+            frame = Image.fromarray(frame)
+            frame.save(buffer, format='jpeg')
+            buffer.seek(0)
+            cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+            frame_cat_cnts[frame_name] = cat_cnt
+            buffer.seek(0)  # Reuse buffer instead of creating a new one
+            buffer.truncate()
+            frame_for_contour = Image.fromarray(frame_for_contour)
+            frame_for_contour.save(buffer, format='jpeg')
+            buffer.seek(0)
+            contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+        encoded_frames[cat] = cat_frames
+        contoured_frames[cat] = contour_frames
+        vid_cat_cnts[cat] = frame_cat_cnts
+    return encoded_frames, vid_cat_cnts, contoured_frames
+def getCaption(idx, model='gpt-4o', color_mask=True):
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    print(f"vid id: {vid_id}\n")
+    frame_indx = vid_meta['sample_indx']             # e.g. [4, 7, 9, 16]
+    cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
+    all_captions = dict()
+    base64_frames, vid_cat_cnts, contoured_frames = number_objects_and_encode(idx, color_mask)
+    #marked = "mask with boundary" if color_mask else "boundary"
+    for cat_name in list(cat_names) :
+        is_movable = False
+        if cat_name in ytvos_category_valid_list :
+            is_movable = True
+        if not is_movable:
+            print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
+        image_captions = {}
+        captioner = OpenAI()
+        cat_base64_frames = base64_frames[cat_name]
+        cont_base64_frames = contoured_frames[cat_name]
+        for i in range(len(cat_base64_frames)):
+            frame_name = frame_indx[i]
+            cont_base64_image = cont_base64_frames[i]
+            base64_image = cat_base64_frames[i]
+            should_filter = False
+            frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
+            if frame_cat_cnts >= 2:
+                should_filter = True
+            else:
+                print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
+            if is_movable and should_filter:
+                #1단계: 필터링
+                print(f"-----------category name: {cat_name}, frame name: {frame_name}")
+                caption_filter_text = f"""
+                You are a visual assistant analyzing a single frame from a video.
+                In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
+                Are {cat_name}s in the image performing all different and recognizable actions or postures?
+                Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
+                facial expressions, and any notable interactions with objects or other {cat_name}s or people.
+                Only focus on obvious, prominent actions that can be reliably identified from this single frame.
+                - Respond with "YES" if:
+                1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
+                    (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
+                2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
+                3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
+                - Respond with "NONE" if:
+                1) The actions or pose are not clearly differentiable or too similar.
+                2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
+                3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
+                Answer strictly with either "YES" or "NONE".
+                """
+                response1 = captioner.chat.completions.create(
+                    model=model,
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": caption_filter_text,
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                }
+                            ],
+                        }
+                    ],
+                )
+                response_content = response1.choices[0].message.content
+                should_caption = True if "yes" in response_content.lower() else False
+                print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
+            else:
+                should_caption = False
+            #2단계: dense caption 만들기
+            dense_caption_prompt_1 =  f"""You are a visual assistant that can analyze a single frame of a video and create referring expressions for each object.
+                                        In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
+                                        I want to use your expressions to create a action-centric referring expression dataset.
+                                        Therefore, your expressions for these {cat_name}s should describe unique action of each object.
+                                        1. Focus only on clear, unique, and prominent actions that distinguish each object.
+                                        2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
+                                        3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
+                                        4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
+                                        5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
+                                        6. Avoid overly detailed or speculative descriptions such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                                        7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+                                        8. Include interactions with objects or other entities when they are prominent and observable.
+                                        9. If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
+                                        10. Do not include descriptions of appearance such as clothes, color, size, shape etc.
+                                        11. Do not include relative position between objects such as 'the left elephant' because left/right can be ambiguous.
+                                        12. Do not mention object IDs.
+                                        13. Use '{cat_name}' as the noun for the referring expressions.
+                                        Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
+                                        Output referring expressions for each object id.
+                                        """
+            dense_caption_prompt = f"""
+            You are a visual assistant analyzing a single frame of a video.
+            In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary.
+            I want to use your expressions to create an **action-centric referring expression** dataset.
+            Please describe each {cat_name} using **clearly observable** and **specific** actions.
+            ---
+            ## Guidelines:
+            1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
+            2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
+            3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
+            4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
+            5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
+            6. If multiple {cat_name}s appear, ensure each description **differentiates** their actions.
+            7. Base your description on these action definitions:
+            - Avoid using term 'minimal' or 'slightly'.
+            - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
+            - details such as motion and intention, facial with object manipulation
+            - movements with objects or other entities when they are prominent and observable. expression should be specific.
+                (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
+            ---
+            ## Output Format:
+            - For each labeled {cat_name}, output **exactly one line**. Your answer should contain details and follow the following format :
+                object id. using {cat_name} as subject noun, action-oriented description
+                (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
+            - **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
+            ### Example
+            If the frame has 2 labeled bears, your output should look like:
+            1. the bear reaching his right arm while leaning forward to capture the prey
+            2. a bear standing upright facing right, touching the bike aside
+            ---
+            **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
+            **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
+            **Do not include markdown** in the output.
+            Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
+            For each labeled {cat_name}, output referring expressions for each object id.
+            """
+            MAX_RETRIES = 2
+            retry_count = 0
+            if should_caption:
+                while retry_count < MAX_RETRIES:
+                    response2 = captioner.chat.completions.create(
+                        model=model,
+                        messages=[
+                            {
+                                "role": "user",
+                                "content": [
+                                    {
+                                        "type": "text",
+                                        "text": dense_caption_prompt,
+                                    },
+                                    {
+                                        "type": "image_url",
+                                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                    },
+                                ],
+                            }
+                        ],
+                    )
+                    # caption = response2.choices[0].message.content
+                    #print(f"{image_path} - {frame_name}: {caption}")
+                    caption = response2.choices[0].message.content.strip()
+                    caption_lower = caption.lower().lstrip()
+                    if caption_lower.startswith("1.") and not any(
+                        phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
+                    ):
+                        break
+                    print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
+                    retry_count += 1
+                    time.sleep(2)
+                if retry_count == MAX_RETRIES:
+                    caption = None
+                    print("Max retries reached. Caption generation failed.")
+            else:
+                caption = None
+            image_captions[frame_name] = caption
+        all_captions[cat_name] = image_captions
+    # final : also prepare valid object ids
+    valid_obj_ids = dict()
+    for cat in cat_names:
+        if cat in ytvos_category_valid_list:
+            obj_id_cat = vid_meta['obj_id_cat']
+            valid_cat_ids = []
+            for obj_id in list(obj_id_cat.keys()):
+                if obj_id_cat[obj_id] == cat:
+                    valid_cat_ids.append(obj_id)
+            valid_obj_ids[cat] = valid_cat_ids
+    return vid_id, all_captions, valid_obj_ids
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions.json")
+    parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids.json")
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_ytvos_ref(image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    # 색상 후보 8개 (RGB 형식)
+    colors = [
+        (255, 0, 0),    # Red
+        (0, 255, 0),    # Green
+        (0, 0, 255),    # Blue
+        (255, 255, 0),  # Yellow
+        (255, 0, 255),  # Magenta
+        (0, 255, 255),  # Cyan
+        (128, 0, 128),  # Purple
+        (255, 165, 0)   # Orange
+    ]
+    ytvos_category_valid_list = [
+    'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
+    'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
+    'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
+    'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
+    'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
+    'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
+    ]
+    #==================gpt 돌리기===================
+    os.environ['OPENAI_API_KEY'] = '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'
+    result_captions = {}
+    result_valid_obj_ids = {}
+    for i in range(370):
+        vid_id, all_captions, valid_obj_ids = getCaption(i, color_mask=False)
+        if vid_id not in result_captions:
+            result_captions[vid_id] = all_captions
+        if vid_id not in result_valid_obj_ids:
+            result_valid_obj_ids[vid_id] = valid_obj_ids
+    print("Finished!", flush=True)
+    with open(args.save_caption_path, "w") as file:
+        json.dump(result_captions, file, indent=4)
+    with open(args.save_valid_obj_ids_path, "w") as file:
+        json.dump(result_valid_obj_ids, file, indent=4)

.history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207172804.py ADDED Viewed

	@@ -0,0 +1,656 @@

+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import time
+from os import path as osp
+from io import BytesIO
+import random
+from mbench.ytvos_ref import build as build_ytvos_ref
+import argparse
+import opts
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import textwrap
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+from openai import OpenAI
+import base64
+import json
+import requests
+from openai.error import APIConnectionError, OpenAIError
+def number_objects_and_encode_old(idx, color_mask=False):
+    encoded_frames = {}
+    contoured_frames = {}  # New dictionary for original images
+    vid_cat_cnts = {}
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    frame_indx = vid_meta['sample_indx']
+    cat_names = set(vid_meta['obj_id_cat'].values())
+    imgs = vid_data[0]
+    for cat in cat_names:
+        cat_frames = []
+        contour_frames = []
+        frame_cat_cnts = {}
+        for i in range(imgs.size(0)):
+            frame_name = frame_indx[i]
+            frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_data = vid_data[2][frame_name]
+            obj_ids = list(frame_data.keys())
+            cat_cnt = 0
+            for j in range(len(obj_ids)):
+                obj_id = obj_ids[j]
+                obj_data = frame_data[obj_id]
+                obj_bbox = obj_data['bbox']
+                obj_valid = obj_data['valid']
+                obj_mask = obj_data['mask'].numpy().astype(np.uint8)
+                obj_cat = obj_data['category_name']
+                if obj_cat == cat and obj_valid:
+                    cat_cnt += 1
+                    if color_mask == False:
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 3)
+                        for i, contour in enumerate(contours):
+                            moments = cv2.moments(contour)
+                            if moments["m00"] != 0:
+                                cx = int(moments["m10"] / moments["m00"])
+                                cy = int(moments["m01"] / moments["m00"])
+                            else:
+                                cx, cy = contour[0][0]
+                            font = cv2.FONT_HERSHEY_SIMPLEX
+                            text = obj_id
+                            text_size = cv2.getTextSize(text, font, 1, 2)[0]
+                            text_w, text_h = text_size
+                            cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
+                                        (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
+                            cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
+                                        font, 1, (255, 255, 255), 2)
+                    else:
+                        alpha = 0.08
+                        colored_obj_mask = np.zeros_like(frame)
+                        colored_obj_mask[obj_mask == 1] = colors[j]
+                        frame[obj_mask == 1] = (
+                            (1 - alpha) * frame[obj_mask == 1]
+                            + alpha * colored_obj_mask[obj_mask == 1]
+                        )
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 2)
+                        cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
+                        if len(contours) > 0:
+                            largest_contour = max(contours, key=cv2.contourArea)
+                            M = cv2.moments(largest_contour)
+                            if M["m00"] != 0:
+                                center_x = int(M["m10"] / M["m00"])
+                                center_y = int(M["m01"] / M["m00"])
+                            else:
+                                center_x, center_y = 0, 0
+                        font = cv2.FONT_HERSHEY_SIMPLEX
+                        text = obj_id
+                        font_scale = 0.9
+                        text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
+                        text_x = center_x - text_size[0] // 1
+                        text_y = center_y
+                        rect_start = (text_x - 5, text_y - text_size[1] - 5)
+                        rect_end = (text_x + text_size[0] + 5, text_y)
+                        cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+                        cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
+            # plt.figure(figsize=(12, 8))
+            # plt.imshow(frame)
+            # plt.title(f"frame {frame_name}")
+            # plt.tight_layout()
+            # plt.axis('off')
+            # plt.show()
+            buffer = BytesIO()
+            frame = Image.fromarray(frame)
+            frame.save(buffer, format='jpeg')
+            buffer.seek(0)
+            cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+            frame_cat_cnts[frame_name] = cat_cnt
+            buffer.seek(0)  # Reuse buffer instead of creating a new one
+            buffer.truncate()
+            frame_for_contour = Image.fromarray(frame_for_contour)
+            frame_for_contour.save(buffer, format='jpeg')
+            buffer.seek(0)
+            contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+        encoded_frames[cat] = cat_frames
+        contoured_frames[cat] = contour_frames
+        vid_cat_cnts[cat] = frame_cat_cnts
+    return encoded_frames, contoured_frames, vid_cat_cnts
+def number_objects_and_encode(idx, color_mask=False):
+    encoded_frames = {}
+    contoured_frames = {}  # New dictionary for original images
+    vid_cat_cnts = {}
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    frame_indx = vid_meta['sample_indx']
+    cat_names = set(vid_meta['obj_id_cat'].values())
+    imgs = vid_data[0]
+    for cat in cat_names:
+        cat_frames = []
+        contour_frames = []
+        frame_cat_cnts = {}
+        for i in range(imgs.size(0)):
+            frame_name = frame_indx[i]
+            frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_data = vid_data[2][frame_name]
+            obj_ids = list(frame_data.keys())
+            cat_cnt = 0
+            for j in range(len(obj_ids)):
+                obj_id = obj_ids[j]
+                obj_data = frame_data[obj_id]
+                obj_bbox = obj_data['bbox']
+                obj_valid = obj_data['valid']
+                obj_mask = obj_data['mask'].numpy().astype(np.uint8)
+                obj_cat = obj_data['category_name']
+                if obj_cat == cat and obj_valid:
+                    cat_cnt += 1
+                    contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                    cv2.drawContours(frame, contours, -1, colors[j], 3)
+                    cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
+                    if len(contours) > 0:
+                        largest_contour = max(contours, key=cv2.contourArea)
+                        M = cv2.moments(largest_contour)
+                        if M["m00"] != 0:
+                            center_x = int(M["m10"] / M["m00"])
+                            center_y = int(M["m01"] / M["m00"])
+                        else:
+                            center_x, center_y = 0, 0
+                    font = cv2.FONT_HERSHEY_SIMPLEX
+                    text = obj_id
+                    font_scale = 1.2
+                    text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
+                    text_x = center_x - text_size[0] // 1
+                    text_y = center_y
+                    rect_start = (text_x - 5, text_y - text_size[1] - 5)
+                    rect_end = (text_x + text_size[0] + 5, text_y + 3)
+                    contour_thickness = 1
+                    rect_start_contour = (rect_start[0] - contour_thickness, rect_start[1] - contour_thickness)
+                    rect_end_contour = (rect_end[0] + contour_thickness, rect_end[1] + contour_thickness)
+                    cv2.rectangle(frame, rect_start_contour, rect_end_contour, colors[j], contour_thickness)
+                    cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+                    cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
+                    if color_mask:
+                        alpha = 0.08
+                        colored_obj_mask = np.zeros_like(frame)
+                        colored_obj_mask[obj_mask == 1] = colors[j]
+                        frame[obj_mask == 1] = (
+                            (1 - alpha) * frame[obj_mask == 1]
+                            + alpha * colored_obj_mask[obj_mask == 1]
+                        )
+            # plt.figure(figsize=(12, 8))
+            # plt.imshow(frame)
+            # plt.title(f"frame {frame_name}")
+            # plt.tight_layout()
+            # plt.axis('off')
+            # plt.show()
+            buffer = BytesIO()
+            frame = Image.fromarray(frame)
+            frame.save(buffer, format='jpeg')
+            buffer.seek(0)
+            cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+            frame_cat_cnts[frame_name] = cat_cnt
+            buffer.seek(0)  # Reuse buffer instead of creating a new one
+            buffer.truncate()
+            frame_for_contour = Image.fromarray(frame_for_contour)
+            frame_for_contour.save(buffer, format='jpeg')
+            buffer.seek(0)
+            contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+        encoded_frames[cat] = cat_frames
+        contoured_frames[cat] = contour_frames
+        vid_cat_cnts[cat] = frame_cat_cnts
+    return encoded_frames, contoured_frames, vid_cat_cnts
+def getCaption(idx, model='gpt-4o'):
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    print(f"vid id: {vid_id}\n")
+    frame_indx = vid_meta['sample_indx']             # e.g. [4, 7, 9, 16]
+    cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
+    all_captions = dict()
+    # color_mask = random.choice([True, False])
+    color_mask = random.choices([False, True], weights=[60, 40])[0]
+    base64_frames, _ , vid_cat_cnts = number_objects_and_encode(idx, color_mask)
+    #marked = "mask with boundary" if color_mask else "boundary"
+    for cat_name in list(cat_names) :
+        is_movable = False
+        if cat_name in ytvos_category_valid_list :
+            is_movable = True
+        if not is_movable:
+            print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
+        image_captions = {}
+        captioner = OpenAI()
+        cat_base64_frames = base64_frames[cat_name]
+        # cont_base64_frames = contoured_frames[cat_name]
+        for i in range(len(cat_base64_frames)):
+            frame_name = frame_indx[i]
+            # cont_base64_image = cont_base64_frames[i]
+            base64_image = cat_base64_frames[i]
+            should_filter = False
+            frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
+            if frame_cat_cnts >= 2:
+                should_filter = True
+            else:
+                print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
+            if is_movable and should_filter:
+                #1단계: 필터링
+                print(f"-----------category name: {cat_name}, frame name: {frame_name}")
+                caption_filter_text = f"""
+                You are a visual assistant analyzing a single frame from a video.
+                In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
+                Are {cat_name}s in the image performing all different and recognizable actions or postures?
+                Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
+                facial expressions, and any notable interactions with objects or other {cat_name}s or people.
+                Only focus on obvious, prominent actions that can be reliably identified from this single frame.
+                - Respond with "YES" if:
+                1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
+                    (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
+                2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
+                3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
+                - Respond with "NONE" if:
+                1) The actions or pose are not clearly differentiable or too similar.
+                2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
+                3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
+                Answer strictly with either "YES" or "NONE".
+                """
+                response1 = captioner.chat.completions.create(
+                    model=model,
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": caption_filter_text,
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                }
+                            ],
+                        }
+                    ],
+                )
+                response_content = response1.choices[0].message.content
+                should_caption = True if "yes" in response_content.lower() else False
+                print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
+            else:
+                should_caption = False
+            #2단계: dense caption 만들기
+            dense_caption_prompt_1 =  f"""
+            In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. The category name of these objects are : {cat_name}.
+            Please describe the image focusing on labeled {cat_name}s in detail, focusing on their actions and interactions.
+            1. Focus only on clear, unique, and prominent actions that distinguish each object.
+            2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
+            3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
+            4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
+            5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
+            6. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                - expressions like 'seems to be', 'appears to be' are BANNED!
+            7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+            8. Include interactions with objects or other entities when they are prominent and observable.
+            9. **Do not include descriptions of appearance** such as clothes, color, size, shape etc.
+            10. **Do not include relative position** between objects such as 'the left elephant' because left/right can be ambiguous.
+            11. Do not mention object IDs.
+            12. Use '{cat_name}' as the noun for the referring expressions.
+            Note that I want to use your description to create a grounding dataset, therefore, your descriptions for different objects should be unique, i.e., If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
+            - Your answer should contain details, and follow the following format:
+                object id. action-oriented description
+                (e.g. 1. the person is holding bananas on two hands and opening his mouth, turning the head right.
+                      2. a person bending over and touching his boots to tie the shoelace.)
+            - for action-oriented description, use {cat_name} as subject noun
+            **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
+            Please pay attention to the categories of these objects and don’t change them.
+            Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
+            Output referring expressions for each object id. Please start your answer:"""
+            dense_caption_prompt_2 = f"""
+            You are an advanced visual language model analyzing a video frame.
+            In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been distinctly labeled with bright numerical IDs at their center and boundary.
+            Your task is to generate **action-oriented descriptions** for each labeled {cat_name}.
+            Your descriptions should capture their **observable actions and interactions**, making sure to highlight movement, gestures, and dynamic behaviors.
+            ---
+            ## Key Guidelines:
+            1. **Describe only clear and visible actions** that uniquely define what the {cat_name} is doing.
+            - Example: "grabbing a branch and pulling it down" (**(O) Specific**)
+            - Avoid: "moving slightly to the side" (**(X) Too vague**)
+            2. **Do not describe appearance, color, or position**—focus purely on the action.
+            - (X) "A large brown bear standing on the left"
+            - (O) "The bear is lifting its front paws and swiping forward."
+            3. **Use dynamic, action-specific verbs** rather than passive descriptions.
+            - (O) "The giraffe is tilting its head and sniffing the ground."
+            - (X) "The giraffe is near a tree and looking around."
+            4. **Avoid assumptions, emotions, or speculative phrasing.**
+            - (X) "The person seems excited" / "The person might be preparing to jump."
+            - (O) "The person is pushing its front legs against the rock and leaping forward."
+            5. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                - expressions like 'seems to be', 'appears to be' are BANNED!
+            6. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+            7. If multiple {cat_name}s are present, make sure their descriptions are **distinct and non-overlapping**.
+            - **Each object should have a unique, descriptive action.**
+            - (X) "Two dogs are running."
+            - (O) "1. One dog is chasing another, its legs stretched mid-air.
+                            2. The other dog is looking back while speeding up."
+            ---
+            ## Output Format:
+            - Each labeled **{cat_name}** should have exactly **one line of description**.
+            - Format: `ID. {cat_name} + action-based description`
+            - (O) Example:
+                ```
+                1. The person is leaning forward while opening a bag with both hands.
+                2. The person is holding onto a rope and pulling themselves up.
+                ```
+            - **Ensure that each object is described individually.**
+            - **Do not group objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
+            ---
+            ## Additional Instructions:
+            - **Do NOT** use expressions like "it appears that..." or "it seems like...".
+            - **Do NOT** mention object IDs in the description (only use the provided format).
+            - **DO NOT** include markdown formatting (no bullet points, no asterisks).
+            - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
+            Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:
+            """
+            dense_caption_prompt = f"""
+            You are a visual assistant analyzing a single frame of a video.
+            In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been labeled with bright numeric IDs at their center and boundary.
+            I am building an **action-centric referring expression** dataset.
+            Your task is to describe each labeled {cat_name} based on **clearly observable and specific actions**.
+            ---
+            ## Guidelines:
+            1. **Focus only on visible and prominent actions** (e.g., running, pushing, grasping an object).
+            2. **Avoid describing minor or ambiguous movements** (e.g., "slightly moving a paw," "tilting head a bit").
+            3. **Do not include subjective or speculative descriptions** (e.g., "it seems excited" or "it might be preparing to jump").
+            4. **Avoid vague expressions** like "engaging with something." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
+            5. **Use dynamic action verbs** (e.g., holding, throwing, inspecting, leaning, pressing) to highlight motion and interaction.
+            6. If multiple {cat_name}s appear, ensure each description is **distinct and non-overlapping**.
+            7. Base your descriptions on these principles:
+            - **Avoid words like 'minimal' or 'slightly'.**
+            - Emphasize **body movement, posture, and motion patterns** (e.g., "lifting its head," "facing forward," "showing its back").
+            - Describe **facial expressions and interactions with objects** (e.g., "opening its mouth wide," "smiling while holding an item").
+            - **Specify actions with other objects or entities** only when they are clear and observable.
+                - (O) "pushing another person"
+                - (X) "interacting with another object"
+            ---
+            ## Output Format:
+            - Each labeled **{cat_name}** must have **exactly one line**.
+            - Format: `ID. {cat_name} + action-based description`
+            - (O) Example:
+                ```
+                1. The person is holding ski poles and skiing down a snowy mountain with bent knees.
+                2. The person is pulling a baby carriage while smiling.
+                ```
+            - **Ensure each object is described individually.**
+            - **Do not group multiple objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
+            ---
+            ## Example:
+            If the frame has two labeled **bears**, your output should be:
+            ```
+            1. The bear is reaching out its right paw while leaning forward to catch prey.
+            2. A bear is standing upright, facing right, and touching the bike beside it.
+            ```
+            ---
+            ## Additional Instructions:
+            - **Do NOT** describe appearance (e.g., color, size, texture) or relative positioning (e.g., "on the left/right").
+            - **Do NOT** reference object IDs explicitly (e.g., "Person 1" or "Object 2" is NOT allowed).
+            - **Do NOT** include markdown formatting (no bullet points, asterisks, or extra symbols).
+            - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
+            Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:"""
+            MAX_RETRIES = 3
+            retry_count = 0
+            if should_caption:
+                while retry_count < MAX_RETRIES:
+                    selected_prompt = random.choice([dense_caption_prompt, dense_caption_prompt_2])
+                    response2 = captioner.chat.completions.create(
+                        model=model,
+                        messages=[
+                            {
+                                "role": "user",
+                                "content": [
+                                    {
+                                        "type": "text",
+                                        "text": selected_prompt,
+                                    },
+                                    {
+                                        "type": "image_url",
+                                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                    },
+                                ],
+                            }
+                        ],
+                    )
+                    # caption = response2.choices[0].message.content
+                    #print(f"{image_path} - {frame_name}: {caption}")
+                    caption = response2.choices[0].message.content.strip()
+                    caption_lower = caption.lower().lstrip()
+                    if caption_lower.startswith("1.") and not any(
+                        phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
+                    ):
+                        break
+                    print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
+                    retry_count += 1
+                    time.sleep(2)
+                if retry_count == MAX_RETRIES:
+                    caption = None
+                    print("Max retries reached. Caption generation failed.")
+            else:
+                caption = None
+            image_captions[frame_name] = caption
+        all_captions[cat_name] = image_captions
+    # final : also prepare valid object ids
+    valid_obj_ids = dict()
+    for cat in cat_names:
+        if cat in ytvos_category_valid_list:
+            obj_id_cat = vid_meta['obj_id_cat']
+            valid_cat_ids = []
+            for obj_id in list(obj_id_cat.keys()):
+                if obj_id_cat[obj_id] == cat:
+                    valid_cat_ids.append(obj_id)
+            valid_obj_ids[cat] = valid_cat_ids
+    return vid_id, all_captions, valid_obj_ids
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions_gpt-4o_randcap.json")
+    parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids_gpt-4o_randcap.json")
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_ytvos_ref(image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    # 색상 후보 8개 (RGB 형식)
+    colors = [
+        (255, 0, 0),    # Red
+        (0, 255, 0),    # Green
+        (0, 0, 255),    # Blue
+        (255, 255, 0),  # Yellow
+        (255, 0, 255),  # Magenta
+        (0, 255, 255),  # Cyan
+        (128, 0, 128),  # Purple
+        (255, 165, 0)   # Orange
+    ]
+    ytvos_category_valid_list = [
+    'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
+    'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
+    'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
+    'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
+    'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
+    'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
+    ]
+    #==================gpt 돌리기===================
+    os.environ['OPENAI_API_KEY'] = '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'
+    result_captions = {}
+    result_valid_obj_ids = {}
+    for i in range(len(metas)):
+        try:
+            vid_id, all_captions, valid_obj_ids = getCaption(i)
+            if vid_id not in result_captions:
+                result_captions[vid_id] = all_captions
+            if vid_id not in result_valid_obj_ids:
+                result_valid_obj_ids[vid_id] = valid_obj_ids
+        except (requests.exceptions.ConnectionError, APIConnectionError) as e:
+            print(f"created caption until {i-1}", flush=True)
+            with open(args.save_caption_path, "w") as file:
+                json.dump(result_captions, file, indent=4)
+            with open(args.save_valid_obj_ids_path, "w") as file:
+                json.dump(result_valid_obj_ids, file, indent=4)
+    print("Finished!", flush=True)
+    with open(args.save_caption_path, "w") as file:
+        json.dump(result_captions, file, indent=4)
+    with open(args.save_valid_obj_ids_path, "w") as file:
+        json.dump(result_valid_obj_ids, file, indent=4)

.history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207173210.py ADDED Viewed

	@@ -0,0 +1,656 @@

+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import time
+from os import path as osp
+from io import BytesIO
+import random
+from mbench.ytvos_ref import build as build_ytvos_ref
+import argparse
+import opts
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import textwrap
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+from openai import OpenAI
+import base64
+import json
+import requests
+from openai.error import APIConnectionError, OpenAIError
+def number_objects_and_encode_old(idx, color_mask=False):
+    encoded_frames = {}
+    contoured_frames = {}  # New dictionary for original images
+    vid_cat_cnts = {}
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    frame_indx = vid_meta['sample_indx']
+    cat_names = set(vid_meta['obj_id_cat'].values())
+    imgs = vid_data[0]
+    for cat in cat_names:
+        cat_frames = []
+        contour_frames = []
+        frame_cat_cnts = {}
+        for i in range(imgs.size(0)):
+            frame_name = frame_indx[i]
+            frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_data = vid_data[2][frame_name]
+            obj_ids = list(frame_data.keys())
+            cat_cnt = 0
+            for j in range(len(obj_ids)):
+                obj_id = obj_ids[j]
+                obj_data = frame_data[obj_id]
+                obj_bbox = obj_data['bbox']
+                obj_valid = obj_data['valid']
+                obj_mask = obj_data['mask'].numpy().astype(np.uint8)
+                obj_cat = obj_data['category_name']
+                if obj_cat == cat and obj_valid:
+                    cat_cnt += 1
+                    if color_mask == False:
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 3)
+                        for i, contour in enumerate(contours):
+                            moments = cv2.moments(contour)
+                            if moments["m00"] != 0:
+                                cx = int(moments["m10"] / moments["m00"])
+                                cy = int(moments["m01"] / moments["m00"])
+                            else:
+                                cx, cy = contour[0][0]
+                            font = cv2.FONT_HERSHEY_SIMPLEX
+                            text = obj_id
+                            text_size = cv2.getTextSize(text, font, 1, 2)[0]
+                            text_w, text_h = text_size
+                            cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
+                                        (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
+                            cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
+                                        font, 1, (255, 255, 255), 2)
+                    else:
+                        alpha = 0.08
+                        colored_obj_mask = np.zeros_like(frame)
+                        colored_obj_mask[obj_mask == 1] = colors[j]
+                        frame[obj_mask == 1] = (
+                            (1 - alpha) * frame[obj_mask == 1]
+                            + alpha * colored_obj_mask[obj_mask == 1]
+                        )
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 2)
+                        cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
+                        if len(contours) > 0:
+                            largest_contour = max(contours, key=cv2.contourArea)
+                            M = cv2.moments(largest_contour)
+                            if M["m00"] != 0:
+                                center_x = int(M["m10"] / M["m00"])
+                                center_y = int(M["m01"] / M["m00"])
+                            else:
+                                center_x, center_y = 0, 0
+                        font = cv2.FONT_HERSHEY_SIMPLEX
+                        text = obj_id
+                        font_scale = 0.9
+                        text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
+                        text_x = center_x - text_size[0] // 1
+                        text_y = center_y
+                        rect_start = (text_x - 5, text_y - text_size[1] - 5)
+                        rect_end = (text_x + text_size[0] + 5, text_y)
+                        cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+                        cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
+            # plt.figure(figsize=(12, 8))
+            # plt.imshow(frame)
+            # plt.title(f"frame {frame_name}")
+            # plt.tight_layout()
+            # plt.axis('off')
+            # plt.show()
+            buffer = BytesIO()
+            frame = Image.fromarray(frame)
+            frame.save(buffer, format='jpeg')
+            buffer.seek(0)
+            cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+            frame_cat_cnts[frame_name] = cat_cnt
+            buffer.seek(0)  # Reuse buffer instead of creating a new one
+            buffer.truncate()
+            frame_for_contour = Image.fromarray(frame_for_contour)
+            frame_for_contour.save(buffer, format='jpeg')
+            buffer.seek(0)
+            contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+        encoded_frames[cat] = cat_frames
+        contoured_frames[cat] = contour_frames
+        vid_cat_cnts[cat] = frame_cat_cnts
+    return encoded_frames, contoured_frames, vid_cat_cnts
+def number_objects_and_encode(idx, color_mask=False):
+    encoded_frames = {}
+    contoured_frames = {}  # New dictionary for original images
+    vid_cat_cnts = {}
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    frame_indx = vid_meta['sample_indx']
+    cat_names = set(vid_meta['obj_id_cat'].values())
+    imgs = vid_data[0]
+    for cat in cat_names:
+        cat_frames = []
+        contour_frames = []
+        frame_cat_cnts = {}
+        for i in range(imgs.size(0)):
+            frame_name = frame_indx[i]
+            frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_data = vid_data[2][frame_name]
+            obj_ids = list(frame_data.keys())
+            cat_cnt = 0
+            for j in range(len(obj_ids)):
+                obj_id = obj_ids[j]
+                obj_data = frame_data[obj_id]
+                obj_bbox = obj_data['bbox']
+                obj_valid = obj_data['valid']
+                obj_mask = obj_data['mask'].numpy().astype(np.uint8)
+                obj_cat = obj_data['category_name']
+                if obj_cat == cat and obj_valid:
+                    cat_cnt += 1
+                    contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                    cv2.drawContours(frame, contours, -1, colors[j], 3)
+                    cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
+                    if len(contours) > 0:
+                        largest_contour = max(contours, key=cv2.contourArea)
+                        M = cv2.moments(largest_contour)
+                        if M["m00"] != 0:
+                            center_x = int(M["m10"] / M["m00"])
+                            center_y = int(M["m01"] / M["m00"])
+                        else:
+                            center_x, center_y = 0, 0
+                    font = cv2.FONT_HERSHEY_SIMPLEX
+                    text = obj_id
+                    font_scale = 1.2
+                    text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
+                    text_x = center_x - text_size[0] // 1
+                    text_y = center_y
+                    rect_start = (text_x - 5, text_y - text_size[1] - 5)
+                    rect_end = (text_x + text_size[0] + 5, text_y + 3)
+                    contour_thickness = 1
+                    rect_start_contour = (rect_start[0] - contour_thickness, rect_start[1] - contour_thickness)
+                    rect_end_contour = (rect_end[0] + contour_thickness, rect_end[1] + contour_thickness)
+                    cv2.rectangle(frame, rect_start_contour, rect_end_contour, colors[j], contour_thickness)
+                    cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+                    cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
+                    if color_mask:
+                        alpha = 0.08
+                        colored_obj_mask = np.zeros_like(frame)
+                        colored_obj_mask[obj_mask == 1] = colors[j]
+                        frame[obj_mask == 1] = (
+                            (1 - alpha) * frame[obj_mask == 1]
+                            + alpha * colored_obj_mask[obj_mask == 1]
+                        )
+            # plt.figure(figsize=(12, 8))
+            # plt.imshow(frame)
+            # plt.title(f"frame {frame_name}")
+            # plt.tight_layout()
+            # plt.axis('off')
+            # plt.show()
+            buffer = BytesIO()
+            frame = Image.fromarray(frame)
+            frame.save(buffer, format='jpeg')
+            buffer.seek(0)
+            cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+            frame_cat_cnts[frame_name] = cat_cnt
+            buffer.seek(0)  # Reuse buffer instead of creating a new one
+            buffer.truncate()
+            frame_for_contour = Image.fromarray(frame_for_contour)
+            frame_for_contour.save(buffer, format='jpeg')
+            buffer.seek(0)
+            contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+        encoded_frames[cat] = cat_frames
+        contoured_frames[cat] = contour_frames
+        vid_cat_cnts[cat] = frame_cat_cnts
+    return encoded_frames, contoured_frames, vid_cat_cnts
+def getCaption(idx, model='gpt-4o'):
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    print(f"vid id: {vid_id}\n")
+    frame_indx = vid_meta['sample_indx']             # e.g. [4, 7, 9, 16]
+    cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
+    all_captions = dict()
+    # color_mask = random.choice([True, False])
+    color_mask = random.choices([False, True], weights=[60, 40])[0]
+    base64_frames, _ , vid_cat_cnts = number_objects_and_encode(idx, color_mask)
+    #marked = "mask with boundary" if color_mask else "boundary"
+    for cat_name in list(cat_names) :
+        is_movable = False
+        if cat_name in ytvos_category_valid_list :
+            is_movable = True
+        if not is_movable:
+            print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
+        image_captions = {}
+        captioner = OpenAI()
+        cat_base64_frames = base64_frames[cat_name]
+        # cont_base64_frames = contoured_frames[cat_name]
+        for i in range(len(cat_base64_frames)):
+            frame_name = frame_indx[i]
+            # cont_base64_image = cont_base64_frames[i]
+            base64_image = cat_base64_frames[i]
+            should_filter = False
+            frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
+            if frame_cat_cnts >= 2:
+                should_filter = True
+            else:
+                print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
+            if is_movable and should_filter:
+                #1단계: 필터링
+                print(f"-----------category name: {cat_name}, frame name: {frame_name}")
+                caption_filter_text = f"""
+                You are a visual assistant analyzing a single frame from a video.
+                In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
+                Are {cat_name}s in the image performing all different and recognizable actions or postures?
+                Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
+                facial expressions, and any notable interactions with objects or other {cat_name}s or people.
+                Only focus on obvious, prominent actions that can be reliably identified from this single frame.
+                - Respond with "YES" if:
+                1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
+                    (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
+                2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
+                3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
+                - Respond with "NONE" if:
+                1) The actions or pose are not clearly differentiable or too similar.
+                2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
+                3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
+                Answer strictly with either "YES" or "NONE".
+                """
+                response1 = captioner.chat.completions.create(
+                    model=model,
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": caption_filter_text,
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                }
+                            ],
+                        }
+                    ],
+                )
+                response_content = response1.choices[0].message.content
+                should_caption = True if "yes" in response_content.lower() else False
+                print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
+            else:
+                should_caption = False
+            #2단계: dense caption 만들기
+            dense_caption_prompt_1 =  f"""
+            In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. The category name of these objects are : {cat_name}.
+            Please describe the image focusing on labeled {cat_name}s in detail, focusing on their actions and interactions.
+            1. Focus only on clear, unique, and prominent actions that distinguish each object.
+            2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
+            3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
+            4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
+            5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
+            6. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                - expressions like 'seems to be', 'appears to be' are BANNED!
+            7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+            8. Include interactions with objects or other entities when they are prominent and observable.
+            9. **Do not include descriptions of appearance** such as clothes, color, size, shape etc.
+            10. **Do not include relative position** between objects such as 'the left elephant' because left/right can be ambiguous.
+            11. Do not mention object IDs.
+            12. Use '{cat_name}' as the noun for the referring expressions.
+            Note that I want to use your description to create a grounding dataset, therefore, your descriptions for different objects should be unique, i.e., If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
+            - Your answer should contain details, and follow the following format:
+                object id. action-oriented description
+                (e.g. 1. the person is holding bananas on two hands and opening his mouth, turning the head right.
+                      2. a person bending over and touching his boots to tie the shoelace.)
+            - for action-oriented description, use {cat_name} as subject noun
+            **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
+            Please pay attention to the categories of these objects and don’t change them.
+            Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
+            Output referring expressions for each object id. Please start your answer:"""
+            dense_caption_prompt_2 = f"""
+            You are an advanced visual language model analyzing a video frame.
+            In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been distinctly labeled with bright numerical IDs at their center and boundary.
+            Your task is to generate **action-oriented descriptions** for each labeled {cat_name}.
+            Your descriptions should capture their **observable actions and interactions**, making sure to highlight movement, gestures, and dynamic behaviors.
+            ---
+            ## Key Guidelines:
+            1. **Describe only clear and visible actions** that uniquely define what the {cat_name} is doing.
+            - Example: "grabbing a branch and pulling it down" (**(O) Specific**)
+            - Avoid: "moving slightly to the side" (**(X) Too vague**)
+            2. **Do not describe appearance, color, or position**—focus purely on the action.
+            - (X) "A large brown bear standing on the left"
+            - (O) "The bear is lifting its front paws and swiping forward."
+            3. **Use dynamic, action-specific verbs** rather than passive descriptions.
+            - (O) "The giraffe is tilting its head and sniffing the ground."
+            - (X) "The giraffe is near a tree and looking around."
+            4. **Avoid assumptions, emotions, or speculative phrasing.**
+            - (X) "The person seems excited" / "The person might be preparing to jump."
+            - (O) "The person is pushing its front legs against the rock and leaping forward."
+            5. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                - expressions like 'seems to be', 'appears to be' are BANNED!
+            6. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+            7. If multiple {cat_name}s are present, make sure their descriptions are **distinct and non-overlapping**.
+            - **Each object should have a unique, descriptive action.**
+            - (X) "Two dogs are running."
+            - (O) "1. One dog is chasing another, its legs stretched mid-air.
+                            2. The other dog is looking back while speeding up."
+            ---
+            ## Output Format:
+            - Each labeled **{cat_name}** should have exactly **one line of description**.
+            - Format: `ID. {cat_name} + action-based description`
+            - (O) Example:
+                ```
+                1. The person is leaning forward while opening a bag with both hands.
+                2. The person is holding onto a rope and pulling themselves up.
+                ```
+            - **Ensure that each object is described individually.**
+            - **Do not group objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
+            ---
+            ## Additional Instructions:
+            - **Do NOT** use expressions like "it appears that..." or "it seems like...".
+            - **Do NOT** mention object IDs in the description (only use the provided format).
+            - **DO NOT** include markdown formatting (no bullet points, no asterisks).
+            - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
+            Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:
+            """
+            dense_caption_prompt = f"""
+            You are a visual assistant analyzing a single frame of a video.
+            In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been labeled with bright numeric IDs at their center and boundary.
+            I am building an **action-centric referring expression** dataset.
+            Your task is to describe each labeled {cat_name} based on **clearly observable and specific actions**.
+            ---
+            ## Guidelines:
+            1. **Focus only on visible and prominent actions** (e.g., running, pushing, grasping an object).
+            2. **Avoid describing minor or ambiguous movements** (e.g., "slightly moving a paw," "tilting head a bit").
+            3. **Do not include subjective or speculative descriptions** (e.g., "it seems excited" or "it might be preparing to jump").
+            4. **Avoid vague expressions** like "engaging with something." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
+            5. **Use dynamic action verbs** (e.g., holding, throwing, inspecting, leaning, pressing) to highlight motion and interaction.
+            6. If multiple {cat_name}s appear, ensure each description is **distinct and non-overlapping**.
+            7. Base your descriptions on these principles:
+            - **Avoid words like 'minimal' or 'slightly'.**
+            - Emphasize **body movement, posture, and motion patterns** (e.g., "lifting its head," "facing forward," "showing its back").
+            - Describe **facial expressions and interactions with objects** (e.g., "opening its mouth wide," "smiling while holding an item").
+            - **Specify actions with other objects or entities** only when they are clear and observable.
+                - (O) "pushing another person"
+                - (X) "interacting with another object"
+            ---
+            ## Output Format:
+            - Each labeled **{cat_name}** must have **exactly one line**.
+            - Format: `ID. {cat_name} + action-based description`
+            - (O) Example:
+                ```
+                1. The person is holding ski poles and skiing down a snowy mountain with bent knees.
+                2. The person is pulling a baby carriage while smiling.
+                ```
+            - **Ensure each object is described individually.**
+            - **Do not group multiple objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
+            ---
+            ## Example:
+            If the frame has two labeled **bears**, your output should be:
+            ```
+            1. The bear is reaching out its right paw while leaning forward to catch prey.
+            2. A bear is standing upright, facing right, and touching the bike beside it.
+            ```
+            ---
+            ## Additional Instructions:
+            - **Do NOT** describe appearance (e.g., color, size, texture) or relative positioning (e.g., "on the left/right").
+            - **Do NOT** reference object IDs explicitly (e.g., "Person 1" or "Object 2" is NOT allowed).
+            - **Do NOT** include markdown formatting (no bullet points, asterisks, or extra symbols).
+            - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
+            Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:"""
+            MAX_RETRIES = 3
+            retry_count = 0
+            if should_caption:
+                while retry_count < MAX_RETRIES:
+                    selected_prompt = random.choice([dense_caption_prompt, dense_caption_prompt_2])
+                    response2 = captioner.chat.completions.create(
+                        model=model,
+                        messages=[
+                            {
+                                "role": "user",
+                                "content": [
+                                    {
+                                        "type": "text",
+                                        "text": selected_prompt,
+                                    },
+                                    {
+                                        "type": "image_url",
+                                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                    },
+                                ],
+                            }
+                        ],
+                    )
+                    # caption = response2.choices[0].message.content
+                    #print(f"{image_path} - {frame_name}: {caption}")
+                    caption = response2.choices[0].message.content.strip()
+                    caption_lower = caption.lower().lstrip()
+                    if caption_lower.startswith("1.") and not any(
+                        phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
+                    ):
+                        break
+                    print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
+                    retry_count += 1
+                    time.sleep(2)
+                if retry_count == MAX_RETRIES:
+                    caption = None
+                    print("Max retries reached. Caption generation failed.")
+            else:
+                caption = None
+            image_captions[frame_name] = caption
+        all_captions[cat_name] = image_captions
+    # final : also prepare valid object ids
+    valid_obj_ids = dict()
+    for cat in cat_names:
+        if cat in ytvos_category_valid_list:
+            obj_id_cat = vid_meta['obj_id_cat']
+            valid_cat_ids = []
+            for obj_id in list(obj_id_cat.keys()):
+                if obj_id_cat[obj_id] == cat:
+                    valid_cat_ids.append(obj_id)
+            valid_obj_ids[cat] = valid_cat_ids
+    return vid_id, all_captions, valid_obj_ids
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions_gpt-4o_randcap.json")
+    parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids_gpt-4o_randcap.json")
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_ytvos_ref(image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    # 색상 후보 8개 (RGB 형식)
+    colors = [
+        (255, 0, 0),    # Red
+        (0, 255, 0),    # Green
+        (0, 0, 255),    # Blue
+        (255, 255, 0),  # Yellow
+        (255, 0, 255),  # Magenta
+        (0, 255, 255),  # Cyan
+        (128, 0, 128),  # Purple
+        (255, 165, 0)   # Orange
+    ]
+    ytvos_category_valid_list = [
+    'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
+    'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
+    'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
+    'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
+    'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
+    'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
+    ]
+    #==================gpt 돌리기===================
+    os.environ['OPENAI_API_KEY'] = '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'
+    result_captions = {}
+    result_valid_obj_ids = {}
+    for i in range(len(metas)):
+        try:
+            vid_id, all_captions, valid_obj_ids = getCaption(i)
+            if vid_id not in result_captions:
+                result_captions[vid_id] = all_captions
+            if vid_id not in result_valid_obj_ids:
+                result_valid_obj_ids[vid_id] = valid_obj_ids
+        except (requests.exceptions.ConnectionError, APIConnectionError, OpenAIError) as e:
+            print(f"created caption until {i-1}", flush=True)
+            with open(args.save_caption_path, "w") as file:
+                json.dump(result_captions, file, indent=4)
+            with open(args.save_valid_obj_ids_path, "w") as file:
+                json.dump(result_valid_obj_ids, file, indent=4)
+    print("Finished!", flush=True)
+    with open(args.save_caption_path, "w") as file:
+        json.dump(result_captions, file, indent=4)
+    with open(args.save_valid_obj_ids_path, "w") as file:
+        json.dump(result_valid_obj_ids, file, indent=4)

.history/mbench/gpt_ref-ytvos_numbered_cy_sanity_2_20250207173355.py ADDED Viewed

	@@ -0,0 +1,677 @@

+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import time
+from os import path as osp
+from io import BytesIO
+import random
+from mbench.ytvos_ref import build as build_ytvos_ref
+import argparse
+import opts
+import sys
+from pathlib import Path
+import os
+from os import path as osp
+import skimage
+from io import BytesIO
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import textwrap
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+from openai import OpenAI
+import base64
+import json
+import requests
+from openai.error import APIConnectionError, OpenAIError
+def number_objects_and_encode_old(idx, color_mask=False):
+    encoded_frames = {}
+    contoured_frames = {}  # New dictionary for original images
+    vid_cat_cnts = {}
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    frame_indx = vid_meta['sample_indx']
+    cat_names = set(vid_meta['obj_id_cat'].values())
+    imgs = vid_data[0]
+    for cat in cat_names:
+        cat_frames = []
+        contour_frames = []
+        frame_cat_cnts = {}
+        for i in range(imgs.size(0)):
+            frame_name = frame_indx[i]
+            frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_data = vid_data[2][frame_name]
+            obj_ids = list(frame_data.keys())
+            cat_cnt = 0
+            for j in range(len(obj_ids)):
+                obj_id = obj_ids[j]
+                obj_data = frame_data[obj_id]
+                obj_bbox = obj_data['bbox']
+                obj_valid = obj_data['valid']
+                obj_mask = obj_data['mask'].numpy().astype(np.uint8)
+                obj_cat = obj_data['category_name']
+                if obj_cat == cat and obj_valid:
+                    cat_cnt += 1
+                    if color_mask == False:
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 3)
+                        for i, contour in enumerate(contours):
+                            moments = cv2.moments(contour)
+                            if moments["m00"] != 0:
+                                cx = int(moments["m10"] / moments["m00"])
+                                cy = int(moments["m01"] / moments["m00"])
+                            else:
+                                cx, cy = contour[0][0]
+                            font = cv2.FONT_HERSHEY_SIMPLEX
+                            text = obj_id
+                            text_size = cv2.getTextSize(text, font, 1, 2)[0]
+                            text_w, text_h = text_size
+                            cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
+                                        (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
+                            cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
+                                        font, 1, (255, 255, 255), 2)
+                    else:
+                        alpha = 0.08
+                        colored_obj_mask = np.zeros_like(frame)
+                        colored_obj_mask[obj_mask == 1] = colors[j]
+                        frame[obj_mask == 1] = (
+                            (1 - alpha) * frame[obj_mask == 1]
+                            + alpha * colored_obj_mask[obj_mask == 1]
+                        )
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 2)
+                        cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
+                        if len(contours) > 0:
+                            largest_contour = max(contours, key=cv2.contourArea)
+                            M = cv2.moments(largest_contour)
+                            if M["m00"] != 0:
+                                center_x = int(M["m10"] / M["m00"])
+                                center_y = int(M["m01"] / M["m00"])
+                            else:
+                                center_x, center_y = 0, 0
+                        font = cv2.FONT_HERSHEY_SIMPLEX
+                        text = obj_id
+                        font_scale = 0.9
+                        text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
+                        text_x = center_x - text_size[0] // 1
+                        text_y = center_y
+                        rect_start = (text_x - 5, text_y - text_size[1] - 5)
+                        rect_end = (text_x + text_size[0] + 5, text_y)
+                        cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+                        cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
+            # plt.figure(figsize=(12, 8))
+            # plt.imshow(frame)
+            # plt.title(f"frame {frame_name}")
+            # plt.tight_layout()
+            # plt.axis('off')
+            # plt.show()
+            buffer = BytesIO()
+            frame = Image.fromarray(frame)
+            frame.save(buffer, format='jpeg')
+            buffer.seek(0)
+            cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+            frame_cat_cnts[frame_name] = cat_cnt
+            buffer.seek(0)  # Reuse buffer instead of creating a new one
+            buffer.truncate()
+            frame_for_contour = Image.fromarray(frame_for_contour)
+            frame_for_contour.save(buffer, format='jpeg')
+            buffer.seek(0)
+            contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+        encoded_frames[cat] = cat_frames
+        contoured_frames[cat] = contour_frames
+        vid_cat_cnts[cat] = frame_cat_cnts
+    return encoded_frames, contoured_frames, vid_cat_cnts
+def number_objects_and_encode(idx, color_mask=False):
+    encoded_frames = {}
+    contoured_frames = {}  # New dictionary for original images
+    vid_cat_cnts = {}
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    frame_indx = vid_meta['sample_indx']
+    cat_names = set(vid_meta['obj_id_cat'].values())
+    imgs = vid_data[0]
+    for cat in cat_names:
+        cat_frames = []
+        contour_frames = []
+        frame_cat_cnts = {}
+        for i in range(imgs.size(0)):
+            frame_name = frame_indx[i]
+            frame = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy())
+            frame_data = vid_data[2][frame_name]
+            obj_ids = list(frame_data.keys())
+            cat_cnt = 0
+            for j in range(len(obj_ids)):
+                obj_id = obj_ids[j]
+                obj_data = frame_data[obj_id]
+                obj_bbox = obj_data['bbox']
+                obj_valid = obj_data['valid']
+                obj_mask = obj_data['mask'].numpy().astype(np.uint8)
+                obj_cat = obj_data['category_name']
+                if obj_cat == cat and obj_valid:
+                    cat_cnt += 1
+                    contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                    cv2.drawContours(frame, contours, -1, colors[j], 3)
+                    cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
+                    if len(contours) > 0:
+                        largest_contour = max(contours, key=cv2.contourArea)
+                        M = cv2.moments(largest_contour)
+                        if M["m00"] != 0:
+                            center_x = int(M["m10"] / M["m00"])
+                            center_y = int(M["m01"] / M["m00"])
+                        else:
+                            center_x, center_y = 0, 0
+                    font = cv2.FONT_HERSHEY_SIMPLEX
+                    text = obj_id
+                    font_scale = 1.2
+                    text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
+                    text_x = center_x - text_size[0] // 1
+                    text_y = center_y
+                    rect_start = (text_x - 5, text_y - text_size[1] - 5)
+                    rect_end = (text_x + text_size[0] + 5, text_y + 3)
+                    contour_thickness = 1
+                    rect_start_contour = (rect_start[0] - contour_thickness, rect_start[1] - contour_thickness)
+                    rect_end_contour = (rect_end[0] + contour_thickness, rect_end[1] + contour_thickness)
+                    cv2.rectangle(frame, rect_start_contour, rect_end_contour, colors[j], contour_thickness)
+                    cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+                    cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
+                    if color_mask:
+                        alpha = 0.08
+                        colored_obj_mask = np.zeros_like(frame)
+                        colored_obj_mask[obj_mask == 1] = colors[j]
+                        frame[obj_mask == 1] = (
+                            (1 - alpha) * frame[obj_mask == 1]
+                            + alpha * colored_obj_mask[obj_mask == 1]
+                        )
+            # plt.figure(figsize=(12, 8))
+            # plt.imshow(frame)
+            # plt.title(f"frame {frame_name}")
+            # plt.tight_layout()
+            # plt.axis('off')
+            # plt.show()
+            buffer = BytesIO()
+            frame = Image.fromarray(frame)
+            frame.save(buffer, format='jpeg')
+            buffer.seek(0)
+            cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+            frame_cat_cnts[frame_name] = cat_cnt
+            buffer.seek(0)  # Reuse buffer instead of creating a new one
+            buffer.truncate()
+            frame_for_contour = Image.fromarray(frame_for_contour)
+            frame_for_contour.save(buffer, format='jpeg')
+            buffer.seek(0)
+            contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8"))
+        encoded_frames[cat] = cat_frames
+        contoured_frames[cat] = contour_frames
+        vid_cat_cnts[cat] = frame_cat_cnts
+    return encoded_frames, contoured_frames, vid_cat_cnts
+def getCaption(idx, model='gpt-4o'):
+    vid_meta = metas[idx]
+    vid_data = train_dataset[idx]
+    vid_id = vid_meta['video']
+    print(f"vid id: {vid_id}\n")
+    frame_indx = vid_meta['sample_indx']             # e.g. [4, 7, 9, 16]
+    cat_names = set(vid_meta['obj_id_cat'].values()) # e.g. {"person", "elephant", ...}
+    all_captions = dict()
+    # color_mask = random.choice([True, False])
+    color_mask = random.choices([False, True], weights=[60, 40])[0]
+    base64_frames, _ , vid_cat_cnts = number_objects_and_encode(idx, color_mask)
+    #marked = "mask with boundary" if color_mask else "boundary"
+    for cat_name in list(cat_names) :
+        is_movable = False
+        if cat_name in ytvos_category_valid_list :
+            is_movable = True
+        if not is_movable:
+            print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n')
+        image_captions = {}
+        captioner = OpenAI()
+        cat_base64_frames = base64_frames[cat_name]
+        # cont_base64_frames = contoured_frames[cat_name]
+        for i in range(len(cat_base64_frames)):
+            frame_name = frame_indx[i]
+            # cont_base64_image = cont_base64_frames[i]
+            base64_image = cat_base64_frames[i]
+            should_filter = False
+            frame_cat_cnts = vid_cat_cnts[cat_name][frame_name]
+            if frame_cat_cnts >= 2:
+                should_filter = True
+            else:
+                print(f"Skipping {cat_name}: There is single or no object.", end='\n\n')
+            if is_movable and should_filter:
+                #1단계: 필터링
+                print(f"-----------category name: {cat_name}, frame name: {frame_name}")
+                caption_filter_text = f"""
+                You are a visual assistant analyzing a single frame from a video.
+                In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker.
+                Are {cat_name}s in the image performing all different and recognizable actions or postures?
+                Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position),
+                facial expressions, and any notable interactions with objects or other {cat_name}s or people.
+                Only focus on obvious, prominent actions that can be reliably identified from this single frame.
+                - Respond with "YES" if:
+                1) Most of {cat_name}s exhibit clearly different, unique actions or poses.
+                    (e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.)
+                2) You can see visible significant differences in action and posture, that an observer can identify at a glance.
+                3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing.
+                - Respond with "NONE" if:
+                1) The actions or pose are not clearly differentiable or too similar.
+                2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance.
+                3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion
+                Answer strictly with either "YES" or "NONE".
+                """
+                response1 = captioner.chat.completions.create(
+                    model=model,
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": caption_filter_text,
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                }
+                            ],
+                        }
+                    ],
+                )
+                response_content = response1.choices[0].message.content
+                should_caption = True if "yes" in response_content.lower() else False
+                print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n')
+            else:
+                should_caption = False
+            #2단계: dense caption 만들기
+            dense_caption_prompt_1 =  f"""
+            In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. The category name of these objects are : {cat_name}.
+            Please describe the image focusing on labeled {cat_name}s in detail, focusing on their actions and interactions.
+            1. Focus only on clear, unique, and prominent actions that distinguish each object.
+            2. Avoid describing actions that are too minor, ambiguous, or not visible from the image.
+            3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions.
+            4. Do not include common-sense or overly general descriptions like 'the elephant walks'.
+            5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements.
+            6. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                - expressions like 'seems to be', 'appears to be' are BANNED!
+            7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+            8. Include interactions with objects or other entities when they are prominent and observable.
+            9. **Do not include descriptions of appearance** such as clothes, color, size, shape etc.
+            10. **Do not include relative position** between objects such as 'the left elephant' because left/right can be ambiguous.
+            11. Do not mention object IDs.
+            12. Use '{cat_name}' as the noun for the referring expressions.
+            Note that I want to use your description to create a grounding dataset, therefore, your descriptions for different objects should be unique, i.e., If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific.
+            - Your answer should contain details, and follow the following format:
+                object id. action-oriented description
+                (e.g. 1. the person is holding bananas on two hands and opening his mouth, turning the head right.
+                      2. a person bending over and touching his boots to tie the shoelace.)
+            - for action-oriented description, use {cat_name} as subject noun
+            **Only include the currently labeled category** in each line (e.g., if it’s a person, do not suddenly label it as other object/animal).
+            Please pay attention to the categories of these objects and don’t change them.
+            Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
+            Output referring expressions for each object id. Please start your answer:"""
+            dense_caption_prompt_2 = f"""
+            You are an advanced visual language model analyzing a video frame.
+            In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been distinctly labeled with bright numerical IDs at their center and boundary.
+            Your task is to generate **action-oriented descriptions** for each labeled {cat_name}.
+            Your descriptions should capture their **observable actions and interactions**, making sure to highlight movement, gestures, and dynamic behaviors.
+            ---
+            ## Key Guidelines:
+            1. **Describe only clear and visible actions** that uniquely define what the {cat_name} is doing.
+            - Example: "grabbing a branch and pulling it down" (**(O) Specific**)
+            - Avoid: "moving slightly to the side" (**(X) Too vague**)
+            2. **Do not describe appearance, color, or position**—focus purely on the action.
+            - (X) "A large brown bear standing on the left"
+            - (O) "The bear is lifting its front paws and swiping forward."
+            3. **Use dynamic, action-specific verbs** rather than passive descriptions.
+            - (O) "The giraffe is tilting its head and sniffing the ground."
+            - (X) "The giraffe is near a tree and looking around."
+            4. **Avoid assumptions, emotions, or speculative phrasing.**
+            - (X) "The person seems excited" / "The person might be preparing to jump."
+            - (O) "The person is pushing its front legs against the rock and leaping forward."
+            5. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'.
+                - expressions like 'seems to be', 'appears to be' are BANNED!
+            6. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'.
+            7. If multiple {cat_name}s are present, make sure their descriptions are **distinct and non-overlapping**.
+            - **Each object should have a unique, descriptive action.**
+            - (X) "Two dogs are running."
+            - (O) "1. One dog is chasing another, its legs stretched mid-air.
+                            2. The other dog is looking back while speeding up."
+            ---
+            ## Output Format:
+            - Each labeled **{cat_name}** should have exactly **one line of description**.
+            - Format: `ID. {cat_name} + action-based description`
+            - (O) Example:
+                ```
+                1. The person is leaning forward while opening a bag with both hands.
+                2. The person is holding onto a rope and pulling themselves up.
+                ```
+            - **Ensure that each object is described individually.**
+            - **Do not group objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
+            ---
+            ## Additional Instructions:
+            - **Do NOT** use expressions like "it appears that..." or "it seems like...".
+            - **Do NOT** mention object IDs in the description (only use the provided format).
+            - **DO NOT** include markdown formatting (no bullet points, no asterisks).
+            - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
+            Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:
+            """
+            dense_caption_prompt = f"""
+            You are a visual assistant analyzing a single frame of a video.
+            In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been labeled with bright numeric IDs at their center and boundary.
+            I am building an **action-centric referring expression** dataset.
+            Your task is to describe each labeled {cat_name} based on **clearly observable and specific actions**.
+            ---
+            ## Guidelines:
+            1. **Focus only on visible and prominent actions** (e.g., running, pushing, grasping an object).
+            2. **Avoid describing minor or ambiguous movements** (e.g., "slightly moving a paw," "tilting head a bit").
+            3. **Do not include subjective or speculative descriptions** (e.g., "it seems excited" or "it might be preparing to jump").
+            4. **Avoid vague expressions** like "engaging with something." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
+            5. **Use dynamic action verbs** (e.g., holding, throwing, inspecting, leaning, pressing) to highlight motion and interaction.
+            6. If multiple {cat_name}s appear, ensure each description is **distinct and non-overlapping**.
+            7. Base your descriptions on these principles:
+            - **Avoid words like 'minimal' or 'slightly'.**
+            - Emphasize **body movement, posture, and motion patterns** (e.g., "lifting its head," "facing forward," "showing its back").
+            - Describe **facial expressions and interactions with objects** (e.g., "opening its mouth wide," "smiling while holding an item").
+            - **Specify actions with other objects or entities** only when they are clear and observable.
+                - (O) "pushing another person"
+                - (X) "interacting with another object"
+            ---
+            ## Output Format:
+            - Each labeled **{cat_name}** must have **exactly one line**.
+            - Format: `ID. {cat_name} + action-based description`
+            - (O) Example:
+                ```
+                1. The person is holding ski poles and skiing down a snowy mountain with bent knees.
+                2. The person is pulling a baby carriage while smiling.
+                ```
+            - **Ensure each object is described individually.**
+            - **Do not group multiple objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed).
+            ---
+            ## Example:
+            If the frame has two labeled **bears**, your output should be:
+            ```
+            1. The bear is reaching out its right paw while leaning forward to catch prey.
+            2. A bear is standing upright, facing right, and touching the bike beside it.
+            ```
+            ---
+            ## Additional Instructions:
+            - **Do NOT** describe appearance (e.g., color, size, texture) or relative positioning (e.g., "on the left/right").
+            - **Do NOT** reference object IDs explicitly (e.g., "Person 1" or "Object 2" is NOT allowed).
+            - **Do NOT** include markdown formatting (no bullet points, asterisks, or extra symbols).
+            - **Only describe actions of the labeled {cat_name} objects**—do not introduce unrelated categories.
+            Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:"""
+            MAX_RETRIES = 3
+            retry_count = 0
+            if should_caption:
+                while retry_count < MAX_RETRIES:
+                    selected_prompt = random.choice([dense_caption_prompt, dense_caption_prompt_2])
+                    response2 = captioner.chat.completions.create(
+                        model=model,
+                        messages=[
+                            {
+                                "role": "user",
+                                "content": [
+                                    {
+                                        "type": "text",
+                                        "text": selected_prompt,
+                                    },
+                                    {
+                                        "type": "image_url",
+                                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                                    },
+                                ],
+                            }
+                        ],
+                    )
+                    # caption = response2.choices[0].message.content
+                    #print(f"{image_path} - {frame_name}: {caption}")
+                    caption = response2.choices[0].message.content.strip()
+                    caption_lower = caption.lower().lstrip()
+                    if caption_lower.startswith("1.") and not any(
+                        phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
+                    ):
+                        break
+                    print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
+                    retry_count += 1
+                    time.sleep(2)
+                if retry_count == MAX_RETRIES:
+                    caption = None
+                    print("Max retries reached. Caption generation failed.")
+            else:
+                caption = None
+            image_captions[frame_name] = caption
+        all_captions[cat_name] = image_captions
+    # final : also prepare valid object ids
+    valid_obj_ids = dict()
+    for cat in cat_names:
+        if cat in ytvos_category_valid_list:
+            obj_id_cat = vid_meta['obj_id_cat']
+            valid_cat_ids = []
+            for obj_id in list(obj_id_cat.keys()):
+                if obj_id_cat[obj_id] == cat:
+                    valid_cat_ids.append(obj_id)
+            valid_obj_ids[cat] = valid_cat_ids
+    return vid_id, all_captions, valid_obj_ids
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions_gpt-4o_randcap.json")
+    parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids_gpt-4o_randcap.json")
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_ytvos_ref(image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    # 색상 후보 8개 (RGB 형식)
+    colors = [
+        (255, 0, 0),    # Red
+        (0, 255, 0),    # Green
+        (0, 0, 255),    # Blue
+        (255, 255, 0),  # Yellow
+        (255, 0, 255),  # Magenta
+        (0, 255, 255),  # Cyan
+        (128, 0, 128),  # Purple
+        (255, 165, 0)   # Orange
+    ]
+    ytvos_category_valid_list = [
+    'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile',
+    'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog',
+    'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard',
+    'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person',
+    'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake',
+    'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra'
+    ]
+    #==================gpt 돌리기===================
+    os.environ['OPENAI_API_KEY'] = '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'
+    result_captions = {}
+    result_valid_obj_ids = {}
+    for i in range(len(metas)):
+        try:
+            vid_id, all_captions, valid_obj_ids = getCaption(i)
+            if vid_id not in result_captions:
+                result_captions[vid_id] = all_captions
+            if vid_id not in result_valid_obj_ids:
+                result_valid_obj_ids[vid_id] = valid_obj_ids
+        except (requests.exceptions.ConnectionError, APIConnectionError) as e:
+            print(f"created caption until {i-1}", flush=True)
+            print("인터넷 연결 문제로 요청을 처리할 수 없습니다:", e, flush=True)
+            with open(args.save_caption_path, "w") as file:
+                json.dump(result_captions, file, indent=4)
+            with open(args.save_valid_obj_ids_path, "w") as file:
+                json.dump(result_valid_obj_ids, file, indent=4)
+        except OpenAIError as e:
+            print(f"created caption until {i-1}", flush=True)
+            print("OpenAI API 관련 오류가 발생했습니다:", e, flush=True)
+            with open(args.save_caption_path, "w") as file:
+                json.dump(result_captions, file, indent=4)
+            with open(args.save_valid_obj_ids_path, "w") as file:
+                json.dump(result_valid_obj_ids, file, indent=4)
+        except Exception as e:
+            print(f"created caption until {i-1}", flush=True)
+            print("알 수 없는 오류 발생:", e, flush=True)
+            with open(args.save_caption_path, "w") as file:
+                json.dump(result_captions, file, indent=4)
+            with open(args.save_valid_obj_ids_path, "w") as file:
+                json.dump(result_valid_obj_ids, file, indent=4)
+    print("Finished!", flush=True)
+    with open(args.save_caption_path, "w") as file:
+        json.dump(result_captions, file, indent=4)
+    with open(args.save_valid_obj_ids_path, "w") as file:
+        json.dump(result_valid_obj_ids, file, indent=4)

.history/mbench/make_ref-ytvos_json_20250117032501.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import sys
+import os
+from os import path as osp
+sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
+from datasets import build_dataset
+import argparse
+import opts
+from pathlib import Path
+import io
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+#==================json 만들기===================
+def createJson(train_dataset, metas):
+    entire_json = {}
+    #초기화
+    vid_idx = 0
+    while vid_idx < len(train_dataset):
+        #하나의 비디오에 대해
+        video_data = {}
+        video_train_frames, video_train_info = train_dataset[vid_idx]
+        video_meta = metas[vid_idx]
+        video_id = video_meta['video']
+        video_data['bins'] = video_meta['bins']
+        bin_nums = len(video_meta['bins'])
+        obj_nums = len(list(video_meta['obj_id_cat'].keys()))
+        annotation_data = []
+        frame_names = []
+        for i in range(bin_nums):
+            bin_data = {}
+            for j in range(obj_nums):
+                obj_id = str(j+1)
+                obj_data = {
+                    "category_name":video_meta['obj_id_cat'][obj_id],
+                    "bbox":video_train_info['boxes'][i*obj_nums+j, :]
+                }
+                bin_data[obj_id] = obj_data
+            annotation_data.append(bin_data)
+        video_data['annotations'] = annotation_data
+        sample_indx = metas[vid_idx]['sample_indx']
+        frames = metas[vid_idx]['frames']
+        for i in sample_indx:
+            frame_name = frames[i]
+            frame_names.append(frame_name)
+        video_data['frame_names'] = frame_names
+        video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
+        entire_json[video_id] = video_data
+        vid_idx += 1
+    return entire_json
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    #==================json 만들기===================
+    entire_json_dict = createJson(train_dataset, metas)
+    print(type(entire_json_dict))
+    entire_json = json.dumps(entire_json_dict, indent=4)
+    with open('mbench/sampled_frame2.json', mode='w') as file:
+        file.write(entire_json)

.history/mbench/make_ref-ytvos_json_20250117072314.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import sys
+import os
+from os import path as osp
+sys.path.append(osp.abspath(osp.join(osp.dirname(__file__), '..')))
+from datasets import build_dataset
+import argparse
+import opts
+from pathlib import Path
+import io
+import numpy as np
+import pandas as pd
+import regex as re
+import json
+import cv2
+from PIL import Image, ImageDraw
+import torch
+from torchvision.transforms import functional as F
+from skimage import measure                        # (pip install scikit-image)
+from shapely.geometry import Polygon, MultiPolygon # (pip install Shapely)
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Rectangle
+import ipywidgets as widgets
+from IPython.display import display, clear_output
+#==================json 만들기===================
+def createJson(train_dataset, metas):
+    entire_json = {}
+    #초기화
+    vid_idx = 0
+    while vid_idx < len(train_dataset):
+        #하나의 비디오에 대해
+        video_data = {}
+        video_train_frames, video_train_info = train_dataset[vid_idx]
+        video_meta = metas[vid_idx]
+        video_id = video_meta['video']
+        video_data['bins'] = video_meta['bins']
+        bin_nums = len(video_meta['bins'])
+        obj_nums = len(list(video_meta['obj_id_cat'].keys()))
+        annotation_data = []
+        frame_names = []
+        for i in range(bin_nums):
+            bin_data = {}
+            for j in range(obj_nums):
+                try:
+                    obj_id = str(j+1)
+                    obj_data = {
+                        "category_name":video_meta['obj_id_cat'][obj_id],
+                        "bbox":video_train_info['boxes'][i*obj_nums+j, :]
+                    }
+                    bin_data[obj_id] = obj_data
+                except:
+                    continue
+            annotation_data.append(bin_data)
+        video_data['annotations'] = annotation_data
+        sample_indx = metas[vid_idx]['sample_indx']
+        frames = metas[vid_idx]['frames']
+        for i in sample_indx:
+            frame_name = frames[i]
+            frame_names.append(frame_name)
+        video_data['frame_names'] = frame_names
+        video_data['video_path'] = os.path.join(str(train_dataset.img_folder), 'JPEGImages', video_id)
+        entire_json[video_id] = video_data
+        vid_idx += 1
+    return entire_json
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_dataset('ytvos_ref', image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    #==================json 만들기===================
+    entire_json_dict = createJson(train_dataset, metas)
+    print(type(entire_json_dict))
+    entire_json = json.dumps(entire_json_dict, indent=4)
+    with open('mbench/sampled_frame2.json', mode='w') as file:
+        file.write(entire_json)

.history/mbench_a2d/gpt_a2d_numbered_20250206114207.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from datasets import build_dataset
+import argparse
+import opts
+import time
+import numpy as np
+import matplotlib.pyplot as plt
+import cv2
+from io import BytesIO
+import base64
+from PIL import Image
+import json
+from openai import OpenAI
+def mark_object_and_encode(frame, mask, instance_id, text_query, color_mask=False, label_number=False):
+    #마스크 색칠할지
+    if color_mask == True:
+        alpha = 0.1
+        colored_mask = np.zeros_like(frame)
+        colored_mask[mask == 1] = [255, 0, 0]
+        frame[mask == 1] = (
+            (1 - alpha) * frame[mask == 1] +
+            alpha * colored_mask[mask == 1]
+        )
+    #마스크 아웃라인 그리기
+    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    cv2.drawContours(frame, contours, -1, [255, 0, 0], 2)
+    #instance_id 적을지
+    if label_number == True:
+        if len(contours) > 0:
+            largest_contour = max(contours, key=cv2.contourArea)
+            M = cv2.moments(largest_contour)
+            if M["m00"] != 0:
+                center_x = int(M["m10"] / M["m00"])
+                center_y = int(M["m01"] / M["m00"])
+            else:
+                center_x, center_y = 0, 0
+            font = cv2.FONT_HERSHEY_SIMPLEX
+            text = str(instance_id)
+            font_scale = 0.6
+            text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
+            text_x = center_x - text_size[0] // 1  # 텍스트의 가로 중심
+            text_y = center_y
+            # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
+            # 텍스트 배경 사각형 좌표 계산
+            rect_start = (text_x - 5, text_y - text_size[1] - 5) # 배경 사각형 좌상단
+            # rect_end = (text_x + text_size[0] + 5, text_y + 5)
+            rect_end = (text_x + text_size[0] + 5, text_y)
+            cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+            cv2.putText(frame, text, (text_x, text_y), font, font_scale, (255, 255, 255), 2)
+    # plt.figure(figsize=(6, 10))
+    # plt.imshow(frame)
+    # plt.title(text_query)
+    # plt.tight_layout()
+    # plt.axis('off')
+    # plt.show()
+    buffer = BytesIO()
+    frame = Image.fromarray(frame)
+    frame.save(buffer, format='jpeg')
+    buffer.seek(0)
+    encoded_frame = base64.b64encode(buffer.read()).decode("utf-8")
+    return encoded_frame
+def getCaption(frame, mask, instance_id, text_query, model='gpt-4o', color_mask=False, label_number=True):
+    base64_image = mark_object_and_encode(frame, mask, instance_id, text_query, color_mask, label_number)
+    captioner = OpenAI()
+    #필터링하지 않고 바로 ref exp 만들기
+    dense_caption_prompt = f"""
+    You are a visual assistant analyzing a single frame of a video.
+    In the given frame, I labeled 1 object by marking each with a bright numeric ID at the center and its boundary.
+    I also give you a text query describing the marked object.
+    I want to use your expression to create an **action-centric referring expression** dataset.
+    Based on the frame and text query, please describe the marked object using **clearly observable** and **specific** actions
+    ---
+    ## Guidelines:
+    1. **Focus on visible, prominent actions** only (e.g., running, pushing, grasping an object).
+    2. **Avoid describing minor or ambiguous actions** (e.g., "slightly moving a paw", "slightly tilting head").
+    3. **Do not include subjective or speculative descriptions** (e.g., “it seems excited” or “it might be preparing to jump”).
+    4. **Avoid vague expressions** like "interacting with something" or "engaging with another object." Instead, specify the action (e.g., "grabbing a stick," "pressing a button").
+    5. **Use dynamic action verbs** (holding, throwing, inspecting, leaning, pressing) to highlight body movement or object/animal interaction.
+    6. If there are multiple objects, ensure the description for the marked object **differentiates** its action.
+    7. Base your description on these action definitions:
+    - Avoid using term 'minimal' or 'slightly'.
+    - General body movement, body position, or pattern which is prominent. (e.g. "lifting head up", "facing towards", "showing its back")
+    - details such as motion and intention, facial with object manipulation
+    - movements with object or other entities when they are prominent and observable. expression should be specific.
+        (e.g., "pushing another person" (O), "engaging with someone" (X) "interacting with another person" (X))
+    --
+    ## Output Format:
+    - For each labeled object, output **exactly one line**. Your answer should contain details and follow the following format :
+        object id. action-oriented description
+        (e.g. 1. the person is holding ski poles and skiing on a snow mountain, with his two legs bent forward.)
+    ### Example
+    If the frame has 1 labeled bear, your output should look like:
+    1. the bear reaching his right arm while leaning forward to capture the prey
+    ---
+    **Do not include** appearance details (e.g., color, size, texture) or relative positioning (e.g., “on the left/right”).
+    **Do not include object IDs** or reference them (e.g., "Person 1" or "object 2" is not allowed).
+    **Do not include markdown** in the output.
+    Keep in mind that you should not group the object, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one).
+    For each labeled object, output referring expressions for each object id.
+    """
+    prompt_with_text_query = f"prompt: {dense_caption_prompt}\n text query: {text_query}"
+    MAX_RETRIES = 2
+    retry_count = 0
+    while retry_count < MAX_RETRIES:
+        response = captioner.chat.completions.create(
+            model=model,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": prompt_with_text_query,
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
+                        },
+                    ],
+                }
+            ],
+        )
+        caption = response.choices[0].message.content.strip()
+        caption_lower = caption.lower().lstrip()
+        if caption_lower.startswith("1.") and not any(
+            phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"]
+        ):
+            break
+        print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})")
+        retry_count += 1
+        time.sleep(2)
+        if retry_count == MAX_RETRIES:
+            caption = None
+            print("Max retries reached. Caption generation failed.")
+    else:
+        caption = None
+    return caption
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    parser.add_argument('--save_caption_path', type=str, default='mbench_a2d/numbered_captions.json')
+    args = parser.parse_args()
+    train_dataset = build_dataset('a2d', image_set = 'train', args = args)
+    text_annotations = train_dataset.text_annotations
+    all_captions = {}
+    #os.environ['OPENAI_API_KEY'] = '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'
+    os.environ['OPENAI_API_KEY'] = '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'
+    first_text_query = ""
+    for idx in range(100):
+        imgs, target = train_dataset[idx]
+        frames_idx = target['frames_idx'].tolist()
+        text_query, vid_id, frame_id, instance_id = text_annotations[idx]
+        if text_query == first_text_query:
+            continue
+        print(f"------------vid id: {vid_id}, frame id: {frame_id}", flush=True)
+        frame_id = frame_id - 1
+        frame_order = frames_idx.index(frame_id)
+        frame = imgs[frame_order, :, :, :].permute(1, 2, 0).numpy()
+        mask = target['masks'].numpy().astype(np.uint8).squeeze()
+        caption = getCaption(frame, mask, instance_id, text_query, model='gpt-4o-mini')
+        if vid_id not in all_captions:
+            all_captions[vid_id] = {frame_id : caption}
+        else:
+            all_captions[vid_id][frame_id] = caption
+    print("Finished!", flush=True)
+    with open(args.save_caption_path, 'w') as file:
+        json.dump(all_captions, file, indent=4)

__pycache__/opts.cpython-310.pyc ADDED Viewed

Binary file (5.44 kB). View file

__pycache__/opts.cpython-39.pyc ADDED Viewed

Binary file (5.44 kB). View file

__pycache__/refer.cpython-39.pyc ADDED Viewed

Binary file (10.1 kB). View file

davis2017/davis.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import os
+from glob import glob
+from collections import defaultdict
+import numpy as np
+from PIL import Image
+class DAVIS(object):
+    SUBSET_OPTIONS = ['train', 'val', 'test-dev', 'test-challenge']
+    TASKS = ['semi-supervised', 'unsupervised']
+    DATASET_WEB = 'https://davischallenge.org/davis2017/code.html'
+    VOID_LABEL = 255
+    def __init__(self, root, task='unsupervised', subset='val', sequences='all', resolution='480p', codalab=False):
+        """
+        Class to read the DAVIS dataset
+        :param root: Path to the DAVIS folder that contains JPEGImages, Annotations, etc. folders.
+        :param task: Task to load the annotations, choose between semi-supervised or unsupervised.
+        :param subset: Set to load the annotations
+        :param sequences: Sequences to consider, 'all' to use all the sequences in a set.
+        :param resolution: Specify the resolution to use the dataset, choose between '480' and 'Full-Resolution'
+        """
+        if subset not in self.SUBSET_OPTIONS:
+            raise ValueError(f'Subset should be in {self.SUBSET_OPTIONS}')
+        if task not in self.TASKS:
+            raise ValueError(f'The only tasks that are supported are {self.TASKS}')
+        self.task = task
+        self.subset = subset
+        self.root = root
+        self.img_path = os.path.join(self.root, 'JPEGImages', resolution)
+        annotations_folder = 'Annotations' if task == 'semi-supervised' else 'Annotations_unsupervised'
+        self.mask_path = os.path.join(self.root, annotations_folder, resolution)
+        year = '2019' if task == 'unsupervised' and (subset == 'test-dev' or subset == 'test-challenge') else '2017'
+        self.imagesets_path = os.path.join(self.root, 'ImageSets', year)
+        self._check_directories()
+        if sequences == 'all':
+            with open(os.path.join(self.imagesets_path, f'{self.subset}.txt'), 'r') as f:
+                tmp = f.readlines()
+            sequences_names = [x.strip() for x in tmp]
+        else:
+            sequences_names = sequences if isinstance(sequences, list) else [sequences]
+        self.sequences = defaultdict(dict)
+        for seq in sequences_names:
+            images = np.sort(glob(os.path.join(self.img_path, seq, '*.jpg'))).tolist()
+            if len(images) == 0 and not codalab:
+                raise FileNotFoundError(f'Images for sequence {seq} not found.')
+            self.sequences[seq]['images'] = images
+            masks = np.sort(glob(os.path.join(self.mask_path, seq, '*.png'))).tolist()
+            masks.extend([-1] * (len(images) - len(masks)))
+            self.sequences[seq]['masks'] = masks
+    def _check_directories(self):
+        if not os.path.exists(self.root):
+            raise FileNotFoundError(f'DAVIS not found in the specified directory, download it from {self.DATASET_WEB}')
+        if not os.path.exists(os.path.join(self.imagesets_path, f'{self.subset}.txt')):
+            raise FileNotFoundError(f'Subset sequences list for {self.subset} not found, download the missing subset '
+                                    f'for the {self.task} task from {self.DATASET_WEB}')
+        if self.subset in ['train', 'val'] and not os.path.exists(self.mask_path):
+            raise FileNotFoundError(f'Annotations folder for the {self.task} task not found, download it from {self.DATASET_WEB}')
+    def get_frames(self, sequence):
+        for img, msk in zip(self.sequences[sequence]['images'], self.sequences[sequence]['masks']):
+            image = np.array(Image.open(img))
+            mask = None if msk is None else np.array(Image.open(msk))
+            yield image, mask
+    def _get_all_elements(self, sequence, obj_type):
+        obj = np.array(Image.open(self.sequences[sequence][obj_type][0]))
+        all_objs = np.zeros((len(self.sequences[sequence][obj_type]), *obj.shape))
+        obj_id = []
+        for i, obj in enumerate(self.sequences[sequence][obj_type]):
+            all_objs[i, ...] = np.array(Image.open(obj))
+            obj_id.append(''.join(obj.split('/')[-1].split('.')[:-1]))
+        return all_objs, obj_id
+    def get_all_images(self, sequence):
+        return self._get_all_elements(sequence, 'images')
+    def get_all_masks(self, sequence, separate_objects_masks=False):
+        masks, masks_id = self._get_all_elements(sequence, 'masks')
+        masks_void = np.zeros_like(masks)
+        # Separate void and object masks
+        for i in range(masks.shape[0]):
+            masks_void[i, ...] = masks[i, ...] == 255
+            masks[i, masks[i, ...] == 255] = 0
+        if separate_objects_masks:
+            num_objects = int(np.max(masks[0, ...]))
+            tmp = np.ones((num_objects, *masks.shape))
+            tmp = tmp * np.arange(1, num_objects + 1)[:, None, None, None]
+            masks = (tmp == masks[None, ...])
+            masks = masks > 0
+        return masks, masks_void, masks_id
+    def get_sequences(self):
+        for seq in self.sequences:
+            yield seq
+if __name__ == '__main__':
+    from matplotlib import pyplot as plt
+    only_first_frame = True
+    subsets = ['train', 'val']
+    for s in subsets:
+        dataset = DAVIS(root='/home/csergi/scratch2/Databases/DAVIS2017_private', subset=s)
+        for seq in dataset.get_sequences():
+            g = dataset.get_frames(seq)
+            img, mask = next(g)
+            plt.subplot(2, 1, 1)
+            plt.title(seq)
+            plt.imshow(img)
+            plt.subplot(2, 1, 2)
+            plt.imshow(mask)
+            plt.show(block=True)

docs/davis_demo1.gif ADDED Viewed

Git LFS Details

SHA256: e3203f7df580fb3903bf46f23a95d1efdf23ed57497691fe673ed51c05a790df
Pointer size: 133 Bytes
Size of remote file: 15.3 MB

docs/davis_demo2.gif ADDED Viewed

Git LFS Details

SHA256: b9301ea2739bd30f44acfd98f99e68ceb4d9deef0ac7458a5de72b7efd2e7445
Pointer size: 133 Bytes
Size of remote file: 12.7 MB

docs/install.md ADDED Viewed

	@@ -0,0 +1,42 @@

+# Installation
+We provide the instructions to install the dependency packages.
+## Requirements
+We test the code in the following environments, other versions may also be compatible:
+- CUDA 11.1
+- Python 3.7
+- Pytorch 1.8.1
+## Setup
+First, clone the repository locally.
+```
+git clone https://github.com/wjn922/ReferFormer.git
+```
+Then, install Pytorch 1.8.1 using the conda environment.
+```
+conda install pytorch==1.8.1 torchvision==0.9.1 torchaudio==0.8.1 -c pytorch
+```
+Install the necessary packages and pycocotools.
+```
+pip install -r requirements.txt
+pip install 'git+https://github.com/facebookresearch/fvcore'
+pip install -U 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'
+```
+Finally, compile CUDA operators.
+```
+cd models/ops
+python setup.py build install
+cd ../..
+```

docs/network.png ADDED Viewed

Git LFS Details

SHA256: 9b52dc182c58c0ce59086750d1c2657dcdbeb9c9771add8cca93ad8a55feba0e
Pointer size: 132 Bytes
Size of remote file: 3.48 MB

docs/ytvos_demo1.gif ADDED Viewed

Git LFS Details

SHA256: 073a90379317b9ddcaae5459a7f7a66ba4d722fa34600af44c3ebc0a3b6fe402
Pointer size: 132 Bytes
Size of remote file: 9.67 MB

docs/ytvos_demo2.gif ADDED Viewed

Git LFS Details

SHA256: 97fe7b1d10968a32bc5dac0c0f543e334c6e59c65de78029d1eed0e3f10b0486
Pointer size: 133 Bytes
Size of remote file: 14.8 MB

hf_cache/.locks/models--zhiqiulin--clip-flant5-xxl/e14a3254bf04f32056759bdc60c64736e7638f31b43957586ff2442ff393890a.lock ADDED Viewed

File without changes

hf_cache/models--zhiqiulin--clip-flant5-xxl/snapshots/89bad6fffe1126b24d4360c1e1f69145eb6103aa/pytorch_model-00002-of-00003.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:12acb5074c883dcab3e166d86d20130615ff83b0d26736ee046f4184202ebd3b
+size 9999791010

make_ref-ytvos/manual_selection.ipynb ADDED Viewed

	@@ -0,0 +1,381 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/data/projects/yejin/VerbCentric_RIS/ReferFormer\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/yejin/.conda/envs/VerbCentric_RIS/lib/python3.9/site-packages/IPython/core/magics/osm.py:417: UserWarning: using dhist requires you to install the `pickleshare` library.\n",
+      "  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]\n"
+     ]
+    }
+   ],
+   "source": [
+    "%cd /home/yejin/data/projects/yejin/VerbCentric_RIS/ReferFormer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. manual 필터링 반영"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import re\n",
+    "import json"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "selected_frames_df = pd.read_json(\"/home/yejin/data/dataset/VRIS/mbench/ytvos/selected_instances.jsonl\", lines = True)\n",
+    "manual_selected = pd.read_json(\"manual_selected_frames.jsonl\", lines = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>segmentation</th>\n",
+       "      <th>bbox</th>\n",
+       "      <th>area</th>\n",
+       "      <th>file_name</th>\n",
+       "      <th>height</th>\n",
+       "      <th>width</th>\n",
+       "      <th>label</th>\n",
+       "      <th>category_name</th>\n",
+       "      <th>sentences</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>[[1081.0, 719.5, 1051.0, 719.5, 1050.5, 716.0,...</td>\n",
+       "      <td>[708.5, 156.5, 420.0, 563.0]</td>\n",
+       "      <td>131357.25</td>\n",
+       "      <td>00917dcfc4_00000.png</td>\n",
+       "      <td>720</td>\n",
+       "      <td>1280</td>\n",
+       "      <td>64</td>\n",
+       "      <td>zebra</td>\n",
+       "      <td>{'tokens': ['a', 'zebra', 'on', 'the', 'right'...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                        segmentation  \\\n",
+       "0  [[1081.0, 719.5, 1051.0, 719.5, 1050.5, 716.0,...   \n",
+       "\n",
+       "                           bbox       area             file_name  height  \\\n",
+       "0  [708.5, 156.5, 420.0, 563.0]  131357.25  00917dcfc4_00000.png     720   \n",
+       "\n",
+       "   width  label category_name  \\\n",
+       "0   1280     64         zebra   \n",
+       "\n",
+       "                                           sentences  \n",
+       "0  {'tokens': ['a', 'zebra', 'on', 'the', 'right'...  "
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "selected_frames_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i in range(len(manual_selected)):\n",
+    "    idx = manual_selected.loc[i, \"index\"]\n",
+    "    new_sent = manual_selected.loc[i, 'new_sent']\n",
+    "\n",
+    "    if new_sent != \"\":\n",
+    "        new_sent_dict = {\n",
+    "            \"tokens\" : new_sent.split(' '),\n",
+    "            \"raw\" : new_sent,\n",
+    "            \"sent\" : re.sub('[^A-Za-z0-9\\s]+', '', new_sent.lower())\n",
+    "        }\n",
+    "        selected_frames_df.at[idx, 'sentences'] = new_sent_dict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "manual_selected_frames = selected_frames_df.loc[manual_selected['index'].values]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "manual_selected_frames.to_json(\"revised_frames.jsonl\", orient='records', lines=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. lmdb로 변환하기 위해 마스크 저장하기"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import argparse\n",
+    "import os\n",
+    "import os.path as osp\n",
+    "import lmdb\n",
+    "import pyarrow as pa\n",
+    "import json\n",
+    "from tqdm import tqdm\n",
+    "import matplotlib.pyplot as plt\n",
+    "from skimage import io\n",
+    "import numpy as np\n",
+    "from shapely.geometry import Polygon, MultiPolygon\n",
+    "from matplotlib.collections import PatchCollection\n",
+    "from pycocotools import mask\n",
+    "import warnings\n",
+    "warnings.filterwarnings(\"ignore\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#jsonl 파일을 {index: json_obj, ... }형식으로\n",
+    "\n",
+    "json_data = []\n",
+    "\n",
+    "with open('revised_frames.jsonl', 'rb') as f:\n",
+    "    for line in f:\n",
+    "        json_data.append(json.loads(line))        "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def getMask(ann):\n",
+    "    # return mask, area and mask-center\n",
+    "    if type(ann['segmentation'][0]) == list:  # polygon\n",
+    "        rle = mask.frPyObjects(ann['segmentation'], ann['height'],\n",
+    "                               ann['width'])\n",
+    "    else:\n",
+    "        rle = ann['segmentation']\n",
+    "    # for i in range(len(rle['counts'])):\n",
+    "    # print(rle)\n",
+    "    m = mask.decode(rle)\n",
+    "    m = np.sum(\n",
+    "        m, axis=2\n",
+    "    )  # sometimes there are multiple binary map (corresponding to multiple segs)\n",
+    "    m = m.astype(np.uint8)  # convert to np.uint8\n",
+    "    # compute area\n",
+    "    area = sum(mask.area(rle))  # should be close to ann['area']\n",
+    "    return {'mask': m, 'area': area}\n",
+    "    # # position\n",
+    "    # position_x = np.mean(np.where(m==1)[1]) # [1] means columns (matlab style) -> x (c style)\n",
+    "    # position_y = np.mean(np.where(m==1)[0]) # [0] means rows (matlab style)    -> y (c style)\n",
+    "    # # mass position (if there were multiple regions, we use the largest one.)\n",
+    "    # label_m = label(m, connectivity=m.ndim)\n",
+    "    # regions = regionprops(label_m)\n",
+    "    # if len(regions) > 0:\n",
+    "    # \tlargest_id = np.argmax(np.array([props.filled_area for props in regions]))\n",
+    "    # \tlargest_props = regions[largest_id]\n",
+    "    # \tmass_y, mass_x = largest_props.centroid\n",
+    "    # else:\n",
+    "    # \tmass_x, mass_y = position_x, position_y\n",
+    "    # # if centroid is not in mask, we find the closest point to it from mask\n",
+    "    # if m[mass_y, mass_x] != 1:\n",
+    "    # \tprint 'Finding closes mask point ...'\n",
+    "    # \tkernel = np.ones((10, 10),np.uint8)\n",
+    "    # \tme = cv2.erode(m, kernel, iterations = 1)\n",
+    "    # \tpoints = zip(np.where(me == 1)[0].tolist(), np.where(me == 1)[1].tolist())  # row, col style\n",
+    "    # \tpoints = np.array(points)\n",
+    "    # \tdist   = np.sum((points - (mass_y, mass_x))**2, axis=1)\n",
+    "    # \tid     = np.argsort(dist)[0]\n",
+    "    # \tmass_y, mass_x = points[id]\n",
+    "    # \t# return\n",
+    "    # return {'mask': m, 'area': area, 'position_x': position_x, 'position_y': position_y, 'mass_x': mass_x, 'mass_y': mass_y}\n",
+    "    # # show image and mask\n",
+    "    # I = io.imread(osp.join(self.IMAGE_DIR, image['file_name']))\n",
+    "    # plt.figure()\n",
+    "    # plt.imshow(I)\n",
+    "    # ax = plt.gca()\n",
+    "    # img = np.ones( (m.shape[0], m.shape[1], 3) )\n",
+    "    # color_mask = np.array([2.0,166.0,101.0])/255\n",
+    "    # for i in range(3):\n",
+    "    #     img[:,:,i] = color_mask[i]\n",
+    "    # ax.imshow(np.dstack( (img, m*0.5) ))\n",
+    "    # plt.show()\n",
+    "\n",
+    "def showMask(ann, image_dir, mask_dir):\n",
+    "   \n",
+    "    fig, ax = plt.subplots()\n",
+    "    I = io.imread(osp.join(image_dir, ann['file_name']))\n",
+    "    ax.imshow(I)\n",
+    "\n",
+    "    M = getMask(ann)\n",
+    "    msk = M['mask']\n",
+    "    #msk = io.imread(osp.join(mask_dir, ann['file_name']))\n",
+    "    \n",
+    "    ax.imshow(msk, alpha = 0.5)\n",
+    "    ax.set_title(ann['sentences']['sent'])\n",
+    "    plt.show()\n",
+    "\n",
+    "\n",
+    "\n",
+    "def saveMask(ann, mask_dir, seg_id):\n",
+    "    M = getMask(ann)\n",
+    "    msk = M['mask']\n",
+    "    height, width = msk.shape\n",
+    "    \n",
+    "    fig, ax = plt.subplots(figsize=(width / 100, height / 100), dpi=100)\n",
+    "    ax.imshow(msk, cmap='gray', vmin=0, vmax=1)\n",
+    "\n",
+    "    save_path = f'{mask_dir}/{seg_id}'\n",
+    "    plt.axis('off')\n",
+    "    plt.subplots_adjust(left=0, right=1, top=1, bottom=0)  # Remove padding\n",
+    "\n",
+    "    fig.savefig(save_path, dpi=100, bbox_inches='tight', pad_inches=0)\n",
+    "    \n",
+    "    plt.close(fig)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i in range(len(json_data)):\n",
+    "    #showMask(json_data[i], image_dir = '/home/yejin/data/dataset/VRIS/mbench/ytvos/selected_frames', mask_dir = '/home/yejin/data/dataset/VRIS/mbench/ytvos/filtered_masks')\n",
+    "    saveMask(json_data[i], '/home/yejin/data/dataset/VRIS/mbench/ytvos/filtered_masks_segid', i)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "##############안 쓰는 함수!###################\n",
+    "# 마스크 저장\n",
+    "# annotation dictionary as input\n",
+    "def saveMask(annotation, mask_dir, seg_box='seg'):\n",
+    "    image_width = annotation['width']\n",
+    "    image_height = annotation['height']\n",
+    "\n",
+    "    fig, ax = plt.subplots(figsize=(image_width / 100, image_height / 100), facecolor='black')  # figsize 단위는 인치, DPI 고려\n",
+    "    ax.set_facecolor('black')\n",
+    "    \n",
+    "    \n",
+    "    if seg_box == 'seg':\n",
+    "        polygons = []\n",
+    "        color = (1, 1, 1)\n",
+    "        \n",
+    "        if type(annotation['segmentation'][0]) == list:\n",
+    "            # polygon used for refcoco*\n",
+    "            for seg in annotation['segmentation']:\n",
+    "                poly = np.array(seg).reshape((int(len(seg) / 2), 2))\n",
+    "                polygons.append(Polygon(poly))\n",
+    "\n",
+    "            p = PatchCollection(polygons,\n",
+    "                                facecolors=(1, 1, 1),\n",
+    "                                linewidths=0)\n",
+    "            ax.add_collection(p)\n",
+    "\n",
+    "            # 축 범위를 이미지 크기에 맞게 설정\n",
+    "            ax.set_xlim(0, image_width)\n",
+    "            ax.set_ylim(0, image_height)\n",
+    "            \n",
+    "            # y축 방향 뒤집기 (이미지 좌표계와 일치)\n",
+    "            ax.invert_yaxis()\n",
+    "            \n",
+    "            # 플롯 표시\n",
+    "            #plt.axis('equal')  # 축 비율을 동일하게 설정\n",
+    "            #plt.show()\n",
+    "\n",
+    "            #플롯 저장\n",
+    "            plt.axis('off')  # 축 숨김 (선택 사항)\n",
+    "            save_path = f'{mask_dir}/{annotation[\"file_name\"]}'\n",
+    "            plt.savefig(save_path, bbox_inches='tight', pad_inches=0, facecolor='black')\n",
+    "\n",
+    "for annotation in json_data:\n",
+    "    saveMask(annotation, mask_dir='/home/yejin/data/dataset/VRIS/mbench/ytvos/filtered_masks')\n",
+    "    "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "VerbCentric_RIS",
+   "language": "python",
+   "name": "verbcentric_ris"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

make_refcoco/refcocog_google/multi_object_data_gref_google.json ADDED Viewed

The diff for this file is too large to render. See raw diff

make_refcoco/refcocog_google/needrevision_refid_part4.json ADDED Viewed

	@@ -0,0 +1,506 @@

+{
+    "4859": {
+        "101105": "man sitting on the ground playing wii",
+        "101106": "man in white and light blue t - shirt"
+    },
+    "678": {
+        "14720": "the man crouching inside the plane",
+        "14721": "the man wearing white hat"
+    },
+    "162": {
+        "2908": "the man resting his face on his hands",
+        "2909": "the man with a plastic bag between his feet"
+    },
+    "3052": {
+        "63901": "person looking at a book",
+        "63902": "person wearing a hat and backpack"
+    },
+    "2355": {
+        "49522": "the cat sitting in the chair",
+        "49523": "cat on left side"
+    },
+    "3408": {
+        "71397": "a man bending and judging a tennis match",
+        "71398": "a man wearing a red shirt and black pants"
+    },
+    "834": {
+        "17983": "a giraffe who is eating hay out of a feeder",
+        "17984": "the giraffe on the right side of the pole"
+    },
+    "328": {
+        "6730": "person bending over",
+        "6731": "big person in blue cap"
+    },
+    "1646": {
+        "35169": "person about to hit a ball",
+        "35170": "person wearing shirt and pants"
+    },
+    "4400": {
+        "91825": "boy sitting on his skateboard and looking at another boy",
+        "91826": "boy wearing dark t - shirt and jeans"
+    },
+    "3683": {
+        "77184": "a man dishing up food",
+        "77185": "a man in military camo and a black hat on the right"
+    },
+    "3788": {
+        "79367": "a black cat sitting and starring",
+        "79368": "a cat with a heart shaped tag"
+    },
+    "4701": {
+        "97795": "person whose tie is being pulled by another person",
+        "97796": "person in blue shirt with a red undone tie"
+    },
+    "1211": {
+        "26003": "person putting arm around another person",
+        "26004": "person with backpack"
+    },
+    "2138": {
+        "45446": "a person sleeping on the top bunk",
+        "45447": "a person in a green shirt and brown shorts"
+    },
+    "3510": {
+        "73478": "personn sitting in a train compartment and reading book",
+        "73479": "person in striped shirt"
+    },
+    "899": {
+        "19308": "a man serving soup",
+        "19309": "a man with tattoo on his arm"
+    },
+    "293": {
+        "5939": "a lady laughing and looking at another lady",
+        "5940": "a lady with dark hair and a dark shirt"
+    },
+    "3196": {
+        "67017": "person holding a pen",
+        "67018": "person in a brown suit"
+    },
+    "1939": {
+        "41076": "a person sitting cross legged on the beach",
+        "41077": "person in khakis and a white shirt with yellow flowers"
+    },
+    "2659": {
+        "56121": "person helping another cross a stream",
+        "56122": "person in white dress"
+    },
+    "2849": {
+        "59798": "person looking down drinking a glass of wine",
+        "59799": "person on the right side not wearing glasses"
+    },
+    "756": {
+        "16375": "the woman about to pick up a slice of pizza",
+        "16376": "a woman with a flower shirt"
+    },
+    "4573": {
+        "95258": "person reaching for another person with the frisbee",
+        "95259": "person with blue and white striped shirt on"
+    },
+    "4514": {
+        "94061": "person running behind",
+        "94062": "person in dark brown top and jeans"
+    },
+    "304": {
+        "6165": "person resting her head in hand and crossing one's legs",
+        "6166": "the person in pink jacket"
+    },
+    "3465": {
+        "72753": "person sitting on a love seat and watching others play wii",
+        "72754": "person in a black shirt and white shorts"
+    },
+    "1092": {
+        "23796": "a bear standing up with its mouth open",
+        "23797": "a bear on the right"
+    },
+    "2025": {
+        "42838": "the person leading the horse",
+        "42839": "the person in gray top and jeans"
+    },
+    "1701": {
+        "36094": "giraffe biting off of a tree",
+        "36095": "tall giraffe on the right"
+    },
+    "2958": {
+        "62137": "person playing with dog",
+        "62138": "balding person wearing brown hoodie"
+    },
+    "4793": {
+        "99824": "the girl eating and looking at her plate",
+        "99825": "the girl wearing a pink shirt"
+    },
+    "1247": {
+        "26624": "the person holding the bat",
+        "26625": "the person in white t - shirt and grey pants"
+    },
+    "1841": {
+        "38888": "person resting hands on other people's shoulders",
+        "38889": "tallest person wearing bright suit"
+    },
+    "4404": {
+        "91907": "a elephant whose trunk pointing to the floor , may be touching",
+        "91908": "elephant more on the right side of the picture"
+    },
+    "4536": {
+        "94448": "a person reaching for the microwave looking at the camera",
+        "94449": "person in black t shirt"
+    },
+    "2787": {
+        "58740": "a giraffe snacking on the tree",
+        "58741": "a giraffe on the right"
+    },
+    "3377": {
+        "70765": "a zebra resting its head on another zebra ' s back",
+        "70766": "a zebra on the left"
+    },
+    "3889": {
+        "81051": "a man holding a basket of pastries",
+        "81052": "a man wearing grey hoodie"
+    },
+    "2194": {
+        "46507": "standing dog",
+        "46508": "a black and white dog with a blue collar tag"
+    },
+    "508": {
+        "11146": "person being held by another person",
+        "11147": "person dressed in a red suit and blue cap"
+    },
+    "2312": {
+        "48847": "a bird standing on a table",
+        "48848": "a bird on the left"
+    },
+    "3948": {
+        "82190": "the woman who is squinting in one eye",
+        "82191": "a blue eyed brown haired woman not wearing glasses"
+    },
+    "1388": {
+        "29353": "person holding another person while watching giraffe drink water",
+        "29354": "person in brown shirt with bag"
+    },
+    "2690": {
+        "56849": "a man about to kick a ball",
+        "56850": "a man in all white with number 23 on his chest"
+    },
+    "1109": {
+        "24043": "man holding the ktie",
+        "24044": "man on the right"
+    },
+    "1374": {
+        "29120": "person arranging pansts of another person",
+        "29121": "the person with in the black tuxedo and glasses in his head"
+    },
+    "3475": {
+        "72951": "woman holding the horse",
+        "72952": "a woman wearing spectacles with violet shirt and flourecent colour waist vest"
+    },
+    "1333": {
+        "28225": "a person holding another person",
+        "28226": "a person in a pink and orange flannel shirt"
+    },
+    "2068": {
+        "43909": "person standing and playing wii",
+        "43910": "person wearing black t - shirt"
+    },
+    "2824": {
+        "59394": "person standing besides a table crossing arms",
+        "59395": "person with glasses and long hair"
+    },
+    "2294": {
+        "48483": "a person sitting on bike holding another person",
+        "48484": "a person with a helmet on the head"
+    },
+    "2446": {
+        "51355": "an elephant that has it ' s trunk pointing towards the water",
+        "51356": "elephant on the left"
+    },
+    "2686": {
+        "56783": "a man staring at another man",
+        "56784": "a man in an orange tie"
+    },
+    "4558": {
+        "94950": "a zebra facing the camera",
+        "94951": "a small zebra beside a larger zebra"
+    },
+    "1499": {
+        "32051": "a man resting on a metal fence",
+        "32052": "a man in white shirt and polka dot tie"
+    },
+    "4303": {
+        "89833": "a man throwing a banana",
+        "89834": "a man in bike gear on the right of the picture"
+    },
+    "1376": {
+        "29146": "a man sitting down with his hands together",
+        "29147": "a man with a purple shirt and khaki pants "
+    },
+    "3544": {
+        "74100": "the man holding a riding crop",
+        "74101": "man in black shirt and slacks on the left"
+    },
+    "1858": {
+        "39103": "a bull standing",
+        "39104": "a white and brown bull on the left of the picture"
+    },
+    "434": {
+        "9561": "the man looking down",
+        "9562": "the man on the left"
+    },
+    "3024": {
+        "63345": "a baseball player sliding into a base",
+        "63346": "baseball player wearing the number 12"
+    },
+    "513": {
+        "11239": "a man riding on a skateboard as his picture is being taken",
+        "11240": "a man in a purple t - shirt and ripped jeans"
+    },
+    "693": {
+        "14989": "a person standing",
+        "14990": "a small person"
+    },
+    "2523": {
+        "53103": "a baseball player sliding into home plate and getting tagged by the catcher",
+        "53104": "a la dodgers player on the right of the picture"
+    },
+    "4987": {
+        "104145": "a girl punching out her arm while playing an interactive video game",
+        "104146": "girl wearing grey and white stripes and sweatpants"
+    },
+    "4041": {
+        "84159": "soccer player about to kick soccer ball",
+        "84160": "soccer player wearing black t - shirt and black gloves"
+    },
+    "2105": {
+        "44674": "a baseball player holding his arm up to catch a ball",
+        "44675": "a baseball player wearing helmet and vest"
+    },
+    "135": {
+        "2353": "dog resting it ' s head on a table",
+        "2354": "golden dog"
+    },
+    "3613": {
+        "75580": "person talking to another person while crossing legs",
+        "75581": "person with long sleeve shirt, jeans and cap"
+    },
+    "1722": {
+        "36451": "person pulling another person's tie",
+        "36452": "blonde person in black dress"
+    },
+    "1607": {
+        "34281": "a person reading a book to another person he ' s holding",
+        "34282": "a bald person wearing a beige t - shirt and gray jeans"
+    },
+    "2761": {
+        "58225": "girl propping her chin on her hand",
+        "58226": "girl in a pink shirt near window"
+    },
+    "2454": {
+        "51492": "a man looking at laptop",
+        "51493": "the man with glasses and painted fingernails"
+    },
+    "1603": {
+        "34234": "person eating a donut",
+        "34235": "person with the black beanie"
+    },
+    "4794": {
+        "99868": "a duck that is looking straight ahead",
+        "99869": "the duck on the right side"
+    },
+    "2485": {
+        "52246": "a person reaching across the net",
+        "52247": "tallest person in a grey shirt and shorts"
+    },
+    "3280": {
+        "68799": "a boy walking towards his skate board",
+        "68800": "a boy in a striped shirt"
+    },
+    "3336": {
+        "69882": "person holding a piece of chocolate cake",
+        "69883": "person wearing a purple dress"
+    },
+    "3118": {
+        "65349": "giraffe stretching its neck straight up",
+        "65350": "taller giraffe"
+    },
+    "4494": {
+        "93729": "man touching the frisbee",
+        "93730": "a man in a white shirt"
+    },
+    "3004": {
+        "62940": "person crouching to catch a ball",
+        "62941": "person in a red uniform and helmet"
+    },
+    "127": {
+        "2256": "a person holding a plate",
+        "2257": "the person in the purple coat"
+    },
+    "3389": {
+        "70905": "person waving",
+        "70906": "person in black sneakers"
+    },
+    "2568": {
+        "54256": "person looking at phone",
+        "54257": "blonde person on the right"
+    },
+    "2283": {
+        "48251": "the cook holding a plate",
+        "48252": "middle cook of three cooks"
+    },
+    "1530": {
+        "32639": "person petting the cat",
+        "32640": "person with sleeves rolled up"
+    },
+    "4251": {
+        "88833": "a person reading a book",
+        "88834": "person in a striped jacket "
+    },
+    "2540": {
+        "53539": "a man reaching out his right arm holding a controller",
+        "53540": "a man in red shirt and black jeans"
+    },
+    "2870": {
+        "60169": "a person watching horse riding",
+        "60170": "a person in a white jacket and beige pants"
+    },
+    "4946": {
+        "103092": "a man about to hit a ball",
+        "103093": "a man in red shirt and blue vest"
+    },
+    "113": {
+        "1973": "person holding phone",
+        "1974": "person with a black shirt and brown coat"
+    },
+    "711": {
+        "15398": "girl crouching and holding an umbrella",
+        "15399": "girl wearing light green socks on the left"
+    },
+    "3209": {
+        "67236": "the person that is sliding into home , getting tagged out by the catcher",
+        "67237": "the person in the white vest over the blue shirt"
+    },
+    "3620": {
+        "75711": "person petting a horse",
+        "75712": "a person in white t - shirt"
+    },
+    "4382": {
+        "91559": "horse being hugged by a person",
+        "91560": "white and brown horse"
+    },
+    "2861": {
+        "60004": "a man playing tennis",
+        "60005": "a man wearing a blue shirt and white shorts"
+    },
+    "3954": {
+        "82306": "a person putting gloves on",
+        "82307": "person with dark blue jumper"
+    },
+    "1984": {
+        "42076": "a person being held by another person",
+        "42077": "little person on pink skiis with yellow parka on"
+    },
+    "2069": {
+        "43945": "a person helping another person ski",
+        "43946": "a big person in white jumper and backpack"
+    },
+    "2016": {
+        "42686": "person putting food in the oven",
+        "42687": "person in green t - shirt"
+    },
+    "1153": {
+        "25076": "a giraffe , with head lowered , crosses in front of another giraffe",
+        "25077": "giraffe in the middle"
+    },
+    "3614": {
+        "75583": "a man in explaining something on a tablet",
+        "75584": "a man with a blue cap and striped shirt"
+    },
+    "198": {
+        "3830": "a giraffe bending down to eat grass",
+        "3831": "giraffe in front"
+    },
+    "3012": {
+        "63097": "person standing with hands on hips",
+        "63098": "person in a white collared shirt and jeans"
+    },
+    "4247": {
+        "88808": "man pointing toward another man",
+        "88809": "man in plaid shirt"
+    },
+    "2205": {
+        "46674": "person bending over",
+        "46675": "person in red shirt and cap"
+    },
+    "4831": {
+        "100694": "person holding bat in hands",
+        "100695": "person wearing light blue shirt and glass"
+    },
+    "4534": {
+        "94419": "the bird not drinking",
+        "94420": "the bird on the left"
+    },
+    "638": {
+        "13717": "person sitting on another person's lap and holding the remote controller",
+        "13718": "small person in red shirt"
+    },
+    "1419": {
+        "30082": "person squatting on the ground to catch a ball",
+        "30083": "person in red and white wearing glove"
+    },
+    "1992": {
+        "42197": "a person reaching for a cupcake",
+        "42198": "a person in a blue vest"
+    },
+    "542": {
+        "11877": "man receiving food",
+        "11878": "a black man in a black shirt"
+    },
+    "2223": {
+        "47051": "person sitting a chair holding a protest sign",
+        "47052": "old person in grey t - shirt and blue jeans"
+    },
+    "4865": {
+        "101219": "person being held by another person",
+        "101220": "a young person wearing a yellow shirt"
+    },
+    "751": {
+        "16247": "person holding a painting brush",
+        "16248": "person wearing white top and cap"
+    },
+    "3540": {
+        "74039": "a man swinging a bat",
+        "74040": "a man in a blue baseball shirt and white pants"
+    },
+    "3765": {
+        "78908": "person sitting",
+        "78909": "person wearing white shirt and red shoes"
+    },
+    "2879": {
+        "60471": "bear standing against the fence",
+        "60472": "a small bear on the right"
+    },
+    "4529": {
+        "94312": "kid holding out left arm playing wii",
+        "94313": "kid in a green and red sweatshirt"
+    },
+    "2131": {
+        "45308": "man putting both hands behind his head",
+        "45309": "a man with the pool noodle"
+    },
+    "1306": {
+        "27841": "a cow eating grass",
+        "27842": "the cow on the right"
+    },
+    "3508": {
+        "73469": "a person standing and playing a video game",
+        "73470": "a little person dressed in brown"
+    },
+    "4165": {
+        "87036": "a child holding feathers",
+        "87037": "a child wearing green t - shirt"
+    },
+    "4126": {
+        "86073": "a person standing and reading a book",
+        "86074": "a person in a suit"
+    },
+    "388": {
+        "8339": "a man holding up an umbrella in the rain for a man who is fixing a tire",
+        "8340": "a man wearing glasses in a red jacket"
+    }
+}

make_refcoco/refcocog_umd/needrevision_refid_part4.json ADDED Viewed

	@@ -0,0 +1,498 @@

+{
+    "1679": {
+        "37582": "player holding a baseball glove",
+        "37583": "a blurred player"
+    },
+    "4048": {
+        "92810": "player hitting a ball with a baseball bat",
+        "92811": "player with number 18 on his back"
+    },
+    "2530": {
+        "57782": "man crouching ready to catch a ball",
+        "57783": "man with 55 on his back"
+    },
+    "4385": {
+        "101410": "man leaning on one leg watching the players",
+        "101411": "man in gray pants"
+    },
+    "5018": {
+        "102413": "man standing ready to swing his bat",
+        "102414": "man in front of the other two men"
+    },
+    "2290": {
+        "52302": "sheep standing in the pasture next to a sitting sheep",
+        "52303": "the front most sheep"
+    },
+    "2347": {
+        "53861": "a sheep sitting down in the grass",
+        "53862": "a sheep in the background"
+    },
+    "3143": {
+        "71854": "a horse being led by it ' s trainer",
+        "71855": "a horse in front of the picture"
+    },
+    "1688": {
+        "37818": "zebra eating grass",
+        "37819": "the zebra in the middle with its face near the ground"
+    },
+    "944": {
+        "21007": "a bird touching its neck with its right feet",
+        "21008": "a bird in the back"
+    },
+    "3477": {
+        "79163": "the bird standing and looking to the left",
+        "79164": "bird with both feet in the water"
+    },
+    "2497": {
+        "56845": "person holding a baseball bat",
+        "56846": "person in blue baseball cap"
+    },
+    "4110": {
+        "94298": "person sitting and watching children play a ballgame",
+        "94299": "person wearing a white shirt and black leggings"
+    },
+    "2011": {
+        "45909": "a woman talking on her cell phone",
+        "45910": "a blonde woman wearing a blue shirt and white shorts"
+    },
+    "2884": {
+        "65819": "a woman looking at her phone",
+        "65820": "a woman with black hair wearing jeans, a striped gray shirt and flip flops"
+    },
+    "1076": {
+        "24000": "person crossing a stream of water",
+        "24001": "person wearing jeans and a green vest"
+    },
+    "4803": {
+        "56121": "person helping the other cross a stream",
+        "56122": "person in white dress"
+    },
+    "3508": {
+        "80112": "baseball player placing his hands on his hips",
+        "80113": "a baseball player named datz"
+    },
+    "169": {
+        "4002": "person feeding a giraffe",
+        "4003": "a small person in light blue shirt"
+    },
+    "258": {
+        "5988": "person holding a child",
+        "5989": "person wearing glasses and navy shirt"
+    },
+    "3661": {
+        "83542": "person sitting on the floor",
+        "83543": "person in a grey shirt and dark pants"
+    },
+    "4831": {
+        "62137": "person sitting on couch and playing with a dog",
+        "62138": "bald person wearing jeans and brown hoodie"
+    },
+    "2214": {
+        "50208": "a woman eating a donut",
+        "50209": "a brown hair woman in gray sweater"
+    },
+    "2266": {
+        "51661": "a woman holding a purse",
+        "51662": "a woman with blonde hair and a black shirt"
+    },
+    "2477": {
+        "56429": "girl talking and looking at another girl",
+        "56430": "girl in black"
+    },
+    "5005": {
+        "99824": "girl eating and looking at her plate",
+        "99825": "girl wearing a pink shirt"
+    },
+    "2919": {
+        "66832": "person riding a bike",
+        "66833": "asian person wearing black jacket"
+    },
+    "1850": {
+        "42078": "man placing his hand on another man's shoulder",
+        "42079": "a man who is wearing a red color tie"
+    },
+    "3757": {
+        "85761": "boy holding a cell phone",
+        "85762": "boy in a blue hoodie"
+    },
+    "524": {
+        "12089": "a zebra that is not eating grass",
+        "12090": "a zebra on the far right"
+    },
+    "4363": {
+        "100914": "elephant holding up its trunk",
+        "100915": "an elephant in front of another"
+    },
+    "2976": {
+        "68306": "girl eating food from her right hand",
+        "68307": "a girl in a black flowered top"
+    },
+    "838": {
+        "18887": "man leaning on bike on boat",
+        "18888": "a man not wearing a hat"
+    },
+    "3044": {
+        "69755": "man rowing boat",
+        "69756": "a man on the left side of the picture"
+    },
+    "2426": {
+        "55424": "the baseball player facing towards the right not doing a high five",
+        "55425": "baseball player in catcher ' s uniform"
+    },
+    "2113": {
+        "47984": "person that is dancing",
+        "47985": "person with the thick beard, glasses and a hat"
+    },
+    "2327": {
+        "53376": "person bathing another person",
+        "53377": "person in a floral print dress and hat"
+    },
+    "4727": {
+        "39103": "a bull laying down",
+        "39104": "a white and brown bull on the right"
+    },
+    "859": {
+        "19350": "cat sitting on a luggage and staring at the camera",
+        "19351": "cat infront of another cat"
+    },
+    "935": {
+        "20809": "cat laying down on a bag",
+        "20810": "cat behind another cat"
+    },
+    "1105": {
+        "24654": "an elephant stepping on a large log",
+        "24655": "elephant on far right"
+    },
+    "395": {
+        "8819": "person placing her hands on one's hips",
+        "8820": "person on the far left"
+    },
+    "771": {
+        "17614": "person holding a child on one's shoulders",
+        "17615": "tall person on the right"
+    },
+    "2942": {
+        "67334": "person sitting on another person's shoulders",
+        "67335": "small person on the right"
+    },
+    "41": {
+        "961": "a lady pouring wine in a glass",
+        "962": "a lady in black tank top"
+    },
+    "885": {
+        "19926": "person feeding another person with a bottle",
+        "19927": "person in black blouse"
+    },
+    "4862": {
+        "69276": "person drinking from a bottle",
+        "69277": "small person in white pajamas"
+    },
+    "1246": {
+        "27831": "person holding a laptop",
+        "27832": "person with curly brown hair wearing jeans"
+    },
+    "3346": {
+        "76051": "person filing her nails",
+        "76052": "person wearing a red robe and has a towel on her head"
+    },
+    "3657": {
+        "83493": "person holding a bottle and listening to music",
+        "83494": "person wearing black in headphones"
+    },
+    "540": {
+        "12381": "the woman is swinging the controller",
+        "12382": "woman in brown top on the right"
+    },
+    "3364": {
+        "76757": "the woman looking at the camera and opening her mouth",
+        "76758": "a woman wearing a brown hooded sweatshirt on the left"
+    },
+    "1880": {
+        "42973": "man looking ahead at the tv",
+        "42974": "a man in a white shirt"
+    },
+    "1949": {
+        "44400": "a man looking at his phone",
+        "44401": "man in black t - shirt and cap"
+    },
+    "1620": {
+        "36248": "person playing tennis",
+        "36249": "person in red tank top and black shorts"
+    },
+    "2902": {
+        "66297": "person sitting and watching a tennis game",
+        "66298": "person in blue top"
+    },
+    "397": {
+        "8843": "giraffe bending its head down",
+        "8844": "giraffe on the far right"
+    },
+    "732": {
+        "16725": "baseball player squatting and watching closely to judge a play",
+        "16726": "baseball player in black top and gray pants"
+    },
+    "1173": {
+        "26074": "a man swinging a bat",
+        "26075": "a man in blue and grey"
+    },
+    "2920": {
+        "66854": "a man reaching out his left arm to catch a ball",
+        "66855": "a man in red uniform and helmet"
+    },
+    "1643": {
+        "36762": "a man smiling looking down at other people",
+        "36763": "a man in a grey suite wearing a pink tie"
+    },
+    "1454": {
+        "32177": "person in putting hands in one's pockets",
+        "32178": "person in gray shirt and jeans"
+    },
+    "1725": {
+        "38835": "person crossing her arms walking with another person",
+        "38836": "person in a black shirt and jeans"
+    },
+    "2338": {
+        "53733": "the person crouching and placing his hands on his knees",
+        "53734": "person with a black shirt and dark grey pants"
+    },
+    "4249": {
+        "97957": "a baseball player reaching out his arm to catch a ball",
+        "97958": "a baseball player in green top"
+    },
+    "3917": {
+        "89675": "cow looking at camera",
+        "89676": "a cow with an ear tag with the number 949 on it"
+    },
+    "1156": {
+        "25761": "man sitting on the couch using a laptop",
+        "25762": "a man with a hat"
+    },
+    "1998": {
+        "45619": "a person watching his phone",
+        "45620": "person wearing glasses"
+    },
+    "3571": {
+        "81719": "person looking at one's phone",
+        "81720": "mature person with blonde hair and glasses"
+    },
+    "292": {
+        "6707": "a zebra lying down in dirt",
+        "6708": "the zebra in the foreground"
+    },
+    "3367": {
+        "76808": "a zebra standing in the zoo",
+        "76809": "a zebra in the background"
+    },
+    "2069": {
+        "47212": "person leaning forward on skis",
+        "47213": "person in blue hat and jacket, black pants"
+    },
+    "4050": {
+        "92834": "person standing straight looking at another person",
+        "92835": "a small person wearing purple pants"
+    },
+    "2953": {
+        "67711": "person who is looking away",
+        "67712": "person in a suit"
+    },
+    "4280": {
+        "98813": "person pulling another person's tie",
+        "98814": "a person in a white shirt"
+    },
+    "1743": {
+        "39371": "a person holding and looking at another person",
+        "39372": "person with bald head and glasses"
+    },
+    "4598": {
+        "13717": "person playing with the remote controller",
+        "13718": "small person in red shirt"
+    },
+    "3380": {
+        "77052": "a person cutting a cake",
+        "77053": "a person in gray shirt that is not striped"
+    },
+    "3439": {
+        "78305": "a person holding a spatula getting readyy to have a cake",
+        "78306": "a person in striped shirt"
+    },
+    "3355": {
+        "76309": "a man swining his bat",
+        "76310": "a man in a baseball uniform with a brace on his left ankle"
+    },
+    "3409": {
+        "77608": "a man holding out his arm to catch a ball",
+        "77609": "a man wearing a red vest with red shin guards"
+    },
+    "711": {
+        "16184": "the man holding a cat in his arms",
+        "16185": "this is a man with thin rimmed glasses and a black scarf"
+    },
+    "3764": {
+        "85913": "person holding a remote and smilling",
+        "85914": "person in a black t - shirt and not wearing glasses"
+    },
+    "113": {
+        "2741": "a sheep being fed by a little girl",
+        "2742": "a sheep on the right"
+    },
+    "518": {
+        "12021": "a sheep eating grass with its head down",
+        "12022": "a sheep on the left"
+    },
+    "3158": {
+        "72128": "a boy crouching and placing both hands on his knees",
+        "72129": "boy wearing white baseball helmet , white baseball uniform with orange writing"
+    },
+    "3223": {
+        "73555": "a boy pitching the ball to a player",
+        "73556": "a boy with the number 4 on his blue jersey"
+    },
+    "914": {
+        "20478": "a person standing on a surf board , riding a wave",
+        "20479": "a person on the right"
+    },
+    "3568": {
+        "81669": "surfer laying down",
+        "81670": "surfer on the left"
+    },
+    "592": {
+        "13643": "person sits on the floor watching tv",
+        "13644": "person with a black hat and a beige shirt"
+    },
+    "2856": {
+        "65208": "person sitting on a chair watching another person play video games",
+        "65209": "person in black shirt and jeans"
+    },
+    "4879": {
+        "73469": "person playing a video game",
+        "73470": "blonde person dressed in brown"
+    },
+    "157": {
+        "3682": "a woman holding a plate and reaching for condiments",
+        "3683": "woman wearing grey button up sweater"
+    },
+    "1774": {
+        "40317": "person being held by another person",
+        "40318": "person with red hair, wearing a pink shirt"
+    },
+    "2354": {
+        "53948": "person with child , catching a frisby",
+        "53949": "bigger person in white t - shirt"
+    },
+    "174": {
+        "4179": "a lamb eating grass",
+        "4180": "a lamb to the left of another lamb"
+    },
+    "2369": {
+        "54196": "the sheep that is looking into the camera",
+        "54197": "a white sheep with a black head on the right"
+    },
+    "4247": {
+        "97897": "a woman holding an umbrella on a bench",
+        "97898": "woman on the right"
+    },
+    "1014": {
+        "22621": "man receiving an award",
+        "22622": "a man in an orange and white uniform with a black cap"
+    },
+    "1080": {
+        "24100": "a man offers a trophy to anothe man",
+        "24101": "a man in a suit"
+    },
+    "2272": {
+        "51815": "the baseball player catching a ball",
+        "51816": "the baseball player in dark top and helmet"
+    },
+    "2495": {
+        "56804": "a baseball player swinging at a ball",
+        "56805": "the baseball player in white uniform"
+    },
+    "3511": {
+        "80309": "person holding a cup",
+        "80310": "person wearing pink shirt"
+    },
+    "3955": {
+        "90542": "person holding a remote control",
+        "90543": "person in orange shirt"
+    },
+    "2409": {
+        "55054": "a man adjusting his head band",
+        "55055": "man in orange and gray shirt"
+    },
+    "2775": {
+        "63273": "a person holding a remote control",
+        "63274": "a tall person in white striped shirt and black pants"
+    },
+    "996": {
+        "22281": "a woman holding a baby",
+        "22282": "woman wearing a black shirt and green apron"
+    },
+    "4789": {
+        "52629": "a person holding skies in one's hands",
+        "52630": "a person with orange mirrored goggles"
+    },
+    "1028": {
+        "22786": "the cow standing up",
+        "22787": "a cow in the middle"
+    },
+    "244": {
+        "5666": "a man holding wine glass",
+        "5668": "a blonde man in a white shirt"
+    },
+    "3538": {
+        "80923": "the man throwing the ball from the picther ' s mound",
+        "80924": "the man in front"
+    },
+    "557": {
+        "12739": "a baseball player getting ready to swing the bat",
+        "12740": "a baseball player , wearing a white and blue uniform"
+    },
+    "4982": {
+        "95870": "cat sitting in front of television on a stand",
+        "95871": "orange cat on the right side of the picture"
+    },
+    "4570": {
+        "6638": "a woman cutting a cake",
+        "6639": "a woman wearing a long sleeve pink sweater"
+    },
+    "1698": {
+        "38093": "a baseball player swinging his bat",
+        "38094": "a baseball player weaing a white uniform and blue helmet"
+    },
+    "3182": {
+        "72616": "the baseball player playing the catcher position",
+        "72617": "the baseball player wearing a red and white uniform"
+    },
+    "846": {
+        "19100": "a man holding a toothbrush in his mouth",
+        "19101": "a man wearing striped shirt"
+    },
+    "671": {
+        "15227": "person petting a horse",
+        "15228": "person wearing a red jacket"
+    },
+    "3254": {
+        "74216": "person sitting in the chair",
+        "74217": "person in the tan shirt wearing glasses"
+    },
+    "3318": {
+        "75539": "the person who is smashing cake in his own face",
+        "75540": "person with a fake tie on its onesie"
+    },
+    "1424": {
+        "31548": "person watching another person eat",
+        "31549": "person in the green shirt"
+    },
+    "3926": {
+        "89831": "person eating a sandwich",
+        "89832": "person in orange top with sunglasses in one's head"
+    },
+    "862": {
+        "19444": "a man driving a bicycle and pulling a cart behind",
+        "19445": "the man is wearing a pair of khaki shorts"
+    },
+    "2932": {
+        "67140": "man standing on bike",
+        "67141": "man in blue jean shorts"
+    }
+}

mbench/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (160 Bytes). View file

mbench/__pycache__/ytvos_ref.cpython-310.pyc ADDED Viewed

Binary file (7.81 kB). View file

mbench/check_image_numbered_cy.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

mbench/check_image_numbered_cy_score.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import sys
+import os
+import argparse
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import opts
+import numpy as np
+import cv2
+from PIL import Image
+import json
+from mbench.ytvos_ref import build as build_ytvos_ref
+import t2v_metrics
+import matplotlib.pyplot as plt
+import textwrap
+def scoreCaption(idx, all_captions, all_valid_obj_ids, clip_flant5_score, color_mask = False):
+    vid_meta = metas[idx]
+    vid_id = vid_meta['video']
+    frames = vid_meta['frames']
+    first_cat = list(all_captions[vid_id].keys())[0]
+    sampled_frames = list(all_captions[vid_id][first_cat].keys())
+    imgs = []
+    masks = []
+    for frame_indx in sampled_frames:
+        frame_name = frames[int(frame_indx)]
+        img_path = os.path.join(str(train_dataset.img_folder), 'JPEGImages', vid_id, frame_name + '.jpg')
+        mask_path = os.path.join(str(train_dataset.img_folder), 'Annotations', vid_id, frame_name + '.png')
+        img = Image.open(img_path).convert('RGB')
+        imgs.append(img)
+        mask = Image.open(mask_path).convert('P')
+        mask = np.array(mask)
+        masks.append(mask)
+    vid_captions = all_captions[vid_id]
+    cat_names = set(list(vid_captions.keys()))
+    vid_result = {}
+    for cat in cat_names:
+        cat_captions = vid_captions[cat]
+        cat_result = {}
+        for i in range(len(imgs)):
+            frame_name = sampled_frames[i]
+            frame = np.copy(np.array(imgs[i]))
+            frame_for_contour = np.copy(np.array(imgs[i]))
+            mask = masks[i]
+            all_obj_ids = np.unique(mask).astype(int)
+            all_obj_ids = [str(obj_id) for obj_id in all_obj_ids if obj_id != 0]
+            if cat in all_valid_obj_ids[vid_id]:
+                valid_obj_ids = all_valid_obj_ids[vid_id][cat]
+            else:
+                valid_obj_ids = []
+            for j in range(len(all_obj_ids)):
+                obj_id = all_obj_ids[j]
+                obj_mask = (mask == int(obj_id)).astype(np.uint8)
+                if obj_id in valid_obj_ids:
+                    if color_mask == False:
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 3)
+                        for i, contour in enumerate(contours):
+                            # 윤곽선 중심 계산
+                            moments = cv2.moments(contour)
+                            if moments["m00"] != 0:  # 중심 계산 가능 여부 확인
+                                cx = int(moments["m10"] / moments["m00"])
+                                cy = int(moments["m01"] / moments["m00"])
+                            else:
+                                cx, cy = contour[0][0]  # 중심 계산 불가시 대체 좌표 사용
+                            # 텍스트 배경 (검은색 배경 만들기)
+                            font = cv2.FONT_HERSHEY_SIMPLEX
+                            text = obj_id
+                            text_size = cv2.getTextSize(text, font, 1, 2)[0]
+                            text_w, text_h = text_size
+                            # 텍스트 배경 그리기 (검은색 배경)
+                            cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5),
+                                        (cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1)
+                            # 텍스트 그리기 (흰색 텍스트)
+                            cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2),
+                                        font, 1, (255, 255, 255), 2)
+                    else:
+                        alpha = 0.08
+                        colored_obj_mask = np.zeros_like(frame)
+                        colored_obj_mask[obj_mask == 1] = colors[j]
+                        frame[obj_mask == 1] = (
+                            (1 - alpha) * frame[obj_mask == 1]
+                            + alpha * colored_obj_mask[obj_mask == 1]
+                        )
+                        contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+                        cv2.drawContours(frame, contours, -1, colors[j], 2)
+                        cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2)
+                        if len(contours) > 0:
+                            largest_contour = max(contours, key=cv2.contourArea)
+                            M = cv2.moments(largest_contour)
+                            if M["m00"] != 0:
+                                center_x = int(M["m10"] / M["m00"])
+                                center_y = int(M["m01"] / M["m00"])
+                            else:
+                                center_x, center_y = 0, 0
+                        font = cv2.FONT_HERSHEY_SIMPLEX
+                        text = obj_id
+                        font_scale = 0.9
+                        text_size = cv2.getTextSize(text, font, font_scale, 2)[0]
+                        text_x = center_x - text_size[0] // 1  # 텍스트의 가로 중심
+                        text_y = center_y
+                        # text_y = center_y + text_size[1] // 2 # 텍스트의 세로 중심
+                        # 텍스트 배경 사각형 좌표 계산
+                        rect_start = (text_x - 5, text_y - text_size[1] - 5)  # 배경 사각형 좌상단
+                        # rect_end = (text_x + text_size[0] + 5, text_y + 5)
+                        rect_end = (text_x + text_size[0] + 5, text_y)
+                        cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1)
+                        cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2)
+            # fig, ax = plt.subplots()
+            # ax.imshow(frame)
+            # ax.axis('off')
+            frame_caption = cat_captions[frame_name]
+            if frame_caption:
+                # wrapped_text = "\n".join(textwrap.wrap(frame_caption, width=60))
+                # ax.text(0.5, -0.3, wrapped_text, ha='center', va='center', fontsize=12, transform=ax.transAxes)
+                #calculate vqa score
+                frame = Image.fromarray(frame)
+                score = clip_flant5_score(images=[frame], texts=[frame_caption])
+            else:
+                score = None
+            # plt.title(f"vid_id: {vid_id}, cat: {cat}, frame: {frame_name}, score: {score}")
+            # plt.tight_layout()
+            # plt.show()
+            cat_result[frame_name] = {
+                "caption" : frame_caption,
+                "score" : score
+            }
+        vid_result[cat] = cat_result
+    return vid_id, vid_result
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()])
+    args = parser.parse_args()
+    #==================데이터 불러오기===================
+    # 전체 데이터셋
+    train_dataset = build_ytvos_ref(image_set = 'train', args = args)
+    # 전체 데이터셋 메타데이터
+    metas = train_dataset.metas
+    # caption 데이터
+    with open('mbench/numbered_captions_gpt-4o_final.json', 'r') as file:
+        all_captions = json.load(file)
+    # valid obj ids 데이터
+    with open('mbench/numbered_valid_obj_ids_gpt-4o_final.json', 'r') as file:
+        all_valid_obj_ids = json.load(file)
+    # 색상 후보 8개 (RGB 형식)
+    colors = [
+        (255, 0, 0),    # Red
+        (0, 255, 0),    # Green
+        (0, 0, 255),    # Blue
+        (255, 255, 0),  # Yellow
+        (255, 0, 255),  # Magenta
+        (0, 255, 255),  # Cyan
+        (128, 0, 128),  # Purple
+        (255, 165, 0)   # Orange
+    ]
+    #==================vqa score 모델 불러오기===================
+    clip_flant5_score = t2v_metrics.VQAScore(model='clip-flant5-xxl')
+    #==================vqa score 점수 계산하기===================
+    all_scores = {}
+    for i in range(5):
+        vid_id, vid_result = scoreCaption(i, all_captions, all_valid_obj_ids, clip_flant5_score, False)
+        all_scores[vid_id] = vid_result
+    with open('mbench/numbered_captions_gpt-4o_final_scores.json', 'w', encoding='utf-8') as json_file:
+        json.dump(all_scores, indent=4, ensure_ascii=False)
+    print("JSON 파일이 성공적으로 저장되었습니다!")

mbench/gpt_ref-ytvos-cy.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

mbench/gpt_ref-ytvos-revised.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

mbench/gpt_ref-ytvos_numbered.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5fd89176d8bf426500d18caf6b5983b0765f147d17a6bb59f41c4edcaf3c3158
+size 16214561

mbench/gpt_ref-ytvos_numbered_cy.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

mbench/numbered_captions.json ADDED Viewed

The diff for this file is too large to render. See raw diff

mbench/numbered_captions_gpt-4o.json ADDED Viewed

The diff for this file is too large to render. See raw diff

mbench/numbered_captions_gpt-4o_nomask_randcap2.json ADDED Viewed

The diff for this file is too large to render. See raw diff

mbench/numbered_valid_obj_ids_gpt-4o_final.json ADDED Viewed

The diff for this file is too large to render. See raw diff

mbench/numbered_valid_obj_ids_gpt-4o_nomask_randcap2.json ADDED Viewed

	@@ -0,0 +1,2153 @@

+{
+    "003234408d": {
+        "penguin": [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5"
+        ]
+    },
+    "0043f083b5": {
+        "bus": [
+            "1"
+        ],
+        "sedan": [
+            "2",
+            "3"
+        ]
+    },
+    "0044fa5fba": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "005a527edd": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "0065b171f9": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "00917dcfc4": {
+        "zebra": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "00a23ccf53": {
+        "shark": [
+            "1"
+        ]
+    },
+    "00ad5016a4": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "01082ae388": {
+        "leopard": [
+            "1"
+        ]
+    },
+    "011ac0a06f": {
+        "ape": [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5"
+        ]
+    },
+    "013099c098": {
+        "giant_panda": [
+            "1",
+            "2"
+        ]
+    },
+    "0155498c85": {
+        "person": [
+            "1"
+        ],
+        "motorbike": [
+            "2"
+        ]
+    },
+    "01694ad9c8": {
+        "bird": [
+            "1"
+        ]
+    },
+    "017ac35701": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "01b80e8e1a": {
+        "zebra": [
+            "1",
+            "2"
+        ]
+    },
+    "01baa5a4e1": {},
+    "01c3111683": {
+        "whale": [
+            "1"
+        ]
+    },
+    "01c4cb5ffe": {
+        "person": [
+            "1",
+            "3"
+        ]
+    },
+    "01c76f0a82": {
+        "sedan": [
+            "1",
+            "4"
+        ]
+    },
+    "01c783268c": {
+        "person": [
+            "2"
+        ],
+        "ape": [
+            "1"
+        ]
+    },
+    "01e64dd36a": {
+        "cow": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "01ed275c6e": {
+        "giraffe": [
+            "1",
+            "2"
+        ]
+    },
+    "01ff60d1fa": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "020cd28cd2": {
+        "person": [
+            "1"
+        ]
+    },
+    "02264db755": {
+        "fox": [
+            "1"
+        ]
+    },
+    "0248626d9a": {
+        "train": [
+            "1"
+        ]
+    },
+    "02668dbffa": {
+        "frog": [
+            "1"
+        ]
+    },
+    "0274193026": {
+        "person": [
+            "2"
+        ]
+    },
+    "02d28375aa": {
+        "fox": [
+            "1"
+        ]
+    },
+    "031ccc99b1": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0321b18c10": {
+        "elephant": [
+            "3"
+        ],
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "0348a45bca": {
+        "fish": [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5"
+        ]
+    },
+    "0355e92655": {
+        "boat": [
+            "3"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "0358b938c1": {
+        "elephant": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "0368107cf1": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "0379ddf557": {
+        "person": [
+            "1"
+        ]
+    },
+    "038b2cc71d": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "038c15a5dd": {
+        "hedgehog": [
+            "1"
+        ]
+    },
+    "03a06cc98a": {
+        "giraffe": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "03a63e187f": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "03c95b4dae": {
+        "elephant": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "03e2b57b0e": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "04194e1248": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "04259896e2": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "0444918a5f": {
+        "truck": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "04460a7a52": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "04474174a4": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "0450095513": {
+        "snail": [
+            "1"
+        ]
+    },
+    "045f00aed2": {
+        "tiger": [
+            "1"
+        ],
+        "person": [
+            "3"
+        ]
+    },
+    "04667fabaa": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "04735c5030": {
+        "cat": [
+            "1",
+            "2"
+        ]
+    },
+    "04990d1915": {
+        "sedan": [
+            "1"
+        ],
+        "truck": [
+            "3"
+        ],
+        "bus": [
+            "2"
+        ]
+    },
+    "04d62d9d98": {
+        "person": [
+            "1"
+        ]
+    },
+    "04f21da964": {
+        "monkey": [
+            "1"
+        ]
+    },
+    "04fbad476e": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "04fe256562": {
+        "truck": [
+            "2"
+        ],
+        "motorbike": [
+            "1"
+        ]
+    },
+    "0503bf89c9": {
+        "hedgehog": [
+            "1"
+        ]
+    },
+    "0536c9eed0": {
+        "cat": [
+            "1"
+        ]
+    },
+    "054acb238f": {
+        "owl": [
+            "1"
+        ]
+    },
+    "05579ca250": {
+        "sedan": [
+            "3"
+        ],
+        "person": [
+            "1"
+        ]
+    },
+    "056c200404": {},
+    "05774f3a2c": {
+        "ape": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "058a7592c8": {
+        "train": [
+            "1"
+        ]
+    },
+    "05a0a513df": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "05a569d8aa": {
+        "cat": [
+            "1"
+        ],
+        "mouse": [
+            "2"
+        ]
+    },
+    "05aa652648": {
+        "ape": [
+            "1"
+        ]
+    },
+    "05d7715782": {},
+    "05e0b0f28f": {
+        "person": [
+            "2"
+        ],
+        "mouse": [
+            "1"
+        ]
+    },
+    "05fdbbdd7a": {},
+    "05ffcfed85": {
+        "monkey": [
+            "1",
+            "2"
+        ]
+    },
+    "0630391881": {
+        "person": [
+            "1"
+        ]
+    },
+    "06840b2bbe": {
+        "snake": [
+            "1"
+        ]
+    },
+    "068f7dce6f": {
+        "shark": [
+            "1"
+        ]
+    },
+    "0693719753": {
+        "turtle": [
+            "1",
+            "2"
+        ]
+    },
+    "06ce2b51fb": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "06e224798e": {
+        "tiger": [
+            "1"
+        ]
+    },
+    "06ee361788": {
+        "duck": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "06fbb3fa2c": {
+        "eagle": [
+            "1"
+        ]
+    },
+    "0700264286": {
+        "cow": [
+            "1",
+            "2"
+        ]
+    },
+    "070c918ca7": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "07129e14a4": {
+        "person": [
+            "3"
+        ],
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "07177017e9": {
+        "motorbike": [
+            "1",
+            "2"
+        ]
+    },
+    "07238ffc58": {
+        "monkey": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "07353b2a89": {
+        "sheep": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "0738493cbf": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "075926c651": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "075c701292": {
+        "duck": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "0762ea9a30": {
+        "person": [
+            "1"
+        ]
+    },
+    "07652ee4af": {
+        "person": [
+            "1"
+        ]
+    },
+    "076f206928": {
+        "person": [
+            "3"
+        ],
+        "zebra": [
+            "1",
+            "2"
+        ]
+    },
+    "077d32af19": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ],
+        "train": [
+            "4"
+        ]
+    },
+    "079049275c": {
+        "mouse": [
+            "1"
+        ]
+    },
+    "07913cdda7": {
+        "person": [
+            "2",
+            "3"
+        ],
+        "train": [
+            "1"
+        ]
+    },
+    "07a11a35e8": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "07ac33b6df": {
+        "ape": [
+            "1"
+        ]
+    },
+    "07c62c3d11": {
+        "parrot": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "07cc1c7d74": {
+        "snake": [
+            "1"
+        ]
+    },
+    "080196ef01": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "081207976e": {},
+    "081ae4fa44": {
+        "shark": [
+            "1",
+            "2"
+        ]
+    },
+    "081d8250cb": {
+        "sedan": [
+            "3"
+        ],
+        "person": [
+            "1"
+        ]
+    },
+    "082900c5d4": {
+        "duck": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0860df21e2": {},
+    "0866d4c5e3": {
+        "bird": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0891ac2eb6": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "08931bc458": {
+        "person": [
+            "1"
+        ]
+    },
+    "08aa2705d5": {
+        "snake": [
+            "1"
+        ]
+    },
+    "08c8450db7": {},
+    "08d50b926c": {
+        "turtle": [
+            "1",
+            "2"
+        ]
+    },
+    "08e1e4de15": {
+        "monkey": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "08e48c1a48": {
+        "cow": [
+            "1"
+        ]
+    },
+    "08f561c65e": {
+        "person": [
+            "2"
+        ],
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "08feb87790": {
+        "sheep": [
+            "1"
+        ]
+    },
+    "09049f6fe3": {
+        "mouse": [
+            "1",
+            "2"
+        ]
+    },
+    "092e4ff450": {
+        "snake": [
+            "1"
+        ]
+    },
+    "09338adea8": {
+        "whale": [
+            "1",
+            "2"
+        ]
+    },
+    "093c335ccc": {
+        "person": [
+            "2"
+        ]
+    },
+    "0970d28339": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "0974a213dc": {
+        "giraffe": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "097b471ed8": {
+        "cat": [
+            "1",
+            "2"
+        ]
+    },
+    "0990941758": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "09a348f4fa": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "09a6841288": {
+        "duck": [
+            "1",
+            "2"
+        ]
+    },
+    "09c5bad17b": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "09c9ce80c7": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "09ff54fef4": {
+        "fox": [
+            "1",
+            "2"
+        ]
+    },
+    "0a23765d15": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "0a275e7f12": {
+        "elephant": [
+            "1"
+        ]
+    },
+    "0a2f2bd294": {
+        "motorbike": [
+            "1"
+        ]
+    },
+    "0a7a2514aa": {
+        "lizard": [
+            "2"
+        ],
+        "cat": [
+            "1"
+        ]
+    },
+    "0a7b27fde9": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "0a8c467cc3": {
+        "fish": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0ac8c560ae": {
+        "person": [
+            "2",
+            "3"
+        ]
+    },
+    "0b1627e896": {
+        "boat": [
+            "1"
+        ]
+    },
+    "0b285c47f6": {
+        "mouse": [
+            "1"
+        ]
+    },
+    "0b34ec1d55": {
+        "ape": [
+            "1"
+        ]
+    },
+    "0b5b5e8e5a": {
+        "sedan": [
+            "2"
+        ],
+        "person": [
+            "1"
+        ]
+    },
+    "0b68535614": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "0b6f9105fc": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "0b7dbfa3cb": {
+        "cow": [
+            "1"
+        ]
+    },
+    "0b9cea51ca": {
+        "whale": [
+            "1"
+        ]
+    },
+    "0b9d012be8": {
+        "camel": [
+            "1"
+        ]
+    },
+    "0bcfc4177d": {
+        "truck": [
+            "1"
+        ]
+    },
+    "0bd37b23c1": {
+        "motorbike": [
+            "1"
+        ]
+    },
+    "0bd864064c": {
+        "eagle": [
+            "1"
+        ]
+    },
+    "0c11c6bf7b": {
+        "deer": [
+            "1"
+        ]
+    },
+    "0c26bc77ac": {
+        "crocodile": [
+            "1"
+        ]
+    },
+    "0c3a04798c": {
+        "duck": [
+            "1"
+        ],
+        "fish": [
+            "2"
+        ]
+    },
+    "0c44a9d545": {
+        "tiger": [
+            "1"
+        ]
+    },
+    "0c817cc390": {
+        "dog": [
+            "2"
+        ],
+        "hedgehog": [
+            "1"
+        ]
+    },
+    "0ca839ee9a": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "0cd7ac0ac0": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "0ce06e0121": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "0cfe974a89": {
+        "turtle": [
+            "1",
+            "2"
+        ]
+    },
+    "0d2fcc0dcd": {
+        "zebra": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "0d3aad05d2": {
+        "person": [
+            "1"
+        ]
+    },
+    "0d40b015f4": {
+        "person": [
+            "1"
+        ]
+    },
+    "0d97fba242": {
+        "person": [
+            "2"
+        ],
+        "dog": [
+            "1"
+        ]
+    },
+    "0d9cc80d7e": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0dab85b6d3": {
+        "lizard": [
+            "1",
+            "2"
+        ]
+    },
+    "0db5c427a5": {
+        "train": [
+            "1"
+        ]
+    },
+    "0dbaf284f1": {
+        "cat": [
+            "1",
+            "2"
+        ]
+    },
+    "0de4923598": {},
+    "0df28a9101": {
+        "turtle": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0e04f636c4": {
+        "frog": [
+            "1"
+        ]
+    },
+    "0e05f0e232": {
+        "lizard": [
+            "1",
+            "2"
+        ]
+    },
+    "0e0930474b": {
+        "sedan": [
+            "1"
+        ],
+        "person": [
+            "2",
+            "3"
+        ]
+    },
+    "0e27472bea": {
+        "turtle": [
+            "1"
+        ]
+    },
+    "0e30020549": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "0e621feb6c": {
+        "lizard": [
+            "1",
+            "2"
+        ]
+    },
+    "0e803c7d73": {},
+    "0e9ebe4e3c": {
+        "truck": [
+            "1"
+        ]
+    },
+    "0e9f2785ec": {
+        "person": [
+            "2"
+        ]
+    },
+    "0ea68d418b": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "0eb403a222": {},
+    "0ee92053d6": {
+        "person": [
+            "1"
+        ]
+    },
+    "0eefca067f": {
+        "giant_panda": [
+            "1",
+            "2"
+        ]
+    },
+    "0f17fa6fcb": {
+        "duck": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0f1ac8e9a3": {
+        "frog": [
+            "1"
+        ]
+    },
+    "0f202e9852": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "0f2ab8b1ff": {
+        "dolphin": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0f51a78756": {
+        "sheep": [
+            "1"
+        ]
+    },
+    "0f5fbe16b0": {
+        "raccoon": [
+            "1",
+            "2"
+        ]
+    },
+    "0f6072077b": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "0f6b69b2f4": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "0f6c2163de": {
+        "snail": [
+            "1"
+        ]
+    },
+    "0f74ec5599": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "0f9683715b": {
+        "elephant": [
+            "1"
+        ]
+    },
+    "0fa7b59356": {
+        "duck": [
+            "1"
+        ]
+    },
+    "0fb173695b": {
+        "person": [
+            "3"
+        ]
+    },
+    "0fc958cde2": {
+        "owl": [
+            "1"
+        ]
+    },
+    "0fe7b1a621": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "0ffcdb491c": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "101caff7d4": {
+        "giant_panda": [
+            "1",
+            "2"
+        ]
+    },
+    "1022fe8417": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "1032e80b37": {
+        "giraffe": [
+            "1"
+        ]
+    },
+    "103f501680": {
+        "fish": [
+            "1"
+        ]
+    },
+    "104e64565f": {
+        "elephant": [
+            "1"
+        ]
+    },
+    "104f1ab997": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "106242403f": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "10b31f5431": {
+        "person": [
+            "1",
+            "3",
+            "4"
+        ]
+    },
+    "10eced835e": {
+        "giant_panda": [
+            "1",
+            "2"
+        ]
+    },
+    "110d26fa3a": {
+        "shark": [
+            "1"
+        ]
+    },
+    "1122c1d16a": {
+        "person": [
+            "6"
+        ],
+        "parrot": [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5"
+        ]
+    },
+    "1145b49a5f": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "11485838c2": {
+        "giraffe": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "114e7676ec": {
+        "person": [
+            "1"
+        ]
+    },
+    "1157472b95": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "115ee1072c": {
+        "cow": [
+            "1"
+        ]
+    },
+    "1171141012": {
+        "person": [
+            "2"
+        ],
+        "turtle": [
+            "1"
+        ]
+    },
+    "117757b4b8": {
+        "snail": [
+            "1"
+        ]
+    },
+    "1178932d2f": {
+        "person": [
+            "1",
+            "2"
+        ],
+        "motorbike": [
+            "3"
+        ]
+    },
+    "117cc76bda": {
+        "whale": [
+            "1"
+        ]
+    },
+    "1180cbf814": {
+        "fish": [
+            "1",
+            "2"
+        ]
+    },
+    "1187bbd0e3": {
+        "cat": [
+            "1"
+        ]
+    },
+    "1197e44b26": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "119cf20728": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "119dd54871": {
+        "lion": [
+            "1",
+            "2"
+        ]
+    },
+    "11a0c3b724": {
+        "mouse": [
+            "1",
+            "2"
+        ]
+    },
+    "11a6ba8c94": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "11c722a456": {
+        "turtle": [
+            "1",
+            "2"
+        ]
+    },
+    "11cbcb0b4d": {
+        "zebra": [
+            "1"
+        ]
+    },
+    "11ccf5e99d": {
+        "person": [
+            "2"
+        ]
+    },
+    "11ce6f452e": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "11feabe596": {
+        "rabbit": [
+            "1"
+        ]
+    },
+    "120cb9514d": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "12156b25b3": {
+        "person": [
+            "1"
+        ]
+    },
+    "122896672d": {
+        "person": [
+            "1",
+            "3"
+        ]
+    },
+    "1233ac8596": {
+        "dog": [
+            "1"
+        ]
+    },
+    "1239c87234": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "1250423f7c": {
+        "elephant": [
+            "3",
+            "4"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "1257a1bc67": {
+        "snake": [
+            "1"
+        ]
+    },
+    "125d1b19dd": {
+        "giant_panda": [
+            "1",
+            "2"
+        ]
+    },
+    "126d203967": {
+        "person": [
+            "2"
+        ]
+    },
+    "1295e19071": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "12ad198c54": {
+        "person": [
+            "1"
+        ]
+    },
+    "12bddb2bcb": {
+        "person": [
+            "2"
+        ]
+    },
+    "12ec9b93ee": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "12eebedc35": {
+        "bird": [
+            "1"
+        ]
+    },
+    "132852e094": {
+        "fox": [
+            "1"
+        ]
+    },
+    "1329409f2a": {
+        "fish": [
+            "1"
+        ]
+    },
+    "13325cfa14": {
+        "person": [
+            "2"
+        ]
+    },
+    "1336440745": {
+        "mouse": [
+            "1",
+            "2"
+        ]
+    },
+    "134d06dbf9": {
+        "cat": [
+            "1"
+        ]
+    },
+    "135625b53d": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "13870016f9": {
+        "person": [
+            "1"
+        ],
+        "cow": [
+            "2",
+            "3"
+        ]
+    },
+    "13960b3c84": {
+        "giraffe": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "13adaad9d9": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "13ae097e20": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "13e3070469": {
+        "zebra": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "13f6a8c20d": {
+        "fish": [
+            "1"
+        ]
+    },
+    "1416925cf2": {
+        "truck": [
+            "1",
+            "2"
+        ]
+    },
+    "142d2621f5": {
+        "person": [
+            "1",
+            "2"
+        ],
+        "motorbike": [
+            "3"
+        ]
+    },
+    "145d5d7c03": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "145fdc3ac5": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "1471274fa7": {
+        "person": [
+            "1"
+        ]
+    },
+    "14a6b5a139": {
+        "fish": [
+            "1"
+        ]
+    },
+    "14c21cea0d": {
+        "monkey": [
+            "1",
+            "2"
+        ]
+    },
+    "14dae0dc93": {
+        "person": [
+            "2"
+        ]
+    },
+    "14f9bd22b5": {
+        "tiger": [
+            "1"
+        ]
+    },
+    "14fd28ae99": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "15097d5d4e": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "150ea711f2": {
+        "whale": [
+            "1"
+        ]
+    },
+    "1514e3563f": {
+        "earless_seal": [
+            "1",
+            "2"
+        ]
+    },
+    "152aaa3a9e": {
+        "raccoon": [
+            "1"
+        ]
+    },
+    "152b7d3bd7": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "15617297cc": {
+        "person": [
+            "1"
+        ]
+    },
+    "15abbe0c52": {
+        "person": [
+            "1"
+        ]
+    },
+    "15d1fb3de5": {
+        "owl": [
+            "1"
+        ],
+        "cat": [
+            "2"
+        ]
+    },
+    "15f67b0fab": {
+        "person": [
+            "1"
+        ]
+    },
+    "161eb59aad": {
+        "giraffe": [
+            "1"
+        ],
+        "cow": [
+            "2",
+            "3"
+        ]
+    },
+    "16288ea47f": {
+        "duck": [
+            "1",
+            "2"
+        ]
+    },
+    "164410ce62": {
+        "person": [
+            "1"
+        ]
+    },
+    "165c3c8cd4": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "165c42b41b": {
+        "person": [
+            "1",
+            "4"
+        ],
+        "motorbike": [
+            "2",
+            "3"
+        ]
+    },
+    "165ec9e22b": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "1669502269": {
+        "person": [
+            "1"
+        ]
+    },
+    "16763cccbb": {
+        "ape": [
+            "1"
+        ]
+    },
+    "16adde065e": {
+        "person": [
+            "3"
+        ],
+        "cat": [
+            "2"
+        ]
+    },
+    "16af445362": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "16afd538ad": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "16c3fa4d5d": {
+        "sedan": [
+            "1"
+        ]
+    },
+    "16d1d65c27": {
+        "monkey": [
+            "1"
+        ]
+    },
+    "16e8599e94": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "16fe9fb444": {
+        "person": [
+            "2"
+        ],
+        "motorbike": [
+            "1"
+        ]
+    },
+    "1705796b02": {
+        "train": [
+            "1"
+        ]
+    },
+    "1724db7671": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "17418e81ea": {
+        "shark": [
+            "1"
+        ]
+    },
+    "175169edbb": {
+        "ape": [
+            "1",
+            "2"
+        ]
+    },
+    "17622326fd": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "17656bae77": {
+        "elephant": [
+            "1"
+        ]
+    },
+    "17b0d94172": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "17c220e4f6": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "17c7bcd146": {
+        "train": [
+            "1"
+        ]
+    },
+    "17cb4afe89": {
+        "tiger": [
+            "1"
+        ]
+    },
+    "17cd79a434": {
+        "squirrel": [
+            "1"
+        ]
+    },
+    "17d18604c3": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "17d8ca1a37": {
+        "owl": [
+            "1"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "17e33f4330": {
+        "monkey": [
+            "1"
+        ]
+    },
+    "17f7a6d805": {
+        "snail": [
+            "1"
+        ]
+    },
+    "180abc8378": {
+        "owl": [
+            "1"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "183ba3d652": {
+        "motorbike": [
+            "3"
+        ],
+        "person": [
+            "2"
+        ]
+    },
+    "185bf64702": {
+        "zebra": [
+            "1",
+            "2"
+        ]
+    },
+    "18913cc690": {
+        "train": [
+            "1"
+        ]
+    },
+    "1892651815": {
+        "camel": [
+            "1"
+        ]
+    },
+    "189ac8208a": {
+        "giraffe": [
+            "1",
+            "2"
+        ]
+    },
+    "189b44e92c": {
+        "zebra": [
+            "1"
+        ]
+    },
+    "18ac264b76": {
+        "person": [
+            "2"
+        ]
+    },
+    "18b245ab49": {
+        "penguin": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "18b5cebc34": {
+        "mouse": [
+            "1"
+        ]
+    },
+    "18bad52083": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "18bb5144d5": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "18c6f205c5": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "1903f9ea15": {
+        "bird": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "1917b209f2": {
+        "person": [
+            "1"
+        ],
+        "cow": [
+            "3",
+            "4"
+        ],
+        "horse": [
+            "2"
+        ]
+    },
+    "191e74c01d": {
+        "deer": [
+            "1"
+        ]
+    },
+    "19367bb94e": {
+        "fish": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "193ffaa217": {
+        "person": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "19696b67d3": {
+        "cow": [
+            "1"
+        ]
+    },
+    "197f3ab6f3": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "1981e763cc": {
+        "sheep": [
+            "1",
+            "2"
+        ]
+    },
+    "198afe39ae": {
+        "person": [
+            "1"
+        ]
+    },
+    "19a6e62b9b": {
+        "monkey": [
+            "1",
+            "2"
+        ]
+    },
+    "19b60d5335": {
+        "hedgehog": [
+            "1"
+        ]
+    },
+    "19c00c11f9": {
+        "person": [
+            "1"
+        ]
+    },
+    "19e061eb88": {
+        "boat": [
+            "1",
+            "2"
+        ]
+    },
+    "19e8bc6178": {
+        "dog": [
+            "1"
+        ]
+    },
+    "19ee80dac6": {
+        "person": [
+            "1",
+            "3",
+            "4"
+        ]
+    },
+    "1a25a9170a": {
+        "cow": [
+            "1"
+        ],
+        "person": [
+            "2",
+            "3"
+        ]
+    },
+    "1a359a6c1a": {
+        "sheep": [
+            "1"
+        ]
+    },
+    "1a3e87c566": {
+        "frog": [
+            "1"
+        ]
+    },
+    "1a5fe06b00": {
+        "bus": [
+            "1"
+        ]
+    },
+    "1a6c0fbd1e": {
+        "person": [
+            "1"
+        ]
+    },
+    "1a6f3b5a4b": {
+        "sedan": [
+            "3"
+        ]
+    },
+    "1a8afbad92": {
+        "zebra": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "1a8bdc5842": {
+        "parrot": [
+            "1",
+            "2"
+        ]
+    },
+    "1a95752aca": {
+        "duck": [
+            "1",
+            "2"
+        ]
+    },
+    "1a9c131cb7": {
+        "ape": [
+            "1",
+            "2",
+            "3"
+        ]
+    },
+    "1aa3da3ee3": {
+        "sheep": [
+            "1",
+            "2",
+            "3",
+            "4"
+        ]
+    },
+    "1ab27ec7ea": {
+        "deer": [
+            "1"
+        ]
+    },
+    "1abf16d21d": {
+        "turtle": [
+            "1"
+        ]
+    },
+    "1acd0f993b": {
+        "dog": [
+            "1"
+        ],
+        "person": [
+            "3"
+        ]
+    },
+    "1ad202e499": {
+        "lizard": [
+            "1",
+            "2"
+        ]
+    },
+    "1af8d2395d": {
+        "person": [
+            "1",
+            "2"
+        ],
+        "airplane": [
+            "4"
+        ]
+    },
+    "1afd39a1fa": {
+        "motorbike": [
+            "2"
+        ]
+    },
+    "1b2d31306f": {
+        "lizard": [
+            "1"
+        ]
+    },
+    "1b3fa67f0e": {
+        "airplane": [
+            "1"
+        ]
+    },
+    "1b43fa74b4": {
+        "owl": [
+            "1",
+            "2"
+        ]
+    },
+    "1b73ea9fc2": {
+        "parrot": [
+            "1"
+        ]
+    },
+    "1b7e8bb255": {
+        "person": [
+            "2"
+        ]
+    },
+    "1b8680f8cd": {
+        "person": [
+            "2",
+            "3"
+        ]
+    },
+    "1b883843c0": {
+        "person": [
+            "1",
+            "2"
+        ]
+    },
+    "1b8898785b": {
+        "monkey": [
+            "1",
+            "2"
+        ]
+    },
+    "1b88ba1aa4": {
+        "giant_panda": [
+            "1"
+        ]
+    },
+    "1b96a498e5": {
+        "ape": [
+            "1"
+        ]
+    },
+    "1bbc4c274f": {
+        "fish": [
+            "2"
+        ]
+    },
+    "1bd87fe9ab": {
+        "train": [
+            "1"
+        ]
+    },
+    "1c4090c75b": {
+        "whale": [
+            "1"
+        ]
+    },
+    "1c41934f84": {
+        "elephant": [
+            "1",
+            "2"
+        ]
+    },
+    "1c72b04b56": {
+        "lion": [
+            "1"
+        ]
+    },
+    "1c87955a3a": {
+        "crocodile": [
+            "1"
+        ],
+        "turtle": [
+            "2"
+        ]
+    },
+    "1c9f9eb792": {
+        "person": [
+            "2"
+        ]
+    },
+    "1ca240fede": {
+        "train": [
+            "1"
+        ]
+    },
+    "1ca5673803": {
+        "person": [
+            "1",
+            "3"
+        ]
+    },
+    "1cada35274": {
+        "duck": [
+            "1"
+        ]
+    },
+    "1cb44b920d": {
+        "eagle": [
+            "1",
+            "2"
+        ]
+    },
+    "1cd10e62be": {
+        "leopard": [
+            "1"
+        ]
+    },
+    "1d3087d5e5": {
+        "fish": [
+            "1",
+            "2",
+            "3",
+            "4",
+            "5"
+        ]
+    },
+    "1d3685150a": {
+        "person": [
+            "1",
+            "3"
+        ]
+    },
+    "1d6ff083aa": {
+        "person": [
+            "1",
+            "2"
+        ]
+    }
+}

mbench/sampled_frame.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ac6df555665b2f0cc411641ce023ac10565ea7e8a5c0586c4a9e775481bca62
+size 17415938

mbench/sampled_frame2.json ADDED Viewed

The diff for this file is too large to render. See raw diff