|
import os |
|
import sys |
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) |
|
import time |
|
|
|
from os import path as osp |
|
from io import BytesIO |
|
import random |
|
|
|
from mbench.ytvos_ref import build as build_ytvos_ref |
|
import argparse |
|
import opts |
|
|
|
import sys |
|
from pathlib import Path |
|
import os |
|
from os import path as osp |
|
import skimage |
|
from io import BytesIO |
|
|
|
import numpy as np |
|
import pandas as pd |
|
import regex as re |
|
import json |
|
|
|
import cv2 |
|
from PIL import Image, ImageDraw |
|
import torch |
|
from torchvision.transforms import functional as F |
|
|
|
from skimage import measure |
|
from shapely.geometry import Polygon, MultiPolygon |
|
|
|
import matplotlib.pyplot as plt |
|
import matplotlib.patches as patches |
|
from matplotlib.collections import PatchCollection |
|
from matplotlib.patches import Rectangle |
|
import textwrap |
|
|
|
|
|
import ipywidgets as widgets |
|
from IPython.display import display, clear_output |
|
|
|
from openai import OpenAI, APIConnectionError, OpenAIError |
|
import base64 |
|
import json |
|
import requests |
|
|
|
def number_objects_and_encode_old(idx, color_mask=False): |
|
encoded_frames = {} |
|
contoured_frames = {} |
|
vid_cat_cnts = {} |
|
|
|
vid_meta = metas[idx] |
|
vid_data = train_dataset[idx] |
|
vid_id = vid_meta['video'] |
|
frame_indx = vid_meta['sample_indx'] |
|
cat_names = set(vid_meta['obj_id_cat'].values()) |
|
imgs = vid_data[0] |
|
|
|
for cat in cat_names: |
|
cat_frames = [] |
|
contour_frames = [] |
|
frame_cat_cnts = {} |
|
|
|
for i in range(imgs.size(0)): |
|
frame_name = frame_indx[i] |
|
frame = np.copy(imgs[i].permute(1, 2, 0).numpy()) |
|
frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy()) |
|
|
|
frame_data = vid_data[2][frame_name] |
|
obj_ids = list(frame_data.keys()) |
|
|
|
cat_cnt = 0 |
|
|
|
for j in range(len(obj_ids)): |
|
obj_id = obj_ids[j] |
|
obj_data = frame_data[obj_id] |
|
obj_bbox = obj_data['bbox'] |
|
obj_valid = obj_data['valid'] |
|
obj_mask = obj_data['mask'].numpy().astype(np.uint8) |
|
obj_cat = obj_data['category_name'] |
|
|
|
if obj_cat == cat and obj_valid: |
|
cat_cnt += 1 |
|
|
|
if color_mask == False: |
|
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) |
|
cv2.drawContours(frame, contours, -1, colors[j], 3) |
|
for i, contour in enumerate(contours): |
|
moments = cv2.moments(contour) |
|
if moments["m00"] != 0: |
|
cx = int(moments["m10"] / moments["m00"]) |
|
cy = int(moments["m01"] / moments["m00"]) |
|
else: |
|
cx, cy = contour[0][0] |
|
|
|
font = cv2.FONT_HERSHEY_SIMPLEX |
|
text = obj_id |
|
text_size = cv2.getTextSize(text, font, 1, 2)[0] |
|
text_w, text_h = text_size |
|
|
|
cv2.rectangle(frame, (cx - text_w // 2 - 5, cy - text_h // 2 - 5), |
|
(cx + text_w // 2 + 5, cy + text_h // 2 + 5), (0, 0, 0), -1) |
|
|
|
cv2.putText(frame, text, (cx - text_w // 2, cy + text_h // 2), |
|
font, 1, (255, 255, 255), 2) |
|
|
|
else: |
|
alpha = 0.08 |
|
|
|
colored_obj_mask = np.zeros_like(frame) |
|
colored_obj_mask[obj_mask == 1] = colors[j] |
|
frame[obj_mask == 1] = ( |
|
(1 - alpha) * frame[obj_mask == 1] |
|
+ alpha * colored_obj_mask[obj_mask == 1] |
|
) |
|
|
|
|
|
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) |
|
cv2.drawContours(frame, contours, -1, colors[j], 2) |
|
cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2) |
|
|
|
if len(contours) > 0: |
|
largest_contour = max(contours, key=cv2.contourArea) |
|
M = cv2.moments(largest_contour) |
|
if M["m00"] != 0: |
|
center_x = int(M["m10"] / M["m00"]) |
|
center_y = int(M["m01"] / M["m00"]) |
|
else: |
|
center_x, center_y = 0, 0 |
|
|
|
font = cv2.FONT_HERSHEY_SIMPLEX |
|
text = obj_id |
|
|
|
font_scale = 0.9 |
|
text_size = cv2.getTextSize(text, font, font_scale, 2)[0] |
|
text_x = center_x - text_size[0] // 1 |
|
text_y = center_y |
|
|
|
rect_start = (text_x - 5, text_y - text_size[1] - 5) |
|
rect_end = (text_x + text_size[0] + 5, text_y) |
|
|
|
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1) |
|
cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
buffer = BytesIO() |
|
frame = Image.fromarray(frame) |
|
frame.save(buffer, format='jpeg') |
|
buffer.seek(0) |
|
cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8")) |
|
frame_cat_cnts[frame_name] = cat_cnt |
|
|
|
buffer.seek(0) |
|
buffer.truncate() |
|
frame_for_contour = Image.fromarray(frame_for_contour) |
|
frame_for_contour.save(buffer, format='jpeg') |
|
buffer.seek(0) |
|
contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8")) |
|
|
|
encoded_frames[cat] = cat_frames |
|
contoured_frames[cat] = contour_frames |
|
vid_cat_cnts[cat] = frame_cat_cnts |
|
|
|
return encoded_frames, contoured_frames, vid_cat_cnts |
|
|
|
|
|
def number_objects_and_encode(idx, color_mask=False): |
|
encoded_frames = {} |
|
contoured_frames = {} |
|
vid_cat_cnts = {} |
|
|
|
vid_meta = metas[idx] |
|
vid_data = train_dataset[idx] |
|
vid_id = vid_meta['video'] |
|
frame_indx = vid_meta['sample_indx'] |
|
cat_names = set(vid_meta['obj_id_cat'].values()) |
|
imgs = vid_data[0] |
|
|
|
for cat in cat_names: |
|
cat_frames = [] |
|
contour_frames = [] |
|
frame_cat_cnts = {} |
|
|
|
for i in range(imgs.size(0)): |
|
frame_name = frame_indx[i] |
|
frame = np.copy(imgs[i].permute(1, 2, 0).numpy()) |
|
frame_for_contour = np.copy(imgs[i].permute(1, 2, 0).numpy()) |
|
|
|
frame_data = vid_data[2][frame_name] |
|
obj_ids = list(frame_data.keys()) |
|
|
|
cat_cnt = 0 |
|
|
|
for j in range(len(obj_ids)): |
|
obj_id = obj_ids[j] |
|
obj_data = frame_data[obj_id] |
|
obj_bbox = obj_data['bbox'] |
|
obj_valid = obj_data['valid'] |
|
obj_mask = obj_data['mask'].numpy().astype(np.uint8) |
|
obj_cat = obj_data['category_name'] |
|
|
|
if obj_cat == cat and obj_valid: |
|
cat_cnt += 1 |
|
|
|
contours, _ = cv2.findContours(obj_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) |
|
cv2.drawContours(frame, contours, -1, colors[j], 3) |
|
cv2.drawContours(frame_for_contour, contours, -1, colors[j], 2) |
|
|
|
if len(contours) > 0: |
|
largest_contour = max(contours, key=cv2.contourArea) |
|
M = cv2.moments(largest_contour) |
|
if M["m00"] != 0: |
|
center_x = int(M["m10"] / M["m00"]) |
|
center_y = int(M["m01"] / M["m00"]) |
|
else: |
|
center_x, center_y = 0, 0 |
|
|
|
font = cv2.FONT_HERSHEY_SIMPLEX |
|
text = obj_id |
|
font_scale = 1.2 |
|
text_size = cv2.getTextSize(text, font, font_scale, 2)[0] |
|
text_x = center_x - text_size[0] // 1 |
|
text_y = center_y |
|
|
|
rect_start = (text_x - 5, text_y - text_size[1] - 5) |
|
rect_end = (text_x + text_size[0] + 5, text_y + 3) |
|
|
|
contour_thickness = 1 |
|
rect_start_contour = (rect_start[0] - contour_thickness, rect_start[1] - contour_thickness) |
|
rect_end_contour = (rect_end[0] + contour_thickness, rect_end[1] + contour_thickness) |
|
|
|
cv2.rectangle(frame, rect_start_contour, rect_end_contour, colors[j], contour_thickness) |
|
cv2.rectangle(frame, rect_start, rect_end, (0, 0, 0), -1) |
|
cv2.putText(frame, text, (text_x, text_y), font, 1, (255, 255, 255), 2) |
|
|
|
|
|
if color_mask: |
|
alpha = 0.08 |
|
colored_obj_mask = np.zeros_like(frame) |
|
colored_obj_mask[obj_mask == 1] = colors[j] |
|
frame[obj_mask == 1] = ( |
|
(1 - alpha) * frame[obj_mask == 1] |
|
+ alpha * colored_obj_mask[obj_mask == 1] |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
buffer = BytesIO() |
|
frame = Image.fromarray(frame) |
|
frame.save(buffer, format='jpeg') |
|
buffer.seek(0) |
|
cat_frames.append(base64.b64encode(buffer.read()).decode("utf-8")) |
|
frame_cat_cnts[frame_name] = cat_cnt |
|
|
|
buffer.seek(0) |
|
buffer.truncate() |
|
frame_for_contour = Image.fromarray(frame_for_contour) |
|
frame_for_contour.save(buffer, format='jpeg') |
|
buffer.seek(0) |
|
contour_frames.append(base64.b64encode(buffer.read()).decode("utf-8")) |
|
|
|
encoded_frames[cat] = cat_frames |
|
contoured_frames[cat] = contour_frames |
|
vid_cat_cnts[cat] = frame_cat_cnts |
|
|
|
return encoded_frames, contoured_frames, vid_cat_cnts |
|
|
|
|
|
|
|
def getCaption(idx, model='gpt-4o'): |
|
vid_meta = metas[idx] |
|
vid_data = train_dataset[idx] |
|
vid_id = vid_meta['video'] |
|
print(f"vid id: {vid_id}\n") |
|
|
|
frame_indx = vid_meta['sample_indx'] |
|
cat_names = set(vid_meta['obj_id_cat'].values()) |
|
all_captions = dict() |
|
|
|
|
|
color_mask = random.choices([False, True], weights=[60, 40])[0] |
|
|
|
base64_frames, _ , vid_cat_cnts = number_objects_and_encode(idx, color_mask) |
|
|
|
|
|
for cat_name in list(cat_names) : |
|
|
|
is_movable = False |
|
if cat_name in ytvos_category_valid_list : |
|
is_movable = True |
|
|
|
if not is_movable: |
|
print(f"Skipping {cat_name}: Determined to be non-movable.", end='\n\n') |
|
|
|
|
|
image_captions = {} |
|
captioner = OpenAI() |
|
cat_base64_frames = base64_frames[cat_name] |
|
|
|
|
|
for i in range(len(cat_base64_frames)): |
|
frame_name = frame_indx[i] |
|
|
|
base64_image = cat_base64_frames[i] |
|
should_filter = False |
|
frame_cat_cnts = vid_cat_cnts[cat_name][frame_name] |
|
|
|
if frame_cat_cnts >= 2: |
|
should_filter = True |
|
else: |
|
print(f"Skipping {cat_name}: There is single or no object.", end='\n\n') |
|
|
|
|
|
if is_movable and should_filter: |
|
|
|
print(f"-----------category name: {cat_name}, frame name: {frame_name}") |
|
caption_filter_text = f""" |
|
You are a visual assistant analyzing a single frame from a video. |
|
In this frame, I have labeled {frame_cat_cnts} {cat_name}(s), each with a bright numeric ID at its center and a visible marker. |
|
|
|
Are {cat_name}s in the image performing all different and recognizable actions or postures? |
|
Consider differences in body pose (standing, sitting, holding hands up, grabbing object, facing the camera, stretching, walking...), motion cues (inferred from the momentary stance or position), |
|
facial expressions, and any notable interactions with objects or other {cat_name}s or people. |
|
|
|
Only focus on obvious, prominent actions that can be reliably identified from this single frame. |
|
|
|
- Respond with "YES" if: |
|
1) Most of {cat_name}s exhibit clearly different, unique actions or poses. |
|
(e.g. standing, sitting, bending, stretching, showing its back, or turning toward the camera.) |
|
2) You can see visible significant differences in action and posture, that an observer can identify at a glance. |
|
3) Interaction Variability: Each {cat_name} is engaged in a different type of action, such as one grasping an object while another is observing. |
|
|
|
- Respond with "NONE" if: |
|
1) The actions or pose are not clearly differentiable or too similar. |
|
2) Minimal or Ambiguous Motion: The frame does not provide clear evidence of distinct movement beyond subtle shifts in stance. |
|
3) Passive or Neutral Poses: If multiple {cat_name}(s) are simply standing or sitting without an obvious difference in orientation or motion |
|
|
|
Answer strictly with either "YES" or "NONE". |
|
""" |
|
|
|
response1 = captioner.chat.completions.create( |
|
model=model, |
|
messages=[ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{ |
|
"type": "text", |
|
"text": caption_filter_text, |
|
}, |
|
{ |
|
"type": "image_url", |
|
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, |
|
} |
|
], |
|
} |
|
], |
|
) |
|
response_content = response1.choices[0].message.content |
|
should_caption = True if "yes" in response_content.lower() else False |
|
print(f"are {cat_name}s distinguished by action: {response_content}", end='\n\n') |
|
|
|
else: |
|
should_caption = False |
|
|
|
|
|
dense_caption_prompt_1 = f""" |
|
In the given frame, I labeled {frame_cat_cnts} {cat_name}s by marking each with a bright numeric ID at the center and its boundary. The category name of these objects are : {cat_name}. |
|
|
|
Please describe the image focusing on labeled {cat_name}s in detail, focusing on their actions and interactions. |
|
|
|
1. Focus only on clear, unique, and prominent actions that distinguish each object. |
|
2. Avoid describing actions that are too minor, ambiguous, or not visible from the image. |
|
3. Avoid subjective terms such as 'skilled', 'controlled', or 'focused'. Only describe observable actions. |
|
4. Do not include common-sense or overly general descriptions like 'the elephant walks'. |
|
5. Use dynamic action verbs (e.g., holding, throwing, jumping, inspecting) to describe interactions, poses, or movements. |
|
6. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'. |
|
- expressions like 'seems to be', 'appears to be' are BANNED! |
|
7. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'. |
|
8. Include interactions with objects or other entities when they are prominent and observable. |
|
9. **Do not include descriptions of appearance** such as clothes, color, size, shape etc. |
|
10. **Do not include relative position** between objects such as 'the left elephant' because left/right can be ambiguous. |
|
11. Do not mention object IDs. |
|
12. Use '{cat_name}' as the noun for the referring expressions. |
|
|
|
Note that I want to use your description to create a grounding dataset, therefore, your descriptions for different objects should be unique, i.e., If the image contains multiple {cat_name}s, describe the actions of each individually and ensure the descriptions are non-overlapping and specific. |
|
|
|
- Your answer should contain details, and follow the following format: |
|
object id. action-oriented description |
|
(e.g. 1. the person is holding bananas on two hands and opening his mouth, turning the head right. |
|
2. a person bending over and touching his boots to tie the shoelace.) |
|
- for action-oriented description, use {cat_name} as subject noun |
|
|
|
**Only include the currently labeled category** in each line (e.g., if itβs a person, do not suddenly label it as other object/animal). |
|
Please pay attention to the categories of these objects and donβt change them. |
|
Keep in mind that you should not group the objects, e.g., 2-5. people: xxx, be sure to describe each object separately (one by one). |
|
Output referring expressions for each object id. Please start your answer:""" |
|
|
|
|
|
dense_caption_prompt_2 = f""" |
|
You are an advanced visual language model analyzing a video frame. |
|
In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been distinctly labeled with bright numerical IDs at their center and boundary. |
|
|
|
Your task is to generate **action-oriented descriptions** for each labeled {cat_name}. |
|
Your descriptions should capture their **observable actions and interactions**, making sure to highlight movement, gestures, and dynamic behaviors. |
|
|
|
--- |
|
## Key Guidelines: |
|
1. **Describe only clear and visible actions** that uniquely define what the {cat_name} is doing. |
|
- Example: "grabbing a branch and pulling it down" (**(O) Specific**) |
|
- Avoid: "moving slightly to the side" (**(X) Too vague**) |
|
|
|
2. **Do not describe appearance, color, or position**βfocus purely on the action. |
|
- (X) "A large brown bear standing on the left" |
|
- (O) "The bear is lifting its front paws and swiping forward." |
|
|
|
3. **Use dynamic, action-specific verbs** rather than passive descriptions. |
|
- (O) "The giraffe is tilting its head and sniffing the ground." |
|
- (X) "The giraffe is near a tree and looking around." |
|
|
|
4. **Avoid assumptions, emotions, or speculative phrasing.** |
|
- (X) "The person seems excited" / "The person might be preparing to jump." |
|
- (O) "The person is pushing its front legs against the rock and leaping forward." |
|
|
|
5. **Avoid overly detailed or speculative descriptions** such as 'slightly moving its mouth' or 'appears to be anticipating'. |
|
- expressions like 'seems to be', 'appears to be' are BANNED! |
|
6. Pretend you are observing the scene directly, avoiding phrases like 'it seems' or 'based on the description'. |
|
|
|
7. If multiple {cat_name}s are present, make sure their descriptions are **distinct and non-overlapping**. |
|
- **Each object should have a unique, descriptive action.** |
|
- (X) "Two dogs are running." |
|
- (O) "1. One dog is chasing another, its legs stretched mid-air. |
|
2. The other dog is looking back while speeding up." |
|
|
|
--- |
|
## Output Format: |
|
- Each labeled **{cat_name}** should have exactly **one line of description**. |
|
- Format: `ID. {cat_name} + action-based description` |
|
- (O) Example: |
|
``` |
|
1. The person is leaning forward while opening a bag with both hands. |
|
2. The person is holding onto a rope and pulling themselves up. |
|
``` |
|
- **Ensure that each object is described individually.** |
|
- **Do not group objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed). |
|
|
|
--- |
|
## Additional Instructions: |
|
- **Do NOT** use expressions like "it appears that..." or "it seems like...". |
|
- **Do NOT** mention object IDs in the description (only use the provided format). |
|
- **DO NOT** include markdown formatting (no bullet points, no asterisks). |
|
- **Only describe actions of the labeled {cat_name} objects**βdo not introduce unrelated categories. |
|
|
|
Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer: |
|
""" |
|
|
|
|
|
dense_caption_prompt = f""" |
|
You are a visual assistant analyzing a single frame of a video. |
|
In this frame, {frame_cat_cnts} objects belonging to the category **{cat_name}** have been labeled with bright numeric IDs at their center and boundary. |
|
|
|
I am building an **action-centric referring expression** dataset. |
|
Your task is to describe each labeled {cat_name} based on **clearly observable and specific actions**. |
|
|
|
--- |
|
## Guidelines: |
|
1. **Focus only on visible and prominent actions** (e.g., running, pushing, grasping an object). |
|
2. **Avoid describing minor or ambiguous movements** (e.g., "slightly moving a paw," "tilting head a bit"). |
|
3. **Do not include subjective or speculative descriptions** (e.g., "it seems excited" or "it might be preparing to jump"). |
|
4. **Avoid vague expressions** like "engaging with something." Instead, specify the action (e.g., "grabbing a stick," "pressing a button"). |
|
5. **Use dynamic action verbs** (e.g., holding, throwing, inspecting, leaning, pressing) to highlight motion and interaction. |
|
6. If multiple {cat_name}s appear, ensure each description is **distinct and non-overlapping**. |
|
7. Base your descriptions on these principles: |
|
- **Avoid words like 'minimal' or 'slightly'.** |
|
- Emphasize **body movement, posture, and motion patterns** (e.g., "lifting its head," "facing forward," "showing its back"). |
|
- Describe **facial expressions and interactions with objects** (e.g., "opening its mouth wide," "smiling while holding an item"). |
|
- **Specify actions with other objects or entities** only when they are clear and observable. |
|
- (O) "pushing another person" |
|
- (X) "interacting with another object" |
|
|
|
--- |
|
## Output Format: |
|
- Each labeled **{cat_name}** must have **exactly one line**. |
|
- Format: `ID. {cat_name} + action-based description` |
|
- (O) Example: |
|
``` |
|
1. The person is holding ski poles and skiing down a snowy mountain with bent knees. |
|
2. The person is pulling a baby carriage while smiling. |
|
``` |
|
- **Ensure each object is described individually.** |
|
- **Do not group multiple objects into a single sentence** (e.g., "2-5. people: xxx" is NOT allowed). |
|
|
|
--- |
|
## Example: |
|
If the frame has two labeled **bears**, your output should be: |
|
``` |
|
1. The bear is reaching out its right paw while leaning forward to catch prey. |
|
2. A bear is standing upright, facing right, and touching the bike beside it. |
|
``` |
|
|
|
--- |
|
## Additional Instructions: |
|
- **Do NOT** describe appearance (e.g., color, size, texture) or relative positioning (e.g., "on the left/right"). |
|
- **Do NOT** reference object IDs explicitly (e.g., "Person 1" or "Object 2" is NOT allowed). |
|
- **Do NOT** include markdown formatting (no bullet points, asterisks, or extra symbols). |
|
- **Only describe actions of the labeled {cat_name} objects**βdo not introduce unrelated categories. |
|
|
|
Please generate the action-oriented descriptions for each labeled {cat_name} and start your answer:""" |
|
|
|
|
|
MAX_RETRIES = 3 |
|
retry_count = 0 |
|
|
|
if should_caption: |
|
while retry_count < MAX_RETRIES: |
|
selected_prompt = random.choice([dense_caption_prompt, dense_caption_prompt_2]) |
|
|
|
response2 = captioner.chat.completions.create( |
|
model=model, |
|
messages=[ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{ |
|
"type": "text", |
|
"text": selected_prompt, |
|
}, |
|
{ |
|
"type": "image_url", |
|
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}, |
|
}, |
|
], |
|
} |
|
], |
|
) |
|
|
|
|
|
|
|
|
|
caption = response2.choices[0].message.content.strip() |
|
caption_lower = caption.lower().lstrip() |
|
|
|
if caption_lower.startswith("1.") and not any( |
|
phrase in caption_lower for phrase in ["i'm sorry", "please", "can't help"] |
|
): |
|
break |
|
|
|
print(f"Retrying caption generation... ({retry_count + 1}/{MAX_RETRIES})") |
|
retry_count += 1 |
|
time.sleep(2) |
|
|
|
if retry_count == MAX_RETRIES: |
|
caption = None |
|
print("Max retries reached. Caption generation failed.") |
|
|
|
else: |
|
caption = None |
|
|
|
image_captions[frame_name] = caption |
|
all_captions[cat_name] = image_captions |
|
|
|
|
|
valid_obj_ids = dict() |
|
|
|
for cat in cat_names: |
|
if cat in ytvos_category_valid_list: |
|
obj_id_cat = vid_meta['obj_id_cat'] |
|
valid_cat_ids = [] |
|
for obj_id in list(obj_id_cat.keys()): |
|
if obj_id_cat[obj_id] == cat: |
|
valid_cat_ids.append(obj_id) |
|
valid_obj_ids[cat] = valid_cat_ids |
|
|
|
return vid_id, all_captions, valid_obj_ids |
|
|
|
|
|
if __name__ == '__main__': |
|
parser = argparse.ArgumentParser('ReferFormer training and evaluation script', parents=[opts.get_args_parser()]) |
|
parser.add_argument('--save_caption_path', type=str, default="mbench/numbered_captions_gpt-4o_randcap.json") |
|
parser.add_argument('--save_valid_obj_ids_path', type=str, default="mbench/numbered_valid_obj_ids_gpt-4o_randcap.json") |
|
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
train_dataset = build_ytvos_ref(image_set = 'train', args = args) |
|
|
|
|
|
metas = train_dataset.metas |
|
|
|
|
|
colors = [ |
|
(255, 0, 0), |
|
(0, 255, 0), |
|
(0, 0, 255), |
|
(255, 255, 0), |
|
(255, 0, 255), |
|
(0, 255, 255), |
|
(128, 0, 128), |
|
(255, 165, 0) |
|
] |
|
|
|
ytvos_category_valid_list = [ |
|
'airplane', 'ape', 'bear', 'bird', 'boat', 'bus', 'camel', 'cat', 'cow', 'crocodile', |
|
'deer', 'dog', 'dolphin', 'duck', 'eagle', 'earless_seal', 'elephant', 'fish', 'fox', 'frog', |
|
'giant_panda', 'giraffe', 'hedgehog', 'horse', 'leopard', 'lion', 'lizard', |
|
'monkey', 'motorbike', 'mouse', 'owl', 'parrot', 'penguin', 'person', |
|
'rabbit', 'raccoon', 'sedan', 'shark', 'sheep', 'snail', 'snake', |
|
'squirrel', 'tiger', 'train', 'truck', 'turtle', 'whale', 'zebra' |
|
] |
|
|
|
|
|
os.environ['OPENAI_API_KEY'] = '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' |
|
|
|
result_captions = {} |
|
result_valid_obj_ids = {} |
|
|
|
for i in range(len(metas)): |
|
try: |
|
vid_id, all_captions, valid_obj_ids = getCaption(i) |
|
|
|
if vid_id not in result_captions: |
|
result_captions[vid_id] = all_captions |
|
if vid_id not in result_valid_obj_ids: |
|
result_valid_obj_ids[vid_id] = valid_obj_ids |
|
|
|
except (requests.exceptions.ConnectionError, APIConnectionError) as e: |
|
print(f"created caption until {i-1}", flush=True) |
|
print("μΈν°λ· μ°κ²° λ¬Έμ λ‘ μμ²μ μ²λ¦¬ν μ μμ΅λλ€:", e, flush=True) |
|
|
|
with open(args.save_caption_path, "w") as file: |
|
json.dump(result_captions, file, indent=4) |
|
|
|
with open(args.save_valid_obj_ids_path, "w") as file: |
|
json.dump(result_valid_obj_ids, file, indent=4) |
|
|
|
except OpenAIError as e: |
|
print(f"created caption until {i-1}", flush=True) |
|
print("OpenAI API κ΄λ ¨ μ€λ₯κ° λ°μνμ΅λλ€:", e, flush=True) |
|
|
|
with open(args.save_caption_path, "w") as file: |
|
json.dump(result_captions, file, indent=4) |
|
|
|
with open(args.save_valid_obj_ids_path, "w") as file: |
|
json.dump(result_valid_obj_ids, file, indent=4) |
|
|
|
except Exception as e: |
|
print(f"created caption until {i-1}", flush=True) |
|
print("μ μ μλ μ€λ₯ λ°μ:", e, flush=True) |
|
|
|
with open(args.save_caption_path, "w") as file: |
|
json.dump(result_captions, file, indent=4) |
|
|
|
with open(args.save_valid_obj_ids_path, "w") as file: |
|
json.dump(result_valid_obj_ids, file, indent=4) |
|
|
|
print("Finished!", flush=True) |
|
|
|
with open(args.save_caption_path, "w") as file: |
|
json.dump(result_captions, file, indent=4) |
|
|
|
with open(args.save_valid_obj_ids_path, "w") as file: |
|
json.dump(result_valid_obj_ids, file, indent=4) |
|
|