peekaboo-demo / data /get_bbox.py
Anshul Nasery
Demo commit
44f2ca8
#@title Get bounding boxes for the subject
from transformers import pipeline
from moviepy.editor import VideoFileClip
from PIL import Image
import os
import numpy as np
import matplotlib.pyplot as plt
import tqdm
import pickle
import torch
checkpoint = "google/owlvit-large-patch14"
detector = pipeline(model=checkpoint, task="zero-shot-object-detection", device='cuda:0')
def get_bounding_boxes(clip_path, subject):
# Read video from the path
clip = VideoFileClip(clip_path)
all_bboxes = []
bbox_present = []
num_bb = 0
for fidx,frame in enumerate(clip.iter_frames()):
frame = Image.fromarray(frame)
predictions = detector(
frame,
candidate_labels=[subject,],
)
try:
bbox = predictions[0]["box"]
bbox = (bbox["xmin"], bbox["ymin"], bbox["xmax"], bbox["ymax"])
# Get a zeros array of the same size as the frame
canvas = np.zeros(frame.size[::-1])
# Draw the bounding box on the canvas
canvas[bbox[1]:bbox[3], bbox[0]:bbox[2]] = 1
# Add the canvas to the list of bounding boxes
all_bboxes.append(canvas)
bbox_present.append(True)
num_bb += 1
except Exception as e:
# Append an empty canvas, we will interpolate later
all_bboxes.append(np.zeros(frame.size[::-1]))
bbox_present.append(False)
continue
return all_bboxes, num_bb
import pickle as pkl
dir_path = '/your/result/path'
video_filename = '2_of_40_2.mp4'
output_bbox = []
with open("/ssv2dataset/path.pkl", "rb") as f:
data = pkl.load(f)
dataset_size = len(data)
failed_cnt = 0
for i, d in tqdm.tqdm(enumerate(data)):
try:
# print(f"{d['subject']} || {d['caption']} || {d['video']}")
filename = d['video'].split('.')[0]
video_path = os.path.join(dir_path, filename, video_filename)
fg_object = d['subject']
masks, num_bb = get_bounding_boxes(video_path, fg_object)
output_bbox.append({
'caption': d['caption'],
'video': d['video'],
'subject': d['subject'],
'mask': masks,
'num_bb': num_bb
})
# print(num_bb)
except:
print(f"Missed #{i} with Caption: {d['caption']}")
failed_cnt += 1
with open(f"/output/path/iou_eval/ssv2_modelscope_{video_filename.split('.')[0]}_bbox-v2.pkl", "wb") as f:
pkl.dump(output_bbox, f)
print(f"Failed: {failed_cnt} out of {dataset_size}")