hma / datasets /extern /
LeroyWaa's picture
history blame
7.95 kB
# --------------------------------------------------------
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import os
from typing import Iterable
import numpy as np
from tqdm import tqdm
from collections import OrderedDict
import os
import numpy as np
from pathlib import Path
CURRENT_DIR = os.path.dirname(__file__)
import cv2
from os.path import expanduser
import json
import matplotlib.pyplot as plt
RESOLUTION = (480, 480)
home = expanduser("~")
# Adjust these to the where-ever your detections and frames are stored.
ROOT = "/datasets01/ego4d_track2/"
LABEL_ROOT = ROOT + "v2_1/annotations/fho_main.json"
VIDEO_PATH = ROOT + "v2_1/full_scale/"
# from epic_kitchens.hoa import load_detections
# labels = json.load(open("/datasets01/ego4d_track2/v2_1/annotations/fho_main.json"))
# videos = /datasets01/ego4d_track2/v2_1/clips
def parse_video_frame(video_path, frame_id):
cap = cv2.VideoCapture(video_path)
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_id-1)
ret, frame =
return frame
def parse_raw_video(video_path):
cap = cv2.VideoCapture(video_path)
frames = []
while cap.isOpened():
ret, frame =
if not ret:
return frames
def compute_state_and_actions(image, curr_frame, next_frame, frame_idx, save=False):
# curr_frame is a list of bounding box labels
img_width, img_height = image.shape[1], image.shape[0]
for box in curr_frame:
if box['object_type'] == 'left_hand':
curr_hand1_center = [box['bbox']['x'] + box['bbox']['width'] / 2, box['bbox']['y'] + box['bbox']['height'] / 2]
if box['object_type'] == 'right_hand':
curr_hand2_center = [box['bbox']['x'] + box['bbox']['width'] / 2, box['bbox']['y'] + box['bbox']['height'] / 2]
for box in next_frame:
if box['object_type'] == 'left_hand':
next_hand1_center = [box['bbox']['x'] + box['bbox']['width'] / 2, box['bbox']['y'] + box['bbox']['height'] / 2]
if box['object_type'] == 'right_hand':
next_hand2_center = [box['bbox']['x'] + box['bbox']['width'] / 2, box['bbox']['y'] + box['bbox']['height'] / 2]
# normalized them
curr_hand1_center = np.array([curr_hand1_center[0] / img_width, curr_hand1_center[1] / img_height])
curr_hand2_center = np.array([curr_hand2_center[0] / img_width, curr_hand2_center[1] / img_height])
# normalize them
next_hand1_center = np.array([next_hand1_center[0] / img_width, next_hand1_center[1] / img_height])
next_hand2_center = np.array([next_hand2_center[0] / img_width, next_hand2_center[1] / img_height])
state = np.concatenate((curr_hand1_center, curr_hand2_center)) # - np.array(curr_hand1_center) - np.array(curr_hand2_center)
action = np.concatenate(
if save:
# draw the bounding boxes, (int(curr_hand1_center[0] * img_width), int(curr_hand1_center[1] * img_height)), 10, (0, 255, 0), -1), (int(curr_hand2_center[0] * img_width), int(curr_hand2_center[1] * img_height)), 10, (0, 255, 0), -1), (int(next_hand1_center[0] * img_width), int(next_hand1_center[1] * img_height)), 10, (0, 0, 255), -1), (int(next_hand2_center[0] * img_width), int(next_hand2_center[1] * img_height)), 10, (0, 0, 255), -1)
# save the image
cv2.imwrite(f"/private/home/xinleic/LR/hpt_video/data/ego4d_video_label_check/img_{frame_idx}.png", image)
return state, action
def parse_raw_video(video_path):
import cv2
cap = cv2.VideoCapture(video_path)
frames = []
while cap.isOpened():
ret, frame =
if not ret:
return frames
def chunk_actions_and_concatenate(actions):
chunk_size = 4
chunked_actions = [actions[i:i + chunk_size] for i in range(0, len(actions), chunk_size)][:-1]
concatenated_frames = []
for chunk in chunked_actions:
frames_to_concat = []
for action in chunk:
frames = action['frames'] # Assuming 'frames' is a list or iterable
if frames is not None:
frames_to_concat.extend(frames) # Collect frames from each action
concatenated_frames.append(frames_to_concat) # Store the concatenated frames for this chunk
return concatenated_frames
def ego4d_dataset_size() -> int:
""" Returns the number of trajectories in the dataset. ~1725 for Ego4D. """
labels = json.load(open(LABEL_ROOT))
return len(labels['videos'])
# define your own dataset conversion
def ego4d_dataset_generator(example_inds: Iterable[int] = None):
Generator yielding data from Ego4D.
example_inds: if specified, will only yield data from these indices.
Otherwise, will default to yielding the entire dataset.
# convert to a list of episodes that can be added to replay buffer
labels = json.load(open(LABEL_ROOT))
if example_inds is None:
example_inds = range(len(labels['videos']))
for example_ind in example_inds:
label = labels['videos'][example_ind]
# ['annotated_intervals'][2]['narrated_actions']
video_path = VIDEO_PATH + label['video_uid'] + ".mp4"
if not os.path.exists(video_path):
print("skip", video_path)
label_detections = labels
print("video_path:", video_path)
print("len label detections", len(label_detections))
# action extractions over bounding boxes subtractions of both hands.
for interval in label['annotated_intervals']:
# print(video_detections[frame_idx].hands)
lang = "use human hands to do some tasks" # dummies
# import IPython; IPython.embed()
print(f"Interval [{interval['start_sec']} - {interval['end_sec']}]")
actions = list(filter(lambda x: not (x['is_invalid_annotation'] or x['is_rejected']) and x['stage'] is not None, interval['narrated_actions']))
print(f"Actions: {len(actions)}")
# because we need to concatenate
if len(actions) < 3:
# the number of frames is usually 7 and it also does not follow strict 2hz
chunk_actions = chunk_actions_and_concatenate(actions)
for frame_idx, frames in enumerate(chunk_actions):
# lang = frame['narration_text']
steps = []
# need to use dummy actions to expand from 6 frames to 16 frames
for idx, frame in enumerate(frames[:-1]):
frame_id = frame['frame_number']
next_frame = frames[idx + 1]
image = parse_video_frame(video_path, frame_id)
if len(frame['boxes']) > 2 and len(next_frame['boxes']) > 2:
s, a = compute_state_and_actions(image, frame['boxes'], next_frame['boxes'], idx, save=False)
print(f'compute action failed idx {idx} frame idx {frame_idx}')
# break into step dict
step = {
"observation": {"image": image, "state": s},
"action": a,
"language_instruction": lang,
if len(steps) < 16:
print("skip this traj because frame window length < 16")
data_dict = {"steps": steps}
yield data_dict