Spaces:

liruiw
/

hma

Running on Zero

App Files Files Community

hma / datasets /extern /ego4d.py

LeroyWaa

draft

246c106 about 1 month ago

raw

history blame

7.95 kB

	# --------------------------------------------------------
	# Licensed under The MIT License [see LICENSE for details]
	# --------------------------------------------------------
	import os
	from typing import Iterable

	import numpy as np
	from tqdm import tqdm
	from collections import OrderedDict
	import os
	import numpy as np
	from pathlib import Path

	CURRENT_DIR = os.path.dirname(__file__)
	import cv2
	from os.path import expanduser
	import json
	import matplotlib.pyplot as plt

	RESOLUTION = (480, 480)
	home = expanduser("~")

	# Adjust these to the where-ever your detections and frames are stored.
	ROOT = "/datasets01/ego4d_track2/"
	LABEL_ROOT = ROOT + "v2_1/annotations/fho_main.json"
	VIDEO_PATH = ROOT + "v2_1/full_scale/"
	# from epic_kitchens.hoa import load_detections

	# labels = json.load(open("/datasets01/ego4d_track2/v2_1/annotations/fho_main.json"))
	# videos = /datasets01/ego4d_track2/v2_1/clips
	def parse_video_frame(video_path, frame_id):
	cap = cv2.VideoCapture(video_path)
	cap.set(cv2.CAP_PROP_POS_FRAMES, frame_id-1)
	ret, frame = cap.read()
	return frame

	def parse_raw_video(video_path):
	cap = cv2.VideoCapture(video_path)
	frames = []
	while cap.isOpened():
	ret, frame = cap.read()
	if not ret:
	break
	frames.append(frame)
	return frames

	def compute_state_and_actions(image, curr_frame, next_frame, frame_idx, save=False):
	# curr_frame is a list of bounding box labels
	img_width, img_height = image.shape[1], image.shape[0]
	for box in curr_frame:
	if box['object_type'] == 'left_hand':
	curr_hand1_center = [box['bbox']['x'] + box['bbox']['width'] / 2, box['bbox']['y'] + box['bbox']['height'] / 2]

	if box['object_type'] == 'right_hand':
	curr_hand2_center = [box['bbox']['x'] + box['bbox']['width'] / 2, box['bbox']['y'] + box['bbox']['height'] / 2]

	for box in next_frame:
	if box['object_type'] == 'left_hand':
	next_hand1_center = [box['bbox']['x'] + box['bbox']['width'] / 2, box['bbox']['y'] + box['bbox']['height'] / 2]

	if box['object_type'] == 'right_hand':
	next_hand2_center = [box['bbox']['x'] + box['bbox']['width'] / 2, box['bbox']['y'] + box['bbox']['height'] / 2]

	# normalized them
	curr_hand1_center = np.array([curr_hand1_center[0] / img_width, curr_hand1_center[1] / img_height])
	curr_hand2_center = np.array([curr_hand2_center[0] / img_width, curr_hand2_center[1] / img_height])

	# normalize them
	next_hand1_center = np.array([next_hand1_center[0] / img_width, next_hand1_center[1] / img_height])
	next_hand2_center = np.array([next_hand2_center[0] / img_width, next_hand2_center[1] / img_height])

	state = np.concatenate((curr_hand1_center, curr_hand2_center)) # - np.array(curr_hand1_center) - np.array(curr_hand2_center)
	action = np.concatenate(
	(
	np.array(next_hand1_center),
	np.array(next_hand2_center),
	)
	)
	if save:
	# draw the bounding boxes
	cv2.circle(image, (int(curr_hand1_center[0] * img_width), int(curr_hand1_center[1] * img_height)), 10, (0, 255, 0), -1)
	cv2.circle(image, (int(curr_hand2_center[0] * img_width), int(curr_hand2_center[1] * img_height)), 10, (0, 255, 0), -1)
	cv2.circle(image, (int(next_hand1_center[0] * img_width), int(next_hand1_center[1] * img_height)), 10, (0, 0, 255), -1)
	cv2.circle(image, (int(next_hand2_center[0] * img_width), int(next_hand2_center[1] * img_height)), 10, (0, 0, 255), -1)
	# save the image
	cv2.imwrite(f"/private/home/xinleic/LR/hpt_video/data/ego4d_video_label_check/img_{frame_idx}.png", image)
	return state, action


	def parse_raw_video(video_path):
	import cv2
	cap = cv2.VideoCapture(video_path)
	frames = []
	while cap.isOpened():
	ret, frame = cap.read()
	if not ret:
	break
	frames.append(frame)
	return frames

	def chunk_actions_and_concatenate(actions):
	chunk_size = 4
	chunked_actions = [actions[i:i + chunk_size] for i in range(0, len(actions), chunk_size)][:-1]
	concatenated_frames = []

	for chunk in chunked_actions:
	frames_to_concat = []
	for action in chunk:
	frames = action['frames'] # Assuming 'frames' is a list or iterable
	if frames is not None:
	frames_to_concat.extend(frames) # Collect frames from each action
	concatenated_frames.append(frames_to_concat) # Store the concatenated frames for this chunk

	return concatenated_frames


	def ego4d_dataset_size() -> int:
	""" Returns the number of trajectories in the dataset. ~1725 for Ego4D. """
	labels = json.load(open(LABEL_ROOT))
	return len(labels['videos'])


	# define your own dataset conversion
	def ego4d_dataset_generator(example_inds: Iterable[int] = None):
	"""
	Generator yielding data from Ego4D.
	Args:
	example_inds: if specified, will only yield data from these indices.
	Otherwise, will default to yielding the entire dataset.
	"""
	# convert to a list of episodes that can be added to replay buffer
	labels = json.load(open(LABEL_ROOT))

	if example_inds is None:
	example_inds = range(len(labels['videos']))

	for example_ind in example_inds:
	label = labels['videos'][example_ind]
	# ['annotated_intervals'][2]['narrated_actions']
	video_path = VIDEO_PATH + label['video_uid'] + ".mp4"
	if not os.path.exists(video_path):
	print("skip", video_path)
	continue

	label_detections = labels
	print("video_path:", video_path)
	print("len label detections", len(label_detections))

	# action extractions over bounding boxes subtractions of both hands.
	for interval in label['annotated_intervals']:
	# print(video_detections[frame_idx].hands)

	lang = "use human hands to do some tasks" # dummies
	# import IPython; IPython.embed()
	print(f"Interval [{interval['start_sec']} - {interval['end_sec']}]")
	actions = list(filter(lambda x: not (x['is_invalid_annotation'] or x['is_rejected']) and x['stage'] is not None, interval['narrated_actions']))
	print(f"Actions: {len(actions)}")

	# because we need to concatenate
	if len(actions) < 3:
	continue

	# the number of frames is usually 7 and it also does not follow strict 2hz
	chunk_actions = chunk_actions_and_concatenate(actions)
	for frame_idx, frames in enumerate(chunk_actions):
	# lang = frame['narration_text']
	steps = []
	# need to use dummy actions to expand from 6 frames to 16 frames
	for idx, frame in enumerate(frames[:-1]):
	frame_id = frame['frame_number']
	next_frame = frames[idx + 1]
	image = parse_video_frame(video_path, frame_id)

	if len(frame['boxes']) > 2 and len(next_frame['boxes']) > 2:
	try:
	s, a = compute_state_and_actions(image, frame['boxes'], next_frame['boxes'], idx, save=False)
	except:
	print(f'compute action failed idx {idx} frame idx {frame_idx}')
	continue
	# break into step dict
	step = {
	"observation": {"image": image, "state": s},
	"action": a,
	"language_instruction": lang,
	}
	steps.append(OrderedDict(step))

	if len(steps) < 16:
	print("skip this traj because frame window length < 16")
	continue
	data_dict = {"steps": steps}
	yield data_dict