Spaces:
Sleeping
Sleeping
File size: 4,244 Bytes
2d9a728 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
import logging
import os
import random
try:
from petrel_client.client import Client
except:
Client = None
from torch.utils.data import Dataset
from .utils import load_image_from_path
from .av_utils import lazy_load_s3video
logger = logging.getLogger(__name__)
class BaseDataset(Dataset):
"""Base class that implements the image and video loading methods"""
media_type = "video"
def __init__(self):
assert self.media_type in ["audio", "image", "video", "audio_video"]
self.data_root = None
self.data_root_prefix = ""
self.anno_list = (
None # list(dict), each dict contains {"image": str, # image or video path}
)
self.transform = None
self.audio_reader_type = None
self.audio_sample_rate = None
self.max_audio_length = None
self.video_reader = None
self.num_tries = None
self.client = Client('~/petreloss.conf') if Client is not None else None
self.trimmed30 = False
def __getitem__(self, index):
raise NotImplementedError
def __len__(self):
raise NotImplementedError
def get_anno(self, index): # NOTE used for most ret_dataset
"""obtain the annotation for one media (video or image)
Args:
index (int): The media index.
Returns: dict.
- "image": the filename, video also use "image".
- "caption": The caption for this file.
"""
anno = self.anno_list[index]
if self.data_root is not None:
if self.media_type == "audio":
anno["audio"] = self.data_root_prefix + os.path.join(self.data_root, anno["audio"])
else:
anno["image"] = self.data_root_prefix + os.path.join(self.data_root, anno["image"])
return anno
def load_and_transform_media_data(self, index, data_path):
try:
if self.media_type == "image":
return self.load_and_transform_media_data_image(index, data_path)
elif self.media_type == "audio":
return self.load_and_transform_media_data_audio(index, data_path)
elif self.media_type == "video":
return self.load_and_transform_media_data_video(index, data_path)
elif self.media_type == "audio_video":
return self.load_and_transform_media_data_audio_video(index, data_path)
else:
raise NotImplementedError(self.media_type)
except Exception as e:
logger.info(f"Something wrong when read {data_path}")
raise e
def load_and_transform_media_data_image(self, index, data_path):
if type(data_path) is dict:
image = load_image_from_path(data_path["image"], client=self.client)
if "crop_bbox" in data_path.keys():
bbox = data_path["crop_bbox"]
x0, y0, x1, y1 = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])
image = image[:, :, y0:y1, x0:x1]
image = self.transform(image)
else:
image = load_image_from_path(data_path, client=self.client)
image = self.transform(image)
return image, index
def load_and_transform_media_data_video(self, index, data_path):
if type(data_path) is dict:
if data_path['read_clip_from_video']:
if self.trimmed30:
raise NotImplementedError("lazy_load_s3video does not support trimmed30")
frames = lazy_load_s3video(data_path['video'], self.num_frames, data_path['video_start_frame'], data_path['video_end_frame'], self.client)
else:
raise NotImplementedError(data_path)
else:
max_num_frames = self.max_num_frames if hasattr(self, "max_num_frames") else -1
frames, frame_indices, video_duration = self.video_reader(
data_path, self.num_frames, self.sample_type,
max_num_frames=max_num_frames, client=self.client,
trimmed30=self.trimmed30
)
# NOTE shared aug for video frames
frames = self.transform(frames)
return frames, index
|