Spaces:
Runtime error
Runtime error
File size: 6,826 Bytes
4121bec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 |
# Copyright (c) Facebook, Inc. and its affiliates.
import copy
import itertools
import logging
import numpy as np
import pickle
import random
import torch.utils.data as data
from torch.utils.data.sampler import Sampler
from detectron2.utils.serialize import PicklableWrapper
__all__ = ["MapDataset", "DatasetFromList", "AspectRatioGroupedDataset", "ToIterableDataset"]
class MapDataset(data.Dataset):
"""
Map a function over the elements in a dataset.
Args:
dataset: a dataset where map function is applied.
map_func: a callable which maps the element in dataset. map_func is
responsible for error handling, when error happens, it needs to
return None so the MapDataset will randomly use other
elements from the dataset.
"""
def __init__(self, dataset, map_func):
self._dataset = dataset
self._map_func = PicklableWrapper(map_func) # wrap so that a lambda will work
self._rng = random.Random(42)
self._fallback_candidates = set(range(len(dataset)))
def __len__(self):
return len(self._dataset)
def __getitem__(self, idx):
retry_count = 0
cur_idx = int(idx)
while True:
data = self._map_func(self._dataset[cur_idx])
if data is not None:
self._fallback_candidates.add(cur_idx)
return data
# _map_func fails for this idx, use a random new index from the pool
retry_count += 1
self._fallback_candidates.discard(cur_idx)
cur_idx = self._rng.sample(self._fallback_candidates, k=1)[0]
if retry_count >= 3:
logger = logging.getLogger(__name__)
logger.warning(
"Failed to apply `_map_func` for idx: {}, retry count: {}".format(
idx, retry_count
)
)
class DatasetFromList(data.Dataset):
"""
Wrap a list to a torch Dataset. It produces elements of the list as data.
"""
def __init__(self, lst: list, copy: bool = True, serialize: bool = True):
"""
Args:
lst (list): a list which contains elements to produce.
copy (bool): whether to deepcopy the element when producing it,
so that the result can be modified in place without affecting the
source in the list.
serialize (bool): whether to hold memory using serialized objects, when
enabled, data loader workers can use shared RAM from master
process instead of making a copy.
"""
self._lst = lst
self._copy = copy
self._serialize = serialize
def _serialize(data):
buffer = pickle.dumps(data, protocol=-1)
return np.frombuffer(buffer, dtype=np.uint8)
if self._serialize:
logger = logging.getLogger(__name__)
logger.info(
"Serializing {} elements to byte tensors and concatenating them all ...".format(
len(self._lst)
)
)
self._lst = [_serialize(x) for x in self._lst]
self._addr = np.asarray([len(x) for x in self._lst], dtype=np.int64)
self._addr = np.cumsum(self._addr)
self._lst = np.concatenate(self._lst)
logger.info("Serialized dataset takes {:.2f} MiB".format(len(self._lst) / 1024 ** 2))
def __len__(self):
if self._serialize:
return len(self._addr)
else:
return len(self._lst)
def __getitem__(self, idx):
if self._serialize:
start_addr = 0 if idx == 0 else self._addr[idx - 1].item()
end_addr = self._addr[idx].item()
bytes = memoryview(self._lst[start_addr:end_addr])
return pickle.loads(bytes)
elif self._copy:
return copy.deepcopy(self._lst[idx])
else:
return self._lst[idx]
class ToIterableDataset(data.IterableDataset):
"""
Convert an old indices-based (also called map-style) dataset
to an iterable-style dataset.
"""
def __init__(self, dataset, sampler):
"""
Args:
dataset (torch.utils.data.Dataset): an old-style dataset with ``__getitem__``
sampler (torch.utils.data.sampler.Sampler): a cheap iterable that produces indices
to be applied on ``dataset``.
"""
assert not isinstance(dataset, data.IterableDataset), dataset
assert isinstance(sampler, Sampler), sampler
self.dataset = dataset
self.sampler = sampler
def __iter__(self):
worker_info = data.get_worker_info()
if worker_info is None or worker_info.num_workers == 1:
for idx in self.sampler:
yield self.dataset[idx]
else:
# With map-style dataset, `DataLoader(dataset, sampler)` runs the
# sampler in main process only. But `DataLoader(ToIterableDataset(dataset, sampler))`
# will run sampler in every of the N worker and only keep 1/N of the ids on each
# worker. The assumption is that sampler is cheap to iterate and it's fine to discard
# ids in workers.
for idx in itertools.islice(
self.sampler, worker_info.id, None, worker_info.num_workers
):
yield self.dataset[idx]
class AspectRatioGroupedDataset(data.IterableDataset):
"""
Batch data that have similar aspect ratio together.
In this implementation, images whose aspect ratio < (or >) 1 will
be batched together.
This improves training speed because the images then need less padding
to form a batch.
It assumes the underlying dataset produces dicts with "width" and "height" keys.
It will then produce a list of original dicts with length = batch_size,
all with similar aspect ratios.
"""
def __init__(self, dataset, batch_size):
"""
Args:
dataset: an iterable. Each element must be a dict with keys
"width" and "height", which will be used to batch data.
batch_size (int):
"""
self.dataset = dataset
self.batch_size = batch_size
self._buckets = [[] for _ in range(2)]
# Hard-coded two aspect ratio groups: w > h and w < h.
# Can add support for more aspect ratio groups, but doesn't seem useful
def __iter__(self):
for d in self.dataset:
w, h = d["width"], d["height"]
bucket_id = 0 if w > h else 1
bucket = self._buckets[bucket_id]
bucket.append(d)
if len(bucket) == self.batch_size:
yield bucket[:]
del bucket[:]
|