Update `--cache disk` deprecate `*_npy/` dirs (#6876)
Browse files* Updates
* Updates
* Updates
* Updates
* Updates
* Updates
* Updates
* Updates
* Updates
* Updates
* Cleanup
* Cleanup
- utils/datasets.py +38 -38
- utils/loggers/wandb/wandb_utils.py +1 -1
- val.py +1 -1
utils/datasets.py
CHANGED
@@ -407,19 +407,19 @@ class LoadImagesAndLabels(Dataset):
|
|
407 |
# f += [p.parent / x.lstrip(os.sep) for x in t] # local to global path (pathlib)
|
408 |
else:
|
409 |
raise Exception(f'{prefix}{p} does not exist')
|
410 |
-
self.
|
411 |
# self.img_files = sorted([x for x in f if x.suffix[1:].lower() in IMG_FORMATS]) # pathlib
|
412 |
-
assert self.
|
413 |
except Exception as e:
|
414 |
raise Exception(f'{prefix}Error loading data from {path}: {e}\nSee {HELP_URL}')
|
415 |
|
416 |
# Check cache
|
417 |
-
self.label_files = img2label_paths(self.
|
418 |
cache_path = (p if p.is_file() else Path(self.label_files[0]).parent).with_suffix('.cache')
|
419 |
try:
|
420 |
cache, exists = np.load(cache_path, allow_pickle=True).item(), True # load dict
|
421 |
assert cache['version'] == self.cache_version # same version
|
422 |
-
assert cache['hash'] == get_hash(self.label_files + self.
|
423 |
except Exception:
|
424 |
cache, exists = self.cache_labels(cache_path, prefix), False # cache
|
425 |
|
@@ -437,7 +437,7 @@ class LoadImagesAndLabels(Dataset):
|
|
437 |
labels, shapes, self.segments = zip(*cache.values())
|
438 |
self.labels = list(labels)
|
439 |
self.shapes = np.array(shapes, dtype=np.float64)
|
440 |
-
self.
|
441 |
self.label_files = img2label_paths(cache.keys()) # update
|
442 |
n = len(shapes) # number of images
|
443 |
bi = np.floor(np.arange(n) / batch_size).astype(np.int) # batch index
|
@@ -466,7 +466,7 @@ class LoadImagesAndLabels(Dataset):
|
|
466 |
s = self.shapes # wh
|
467 |
ar = s[:, 1] / s[:, 0] # aspect ratio
|
468 |
irect = ar.argsort()
|
469 |
-
self.
|
470 |
self.label_files = [self.label_files[i] for i in irect]
|
471 |
self.labels = [self.labels[i] for i in irect]
|
472 |
self.shapes = s[irect] # wh
|
@@ -485,24 +485,20 @@ class LoadImagesAndLabels(Dataset):
|
|
485 |
self.batch_shapes = np.ceil(np.array(shapes) * img_size / stride + pad).astype(np.int) * stride
|
486 |
|
487 |
# Cache images into RAM/disk for faster training (WARNING: large datasets may exceed system resources)
|
488 |
-
self.
|
|
|
489 |
if cache_images:
|
490 |
-
if cache_images == 'disk':
|
491 |
-
self.im_cache_dir = Path(Path(self.img_files[0]).parent.as_posix() + '_npy')
|
492 |
-
self.img_npy = [self.im_cache_dir / Path(f).with_suffix('.npy').name for f in self.img_files]
|
493 |
-
self.im_cache_dir.mkdir(parents=True, exist_ok=True)
|
494 |
gb = 0 # Gigabytes of cached images
|
495 |
-
self.
|
496 |
-
|
|
|
497 |
pbar = tqdm(enumerate(results), total=n)
|
498 |
for i, x in pbar:
|
499 |
if cache_images == 'disk':
|
500 |
-
|
501 |
-
np.save(self.img_npy[i].as_posix(), x[0])
|
502 |
-
gb += self.img_npy[i].stat().st_size
|
503 |
else: # 'ram'
|
504 |
-
self.
|
505 |
-
gb += self.
|
506 |
pbar.desc = f'{prefix}Caching images ({gb / 1E9:.1f}GB {cache_images})'
|
507 |
pbar.close()
|
508 |
|
@@ -512,8 +508,8 @@ class LoadImagesAndLabels(Dataset):
|
|
512 |
nm, nf, ne, nc, msgs = 0, 0, 0, 0, [] # number missing, found, empty, corrupt, messages
|
513 |
desc = f"{prefix}Scanning '{path.parent / path.stem}' images and labels..."
|
514 |
with Pool(NUM_THREADS) as pool:
|
515 |
-
pbar = tqdm(pool.imap(verify_image_label, zip(self.
|
516 |
-
desc=desc, total=len(self.
|
517 |
for im_file, lb, shape, segments, nm_f, nf_f, ne_f, nc_f, msg in pbar:
|
518 |
nm += nm_f
|
519 |
nf += nf_f
|
@@ -530,8 +526,8 @@ class LoadImagesAndLabels(Dataset):
|
|
530 |
LOGGER.info('\n'.join(msgs))
|
531 |
if nf == 0:
|
532 |
LOGGER.warning(f'{prefix}WARNING: No labels found in {path}. See {HELP_URL}')
|
533 |
-
x['hash'] = get_hash(self.label_files + self.
|
534 |
-
x['results'] = nf, nm, ne, nc, len(self.
|
535 |
x['msgs'] = msgs # warnings
|
536 |
x['version'] = self.cache_version # cache version
|
537 |
try:
|
@@ -543,7 +539,7 @@ class LoadImagesAndLabels(Dataset):
|
|
543 |
return x
|
544 |
|
545 |
def __len__(self):
|
546 |
-
return len(self.
|
547 |
|
548 |
# def __iter__(self):
|
549 |
# self.count = -1
|
@@ -622,17 +618,15 @@ class LoadImagesAndLabels(Dataset):
|
|
622 |
img = img.transpose((2, 0, 1))[::-1] # HWC to CHW, BGR to RGB
|
623 |
img = np.ascontiguousarray(img)
|
624 |
|
625 |
-
return torch.from_numpy(img), labels_out, self.
|
626 |
|
627 |
def load_image(self, i):
|
628 |
-
#
|
629 |
-
im = self.
|
630 |
if im is None: # not cached in RAM
|
631 |
-
|
632 |
-
|
633 |
-
im = np.load(npy)
|
634 |
else: # read image
|
635 |
-
f = self.img_files[i]
|
636 |
im = cv2.imread(f) # BGR
|
637 |
assert im is not None, f'Image Not Found {f}'
|
638 |
h0, w0 = im.shape[:2] # orig hw
|
@@ -643,7 +637,13 @@ class LoadImagesAndLabels(Dataset):
|
|
643 |
interpolation=cv2.INTER_LINEAR if (self.augment or r > 1) else cv2.INTER_AREA)
|
644 |
return im, (h0, w0), im.shape[:2] # im, hw_original, hw_resized
|
645 |
else:
|
646 |
-
return self.
|
|
|
|
|
|
|
|
|
|
|
|
|
647 |
|
648 |
def load_mosaic(self, index):
|
649 |
# YOLOv5 4-mosaic loader. Loads 1 image + 3 random images into a 4-image mosaic
|
@@ -777,16 +777,16 @@ class LoadImagesAndLabels(Dataset):
|
|
777 |
|
778 |
@staticmethod
|
779 |
def collate_fn(batch):
|
780 |
-
|
781 |
for i, lb in enumerate(label):
|
782 |
lb[:, 0] = i # add target image index for build_targets()
|
783 |
-
return torch.stack(
|
784 |
|
785 |
@staticmethod
|
786 |
def collate_fn4(batch):
|
787 |
img, label, path, shapes = zip(*batch) # transposed
|
788 |
n = len(shapes) // 4
|
789 |
-
|
790 |
|
791 |
ho = torch.tensor([[0.0, 0, 0, 1, 0, 0]])
|
792 |
wo = torch.tensor([[0.0, 0, 1, 0, 0, 0]])
|
@@ -800,13 +800,13 @@ class LoadImagesAndLabels(Dataset):
|
|
800 |
else:
|
801 |
im = torch.cat((torch.cat((img[i], img[i + 1]), 1), torch.cat((img[i + 2], img[i + 3]), 1)), 2)
|
802 |
lb = torch.cat((label[i], label[i + 1] + ho, label[i + 2] + wo, label[i + 3] + ho + wo), 0) * s
|
803 |
-
|
804 |
label4.append(lb)
|
805 |
|
806 |
for i, lb in enumerate(label4):
|
807 |
lb[:, 0] = i # add target image index for build_targets()
|
808 |
|
809 |
-
return torch.stack(
|
810 |
|
811 |
|
812 |
# Ancillary functions --------------------------------------------------------------------------------------------------
|
@@ -999,12 +999,12 @@ def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profil
|
|
999 |
'image_stats': {'total': dataset.n, 'unlabelled': int(np.all(x == 0, 1).sum()),
|
1000 |
'per_class': (x > 0).sum(0).tolist()},
|
1001 |
'labels': [{str(Path(k).name): round_labels(v.tolist())} for k, v in
|
1002 |
-
zip(dataset.
|
1003 |
|
1004 |
if hub:
|
1005 |
im_dir = hub_dir / 'images'
|
1006 |
im_dir.mkdir(parents=True, exist_ok=True)
|
1007 |
-
for _ in tqdm(ThreadPool(NUM_THREADS).imap(hub_ops, dataset.
|
1008 |
pass
|
1009 |
|
1010 |
# Profile
|
|
|
407 |
# f += [p.parent / x.lstrip(os.sep) for x in t] # local to global path (pathlib)
|
408 |
else:
|
409 |
raise Exception(f'{prefix}{p} does not exist')
|
410 |
+
self.im_files = sorted(x.replace('/', os.sep) for x in f if x.split('.')[-1].lower() in IMG_FORMATS)
|
411 |
# self.img_files = sorted([x for x in f if x.suffix[1:].lower() in IMG_FORMATS]) # pathlib
|
412 |
+
assert self.im_files, f'{prefix}No images found'
|
413 |
except Exception as e:
|
414 |
raise Exception(f'{prefix}Error loading data from {path}: {e}\nSee {HELP_URL}')
|
415 |
|
416 |
# Check cache
|
417 |
+
self.label_files = img2label_paths(self.im_files) # labels
|
418 |
cache_path = (p if p.is_file() else Path(self.label_files[0]).parent).with_suffix('.cache')
|
419 |
try:
|
420 |
cache, exists = np.load(cache_path, allow_pickle=True).item(), True # load dict
|
421 |
assert cache['version'] == self.cache_version # same version
|
422 |
+
assert cache['hash'] == get_hash(self.label_files + self.im_files) # same hash
|
423 |
except Exception:
|
424 |
cache, exists = self.cache_labels(cache_path, prefix), False # cache
|
425 |
|
|
|
437 |
labels, shapes, self.segments = zip(*cache.values())
|
438 |
self.labels = list(labels)
|
439 |
self.shapes = np.array(shapes, dtype=np.float64)
|
440 |
+
self.im_files = list(cache.keys()) # update
|
441 |
self.label_files = img2label_paths(cache.keys()) # update
|
442 |
n = len(shapes) # number of images
|
443 |
bi = np.floor(np.arange(n) / batch_size).astype(np.int) # batch index
|
|
|
466 |
s = self.shapes # wh
|
467 |
ar = s[:, 1] / s[:, 0] # aspect ratio
|
468 |
irect = ar.argsort()
|
469 |
+
self.im_files = [self.im_files[i] for i in irect]
|
470 |
self.label_files = [self.label_files[i] for i in irect]
|
471 |
self.labels = [self.labels[i] for i in irect]
|
472 |
self.shapes = s[irect] # wh
|
|
|
485 |
self.batch_shapes = np.ceil(np.array(shapes) * img_size / stride + pad).astype(np.int) * stride
|
486 |
|
487 |
# Cache images into RAM/disk for faster training (WARNING: large datasets may exceed system resources)
|
488 |
+
self.ims = [None] * n
|
489 |
+
self.npy_files = [Path(f).with_suffix('.npy') for f in self.im_files]
|
490 |
if cache_images:
|
|
|
|
|
|
|
|
|
491 |
gb = 0 # Gigabytes of cached images
|
492 |
+
self.im_hw0, self.im_hw = [None] * n, [None] * n
|
493 |
+
fcn = self.cache_images_to_disk if cache_images == 'disk' else self.load_image
|
494 |
+
results = ThreadPool(NUM_THREADS).imap(fcn, range(n))
|
495 |
pbar = tqdm(enumerate(results), total=n)
|
496 |
for i, x in pbar:
|
497 |
if cache_images == 'disk':
|
498 |
+
gb += self.npy_files[i].stat().st_size
|
|
|
|
|
499 |
else: # 'ram'
|
500 |
+
self.ims[i], self.im_hw0[i], self.im_hw[i] = x # im, hw_orig, hw_resized = load_image(self, i)
|
501 |
+
gb += self.ims[i].nbytes
|
502 |
pbar.desc = f'{prefix}Caching images ({gb / 1E9:.1f}GB {cache_images})'
|
503 |
pbar.close()
|
504 |
|
|
|
508 |
nm, nf, ne, nc, msgs = 0, 0, 0, 0, [] # number missing, found, empty, corrupt, messages
|
509 |
desc = f"{prefix}Scanning '{path.parent / path.stem}' images and labels..."
|
510 |
with Pool(NUM_THREADS) as pool:
|
511 |
+
pbar = tqdm(pool.imap(verify_image_label, zip(self.im_files, self.label_files, repeat(prefix))),
|
512 |
+
desc=desc, total=len(self.im_files))
|
513 |
for im_file, lb, shape, segments, nm_f, nf_f, ne_f, nc_f, msg in pbar:
|
514 |
nm += nm_f
|
515 |
nf += nf_f
|
|
|
526 |
LOGGER.info('\n'.join(msgs))
|
527 |
if nf == 0:
|
528 |
LOGGER.warning(f'{prefix}WARNING: No labels found in {path}. See {HELP_URL}')
|
529 |
+
x['hash'] = get_hash(self.label_files + self.im_files)
|
530 |
+
x['results'] = nf, nm, ne, nc, len(self.im_files)
|
531 |
x['msgs'] = msgs # warnings
|
532 |
x['version'] = self.cache_version # cache version
|
533 |
try:
|
|
|
539 |
return x
|
540 |
|
541 |
def __len__(self):
|
542 |
+
return len(self.im_files)
|
543 |
|
544 |
# def __iter__(self):
|
545 |
# self.count = -1
|
|
|
618 |
img = img.transpose((2, 0, 1))[::-1] # HWC to CHW, BGR to RGB
|
619 |
img = np.ascontiguousarray(img)
|
620 |
|
621 |
+
return torch.from_numpy(img), labels_out, self.im_files[index], shapes
|
622 |
|
623 |
def load_image(self, i):
|
624 |
+
# Loads 1 image from dataset index 'i', returns (im, original hw, resized hw)
|
625 |
+
im, f, fn = self.ims[i], self.im_files[i], self.npy_files[i],
|
626 |
if im is None: # not cached in RAM
|
627 |
+
if fn.exists(): # load npy
|
628 |
+
im = np.load(fn)
|
|
|
629 |
else: # read image
|
|
|
630 |
im = cv2.imread(f) # BGR
|
631 |
assert im is not None, f'Image Not Found {f}'
|
632 |
h0, w0 = im.shape[:2] # orig hw
|
|
|
637 |
interpolation=cv2.INTER_LINEAR if (self.augment or r > 1) else cv2.INTER_AREA)
|
638 |
return im, (h0, w0), im.shape[:2] # im, hw_original, hw_resized
|
639 |
else:
|
640 |
+
return self.ims[i], self.im_hw0[i], self.im_hw[i] # im, hw_original, hw_resized
|
641 |
+
|
642 |
+
def cache_images_to_disk(self, i):
|
643 |
+
# Saves an image as an *.npy file for faster loading
|
644 |
+
f = self.npy_files[i]
|
645 |
+
if not f.exists():
|
646 |
+
np.save(f.as_posix(), cv2.imread(self.im_files[i]))
|
647 |
|
648 |
def load_mosaic(self, index):
|
649 |
# YOLOv5 4-mosaic loader. Loads 1 image + 3 random images into a 4-image mosaic
|
|
|
777 |
|
778 |
@staticmethod
|
779 |
def collate_fn(batch):
|
780 |
+
im, label, path, shapes = zip(*batch) # transposed
|
781 |
for i, lb in enumerate(label):
|
782 |
lb[:, 0] = i # add target image index for build_targets()
|
783 |
+
return torch.stack(im, 0), torch.cat(label, 0), path, shapes
|
784 |
|
785 |
@staticmethod
|
786 |
def collate_fn4(batch):
|
787 |
img, label, path, shapes = zip(*batch) # transposed
|
788 |
n = len(shapes) // 4
|
789 |
+
im4, label4, path4, shapes4 = [], [], path[:n], shapes[:n]
|
790 |
|
791 |
ho = torch.tensor([[0.0, 0, 0, 1, 0, 0]])
|
792 |
wo = torch.tensor([[0.0, 0, 1, 0, 0, 0]])
|
|
|
800 |
else:
|
801 |
im = torch.cat((torch.cat((img[i], img[i + 1]), 1), torch.cat((img[i + 2], img[i + 3]), 1)), 2)
|
802 |
lb = torch.cat((label[i], label[i + 1] + ho, label[i + 2] + wo, label[i + 3] + ho + wo), 0) * s
|
803 |
+
im4.append(im)
|
804 |
label4.append(lb)
|
805 |
|
806 |
for i, lb in enumerate(label4):
|
807 |
lb[:, 0] = i # add target image index for build_targets()
|
808 |
|
809 |
+
return torch.stack(im4, 0), torch.cat(label4, 0), path4, shapes4
|
810 |
|
811 |
|
812 |
# Ancillary functions --------------------------------------------------------------------------------------------------
|
|
|
999 |
'image_stats': {'total': dataset.n, 'unlabelled': int(np.all(x == 0, 1).sum()),
|
1000 |
'per_class': (x > 0).sum(0).tolist()},
|
1001 |
'labels': [{str(Path(k).name): round_labels(v.tolist())} for k, v in
|
1002 |
+
zip(dataset.im_files, dataset.labels)]}
|
1003 |
|
1004 |
if hub:
|
1005 |
im_dir = hub_dir / 'images'
|
1006 |
im_dir.mkdir(parents=True, exist_ok=True)
|
1007 |
+
for _ in tqdm(ThreadPool(NUM_THREADS).imap(hub_ops, dataset.im_files), total=dataset.n, desc='HUB Ops'):
|
1008 |
pass
|
1009 |
|
1010 |
# Profile
|
utils/loggers/wandb/wandb_utils.py
CHANGED
@@ -403,7 +403,7 @@ class WandbLogger():
|
|
403 |
# TODO: Explore multiprocessing to slpit this loop parallely| This is essential for speeding up the the logging
|
404 |
artifact = wandb.Artifact(name=name, type="dataset")
|
405 |
img_files = tqdm([dataset.path]) if isinstance(dataset.path, str) and Path(dataset.path).is_dir() else None
|
406 |
-
img_files = tqdm(dataset.
|
407 |
for img_file in img_files:
|
408 |
if Path(img_file).is_dir():
|
409 |
artifact.add_dir(img_file, name='data/images')
|
|
|
403 |
# TODO: Explore multiprocessing to slpit this loop parallely| This is essential for speeding up the the logging
|
404 |
artifact = wandb.Artifact(name=name, type="dataset")
|
405 |
img_files = tqdm([dataset.path]) if isinstance(dataset.path, str) and Path(dataset.path).is_dir() else None
|
406 |
+
img_files = tqdm(dataset.im_files) if not img_files else img_files
|
407 |
for img_file in img_files:
|
408 |
if Path(img_file).is_dir():
|
409 |
artifact.add_dir(img_file, name='data/images')
|
val.py
CHANGED
@@ -297,7 +297,7 @@ def run(data,
|
|
297 |
pred = anno.loadRes(pred_json) # init predictions api
|
298 |
eval = COCOeval(anno, pred, 'bbox')
|
299 |
if is_coco:
|
300 |
-
eval.params.imgIds = [int(Path(x).stem) for x in dataloader.dataset.
|
301 |
eval.evaluate()
|
302 |
eval.accumulate()
|
303 |
eval.summarize()
|
|
|
297 |
pred = anno.loadRes(pred_json) # init predictions api
|
298 |
eval = COCOeval(anno, pred, 'bbox')
|
299 |
if is_coco:
|
300 |
+
eval.params.imgIds = [int(Path(x).stem) for x in dataloader.dataset.im_files] # image IDs to evaluate
|
301 |
eval.evaluate()
|
302 |
eval.accumulate()
|
303 |
eval.summarize()
|