glenn-jocher commited on
Commit
4728840
1 Parent(s): 8a66eba

Update `--cache disk` deprecate `*_npy/` dirs (#6876)

Browse files

* Updates

* Updates

* Updates

* Updates

* Updates

* Updates

* Updates

* Updates

* Updates

* Updates

* Cleanup

* Cleanup

Files changed (3) hide show
  1. utils/datasets.py +38 -38
  2. utils/loggers/wandb/wandb_utils.py +1 -1
  3. val.py +1 -1
utils/datasets.py CHANGED
@@ -407,19 +407,19 @@ class LoadImagesAndLabels(Dataset):
407
  # f += [p.parent / x.lstrip(os.sep) for x in t] # local to global path (pathlib)
408
  else:
409
  raise Exception(f'{prefix}{p} does not exist')
410
- self.img_files = sorted(x.replace('/', os.sep) for x in f if x.split('.')[-1].lower() in IMG_FORMATS)
411
  # self.img_files = sorted([x for x in f if x.suffix[1:].lower() in IMG_FORMATS]) # pathlib
412
- assert self.img_files, f'{prefix}No images found'
413
  except Exception as e:
414
  raise Exception(f'{prefix}Error loading data from {path}: {e}\nSee {HELP_URL}')
415
 
416
  # Check cache
417
- self.label_files = img2label_paths(self.img_files) # labels
418
  cache_path = (p if p.is_file() else Path(self.label_files[0]).parent).with_suffix('.cache')
419
  try:
420
  cache, exists = np.load(cache_path, allow_pickle=True).item(), True # load dict
421
  assert cache['version'] == self.cache_version # same version
422
- assert cache['hash'] == get_hash(self.label_files + self.img_files) # same hash
423
  except Exception:
424
  cache, exists = self.cache_labels(cache_path, prefix), False # cache
425
 
@@ -437,7 +437,7 @@ class LoadImagesAndLabels(Dataset):
437
  labels, shapes, self.segments = zip(*cache.values())
438
  self.labels = list(labels)
439
  self.shapes = np.array(shapes, dtype=np.float64)
440
- self.img_files = list(cache.keys()) # update
441
  self.label_files = img2label_paths(cache.keys()) # update
442
  n = len(shapes) # number of images
443
  bi = np.floor(np.arange(n) / batch_size).astype(np.int) # batch index
@@ -466,7 +466,7 @@ class LoadImagesAndLabels(Dataset):
466
  s = self.shapes # wh
467
  ar = s[:, 1] / s[:, 0] # aspect ratio
468
  irect = ar.argsort()
469
- self.img_files = [self.img_files[i] for i in irect]
470
  self.label_files = [self.label_files[i] for i in irect]
471
  self.labels = [self.labels[i] for i in irect]
472
  self.shapes = s[irect] # wh
@@ -485,24 +485,20 @@ class LoadImagesAndLabels(Dataset):
485
  self.batch_shapes = np.ceil(np.array(shapes) * img_size / stride + pad).astype(np.int) * stride
486
 
487
  # Cache images into RAM/disk for faster training (WARNING: large datasets may exceed system resources)
488
- self.imgs, self.img_npy = [None] * n, [None] * n
 
489
  if cache_images:
490
- if cache_images == 'disk':
491
- self.im_cache_dir = Path(Path(self.img_files[0]).parent.as_posix() + '_npy')
492
- self.img_npy = [self.im_cache_dir / Path(f).with_suffix('.npy').name for f in self.img_files]
493
- self.im_cache_dir.mkdir(parents=True, exist_ok=True)
494
  gb = 0 # Gigabytes of cached images
495
- self.img_hw0, self.img_hw = [None] * n, [None] * n
496
- results = ThreadPool(NUM_THREADS).imap(self.load_image, range(n))
 
497
  pbar = tqdm(enumerate(results), total=n)
498
  for i, x in pbar:
499
  if cache_images == 'disk':
500
- if not self.img_npy[i].exists():
501
- np.save(self.img_npy[i].as_posix(), x[0])
502
- gb += self.img_npy[i].stat().st_size
503
  else: # 'ram'
504
- self.imgs[i], self.img_hw0[i], self.img_hw[i] = x # im, hw_orig, hw_resized = load_image(self, i)
505
- gb += self.imgs[i].nbytes
506
  pbar.desc = f'{prefix}Caching images ({gb / 1E9:.1f}GB {cache_images})'
507
  pbar.close()
508
 
@@ -512,8 +508,8 @@ class LoadImagesAndLabels(Dataset):
512
  nm, nf, ne, nc, msgs = 0, 0, 0, 0, [] # number missing, found, empty, corrupt, messages
513
  desc = f"{prefix}Scanning '{path.parent / path.stem}' images and labels..."
514
  with Pool(NUM_THREADS) as pool:
515
- pbar = tqdm(pool.imap(verify_image_label, zip(self.img_files, self.label_files, repeat(prefix))),
516
- desc=desc, total=len(self.img_files))
517
  for im_file, lb, shape, segments, nm_f, nf_f, ne_f, nc_f, msg in pbar:
518
  nm += nm_f
519
  nf += nf_f
@@ -530,8 +526,8 @@ class LoadImagesAndLabels(Dataset):
530
  LOGGER.info('\n'.join(msgs))
531
  if nf == 0:
532
  LOGGER.warning(f'{prefix}WARNING: No labels found in {path}. See {HELP_URL}')
533
- x['hash'] = get_hash(self.label_files + self.img_files)
534
- x['results'] = nf, nm, ne, nc, len(self.img_files)
535
  x['msgs'] = msgs # warnings
536
  x['version'] = self.cache_version # cache version
537
  try:
@@ -543,7 +539,7 @@ class LoadImagesAndLabels(Dataset):
543
  return x
544
 
545
  def __len__(self):
546
- return len(self.img_files)
547
 
548
  # def __iter__(self):
549
  # self.count = -1
@@ -622,17 +618,15 @@ class LoadImagesAndLabels(Dataset):
622
  img = img.transpose((2, 0, 1))[::-1] # HWC to CHW, BGR to RGB
623
  img = np.ascontiguousarray(img)
624
 
625
- return torch.from_numpy(img), labels_out, self.img_files[index], shapes
626
 
627
  def load_image(self, i):
628
- # loads 1 image from dataset index 'i', returns (im, original hw, resized hw)
629
- im = self.imgs[i]
630
  if im is None: # not cached in RAM
631
- npy = self.img_npy[i]
632
- if npy and npy.exists(): # load npy
633
- im = np.load(npy)
634
  else: # read image
635
- f = self.img_files[i]
636
  im = cv2.imread(f) # BGR
637
  assert im is not None, f'Image Not Found {f}'
638
  h0, w0 = im.shape[:2] # orig hw
@@ -643,7 +637,13 @@ class LoadImagesAndLabels(Dataset):
643
  interpolation=cv2.INTER_LINEAR if (self.augment or r > 1) else cv2.INTER_AREA)
644
  return im, (h0, w0), im.shape[:2] # im, hw_original, hw_resized
645
  else:
646
- return self.imgs[i], self.img_hw0[i], self.img_hw[i] # im, hw_original, hw_resized
 
 
 
 
 
 
647
 
648
  def load_mosaic(self, index):
649
  # YOLOv5 4-mosaic loader. Loads 1 image + 3 random images into a 4-image mosaic
@@ -777,16 +777,16 @@ class LoadImagesAndLabels(Dataset):
777
 
778
  @staticmethod
779
  def collate_fn(batch):
780
- img, label, path, shapes = zip(*batch) # transposed
781
  for i, lb in enumerate(label):
782
  lb[:, 0] = i # add target image index for build_targets()
783
- return torch.stack(img, 0), torch.cat(label, 0), path, shapes
784
 
785
  @staticmethod
786
  def collate_fn4(batch):
787
  img, label, path, shapes = zip(*batch) # transposed
788
  n = len(shapes) // 4
789
- img4, label4, path4, shapes4 = [], [], path[:n], shapes[:n]
790
 
791
  ho = torch.tensor([[0.0, 0, 0, 1, 0, 0]])
792
  wo = torch.tensor([[0.0, 0, 1, 0, 0, 0]])
@@ -800,13 +800,13 @@ class LoadImagesAndLabels(Dataset):
800
  else:
801
  im = torch.cat((torch.cat((img[i], img[i + 1]), 1), torch.cat((img[i + 2], img[i + 3]), 1)), 2)
802
  lb = torch.cat((label[i], label[i + 1] + ho, label[i + 2] + wo, label[i + 3] + ho + wo), 0) * s
803
- img4.append(im)
804
  label4.append(lb)
805
 
806
  for i, lb in enumerate(label4):
807
  lb[:, 0] = i # add target image index for build_targets()
808
 
809
- return torch.stack(img4, 0), torch.cat(label4, 0), path4, shapes4
810
 
811
 
812
  # Ancillary functions --------------------------------------------------------------------------------------------------
@@ -999,12 +999,12 @@ def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False, profil
999
  'image_stats': {'total': dataset.n, 'unlabelled': int(np.all(x == 0, 1).sum()),
1000
  'per_class': (x > 0).sum(0).tolist()},
1001
  'labels': [{str(Path(k).name): round_labels(v.tolist())} for k, v in
1002
- zip(dataset.img_files, dataset.labels)]}
1003
 
1004
  if hub:
1005
  im_dir = hub_dir / 'images'
1006
  im_dir.mkdir(parents=True, exist_ok=True)
1007
- for _ in tqdm(ThreadPool(NUM_THREADS).imap(hub_ops, dataset.img_files), total=dataset.n, desc='HUB Ops'):
1008
  pass
1009
 
1010
  # Profile
 
407
  # f += [p.parent / x.lstrip(os.sep) for x in t] # local to global path (pathlib)
408
  else:
409
  raise Exception(f'{prefix}{p} does not exist')
410
+ self.im_files = sorted(x.replace('/', os.sep) for x in f if x.split('.')[-1].lower() in IMG_FORMATS)
411
  # self.img_files = sorted([x for x in f if x.suffix[1:].lower() in IMG_FORMATS]) # pathlib
412
+ assert self.im_files, f'{prefix}No images found'
413
  except Exception as e:
414
  raise Exception(f'{prefix}Error loading data from {path}: {e}\nSee {HELP_URL}')
415
 
416
  # Check cache
417
+ self.label_files = img2label_paths(self.im_files) # labels
418
  cache_path = (p if p.is_file() else Path(self.label_files[0]).parent).with_suffix('.cache')
419
  try:
420
  cache, exists = np.load(cache_path, allow_pickle=True).item(), True # load dict
421
  assert cache['version'] == self.cache_version # same version
422
+ assert cache['hash'] == get_hash(self.label_files + self.im_files) # same hash
423
  except Exception:
424
  cache, exists = self.cache_labels(cache_path, prefix), False # cache
425
 
 
437
  labels, shapes, self.segments = zip(*cache.values())
438
  self.labels = list(labels)
439
  self.shapes = np.array(shapes, dtype=np.float64)
440
+ self.im_files = list(cache.keys()) # update
441
  self.label_files = img2label_paths(cache.keys()) # update
442
  n = len(shapes) # number of images
443
  bi = np.floor(np.arange(n) / batch_size).astype(np.int) # batch index
 
466
  s = self.shapes # wh
467
  ar = s[:, 1] / s[:, 0] # aspect ratio
468
  irect = ar.argsort()
469
+ self.im_files = [self.im_files[i] for i in irect]
470
  self.label_files = [self.label_files[i] for i in irect]
471
  self.labels = [self.labels[i] for i in irect]
472
  self.shapes = s[irect] # wh
 
485
  self.batch_shapes = np.ceil(np.array(shapes) * img_size / stride + pad).astype(np.int) * stride
486
 
487
  # Cache images into RAM/disk for faster training (WARNING: large datasets may exceed system resources)
488
+ self.ims = [None] * n
489
+ self.npy_files = [Path(f).with_suffix('.npy') for f in self.im_files]
490
  if cache_images:
 
 
 
 
491
  gb = 0 # Gigabytes of cached images
492
+ self.im_hw0, self.im_hw = [None] * n, [None] * n
493
+ fcn = self.cache_images_to_disk if cache_images == 'disk' else self.load_image
494
+ results = ThreadPool(NUM_THREADS).imap(fcn, range(n))
495
  pbar = tqdm(enumerate(results), total=n)
496
  for i, x in pbar:
497
  if cache_images == 'disk':
498
+ gb += self.npy_files[i].stat().st_size
 
 
499
  else: # 'ram'
500
+ self.ims[i], self.im_hw0[i], self.im_hw[i] = x # im, hw_orig, hw_resized = load_image(self, i)
501
+ gb += self.ims[i].nbytes
502
  pbar.desc = f'{prefix}Caching images ({gb / 1E9:.1f}GB {cache_images})'
503
  pbar.close()
504
 
 
508
  nm, nf, ne, nc, msgs = 0, 0, 0, 0, [] # number missing, found, empty, corrupt, messages
509
  desc = f"{prefix}Scanning '{path.parent / path.stem}' images and labels..."
510
  with Pool(NUM_THREADS) as pool:
511
+ pbar = tqdm(pool.imap(verify_image_label, zip(self.im_files, self.label_files, repeat(prefix))),
512
+ desc=desc, total=len(self.im_files))
513
  for im_file, lb, shape, segments, nm_f, nf_f, ne_f, nc_f, msg in pbar:
514
  nm += nm_f
515
  nf += nf_f
 
526
  LOGGER.info('\n'.join(msgs))
527
  if nf == 0:
528
  LOGGER.warning(f'{prefix}WARNING: No labels found in {path}. See {HELP_URL}')
529
+ x['hash'] = get_hash(self.label_files + self.im_files)
530
+ x['results'] = nf, nm, ne, nc, len(self.im_files)
531
  x['msgs'] = msgs # warnings
532
  x['version'] = self.cache_version # cache version
533
  try:
 
539
  return x
540
 
541
  def __len__(self):
542
+ return len(self.im_files)
543
 
544
  # def __iter__(self):
545
  # self.count = -1
 
618
  img = img.transpose((2, 0, 1))[::-1] # HWC to CHW, BGR to RGB
619
  img = np.ascontiguousarray(img)
620
 
621
+ return torch.from_numpy(img), labels_out, self.im_files[index], shapes
622
 
623
  def load_image(self, i):
624
+ # Loads 1 image from dataset index 'i', returns (im, original hw, resized hw)
625
+ im, f, fn = self.ims[i], self.im_files[i], self.npy_files[i],
626
  if im is None: # not cached in RAM
627
+ if fn.exists(): # load npy
628
+ im = np.load(fn)
 
629
  else: # read image
 
630
  im = cv2.imread(f) # BGR
631
  assert im is not None, f'Image Not Found {f}'
632
  h0, w0 = im.shape[:2] # orig hw
 
637
  interpolation=cv2.INTER_LINEAR if (self.augment or r > 1) else cv2.INTER_AREA)
638
  return im, (h0, w0), im.shape[:2] # im, hw_original, hw_resized
639
  else:
640
+ return self.ims[i], self.im_hw0[i], self.im_hw[i] # im, hw_original, hw_resized
641
+
642
+ def cache_images_to_disk(self, i):
643
+ # Saves an image as an *.npy file for faster loading
644
+ f = self.npy_files[i]
645
+ if not f.exists():
646
+ np.save(f.as_posix(), cv2.imread(self.im_files[i]))
647
 
648
  def load_mosaic(self, index):
649
  # YOLOv5 4-mosaic loader. Loads 1 image + 3 random images into a 4-image mosaic
 
777
 
778
  @staticmethod
779
  def collate_fn(batch):
780
+ im, label, path, shapes = zip(*batch) # transposed
781
  for i, lb in enumerate(label):
782
  lb[:, 0] = i # add target image index for build_targets()
783
+ return torch.stack(im, 0), torch.cat(label, 0), path, shapes
784
 
785
  @staticmethod
786
  def collate_fn4(batch):
787
  img, label, path, shapes = zip(*batch) # transposed
788
  n = len(shapes) // 4
789
+ im4, label4, path4, shapes4 = [], [], path[:n], shapes[:n]
790
 
791
  ho = torch.tensor([[0.0, 0, 0, 1, 0, 0]])
792
  wo = torch.tensor([[0.0, 0, 1, 0, 0, 0]])
 
800
  else:
801
  im = torch.cat((torch.cat((img[i], img[i + 1]), 1), torch.cat((img[i + 2], img[i + 3]), 1)), 2)
802
  lb = torch.cat((label[i], label[i + 1] + ho, label[i + 2] + wo, label[i + 3] + ho + wo), 0) * s
803
+ im4.append(im)
804
  label4.append(lb)
805
 
806
  for i, lb in enumerate(label4):
807
  lb[:, 0] = i # add target image index for build_targets()
808
 
809
+ return torch.stack(im4, 0), torch.cat(label4, 0), path4, shapes4
810
 
811
 
812
  # Ancillary functions --------------------------------------------------------------------------------------------------
 
999
  'image_stats': {'total': dataset.n, 'unlabelled': int(np.all(x == 0, 1).sum()),
1000
  'per_class': (x > 0).sum(0).tolist()},
1001
  'labels': [{str(Path(k).name): round_labels(v.tolist())} for k, v in
1002
+ zip(dataset.im_files, dataset.labels)]}
1003
 
1004
  if hub:
1005
  im_dir = hub_dir / 'images'
1006
  im_dir.mkdir(parents=True, exist_ok=True)
1007
+ for _ in tqdm(ThreadPool(NUM_THREADS).imap(hub_ops, dataset.im_files), total=dataset.n, desc='HUB Ops'):
1008
  pass
1009
 
1010
  # Profile
utils/loggers/wandb/wandb_utils.py CHANGED
@@ -403,7 +403,7 @@ class WandbLogger():
403
  # TODO: Explore multiprocessing to slpit this loop parallely| This is essential for speeding up the the logging
404
  artifact = wandb.Artifact(name=name, type="dataset")
405
  img_files = tqdm([dataset.path]) if isinstance(dataset.path, str) and Path(dataset.path).is_dir() else None
406
- img_files = tqdm(dataset.img_files) if not img_files else img_files
407
  for img_file in img_files:
408
  if Path(img_file).is_dir():
409
  artifact.add_dir(img_file, name='data/images')
 
403
  # TODO: Explore multiprocessing to slpit this loop parallely| This is essential for speeding up the the logging
404
  artifact = wandb.Artifact(name=name, type="dataset")
405
  img_files = tqdm([dataset.path]) if isinstance(dataset.path, str) and Path(dataset.path).is_dir() else None
406
+ img_files = tqdm(dataset.im_files) if not img_files else img_files
407
  for img_file in img_files:
408
  if Path(img_file).is_dir():
409
  artifact.add_dir(img_file, name='data/images')
val.py CHANGED
@@ -297,7 +297,7 @@ def run(data,
297
  pred = anno.loadRes(pred_json) # init predictions api
298
  eval = COCOeval(anno, pred, 'bbox')
299
  if is_coco:
300
- eval.params.imgIds = [int(Path(x).stem) for x in dataloader.dataset.img_files] # image IDs to evaluate
301
  eval.evaluate()
302
  eval.accumulate()
303
  eval.summarize()
 
297
  pred = anno.loadRes(pred_json) # init predictions api
298
  eval = COCOeval(anno, pred, 'bbox')
299
  if is_coco:
300
+ eval.params.imgIds = [int(Path(x).stem) for x in dataloader.dataset.im_files] # image IDs to evaluate
301
  eval.evaluate()
302
  eval.accumulate()
303
  eval.summarize()