glenn-jocher commited on
Commit
f527704
1 Parent(s): 2296f15

Cache v0.3: improved corrupt image/label reporting (#3676)

Browse files

* Cache v0.3: improved corrupt image/label reporting

Fix for https://github.com/ultralytics/yolov5/issues/3656#issuecomment-863660899

* cleanup

Files changed (1) hide show
  1. utils/datasets.py +15 -9
utils/datasets.py CHANGED
@@ -390,7 +390,7 @@ class LoadImagesAndLabels(Dataset): # for training/testing
390
  cache_path = (p if p.is_file() else Path(self.label_files[0]).parent).with_suffix('.cache') # cached labels
391
  if cache_path.is_file():
392
  cache, exists = torch.load(cache_path), True # load
393
- if cache['hash'] != get_hash(self.label_files + self.img_files): # changed
394
  cache, exists = self.cache_labels(cache_path, prefix), False # re-cache
395
  else:
396
  cache, exists = self.cache_labels(cache_path, prefix), False # cache
@@ -400,11 +400,12 @@ class LoadImagesAndLabels(Dataset): # for training/testing
400
  if exists:
401
  d = f"Scanning '{cache_path}' images and labels... {nf} found, {nm} missing, {ne} empty, {nc} corrupted"
402
  tqdm(None, desc=prefix + d, total=n, initial=n) # display cache results
 
 
403
  assert nf > 0 or not augment, f'{prefix}No labels in {cache_path}. Can not train without labels. See {help_url}'
404
 
405
  # Read cache
406
- cache.pop('hash') # remove hash
407
- cache.pop('version') # remove version
408
  labels, shapes, self.segments = zip(*cache.values())
409
  self.labels = list(labels)
410
  self.shapes = np.array(shapes, dtype=np.float64)
@@ -461,26 +462,31 @@ class LoadImagesAndLabels(Dataset): # for training/testing
461
  def cache_labels(self, path=Path('./labels.cache'), prefix=''):
462
  # Cache dataset labels, check images and read shapes
463
  x = {} # dict
464
- nm, nf, ne, nc = 0, 0, 0, 0 # number missing, found, empty, corrupt
465
  desc = f"{prefix}Scanning '{path.parent / path.stem}' images and labels..."
466
  with Pool(num_threads) as pool:
467
  pbar = tqdm(pool.imap_unordered(verify_image_label, zip(self.img_files, self.label_files, repeat(prefix))),
468
  desc=desc, total=len(self.img_files))
469
- for im_file, l, shape, segments, nm_f, nf_f, ne_f, nc_f in pbar:
470
  nm += nm_f
471
  nf += nf_f
472
  ne += ne_f
473
  nc += nc_f
474
  if im_file:
475
  x[im_file] = [l, shape, segments]
 
 
476
  pbar.desc = f"{desc}{nf} found, {nm} missing, {ne} empty, {nc} corrupted"
477
 
478
  pbar.close()
 
 
479
  if nf == 0:
480
  logging.info(f'{prefix}WARNING: No labels found in {path}. See {help_url}')
481
  x['hash'] = get_hash(self.label_files + self.img_files)
482
  x['results'] = nf, nm, ne, nc, len(self.img_files)
483
- x['version'] = 0.2 # cache version
 
484
  try:
485
  torch.save(x, path) # save cache for next time
486
  logging.info(f'{prefix}New cache created: {path}')
@@ -1084,11 +1090,11 @@ def verify_image_label(args):
1084
  else:
1085
  nm = 1 # label missing
1086
  l = np.zeros((0, 5), dtype=np.float32)
1087
- return im_file, l, shape, segments, nm, nf, ne, nc
1088
  except Exception as e:
1089
  nc = 1
1090
- logging.info(f'{prefix}WARNING: Ignoring corrupted image and/or label {im_file}: {e}')
1091
- return [None, None, None, None, nm, nf, ne, nc]
1092
 
1093
 
1094
  def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False):
 
390
  cache_path = (p if p.is_file() else Path(self.label_files[0]).parent).with_suffix('.cache') # cached labels
391
  if cache_path.is_file():
392
  cache, exists = torch.load(cache_path), True # load
393
+ if cache['hash'] != get_hash(self.label_files + self.img_files) or cache['version'] != 0.3:
394
  cache, exists = self.cache_labels(cache_path, prefix), False # re-cache
395
  else:
396
  cache, exists = self.cache_labels(cache_path, prefix), False # cache
 
400
  if exists:
401
  d = f"Scanning '{cache_path}' images and labels... {nf} found, {nm} missing, {ne} empty, {nc} corrupted"
402
  tqdm(None, desc=prefix + d, total=n, initial=n) # display cache results
403
+ if cache['msgs']:
404
+ logging.info('\n'.join(cache['msgs'])) # display warnings
405
  assert nf > 0 or not augment, f'{prefix}No labels in {cache_path}. Can not train without labels. See {help_url}'
406
 
407
  # Read cache
408
+ [cache.pop(k) for k in ('hash', 'version', 'msgs')] # remove items
 
409
  labels, shapes, self.segments = zip(*cache.values())
410
  self.labels = list(labels)
411
  self.shapes = np.array(shapes, dtype=np.float64)
 
462
  def cache_labels(self, path=Path('./labels.cache'), prefix=''):
463
  # Cache dataset labels, check images and read shapes
464
  x = {} # dict
465
+ nm, nf, ne, nc, msgs = 0, 0, 0, 0, [] # number missing, found, empty, corrupt, messages
466
  desc = f"{prefix}Scanning '{path.parent / path.stem}' images and labels..."
467
  with Pool(num_threads) as pool:
468
  pbar = tqdm(pool.imap_unordered(verify_image_label, zip(self.img_files, self.label_files, repeat(prefix))),
469
  desc=desc, total=len(self.img_files))
470
+ for im_file, l, shape, segments, nm_f, nf_f, ne_f, nc_f, msg in pbar:
471
  nm += nm_f
472
  nf += nf_f
473
  ne += ne_f
474
  nc += nc_f
475
  if im_file:
476
  x[im_file] = [l, shape, segments]
477
+ if msg:
478
+ msgs.append(msg)
479
  pbar.desc = f"{desc}{nf} found, {nm} missing, {ne} empty, {nc} corrupted"
480
 
481
  pbar.close()
482
+ if msgs:
483
+ logging.info('\n'.join(msgs))
484
  if nf == 0:
485
  logging.info(f'{prefix}WARNING: No labels found in {path}. See {help_url}')
486
  x['hash'] = get_hash(self.label_files + self.img_files)
487
  x['results'] = nf, nm, ne, nc, len(self.img_files)
488
+ x['msgs'] = msgs # warnings
489
+ x['version'] = 0.3 # cache version
490
  try:
491
  torch.save(x, path) # save cache for next time
492
  logging.info(f'{prefix}New cache created: {path}')
 
1090
  else:
1091
  nm = 1 # label missing
1092
  l = np.zeros((0, 5), dtype=np.float32)
1093
+ return im_file, l, shape, segments, nm, nf, ne, nc, ''
1094
  except Exception as e:
1095
  nc = 1
1096
+ msg = f'{prefix}WARNING: Ignoring corrupted image and/or label {im_file}: {e}'
1097
+ return [None, None, None, None, nm, nf, ne, nc, msg]
1098
 
1099
 
1100
  def dataset_stats(path='coco128.yaml', autodownload=False, verbose=False):