glenn-jocher commited on
Commit
11f85e7
1 Parent(s): 2da6444

Auto-fix corrupt JPEGs (#4548)

Browse files

* Autofix corrupt JPEGs

This PR automatically re-saves corrupt JPEGs and trains with the resaved images. WARNING: this will overwrite the existing corrupt JPEGs in a dataset and replace them with correct JPEGs, though the filesize may increase and the image contents may not be exactly the same due to lossy JPEG compression schemes. Results may vary by JPEG decoder and hardware.

Current behavior is to exclude corrupt JPEGs from training with a warning to the user, but many users have been complaining about large parts of their dataset being excluded from training.

* Clarify re-save reason

Files changed (1) hide show
  1. utils/datasets.py +7 -6
utils/datasets.py CHANGED
@@ -314,7 +314,7 @@ class LoadStreams: # multiple IP or RTSP cameras
314
  print('') # newline
315
 
316
  # check for common shapes
317
- s = np.stack([letterbox(x, self.img_size, stride=self.stride, auto=self.auto)[0].shape for x in self.imgs], 0) # shapes
318
  self.rect = np.unique(s, axis=0).shape[0] == 1 # rect inference if all shapes equal
319
  if not self.rect:
320
  print('WARNING: Different stream shapes detected. For optimal performance supply similarly-shaped streams.')
@@ -568,7 +568,7 @@ class LoadImagesAndLabels(Dataset): # for training/testing
568
  if self.augment:
569
  # Albumentations
570
  img, labels = self.albumentations(img, labels)
571
- nl = len(labels) # update after albumentations
572
 
573
  # HSV color-space
574
  augment_hsv(img, hgain=hyp['hsv_h'], sgain=hyp['hsv_s'], vgain=hyp['hsv_v'])
@@ -861,7 +861,7 @@ def autosplit(path='../datasets/coco128/images', weights=(0.9, 0.1, 0.0), annota
861
  def verify_image_label(args):
862
  # Verify one image-label pair
863
  im_file, lb_file, prefix = args
864
- nm, nf, ne, nc = 0, 0, 0, 0 # number missing, found, empty, corrupt
865
  try:
866
  # verify images
867
  im = Image.open(im_file)
@@ -872,10 +872,11 @@ def verify_image_label(args):
872
  if im.format.lower() in ('jpg', 'jpeg'):
873
  with open(im_file, 'rb') as f:
874
  f.seek(-2, 2)
875
- assert f.read() == b'\xff\xd9', 'corrupted JPEG'
 
 
876
 
877
  # verify labels
878
- segments = [] # instance segments
879
  if os.path.isfile(lb_file):
880
  nf = 1 # label found
881
  with open(lb_file, 'r') as f:
@@ -896,7 +897,7 @@ def verify_image_label(args):
896
  else:
897
  nm = 1 # label missing
898
  l = np.zeros((0, 5), dtype=np.float32)
899
- return im_file, l, shape, segments, nm, nf, ne, nc, ''
900
  except Exception as e:
901
  nc = 1
902
  msg = f'{prefix}WARNING: Ignoring corrupted image and/or label {im_file}: {e}'
 
314
  print('') # newline
315
 
316
  # check for common shapes
317
+ s = np.stack([letterbox(x, self.img_size, stride=self.stride, auto=self.auto)[0].shape for x in self.imgs])
318
  self.rect = np.unique(s, axis=0).shape[0] == 1 # rect inference if all shapes equal
319
  if not self.rect:
320
  print('WARNING: Different stream shapes detected. For optimal performance supply similarly-shaped streams.')
 
568
  if self.augment:
569
  # Albumentations
570
  img, labels = self.albumentations(img, labels)
571
+ nl = len(labels) # update after albumentations
572
 
573
  # HSV color-space
574
  augment_hsv(img, hgain=hyp['hsv_h'], sgain=hyp['hsv_s'], vgain=hyp['hsv_v'])
 
861
  def verify_image_label(args):
862
  # Verify one image-label pair
863
  im_file, lb_file, prefix = args
864
+ nm, nf, ne, nc, msg, segments = 0, 0, 0, 0, '', [] # number (missing, found, empty, corrupt), message, segments
865
  try:
866
  # verify images
867
  im = Image.open(im_file)
 
872
  if im.format.lower() in ('jpg', 'jpeg'):
873
  with open(im_file, 'rb') as f:
874
  f.seek(-2, 2)
875
+ if f.read() != b'\xff\xd9': # corrupt JPEG
876
+ im.save(im_file, format='JPEG', subsampling=0, quality=100) # re-save image
877
+ msg = f'{prefix}WARNING: corrupt JPEG restored and saved {im_file}'
878
 
879
  # verify labels
 
880
  if os.path.isfile(lb_file):
881
  nf = 1 # label found
882
  with open(lb_file, 'r') as f:
 
897
  else:
898
  nm = 1 # label missing
899
  l = np.zeros((0, 5), dtype=np.float32)
900
+ return im_file, l, shape, segments, nm, nf, ne, nc, msg
901
  except Exception as e:
902
  nc = 1
903
  msg = f'{prefix}WARNING: Ignoring corrupted image and/or label {im_file}: {e}'