glenn-jocher
commited on
Commit
•
11f85e7
1
Parent(s):
2da6444
Auto-fix corrupt JPEGs (#4548)
Browse files* Autofix corrupt JPEGs
This PR automatically re-saves corrupt JPEGs and trains with the resaved images. WARNING: this will overwrite the existing corrupt JPEGs in a dataset and replace them with correct JPEGs, though the filesize may increase and the image contents may not be exactly the same due to lossy JPEG compression schemes. Results may vary by JPEG decoder and hardware.
Current behavior is to exclude corrupt JPEGs from training with a warning to the user, but many users have been complaining about large parts of their dataset being excluded from training.
* Clarify re-save reason
- utils/datasets.py +7 -6
utils/datasets.py
CHANGED
@@ -314,7 +314,7 @@ class LoadStreams: # multiple IP or RTSP cameras
|
|
314 |
print('') # newline
|
315 |
|
316 |
# check for common shapes
|
317 |
-
s = np.stack([letterbox(x, self.img_size, stride=self.stride, auto=self.auto)[0].shape for x in self.imgs]
|
318 |
self.rect = np.unique(s, axis=0).shape[0] == 1 # rect inference if all shapes equal
|
319 |
if not self.rect:
|
320 |
print('WARNING: Different stream shapes detected. For optimal performance supply similarly-shaped streams.')
|
@@ -568,7 +568,7 @@ class LoadImagesAndLabels(Dataset): # for training/testing
|
|
568 |
if self.augment:
|
569 |
# Albumentations
|
570 |
img, labels = self.albumentations(img, labels)
|
571 |
-
nl = len(labels)
|
572 |
|
573 |
# HSV color-space
|
574 |
augment_hsv(img, hgain=hyp['hsv_h'], sgain=hyp['hsv_s'], vgain=hyp['hsv_v'])
|
@@ -861,7 +861,7 @@ def autosplit(path='../datasets/coco128/images', weights=(0.9, 0.1, 0.0), annota
|
|
861 |
def verify_image_label(args):
|
862 |
# Verify one image-label pair
|
863 |
im_file, lb_file, prefix = args
|
864 |
-
nm, nf, ne, nc = 0, 0, 0, 0 # number missing, found, empty, corrupt
|
865 |
try:
|
866 |
# verify images
|
867 |
im = Image.open(im_file)
|
@@ -872,10 +872,11 @@ def verify_image_label(args):
|
|
872 |
if im.format.lower() in ('jpg', 'jpeg'):
|
873 |
with open(im_file, 'rb') as f:
|
874 |
f.seek(-2, 2)
|
875 |
-
|
|
|
|
|
876 |
|
877 |
# verify labels
|
878 |
-
segments = [] # instance segments
|
879 |
if os.path.isfile(lb_file):
|
880 |
nf = 1 # label found
|
881 |
with open(lb_file, 'r') as f:
|
@@ -896,7 +897,7 @@ def verify_image_label(args):
|
|
896 |
else:
|
897 |
nm = 1 # label missing
|
898 |
l = np.zeros((0, 5), dtype=np.float32)
|
899 |
-
return im_file, l, shape, segments, nm, nf, ne, nc,
|
900 |
except Exception as e:
|
901 |
nc = 1
|
902 |
msg = f'{prefix}WARNING: Ignoring corrupted image and/or label {im_file}: {e}'
|
|
|
314 |
print('') # newline
|
315 |
|
316 |
# check for common shapes
|
317 |
+
s = np.stack([letterbox(x, self.img_size, stride=self.stride, auto=self.auto)[0].shape for x in self.imgs])
|
318 |
self.rect = np.unique(s, axis=0).shape[0] == 1 # rect inference if all shapes equal
|
319 |
if not self.rect:
|
320 |
print('WARNING: Different stream shapes detected. For optimal performance supply similarly-shaped streams.')
|
|
|
568 |
if self.augment:
|
569 |
# Albumentations
|
570 |
img, labels = self.albumentations(img, labels)
|
571 |
+
nl = len(labels) # update after albumentations
|
572 |
|
573 |
# HSV color-space
|
574 |
augment_hsv(img, hgain=hyp['hsv_h'], sgain=hyp['hsv_s'], vgain=hyp['hsv_v'])
|
|
|
861 |
def verify_image_label(args):
|
862 |
# Verify one image-label pair
|
863 |
im_file, lb_file, prefix = args
|
864 |
+
nm, nf, ne, nc, msg, segments = 0, 0, 0, 0, '', [] # number (missing, found, empty, corrupt), message, segments
|
865 |
try:
|
866 |
# verify images
|
867 |
im = Image.open(im_file)
|
|
|
872 |
if im.format.lower() in ('jpg', 'jpeg'):
|
873 |
with open(im_file, 'rb') as f:
|
874 |
f.seek(-2, 2)
|
875 |
+
if f.read() != b'\xff\xd9': # corrupt JPEG
|
876 |
+
im.save(im_file, format='JPEG', subsampling=0, quality=100) # re-save image
|
877 |
+
msg = f'{prefix}WARNING: corrupt JPEG restored and saved {im_file}'
|
878 |
|
879 |
# verify labels
|
|
|
880 |
if os.path.isfile(lb_file):
|
881 |
nf = 1 # label found
|
882 |
with open(lb_file, 'r') as f:
|
|
|
897 |
else:
|
898 |
nm = 1 # label missing
|
899 |
l = np.zeros((0, 5), dtype=np.float32)
|
900 |
+
return im_file, l, shape, segments, nm, nf, ne, nc, msg
|
901 |
except Exception as e:
|
902 |
nc = 1
|
903 |
msg = f'{prefix}WARNING: Ignoring corrupted image and/or label {im_file}: {e}'
|