glenn-jocher commited on
Commit
41523e2
1 Parent(s): 3d8ed0a

Dataset autodownload feature addition (#685)

Browse files

* initial commit

* move download scripts into data/scripts

* new check_dataset() function in general.py

* move check_dataset() out of with context

* Update general.py

* DDP update

* Update general.py

data/coco.yaml CHANGED
@@ -1,5 +1,4 @@
1
  # COCO 2017 dataset http://cocodataset.org
2
- # Download command: bash yolov5/data/get_coco2017.sh
3
  # Train command: python train.py --data coco.yaml
4
  # Default dataset location is next to /yolov5:
5
  # /parent_folder
@@ -7,6 +6,9 @@
7
  # /yolov5
8
 
9
 
 
 
 
10
  # train and val data as 1) directory: path/images/, 2) file: path/images.txt, or 3) list: [path1/images/, path2/images/]
11
  train: ../coco/train2017.txt # 118287 images
12
  val: ../coco/val2017.txt # 5000 images
 
1
  # COCO 2017 dataset http://cocodataset.org
 
2
  # Train command: python train.py --data coco.yaml
3
  # Default dataset location is next to /yolov5:
4
  # /parent_folder
 
6
  # /yolov5
7
 
8
 
9
+ # download command/URL (optional)
10
+ download: bash data/scripts/get_coco.sh
11
+
12
  # train and val data as 1) directory: path/images/, 2) file: path/images.txt, or 3) list: [path1/images/, path2/images/]
13
  train: ../coco/train2017.txt # 118287 images
14
  val: ../coco/val2017.txt # 5000 images
data/coco128.yaml CHANGED
@@ -1,5 +1,4 @@
1
  # COCO 2017 dataset http://cocodataset.org - first 128 training images
2
- # Download command: python -c "from yolov5.utils.google_utils import *; gdrive_download('1n_oKgR81BJtqk75b00eAjdv03qVCQn2f', 'coco128.zip')"
3
  # Train command: python train.py --data coco128.yaml
4
  # Default dataset location is next to /yolov5:
5
  # /parent_folder
@@ -7,6 +6,9 @@
7
  # /yolov5
8
 
9
 
 
 
 
10
  # train and val data as 1) directory: path/images/, 2) file: path/images.txt, or 3) list: [path1/images/, path2/images/]
11
  train: ../coco128/images/train2017/ # 128 images
12
  val: ../coco128/images/train2017/ # 128 images
 
1
  # COCO 2017 dataset http://cocodataset.org - first 128 training images
 
2
  # Train command: python train.py --data coco128.yaml
3
  # Default dataset location is next to /yolov5:
4
  # /parent_folder
 
6
  # /yolov5
7
 
8
 
9
+ # download command/URL (optional)
10
+ download: https://github.com/ultralytics/yolov5/releases/download/v1.0/coco128.zip
11
+
12
  # train and val data as 1) directory: path/images/, 2) file: path/images.txt, or 3) list: [path1/images/, path2/images/]
13
  train: ../coco128/images/train2017/ # 128 images
14
  val: ../coco128/images/train2017/ # 128 images
data/get_coco2017.sh DELETED
@@ -1,30 +0,0 @@
1
- #!/bin/bash
2
- # COCO 2017 dataset http://cocodataset.org
3
- # Download command: bash yolov5/data/get_coco2017.sh
4
- # Train command: python train.py --data coco.yaml
5
- # Default dataset location is next to /yolov5:
6
- # /parent_folder
7
- # /coco
8
- # /yolov5
9
-
10
-
11
- # Download labels from Google Drive, accepting presented query
12
- filename="coco2017labels.zip"
13
- fileid="1cXZR_ckHki6nddOmcysCuuJFM--T-Q6L"
14
- curl -c ./cookie -s -L "https://drive.google.com/uc?export=download&id=${fileid}" > /dev/null
15
- curl -Lb ./cookie "https://drive.google.com/uc?export=download&confirm=`awk '/download/ {print $NF}' ./cookie`&id=${fileid}" -o ${filename}
16
- rm ./cookie
17
-
18
- # Unzip labels
19
- unzip -q ${filename} # for coco.zip
20
- # tar -xzf ${filename} # for coco.tar.gz
21
- rm ${filename}
22
-
23
- # Download and unzip images
24
- cd coco/images
25
- f="train2017.zip" && curl http://images.cocodataset.org/zips/$f -o $f && unzip -q $f && rm $f # 19G, 118k images
26
- f="val2017.zip" && curl http://images.cocodataset.org/zips/$f -o $f && unzip -q $f && rm $f # 1G, 5k images
27
- # f="test2017.zip" && curl http://images.cocodataset.org/zips/$f -o $f && unzip -q $f && rm $f # 7G, 41k images
28
-
29
- # cd out
30
- cd ../..
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/scripts/get_coco.sh ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # COCO 2017 dataset http://cocodataset.org
3
+ # Download command: bash data/scripts/get_coco.sh
4
+ # Train command: python train.py --data coco.yaml
5
+ # Default dataset location is next to /yolov5:
6
+ # /parent_folder
7
+ # /coco
8
+ # /yolov5
9
+
10
+ # Download/unzip labels
11
+ echo 'Downloading COCO 2017 labels ...'
12
+ d='../' # unzip directory
13
+ f='coco2017labels.zip' && curl -L https://github.com/ultralytics/yolov5/releases/download/v1.0/$f -o $f
14
+ unzip -q $f -d $d && rm $f
15
+
16
+ # Download/unzip images
17
+ echo 'Downloading COCO 2017 images ...'
18
+ d='../coco/images' # unzip directory
19
+ f='train2017.zip' && curl http://images.cocodataset.org/zips/$f -o $f && unzip -q $f -d $d && rm $f # 19G, 118k images
20
+ f='val2017.zip' && curl http://images.cocodataset.org/zips/$f -o $f && unzip -q $f -d $d && rm $f # 1G, 5k images
21
+ # f='test2017.zip' && curl http://images.cocodataset.org/zips/$f -o $f && unzip -q $f -d $d && rm $f # 7G, 41k images
data/{get_voc.sh → scripts/get_voc.sh} RENAMED
@@ -1,33 +1,32 @@
 
1
  # PASCAL VOC dataset http://host.robots.ox.ac.uk/pascal/VOC/
2
- # Download command: bash ./data/get_voc.sh
3
  # Train command: python train.py --data voc.yaml
4
  # Default dataset location is next to /yolov5:
5
  # /parent_folder
6
  # /VOC
7
  # /yolov5
8
 
9
-
10
- start=`date +%s`
11
 
12
  # handle optional download dir
13
- if [ -z "$1" ]
14
- then
15
- # navigate to ~/tmp
16
- echo "navigating to ../tmp/ ..."
17
- mkdir -p ../tmp
18
- cd ../tmp/
19
- else
20
- # check if is valid directory
21
- if [ ! -d $1 ]; then
22
- echo $1 "is not a valid directory"
23
- exit 0
24
- fi
25
- echo "navigating to" $1 "..."
26
- cd $1
27
  fi
28
 
29
  echo "Downloading VOC2007 trainval ..."
30
- # Download the data.
31
  curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
32
  echo "Downloading VOC2007 test data ..."
33
  curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
@@ -42,44 +41,42 @@ echo "removing tars ..."
42
  rm VOCtrainval_06-Nov-2007.tar
43
  rm VOCtest_06-Nov-2007.tar
44
 
45
- end=`date +%s`
46
- runtime=$((end-start))
47
 
48
  echo "Completed in" $runtime "seconds"
49
 
50
- start=`date +%s`
51
 
52
  # handle optional download dir
53
- if [ -z "$1" ]
54
- then
55
- # navigate to ~/tmp
56
- echo "navigating to ../tmp/ ..."
57
- mkdir -p ../tmp
58
- cd ../tmp/
59
- else
60
- # check if is valid directory
61
- if [ ! -d $1 ]; then
62
- echo $1 "is not a valid directory"
63
- exit 0
64
- fi
65
- echo "navigating to" $1 "..."
66
- cd $1
67
  fi
68
 
69
  echo "Downloading VOC2012 trainval ..."
70
- # Download the data.
71
  curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
72
  echo "Done downloading."
73
 
74
-
75
  # Extract data
76
  echo "Extracting trainval ..."
77
  tar -xf VOCtrainval_11-May-2012.tar
78
  echo "removing tar ..."
79
  rm VOCtrainval_11-May-2012.tar
80
 
81
- end=`date +%s`
82
- runtime=$((end-start))
83
 
84
  echo "Completed in" $runtime "seconds"
85
 
@@ -144,8 +141,8 @@ for year, image_set in sets:
144
 
145
  END
146
 
147
- cat 2007_train.txt 2007_val.txt 2012_train.txt 2012_val.txt > train.txt
148
- cat 2007_train.txt 2007_val.txt 2007_test.txt 2012_train.txt 2012_val.txt > train.all.txt
149
 
150
  python3 - "$@" <<END
151
 
@@ -211,5 +208,5 @@ for line in lines:
211
 
212
  END
213
 
214
- rm -rf ../tmp # remove temporary directory
215
  echo "VOC download done."
 
1
+ #!/bin/bash
2
  # PASCAL VOC dataset http://host.robots.ox.ac.uk/pascal/VOC/
3
+ # Download command: bash data/scripts/get_voc.sh
4
  # Train command: python train.py --data voc.yaml
5
  # Default dataset location is next to /yolov5:
6
  # /parent_folder
7
  # /VOC
8
  # /yolov5
9
 
10
+ start=$(date +%s)
 
11
 
12
  # handle optional download dir
13
+ if [ -z "$1" ]; then
14
+ # navigate to ~/tmp
15
+ echo "navigating to ../tmp/ ..."
16
+ mkdir -p ../tmp
17
+ cd ../tmp/
18
+ else
19
+ # check if is valid directory
20
+ if [ ! -d $1 ]; then
21
+ echo $1 "is not a valid directory"
22
+ exit 0
23
+ fi
24
+ echo "navigating to" $1 "..."
25
+ cd $1
 
26
  fi
27
 
28
  echo "Downloading VOC2007 trainval ..."
29
+ # Download data
30
  curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
31
  echo "Downloading VOC2007 test data ..."
32
  curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
 
41
  rm VOCtrainval_06-Nov-2007.tar
42
  rm VOCtest_06-Nov-2007.tar
43
 
44
+ end=$(date +%s)
45
+ runtime=$((end - start))
46
 
47
  echo "Completed in" $runtime "seconds"
48
 
49
+ start=$(date +%s)
50
 
51
  # handle optional download dir
52
+ if [ -z "$1" ]; then
53
+ # navigate to ~/tmp
54
+ echo "navigating to ../tmp/ ..."
55
+ mkdir -p ../tmp
56
+ cd ../tmp/
57
+ else
58
+ # check if is valid directory
59
+ if [ ! -d $1 ]; then
60
+ echo $1 "is not a valid directory"
61
+ exit 0
62
+ fi
63
+ echo "navigating to" $1 "..."
64
+ cd $1
 
65
  fi
66
 
67
  echo "Downloading VOC2012 trainval ..."
68
+ # Download data
69
  curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
70
  echo "Done downloading."
71
 
 
72
  # Extract data
73
  echo "Extracting trainval ..."
74
  tar -xf VOCtrainval_11-May-2012.tar
75
  echo "removing tar ..."
76
  rm VOCtrainval_11-May-2012.tar
77
 
78
+ end=$(date +%s)
79
+ runtime=$((end - start))
80
 
81
  echo "Completed in" $runtime "seconds"
82
 
 
141
 
142
  END
143
 
144
+ cat 2007_train.txt 2007_val.txt 2012_train.txt 2012_val.txt >train.txt
145
+ cat 2007_train.txt 2007_val.txt 2007_test.txt 2012_train.txt 2012_val.txt >train.all.txt
146
 
147
  python3 - "$@" <<END
148
 
 
208
 
209
  END
210
 
211
+ rm -rf ../tmp # remove temporary directory
212
  echo "VOC download done."
data/voc.yaml CHANGED
@@ -1,5 +1,4 @@
1
  # PASCAL VOC dataset http://host.robots.ox.ac.uk/pascal/VOC/
2
- # Download command: bash ./data/get_voc.sh
3
  # Train command: python train.py --data voc.yaml
4
  # Default dataset location is next to /yolov5:
5
  # /parent_folder
@@ -7,6 +6,9 @@
7
  # /yolov5
8
 
9
 
 
 
 
10
  # train and val data as 1) directory: path/images/, 2) file: path/images.txt, or 3) list: [path1/images/, path2/images/]
11
  train: ../VOC/images/train/ # 16551 images
12
  val: ../VOC/images/val/ # 4952 images
 
1
  # PASCAL VOC dataset http://host.robots.ox.ac.uk/pascal/VOC/
 
2
  # Train command: python train.py --data voc.yaml
3
  # Default dataset location is next to /yolov5:
4
  # /parent_folder
 
6
  # /yolov5
7
 
8
 
9
+ # download command/URL (optional)
10
+ download: bash data/scripts/get_voc.sh
11
+
12
  # train and val data as 1) directory: path/images/, 2) file: path/images.txt, or 3) list: [path1/images/, path2/images/]
13
  train: ../VOC/images/train/ # 16551 images
14
  val: ../VOC/images/val/ # 4952 images
test.py CHANGED
@@ -13,7 +13,7 @@ from tqdm import tqdm
13
  from models.experimental import attempt_load
14
  from utils.datasets import create_dataloader
15
  from utils.general import (
16
- coco80_to_coco91_class, check_file, check_img_size, compute_loss, non_max_suppression,
17
  scale_coords, xyxy2xywh, clip_coords, plot_images, xywh2xyxy, box_iou, output_to_target, ap_per_class)
18
  from utils.torch_utils import select_device, time_synchronized
19
 
@@ -68,6 +68,7 @@ def test(data,
68
  model.eval()
69
  with open(data) as f:
70
  data = yaml.load(f, Loader=yaml.FullLoader) # model dict
 
71
  nc = 1 if single_cls else int(data['nc']) # number of classes
72
  iouv = torch.linspace(0.5, 0.95, 10).to(device) # iou vector for mAP@0.5:0.95
73
  niou = iouv.numel()
 
13
  from models.experimental import attempt_load
14
  from utils.datasets import create_dataloader
15
  from utils.general import (
16
+ coco80_to_coco91_class, check_dataset, check_file, check_img_size, compute_loss, non_max_suppression,
17
  scale_coords, xyxy2xywh, clip_coords, plot_images, xywh2xyxy, box_iou, output_to_target, ap_per_class)
18
  from utils.torch_utils import select_device, time_synchronized
19
 
 
68
  model.eval()
69
  with open(data) as f:
70
  data = yaml.load(f, Loader=yaml.FullLoader) # model dict
71
+ check_dataset(data) # check
72
  nc = 1 if single_cls else int(data['nc']) # number of classes
73
  iouv = torch.linspace(0.5, 0.95, 10).to(device) # iou vector for mAP@0.5:0.95
74
  niou = iouv.numel()
train.py CHANGED
@@ -21,9 +21,9 @@ import test # import test.py to get mAP after each epoch
21
  from models.yolo import Model
22
  from utils.datasets import create_dataloader
23
  from utils.general import (
24
- check_img_size, torch_distributed_zero_first, labels_to_class_weights, plot_labels, check_anchors,
25
- labels_to_image_weights, compute_loss, plot_images, fitness, strip_optimizer, plot_results,
26
- get_latest_run, check_git_status, check_file, increment_dir, print_mutation, plot_evolution)
27
  from utils.google_utils import attempt_download
28
  from utils.torch_utils import init_seeds, ModelEMA, select_device, intersect_dicts
29
 
@@ -51,6 +51,8 @@ def train(hyp, opt, device, tb_writer=None):
51
  init_seeds(2 + rank)
52
  with open(opt.data) as f:
53
  data_dict = yaml.load(f, Loader=yaml.FullLoader) # model dict
 
 
54
  train_path = data_dict['train']
55
  test_path = data_dict['val']
56
  nc, names = (1, ['item']) if opt.single_cls else (int(data_dict['nc']), data_dict['names']) # number classes, names
 
21
  from models.yolo import Model
22
  from utils.datasets import create_dataloader
23
  from utils.general import (
24
+ torch_distributed_zero_first, labels_to_class_weights, plot_labels, check_anchors, labels_to_image_weights,
25
+ compute_loss, plot_images, fitness, strip_optimizer, plot_results, get_latest_run, check_dataset, check_file,
26
+ check_git_status, check_img_size, increment_dir, print_mutation, plot_evolution)
27
  from utils.google_utils import attempt_download
28
  from utils.torch_utils import init_seeds, ModelEMA, select_device, intersect_dicts
29
 
 
51
  init_seeds(2 + rank)
52
  with open(opt.data) as f:
53
  data_dict = yaml.load(f, Loader=yaml.FullLoader) # model dict
54
+ with torch_distributed_zero_first(rank):
55
+ check_dataset(data_dict) # check
56
  train_path = data_dict['train']
57
  test_path = data_dict['val']
58
  nc, names = (1, ['item']) if opt.single_cls else (int(data_dict['nc']), data_dict['names']) # number classes, names
utils/general.py CHANGED
@@ -128,6 +128,25 @@ def check_file(file):
128
  return files[0] # return first file if multiple found
129
 
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  def make_divisible(x, divisor):
132
  # Returns x evenly divisble by divisor
133
  return math.ceil(x / divisor) * divisor
 
128
  return files[0] # return first file if multiple found
129
 
130
 
131
+ def check_dataset(dict):
132
+ # Download dataset if not found
133
+ train, val = os.path.abspath(dict['train']), os.path.abspath(dict['val']) # data paths
134
+ if not (os.path.exists(train) and os.path.exists(val)):
135
+ print('\nWARNING: Dataset not found, nonexistant paths: %s' % [train, val])
136
+ if 'download' in dict:
137
+ s = dict['download']
138
+ print('Attempting autodownload from: %s' % s)
139
+ if s.startswith('http') and s.endswith('.zip'): # URL
140
+ f = Path(s).name # filename
141
+ torch.hub.download_url_to_file(s, f)
142
+ r = os.system('unzip -q %s -d ../ && rm %s' % (f, f))
143
+ else: # bash script
144
+ r = os.system(s)
145
+ print('Dataset autodownload %s\n' % ('success' if r == 0 else 'failure')) # analyze return value
146
+ else:
147
+ Exception('Dataset autodownload unavailable.')
148
+
149
+
150
  def make_divisible(x, divisor):
151
  # Returns x evenly divisble by divisor
152
  return math.ceil(x / divisor) * divisor