glenn-jocher commited on
Commit
63dd65e
1 Parent(s): 264be1a

Update train.py (#4136)

Browse files

* Refactor train.py

* Update imports

* Update imports

* Update optimizer

* cleanup

Files changed (3) hide show
  1. train.py +47 -57
  2. utils/general.py +1 -1
  3. utils/loss.py +1 -1
train.py CHANGED
@@ -17,15 +17,13 @@ from threading import Thread
17
 
18
  import math
19
  import numpy as np
 
20
  import torch.distributed as dist
21
  import torch.nn as nn
22
- import torch.nn.functional as F
23
- import torch.optim as optim
24
- import torch.optim.lr_scheduler as lr_scheduler
25
- import torch.utils.data
26
  import yaml
27
  from torch.cuda import amp
28
  from torch.nn.parallel import DistributedDataParallel as DDP
 
29
  from torch.utils.tensorboard import SummaryWriter
30
  from tqdm import tqdm
31
 
@@ -58,16 +56,13 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
58
  device,
59
  ):
60
  save_dir, epochs, batch_size, weights, single_cls, evolve, data, cfg, resume, noval, nosave, workers, = \
61
- opt.save_dir, opt.epochs, opt.batch_size, opt.weights, opt.single_cls, opt.evolve, opt.data, opt.cfg, \
62
  opt.resume, opt.noval, opt.nosave, opt.workers
63
 
64
  # Directories
65
- save_dir = Path(save_dir)
66
- wdir = save_dir / 'weights'
67
- wdir.mkdir(parents=True, exist_ok=True) # make dir
68
- last = wdir / 'last.pt'
69
- best = wdir / 'best.pt'
70
- results_file = save_dir / 'results.txt'
71
 
72
  # Hyperparameters
73
  if isinstance(hyp, str):
@@ -92,7 +87,7 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
92
  loggers = {'wandb': None, 'tb': None} # loggers dict
93
  if RANK in [-1, 0]:
94
  # TensorBoard
95
- if not evolve:
96
  prefix = colorstr('tensorboard: ')
97
  LOGGER.info(f"{prefix}Start with 'tensorboard --logdir {opt.project}', view at http://localhost:6006/")
98
  loggers['tb'] = SummaryWriter(str(save_dir))
@@ -105,11 +100,11 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
105
  loggers['wandb'] = wandb_logger.wandb
106
  if loggers['wandb']:
107
  data_dict = wandb_logger.data_dict
108
- weights, epochs, hyp = opt.weights, opt.epochs, opt.hyp # may update weights, epochs if resuming
109
 
110
  nc = 1 if single_cls else int(data_dict['nc']) # number of classes
111
  names = ['item'] if single_cls and len(data_dict['names']) != 1 else data_dict['names'] # class names
112
- assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (len(names), nc, data) # check
113
  is_coco = data.endswith('coco.yaml') and nc == 80 # COCO dataset
114
 
115
  # Model
@@ -120,23 +115,22 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
120
  ckpt = torch.load(weights, map_location=device) # load checkpoint
121
  model = Model(cfg or ckpt['model'].yaml, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create
122
  exclude = ['anchor'] if (cfg or hyp.get('anchors')) and not resume else [] # exclude keys
123
- state_dict = ckpt['model'].float().state_dict() # to FP32
124
- state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect
125
- model.load_state_dict(state_dict, strict=False) # load
126
- LOGGER.info('Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report
127
  else:
128
  model = Model(cfg, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create
129
  with torch_distributed_zero_first(RANK):
130
  check_dataset(data_dict) # check
131
- train_path = data_dict['train']
132
- val_path = data_dict['val']
133
 
134
  # Freeze
135
  freeze = [] # parameter names to freeze (full or partial)
136
  for k, v in model.named_parameters():
137
  v.requires_grad = True # train all layers
138
  if any(x in k for x in freeze):
139
- print('freezing %s' % k)
140
  v.requires_grad = False
141
 
142
  # Optimizer
@@ -145,33 +139,32 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
145
  hyp['weight_decay'] *= batch_size * accumulate / nbs # scale weight_decay
146
  LOGGER.info(f"Scaled weight_decay = {hyp['weight_decay']}")
147
 
148
- pg0, pg1, pg2 = [], [], [] # optimizer parameter groups
149
- for k, v in model.named_modules():
150
- if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter):
151
- pg2.append(v.bias) # biases
152
- if isinstance(v, nn.BatchNorm2d):
153
- pg0.append(v.weight) # no decay
154
- elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter):
155
- pg1.append(v.weight) # apply decay
156
 
157
  if opt.adam:
158
- optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum
159
  else:
160
- optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
161
 
162
- optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay
163
- optimizer.add_param_group({'params': pg2}) # add pg2 (biases)
164
- LOGGER.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0)))
165
- del pg0, pg1, pg2
 
166
 
167
- # Scheduler https://arxiv.org/pdf/1812.01187.pdf
168
- # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR
169
  if opt.linear_lr:
170
  lf = lambda x: (1 - x / (epochs - 1)) * (1.0 - hyp['lrf']) + hyp['lrf'] # linear
171
  else:
172
  lf = one_cycle(1, hyp['lrf'], epochs) # cosine 1->hyp['lrf']
173
- scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
174
- # plot_lr_scheduler(optimizer, scheduler, epochs)
175
 
176
  # EMA
177
  ema = ModelEMA(model) if RANK in [-1, 0] else None
@@ -196,13 +189,12 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
196
  # Epochs
197
  start_epoch = ckpt['epoch'] + 1
198
  if resume:
199
- assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % (weights, epochs)
200
  if epochs < start_epoch:
201
- LOGGER.info('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' %
202
- (weights, ckpt['epoch'], epochs))
203
  epochs += ckpt['epoch'] # finetune additional epochs
204
 
205
- del ckpt, state_dict
206
 
207
  # Image sizes
208
  gs = max(int(model.stride.max()), 32) # grid size (max stride)
@@ -217,7 +209,6 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
217
 
218
  # SyncBatchNorm
219
  if opt.sync_bn and cuda and RANK != -1:
220
- raise Exception('can not train with --sync-bn, known issue https://github.com/ultralytics/yolov5/issues/3998')
221
  model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
222
  LOGGER.info('Using SyncBatchNorm()')
223
 
@@ -228,7 +219,7 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
228
  prefix=colorstr('train: '))
229
  mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class
230
  nb = len(train_loader) # number of batches
231
- assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (mlc, nc, data, nc - 1)
232
 
233
  # Process 0
234
  if RANK in [-1, 0]:
@@ -261,7 +252,6 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
261
  hyp['label_smoothing'] = opt.label_smoothing
262
  model.nc = nc # attach number of classes to model
263
  model.hyp = hyp # attach hyperparameters to model
264
- model.gr = 1.0 # iou loss ratio (obj_loss = 1.0 or iou)
265
  model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc # attach class weights
266
  model.names = names
267
 
@@ -315,7 +305,7 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
315
  # Warmup
316
  if ni <= nw:
317
  xi = [0, nw] # x interp
318
- # model.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou)
319
  accumulate = max(1, np.interp(ni, xi, [1, nbs / batch_size]).round())
320
  for j, x in enumerate(optimizer.param_groups):
321
  # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
@@ -329,7 +319,7 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
329
  sf = sz / max(imgs.shape[2:]) # scale factor
330
  if sf != 1:
331
  ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]] # new shape (stretched to gs-multiple)
332
- imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)
333
 
334
  # Forward
335
  with amp.autocast(enabled=cuda):
@@ -355,7 +345,7 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
355
  # Print
356
  if RANK in [-1, 0]:
357
  mloss = (mloss * i + loss_items) / (i + 1) # update mean losses
358
- mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB)
359
  s = ('%10s' * 2 + '%10.4g' * 6) % (
360
  f'{epoch}/{epochs - 1}', mem, *mloss, targets.shape[0], imgs.shape[-1])
361
  pbar.set_description(s)
@@ -381,7 +371,7 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
381
  # DDP process 0 or single-GPU
382
  if RANK in [-1, 0]:
383
  # mAP
384
- ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride', 'class_weights'])
385
  final_epoch = epoch + 1 == epochs
386
  if not noval or final_epoch: # Calculate mAP
387
  wandb_logger.current_epoch = epoch + 1
@@ -457,6 +447,7 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
457
  batch_size=batch_size // WORLD_SIZE * 2,
458
  imgsz=imgsz,
459
  model=attempt_load(m, device).half(),
 
460
  single_cls=single_cls,
461
  dataloader=val_loader,
462
  save_dir=save_dir,
@@ -525,8 +516,7 @@ def main(opt):
525
  check_requirements(exclude=['thop'])
526
 
527
  # Resume
528
- wandb_run = check_wandb_resume(opt)
529
- if opt.resume and not wandb_run: # resume an interrupted run
530
  ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run() # specified or most recent path
531
  assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist'
532
  with open(Path(ckpt).parent.parent / 'opt.yaml') as f:
@@ -534,7 +524,6 @@ def main(opt):
534
  opt.cfg, opt.weights, opt.resume = '', ckpt, True # reinstate
535
  LOGGER.info(f'Resuming training from {ckpt}')
536
  else:
537
- # opt.hyp = opt.hyp or ('hyp.finetune.yaml' if opt.weights else 'hyp.scratch.yaml')
538
  opt.data, opt.cfg, opt.hyp = check_file(opt.data), check_file(opt.cfg), check_file(opt.hyp) # check files
539
  assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified'
540
  opt.name = 'evolve' if opt.evolve else opt.name
@@ -545,11 +534,13 @@ def main(opt):
545
  if LOCAL_RANK != -1:
546
  from datetime import timedelta
547
  assert torch.cuda.device_count() > LOCAL_RANK, 'insufficient CUDA devices for DDP command'
 
 
 
 
548
  torch.cuda.set_device(LOCAL_RANK)
549
  device = torch.device('cuda', LOCAL_RANK)
550
  dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo", timeout=timedelta(seconds=60))
551
- assert opt.batch_size % WORLD_SIZE == 0, '--batch-size must be multiple of CUDA device count'
552
- assert not opt.image_weights, '--image-weights argument is not compatible with DDP training'
553
 
554
  # Train
555
  if not opt.evolve:
@@ -594,7 +585,6 @@ def main(opt):
594
  hyp = yaml.safe_load(f) # load hyps dict
595
  if 'anchors' not in hyp: # anchors commented in hyp.yaml
596
  hyp['anchors'] = 3
597
- assert LOCAL_RANK == -1, 'DDP mode not implemented for --evolve'
598
  opt.noval, opt.nosave = True, True # only val/save final epoch
599
  # ei = [isinstance(x, (int, float)) for x in hyp.values()] # evolvable indices
600
  yaml_file = Path(opt.save_dir) / 'hyp_evolved.yaml' # save best result here
@@ -646,7 +636,7 @@ def main(opt):
646
 
647
 
648
  def run(**kwargs):
649
- # Usage: import train; train.run(imgsz=320, weights='yolov5m.pt')
650
  opt = parse_opt(True)
651
  for k, v in kwargs.items():
652
  setattr(opt, k, v)
 
17
 
18
  import math
19
  import numpy as np
20
+ import torch
21
  import torch.distributed as dist
22
  import torch.nn as nn
 
 
 
 
23
  import yaml
24
  from torch.cuda import amp
25
  from torch.nn.parallel import DistributedDataParallel as DDP
26
+ from torch.optim import Adam, SGD, lr_scheduler
27
  from torch.utils.tensorboard import SummaryWriter
28
  from tqdm import tqdm
29
 
 
56
  device,
57
  ):
58
  save_dir, epochs, batch_size, weights, single_cls, evolve, data, cfg, resume, noval, nosave, workers, = \
59
+ Path(opt.save_dir), opt.epochs, opt.batch_size, opt.weights, opt.single_cls, opt.evolve, opt.data, opt.cfg, \
60
  opt.resume, opt.noval, opt.nosave, opt.workers
61
 
62
  # Directories
63
+ w = save_dir / 'weights' # weights dir
64
+ w.mkdir(parents=True, exist_ok=True) # make dir
65
+ last, best, results_file = w / 'last.pt', w / 'best.pt', save_dir / 'results.txt'
 
 
 
66
 
67
  # Hyperparameters
68
  if isinstance(hyp, str):
 
87
  loggers = {'wandb': None, 'tb': None} # loggers dict
88
  if RANK in [-1, 0]:
89
  # TensorBoard
90
+ if plots:
91
  prefix = colorstr('tensorboard: ')
92
  LOGGER.info(f"{prefix}Start with 'tensorboard --logdir {opt.project}', view at http://localhost:6006/")
93
  loggers['tb'] = SummaryWriter(str(save_dir))
 
100
  loggers['wandb'] = wandb_logger.wandb
101
  if loggers['wandb']:
102
  data_dict = wandb_logger.data_dict
103
+ weights, epochs, hyp = opt.weights, opt.epochs, opt.hyp # may update values if resuming
104
 
105
  nc = 1 if single_cls else int(data_dict['nc']) # number of classes
106
  names = ['item'] if single_cls and len(data_dict['names']) != 1 else data_dict['names'] # class names
107
+ assert len(names) == nc, f'{len(names)} names found for nc={nc} dataset in {data}' # check
108
  is_coco = data.endswith('coco.yaml') and nc == 80 # COCO dataset
109
 
110
  # Model
 
115
  ckpt = torch.load(weights, map_location=device) # load checkpoint
116
  model = Model(cfg or ckpt['model'].yaml, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create
117
  exclude = ['anchor'] if (cfg or hyp.get('anchors')) and not resume else [] # exclude keys
118
+ csd = ckpt['model'].float().state_dict() # checkpoint state_dict as FP32
119
+ csd = intersect_dicts(csd, model.state_dict(), exclude=exclude) # intersect
120
+ model.load_state_dict(csd, strict=False) # load
121
+ LOGGER.info(f'Transferred {len(csd)}/{len(model.state_dict())} items from {weights}') # report
122
  else:
123
  model = Model(cfg, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create
124
  with torch_distributed_zero_first(RANK):
125
  check_dataset(data_dict) # check
126
+ train_path, val_path = data_dict['train'], data_dict['val']
 
127
 
128
  # Freeze
129
  freeze = [] # parameter names to freeze (full or partial)
130
  for k, v in model.named_parameters():
131
  v.requires_grad = True # train all layers
132
  if any(x in k for x in freeze):
133
+ print(f'freezing {k}')
134
  v.requires_grad = False
135
 
136
  # Optimizer
 
139
  hyp['weight_decay'] *= batch_size * accumulate / nbs # scale weight_decay
140
  LOGGER.info(f"Scaled weight_decay = {hyp['weight_decay']}")
141
 
142
+ g0, g1, g2 = [], [], [] # optimizer parameter groups
143
+ for v in model.modules():
144
+ if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): # bias
145
+ g2.append(v.bias)
146
+ if isinstance(v, nn.BatchNorm2d): # weight with decay
147
+ g0.append(v.weight)
148
+ elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter): # weight without decay
149
+ g1.append(v.weight)
150
 
151
  if opt.adam:
152
+ optimizer = Adam(g0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum
153
  else:
154
+ optimizer = SGD(g0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
155
 
156
+ optimizer.add_param_group({'params': g1, 'weight_decay': hyp['weight_decay']}) # add g1 with weight_decay
157
+ optimizer.add_param_group({'params': g2}) # add g2 (biases)
158
+ LOGGER.info(f"{colorstr('optimizer:')} {type(optimizer).__name__} with parameter groups "
159
+ f"{len(g0)} weight, {len(g1)} weight (no decay), {len(g2)} bias")
160
+ del g0, g1, g2
161
 
162
+ # Scheduler
 
163
  if opt.linear_lr:
164
  lf = lambda x: (1 - x / (epochs - 1)) * (1.0 - hyp['lrf']) + hyp['lrf'] # linear
165
  else:
166
  lf = one_cycle(1, hyp['lrf'], epochs) # cosine 1->hyp['lrf']
167
+ scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs)
 
168
 
169
  # EMA
170
  ema = ModelEMA(model) if RANK in [-1, 0] else None
 
189
  # Epochs
190
  start_epoch = ckpt['epoch'] + 1
191
  if resume:
192
+ assert start_epoch > 0, f'{weights} training to {epochs} epochs is finished, nothing to resume.'
193
  if epochs < start_epoch:
194
+ LOGGER.info(f"{weights} has been trained for {ckpt['epoch']} epochs. Fine-tuning for {epochs} more epochs.")
 
195
  epochs += ckpt['epoch'] # finetune additional epochs
196
 
197
+ del ckpt, csd
198
 
199
  # Image sizes
200
  gs = max(int(model.stride.max()), 32) # grid size (max stride)
 
209
 
210
  # SyncBatchNorm
211
  if opt.sync_bn and cuda and RANK != -1:
 
212
  model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
213
  LOGGER.info('Using SyncBatchNorm()')
214
 
 
219
  prefix=colorstr('train: '))
220
  mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class
221
  nb = len(train_loader) # number of batches
222
+ assert mlc < nc, f'Label class {mlc} exceeds nc={nc} in {data}. Possible class labels are 0-{nc - 1}'
223
 
224
  # Process 0
225
  if RANK in [-1, 0]:
 
252
  hyp['label_smoothing'] = opt.label_smoothing
253
  model.nc = nc # attach number of classes to model
254
  model.hyp = hyp # attach hyperparameters to model
 
255
  model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc # attach class weights
256
  model.names = names
257
 
 
305
  # Warmup
306
  if ni <= nw:
307
  xi = [0, nw] # x interp
308
+ # compute_loss.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou)
309
  accumulate = max(1, np.interp(ni, xi, [1, nbs / batch_size]).round())
310
  for j, x in enumerate(optimizer.param_groups):
311
  # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
 
319
  sf = sz / max(imgs.shape[2:]) # scale factor
320
  if sf != 1:
321
  ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]] # new shape (stretched to gs-multiple)
322
+ imgs = nn.functional.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)
323
 
324
  # Forward
325
  with amp.autocast(enabled=cuda):
 
345
  # Print
346
  if RANK in [-1, 0]:
347
  mloss = (mloss * i + loss_items) / (i + 1) # update mean losses
348
+ mem = f'{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G' # (GB)
349
  s = ('%10s' * 2 + '%10.4g' * 6) % (
350
  f'{epoch}/{epochs - 1}', mem, *mloss, targets.shape[0], imgs.shape[-1])
351
  pbar.set_description(s)
 
371
  # DDP process 0 or single-GPU
372
  if RANK in [-1, 0]:
373
  # mAP
374
+ ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'names', 'stride', 'class_weights'])
375
  final_epoch = epoch + 1 == epochs
376
  if not noval or final_epoch: # Calculate mAP
377
  wandb_logger.current_epoch = epoch + 1
 
447
  batch_size=batch_size // WORLD_SIZE * 2,
448
  imgsz=imgsz,
449
  model=attempt_load(m, device).half(),
450
+ iou_thres=0.7, # NMS IoU threshold for best pycocotools results
451
  single_cls=single_cls,
452
  dataloader=val_loader,
453
  save_dir=save_dir,
 
516
  check_requirements(exclude=['thop'])
517
 
518
  # Resume
519
+ if opt.resume and not check_wandb_resume(opt): # resume an interrupted run
 
520
  ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run() # specified or most recent path
521
  assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist'
522
  with open(Path(ckpt).parent.parent / 'opt.yaml') as f:
 
524
  opt.cfg, opt.weights, opt.resume = '', ckpt, True # reinstate
525
  LOGGER.info(f'Resuming training from {ckpt}')
526
  else:
 
527
  opt.data, opt.cfg, opt.hyp = check_file(opt.data), check_file(opt.cfg), check_file(opt.hyp) # check files
528
  assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified'
529
  opt.name = 'evolve' if opt.evolve else opt.name
 
534
  if LOCAL_RANK != -1:
535
  from datetime import timedelta
536
  assert torch.cuda.device_count() > LOCAL_RANK, 'insufficient CUDA devices for DDP command'
537
+ assert opt.batch_size % WORLD_SIZE == 0, '--batch-size must be multiple of CUDA device count'
538
+ assert not opt.image_weights, '--image-weights argument is not compatible with DDP training'
539
+ assert not opt.evolve, '--evolve argument is not compatible with DDP training'
540
+ assert not opt.sync_bn, '--sync-bn known training issue, see https://github.com/ultralytics/yolov5/issues/3998'
541
  torch.cuda.set_device(LOCAL_RANK)
542
  device = torch.device('cuda', LOCAL_RANK)
543
  dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo", timeout=timedelta(seconds=60))
 
 
544
 
545
  # Train
546
  if not opt.evolve:
 
585
  hyp = yaml.safe_load(f) # load hyps dict
586
  if 'anchors' not in hyp: # anchors commented in hyp.yaml
587
  hyp['anchors'] = 3
 
588
  opt.noval, opt.nosave = True, True # only val/save final epoch
589
  # ei = [isinstance(x, (int, float)) for x in hyp.values()] # evolvable indices
590
  yaml_file = Path(opt.save_dir) / 'hyp_evolved.yaml' # save best result here
 
636
 
637
 
638
  def run(**kwargs):
639
+ # Usage: import train; train.run(data='coco128.yaml', imgsz=320, weights='yolov5m.pt')
640
  opt = parse_opt(True)
641
  for k, v in kwargs.items():
642
  setattr(opt, k, v)
utils/general.py CHANGED
@@ -301,7 +301,7 @@ def clean_str(s):
301
 
302
 
303
  def one_cycle(y1=0.0, y2=1.0, steps=100):
304
- # lambda function for sinusoidal ramp from y1 to y2
305
  return lambda x: ((1 - math.cos(x * math.pi / steps)) / 2) * (y2 - y1) + y1
306
 
307
 
 
301
 
302
 
303
  def one_cycle(y1=0.0, y2=1.0, steps=100):
304
+ # lambda function for sinusoidal ramp from y1 to y2 https://arxiv.org/pdf/1812.01187.pdf
305
  return lambda x: ((1 - math.cos(x * math.pi / steps)) / 2) * (y2 - y1) + y1
306
 
307
 
utils/loss.py CHANGED
@@ -108,7 +108,7 @@ class ComputeLoss:
108
  det = model.module.model[-1] if is_parallel(model) else model.model[-1] # Detect() module
109
  self.balance = {3: [4.0, 1.0, 0.4]}.get(det.nl, [4.0, 1.0, 0.25, 0.06, .02]) # P3-P7
110
  self.ssi = list(det.stride).index(16) if autobalance else 0 # stride 16 index
111
- self.BCEcls, self.BCEobj, self.gr, self.hyp, self.autobalance = BCEcls, BCEobj, model.gr, h, autobalance
112
  for k in 'na', 'nc', 'nl', 'anchors':
113
  setattr(self, k, getattr(det, k))
114
 
 
108
  det = model.module.model[-1] if is_parallel(model) else model.model[-1] # Detect() module
109
  self.balance = {3: [4.0, 1.0, 0.4]}.get(det.nl, [4.0, 1.0, 0.25, 0.06, .02]) # P3-P7
110
  self.ssi = list(det.stride).index(16) if autobalance else 0 # stride 16 index
111
+ self.BCEcls, self.BCEobj, self.gr, self.hyp, self.autobalance = BCEcls, BCEobj, 1.0, h, autobalance
112
  for k in 'na', 'nc', 'nl', 'anchors':
113
  setattr(self, k, getattr(det, k))
114