glenn-jocher commited on
Commit
16f6834
1 Parent(s): 1b9e28e

update train.py and experimental.py

Browse files
Files changed (2) hide show
  1. models/experimental.py +4 -1
  2. train.py +10 -12
models/experimental.py CHANGED
@@ -119,7 +119,10 @@ class Ensemble(nn.ModuleList):
119
  y = []
120
  for module in self:
121
  y.append(module(x, augment)[0])
122
- return torch.cat(y, 1), None # ensembled inference output, train output
 
 
 
123
 
124
 
125
  def attempt_load(weights, map_location=None):
 
119
  y = []
120
  for module in self:
121
  y.append(module(x, augment)[0])
122
+ # y = torch.stack(y).max(0)[0] # max ensemble
123
+ # y = torch.cat(y, 1) # nms ensemble
124
+ y = torch.stack(y).mean(0) # mean ensemble
125
+ return y, None # inference, train output
126
 
127
 
128
  def attempt_load(weights, map_location=None):
train.py CHANGED
@@ -101,11 +101,13 @@ def train(hyp):
101
  optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
102
  optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay
103
  optimizer.add_param_group({'params': pg2}) # add pg2 (biases)
 
 
 
104
  # Scheduler https://arxiv.org/pdf/1812.01187.pdf
105
  lf = lambda x: (((1 + math.cos(x * math.pi / epochs)) / 2) ** 1.0) * 0.9 + 0.1 # cosine
106
  scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
107
- print('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0)))
108
- del pg0, pg1, pg2
109
 
110
  # Load Model
111
  google_utils.attempt_download(weights)
@@ -147,12 +149,7 @@ def train(hyp):
147
  if mixed_precision:
148
  model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0)
149
 
150
-
151
- scheduler.last_epoch = start_epoch - 1 # do not move
152
- # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822
153
- # plot_lr_scheduler(optimizer, scheduler, epochs)
154
-
155
- # Initialize distributed training
156
  if device.type != 'cpu' and torch.cuda.device_count() > 1 and torch.distributed.is_available():
157
  dist.init_process_group(backend='nccl', # distributed backend
158
  init_method='tcp://127.0.0.1:9999', # init method
@@ -198,9 +195,10 @@ def train(hyp):
198
  # Start training
199
  t0 = time.time()
200
  nb = len(dataloader) # number of batches
201
- n_burn = max(3 * nb, 1e3) # burn-in iterations, max(3 epochs, 1k iterations)
202
  maps = np.zeros(nc) # mAP per class
203
  results = (0, 0, 0, 0, 0, 0, 0) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'
 
204
  print('Image sizes %g train, %g test' % (imgsz, imgsz_test))
205
  print('Using %g dataloader workers' % dataloader.num_workers)
206
  print('Starting training for %g epochs...' % epochs)
@@ -225,9 +223,9 @@ def train(hyp):
225
  ni = i + nb * epoch # number integrated batches (since train start)
226
  imgs = imgs.to(device).float() / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0
227
 
228
- # Burn-in
229
- if ni <= n_burn:
230
- xi = [0, n_burn] # x interp
231
  # model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou)
232
  accumulate = max(1, np.interp(ni, xi, [1, nbs / batch_size]).round())
233
  for j, x in enumerate(optimizer.param_groups):
 
101
  optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
102
  optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay
103
  optimizer.add_param_group({'params': pg2}) # add pg2 (biases)
104
+ print('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0)))
105
+ del pg0, pg1, pg2
106
+
107
  # Scheduler https://arxiv.org/pdf/1812.01187.pdf
108
  lf = lambda x: (((1 + math.cos(x * math.pi / epochs)) / 2) ** 1.0) * 0.9 + 0.1 # cosine
109
  scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
110
+ # plot_lr_scheduler(optimizer, scheduler, epochs)
 
111
 
112
  # Load Model
113
  google_utils.attempt_download(weights)
 
149
  if mixed_precision:
150
  model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0)
151
 
152
+ # Distributed training
 
 
 
 
 
153
  if device.type != 'cpu' and torch.cuda.device_count() > 1 and torch.distributed.is_available():
154
  dist.init_process_group(backend='nccl', # distributed backend
155
  init_method='tcp://127.0.0.1:9999', # init method
 
195
  # Start training
196
  t0 = time.time()
197
  nb = len(dataloader) # number of batches
198
+ nw = max(3 * nb, 1e3) # number of warmup iterations, max(3 epochs, 1k iterations)
199
  maps = np.zeros(nc) # mAP per class
200
  results = (0, 0, 0, 0, 0, 0, 0) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'
201
+ scheduler.last_epoch = start_epoch - 1 # do not move
202
  print('Image sizes %g train, %g test' % (imgsz, imgsz_test))
203
  print('Using %g dataloader workers' % dataloader.num_workers)
204
  print('Starting training for %g epochs...' % epochs)
 
223
  ni = i + nb * epoch # number integrated batches (since train start)
224
  imgs = imgs.to(device).float() / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0
225
 
226
+ # Warmup
227
+ if ni <= nw:
228
+ xi = [0, nw] # x interp
229
  # model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou)
230
  accumulate = max(1, np.interp(ni, xi, [1, nbs / batch_size]).round())
231
  for j, x in enumerate(optimizer.param_groups):