glenn-jocher
commited on
Commit
•
c687d5c
1
Parent(s):
bc1fd13
reorganize train initialization steps
Browse files
train.py
CHANGED
@@ -161,7 +161,7 @@ def train(hyp, opt, device, tb_writer=None):
|
|
161 |
|
162 |
# DDP mode
|
163 |
if cuda and rank != -1:
|
164 |
-
model = DDP(model, device_ids=[opt.local_rank], output_device=
|
165 |
|
166 |
# Trainloader
|
167 |
dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt,
|
@@ -171,12 +171,26 @@ def train(hyp, opt, device, tb_writer=None):
|
|
171 |
nb = len(dataloader) # number of batches
|
172 |
assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (mlc, nc, opt.data, nc - 1)
|
173 |
|
174 |
-
#
|
175 |
if rank in [-1, 0]:
|
176 |
ema.updates = start_epoch * nb // accumulate # set EMA updates
|
177 |
testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt,
|
178 |
hyp=hyp, augment=False, cache=opt.cache_images, rect=True, rank=-1,
|
179 |
-
world_size=opt.world_size, workers=opt.workers)[0] #
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
|
181 |
# Model parameters
|
182 |
hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset
|
@@ -186,21 +200,6 @@ def train(hyp, opt, device, tb_writer=None):
|
|
186 |
model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights
|
187 |
model.names = names
|
188 |
|
189 |
-
# Classes and Anchors
|
190 |
-
if rank in [-1, 0] and not opt.resume:
|
191 |
-
labels = np.concatenate(dataset.labels, 0)
|
192 |
-
c = torch.tensor(labels[:, 0]) # classes
|
193 |
-
# cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency
|
194 |
-
# model._initialize_biases(cf.to(device))
|
195 |
-
plot_labels(labels, save_dir=log_dir)
|
196 |
-
if tb_writer:
|
197 |
-
# tb_writer.add_hparams(hyp, {}) # causes duplicate https://github.com/ultralytics/yolov5/pull/384
|
198 |
-
tb_writer.add_histogram('classes', c, 0)
|
199 |
-
|
200 |
-
# Anchors
|
201 |
-
if not opt.noautoanchor:
|
202 |
-
check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz)
|
203 |
-
|
204 |
# Start training
|
205 |
t0 = time.time()
|
206 |
nw = max(3 * nb, 1e3) # number of warmup iterations, max(3 epochs, 1k iterations)
|
@@ -209,10 +208,8 @@ def train(hyp, opt, device, tb_writer=None):
|
|
209 |
results = (0, 0, 0, 0, 0, 0, 0) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'
|
210 |
scheduler.last_epoch = start_epoch - 1 # do not move
|
211 |
scaler = amp.GradScaler(enabled=cuda)
|
212 |
-
logger.info('Image sizes %g train, %g test
|
213 |
-
|
214 |
-
logger.info('Starting training for %g epochs...' % epochs)
|
215 |
-
# torch.autograd.set_detect_anomaly(True)
|
216 |
for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------
|
217 |
model.train()
|
218 |
|
|
|
161 |
|
162 |
# DDP mode
|
163 |
if cuda and rank != -1:
|
164 |
+
model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank)
|
165 |
|
166 |
# Trainloader
|
167 |
dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt,
|
|
|
171 |
nb = len(dataloader) # number of batches
|
172 |
assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (mlc, nc, opt.data, nc - 1)
|
173 |
|
174 |
+
# Process 0
|
175 |
if rank in [-1, 0]:
|
176 |
ema.updates = start_epoch * nb // accumulate # set EMA updates
|
177 |
testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt,
|
178 |
hyp=hyp, augment=False, cache=opt.cache_images, rect=True, rank=-1,
|
179 |
+
world_size=opt.world_size, workers=opt.workers)[0] # testloader
|
180 |
+
|
181 |
+
if not opt.resume:
|
182 |
+
labels = np.concatenate(dataset.labels, 0)
|
183 |
+
c = torch.tensor(labels[:, 0]) # classes
|
184 |
+
# cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency
|
185 |
+
# model._initialize_biases(cf.to(device))
|
186 |
+
plot_labels(labels, save_dir=log_dir)
|
187 |
+
if tb_writer:
|
188 |
+
# tb_writer.add_hparams(hyp, {}) # causes duplicate https://github.com/ultralytics/yolov5/pull/384
|
189 |
+
tb_writer.add_histogram('classes', c, 0)
|
190 |
+
|
191 |
+
# Anchors
|
192 |
+
if not opt.noautoanchor:
|
193 |
+
check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz)
|
194 |
|
195 |
# Model parameters
|
196 |
hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset
|
|
|
200 |
model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights
|
201 |
model.names = names
|
202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
# Start training
|
204 |
t0 = time.time()
|
205 |
nw = max(3 * nb, 1e3) # number of warmup iterations, max(3 epochs, 1k iterations)
|
|
|
208 |
results = (0, 0, 0, 0, 0, 0, 0) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'
|
209 |
scheduler.last_epoch = start_epoch - 1 # do not move
|
210 |
scaler = amp.GradScaler(enabled=cuda)
|
211 |
+
logger.info('Image sizes %g train, %g test\nUsing %g dataloader workers\nLogging results to %s\n'
|
212 |
+
'Starting training for %g epochs...' % (imgsz, imgsz_test, dataloader.num_workers, log_dir, epochs))
|
|
|
|
|
213 |
for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------
|
214 |
model.train()
|
215 |
|