Update train.py (#4136)
Browse files* Refactor train.py
* Update imports
* Update imports
* Update optimizer
* cleanup
- train.py +47 -57
- utils/general.py +1 -1
- utils/loss.py +1 -1
train.py
CHANGED
@@ -17,15 +17,13 @@ from threading import Thread
|
|
17 |
|
18 |
import math
|
19 |
import numpy as np
|
|
|
20 |
import torch.distributed as dist
|
21 |
import torch.nn as nn
|
22 |
-
import torch.nn.functional as F
|
23 |
-
import torch.optim as optim
|
24 |
-
import torch.optim.lr_scheduler as lr_scheduler
|
25 |
-
import torch.utils.data
|
26 |
import yaml
|
27 |
from torch.cuda import amp
|
28 |
from torch.nn.parallel import DistributedDataParallel as DDP
|
|
|
29 |
from torch.utils.tensorboard import SummaryWriter
|
30 |
from tqdm import tqdm
|
31 |
|
@@ -58,16 +56,13 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
|
|
58 |
device,
|
59 |
):
|
60 |
save_dir, epochs, batch_size, weights, single_cls, evolve, data, cfg, resume, noval, nosave, workers, = \
|
61 |
-
opt.save_dir, opt.epochs, opt.batch_size, opt.weights, opt.single_cls, opt.evolve, opt.data, opt.cfg, \
|
62 |
opt.resume, opt.noval, opt.nosave, opt.workers
|
63 |
|
64 |
# Directories
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
last = wdir / 'last.pt'
|
69 |
-
best = wdir / 'best.pt'
|
70 |
-
results_file = save_dir / 'results.txt'
|
71 |
|
72 |
# Hyperparameters
|
73 |
if isinstance(hyp, str):
|
@@ -92,7 +87,7 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
|
|
92 |
loggers = {'wandb': None, 'tb': None} # loggers dict
|
93 |
if RANK in [-1, 0]:
|
94 |
# TensorBoard
|
95 |
-
if
|
96 |
prefix = colorstr('tensorboard: ')
|
97 |
LOGGER.info(f"{prefix}Start with 'tensorboard --logdir {opt.project}', view at http://localhost:6006/")
|
98 |
loggers['tb'] = SummaryWriter(str(save_dir))
|
@@ -105,11 +100,11 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
|
|
105 |
loggers['wandb'] = wandb_logger.wandb
|
106 |
if loggers['wandb']:
|
107 |
data_dict = wandb_logger.data_dict
|
108 |
-
weights, epochs, hyp = opt.weights, opt.epochs, opt.hyp # may update
|
109 |
|
110 |
nc = 1 if single_cls else int(data_dict['nc']) # number of classes
|
111 |
names = ['item'] if single_cls and len(data_dict['names']) != 1 else data_dict['names'] # class names
|
112 |
-
assert len(names) == nc, '
|
113 |
is_coco = data.endswith('coco.yaml') and nc == 80 # COCO dataset
|
114 |
|
115 |
# Model
|
@@ -120,23 +115,22 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
|
|
120 |
ckpt = torch.load(weights, map_location=device) # load checkpoint
|
121 |
model = Model(cfg or ckpt['model'].yaml, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create
|
122 |
exclude = ['anchor'] if (cfg or hyp.get('anchors')) and not resume else [] # exclude keys
|
123 |
-
|
124 |
-
|
125 |
-
model.load_state_dict(
|
126 |
-
LOGGER.info('Transferred
|
127 |
else:
|
128 |
model = Model(cfg, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create
|
129 |
with torch_distributed_zero_first(RANK):
|
130 |
check_dataset(data_dict) # check
|
131 |
-
train_path = data_dict['train']
|
132 |
-
val_path = data_dict['val']
|
133 |
|
134 |
# Freeze
|
135 |
freeze = [] # parameter names to freeze (full or partial)
|
136 |
for k, v in model.named_parameters():
|
137 |
v.requires_grad = True # train all layers
|
138 |
if any(x in k for x in freeze):
|
139 |
-
print('freezing
|
140 |
v.requires_grad = False
|
141 |
|
142 |
# Optimizer
|
@@ -145,33 +139,32 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
|
|
145 |
hyp['weight_decay'] *= batch_size * accumulate / nbs # scale weight_decay
|
146 |
LOGGER.info(f"Scaled weight_decay = {hyp['weight_decay']}")
|
147 |
|
148 |
-
|
149 |
-
for
|
150 |
-
if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter):
|
151 |
-
|
152 |
-
if isinstance(v, nn.BatchNorm2d):
|
153 |
-
|
154 |
-
elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter):
|
155 |
-
|
156 |
|
157 |
if opt.adam:
|
158 |
-
optimizer =
|
159 |
else:
|
160 |
-
optimizer =
|
161 |
|
162 |
-
optimizer.add_param_group({'params':
|
163 |
-
optimizer.add_param_group({'params':
|
164 |
-
LOGGER.info('
|
165 |
-
|
|
|
166 |
|
167 |
-
# Scheduler
|
168 |
-
# https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR
|
169 |
if opt.linear_lr:
|
170 |
lf = lambda x: (1 - x / (epochs - 1)) * (1.0 - hyp['lrf']) + hyp['lrf'] # linear
|
171 |
else:
|
172 |
lf = one_cycle(1, hyp['lrf'], epochs) # cosine 1->hyp['lrf']
|
173 |
-
scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
|
174 |
-
# plot_lr_scheduler(optimizer, scheduler, epochs)
|
175 |
|
176 |
# EMA
|
177 |
ema = ModelEMA(model) if RANK in [-1, 0] else None
|
@@ -196,13 +189,12 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
|
|
196 |
# Epochs
|
197 |
start_epoch = ckpt['epoch'] + 1
|
198 |
if resume:
|
199 |
-
assert start_epoch > 0, '
|
200 |
if epochs < start_epoch:
|
201 |
-
LOGGER.info(
|
202 |
-
(weights, ckpt['epoch'], epochs))
|
203 |
epochs += ckpt['epoch'] # finetune additional epochs
|
204 |
|
205 |
-
del ckpt,
|
206 |
|
207 |
# Image sizes
|
208 |
gs = max(int(model.stride.max()), 32) # grid size (max stride)
|
@@ -217,7 +209,6 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
|
|
217 |
|
218 |
# SyncBatchNorm
|
219 |
if opt.sync_bn and cuda and RANK != -1:
|
220 |
-
raise Exception('can not train with --sync-bn, known issue https://github.com/ultralytics/yolov5/issues/3998')
|
221 |
model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
|
222 |
LOGGER.info('Using SyncBatchNorm()')
|
223 |
|
@@ -228,7 +219,7 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
|
|
228 |
prefix=colorstr('train: '))
|
229 |
mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class
|
230 |
nb = len(train_loader) # number of batches
|
231 |
-
assert mlc < nc, 'Label class
|
232 |
|
233 |
# Process 0
|
234 |
if RANK in [-1, 0]:
|
@@ -261,7 +252,6 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
|
|
261 |
hyp['label_smoothing'] = opt.label_smoothing
|
262 |
model.nc = nc # attach number of classes to model
|
263 |
model.hyp = hyp # attach hyperparameters to model
|
264 |
-
model.gr = 1.0 # iou loss ratio (obj_loss = 1.0 or iou)
|
265 |
model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc # attach class weights
|
266 |
model.names = names
|
267 |
|
@@ -315,7 +305,7 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
|
|
315 |
# Warmup
|
316 |
if ni <= nw:
|
317 |
xi = [0, nw] # x interp
|
318 |
-
#
|
319 |
accumulate = max(1, np.interp(ni, xi, [1, nbs / batch_size]).round())
|
320 |
for j, x in enumerate(optimizer.param_groups):
|
321 |
# bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
|
@@ -329,7 +319,7 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
|
|
329 |
sf = sz / max(imgs.shape[2:]) # scale factor
|
330 |
if sf != 1:
|
331 |
ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]] # new shape (stretched to gs-multiple)
|
332 |
-
imgs =
|
333 |
|
334 |
# Forward
|
335 |
with amp.autocast(enabled=cuda):
|
@@ -355,7 +345,7 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
|
|
355 |
# Print
|
356 |
if RANK in [-1, 0]:
|
357 |
mloss = (mloss * i + loss_items) / (i + 1) # update mean losses
|
358 |
-
mem = '
|
359 |
s = ('%10s' * 2 + '%10.4g' * 6) % (
|
360 |
f'{epoch}/{epochs - 1}', mem, *mloss, targets.shape[0], imgs.shape[-1])
|
361 |
pbar.set_description(s)
|
@@ -381,7 +371,7 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
|
|
381 |
# DDP process 0 or single-GPU
|
382 |
if RANK in [-1, 0]:
|
383 |
# mAP
|
384 |
-
ema.update_attr(model, include=['yaml', 'nc', 'hyp', '
|
385 |
final_epoch = epoch + 1 == epochs
|
386 |
if not noval or final_epoch: # Calculate mAP
|
387 |
wandb_logger.current_epoch = epoch + 1
|
@@ -457,6 +447,7 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
|
|
457 |
batch_size=batch_size // WORLD_SIZE * 2,
|
458 |
imgsz=imgsz,
|
459 |
model=attempt_load(m, device).half(),
|
|
|
460 |
single_cls=single_cls,
|
461 |
dataloader=val_loader,
|
462 |
save_dir=save_dir,
|
@@ -525,8 +516,7 @@ def main(opt):
|
|
525 |
check_requirements(exclude=['thop'])
|
526 |
|
527 |
# Resume
|
528 |
-
|
529 |
-
if opt.resume and not wandb_run: # resume an interrupted run
|
530 |
ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run() # specified or most recent path
|
531 |
assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist'
|
532 |
with open(Path(ckpt).parent.parent / 'opt.yaml') as f:
|
@@ -534,7 +524,6 @@ def main(opt):
|
|
534 |
opt.cfg, opt.weights, opt.resume = '', ckpt, True # reinstate
|
535 |
LOGGER.info(f'Resuming training from {ckpt}')
|
536 |
else:
|
537 |
-
# opt.hyp = opt.hyp or ('hyp.finetune.yaml' if opt.weights else 'hyp.scratch.yaml')
|
538 |
opt.data, opt.cfg, opt.hyp = check_file(opt.data), check_file(opt.cfg), check_file(opt.hyp) # check files
|
539 |
assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified'
|
540 |
opt.name = 'evolve' if opt.evolve else opt.name
|
@@ -545,11 +534,13 @@ def main(opt):
|
|
545 |
if LOCAL_RANK != -1:
|
546 |
from datetime import timedelta
|
547 |
assert torch.cuda.device_count() > LOCAL_RANK, 'insufficient CUDA devices for DDP command'
|
|
|
|
|
|
|
|
|
548 |
torch.cuda.set_device(LOCAL_RANK)
|
549 |
device = torch.device('cuda', LOCAL_RANK)
|
550 |
dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo", timeout=timedelta(seconds=60))
|
551 |
-
assert opt.batch_size % WORLD_SIZE == 0, '--batch-size must be multiple of CUDA device count'
|
552 |
-
assert not opt.image_weights, '--image-weights argument is not compatible with DDP training'
|
553 |
|
554 |
# Train
|
555 |
if not opt.evolve:
|
@@ -594,7 +585,6 @@ def main(opt):
|
|
594 |
hyp = yaml.safe_load(f) # load hyps dict
|
595 |
if 'anchors' not in hyp: # anchors commented in hyp.yaml
|
596 |
hyp['anchors'] = 3
|
597 |
-
assert LOCAL_RANK == -1, 'DDP mode not implemented for --evolve'
|
598 |
opt.noval, opt.nosave = True, True # only val/save final epoch
|
599 |
# ei = [isinstance(x, (int, float)) for x in hyp.values()] # evolvable indices
|
600 |
yaml_file = Path(opt.save_dir) / 'hyp_evolved.yaml' # save best result here
|
@@ -646,7 +636,7 @@ def main(opt):
|
|
646 |
|
647 |
|
648 |
def run(**kwargs):
|
649 |
-
# Usage: import train; train.run(imgsz=320, weights='yolov5m.pt')
|
650 |
opt = parse_opt(True)
|
651 |
for k, v in kwargs.items():
|
652 |
setattr(opt, k, v)
|
|
|
17 |
|
18 |
import math
|
19 |
import numpy as np
|
20 |
+
import torch
|
21 |
import torch.distributed as dist
|
22 |
import torch.nn as nn
|
|
|
|
|
|
|
|
|
23 |
import yaml
|
24 |
from torch.cuda import amp
|
25 |
from torch.nn.parallel import DistributedDataParallel as DDP
|
26 |
+
from torch.optim import Adam, SGD, lr_scheduler
|
27 |
from torch.utils.tensorboard import SummaryWriter
|
28 |
from tqdm import tqdm
|
29 |
|
|
|
56 |
device,
|
57 |
):
|
58 |
save_dir, epochs, batch_size, weights, single_cls, evolve, data, cfg, resume, noval, nosave, workers, = \
|
59 |
+
Path(opt.save_dir), opt.epochs, opt.batch_size, opt.weights, opt.single_cls, opt.evolve, opt.data, opt.cfg, \
|
60 |
opt.resume, opt.noval, opt.nosave, opt.workers
|
61 |
|
62 |
# Directories
|
63 |
+
w = save_dir / 'weights' # weights dir
|
64 |
+
w.mkdir(parents=True, exist_ok=True) # make dir
|
65 |
+
last, best, results_file = w / 'last.pt', w / 'best.pt', save_dir / 'results.txt'
|
|
|
|
|
|
|
66 |
|
67 |
# Hyperparameters
|
68 |
if isinstance(hyp, str):
|
|
|
87 |
loggers = {'wandb': None, 'tb': None} # loggers dict
|
88 |
if RANK in [-1, 0]:
|
89 |
# TensorBoard
|
90 |
+
if plots:
|
91 |
prefix = colorstr('tensorboard: ')
|
92 |
LOGGER.info(f"{prefix}Start with 'tensorboard --logdir {opt.project}', view at http://localhost:6006/")
|
93 |
loggers['tb'] = SummaryWriter(str(save_dir))
|
|
|
100 |
loggers['wandb'] = wandb_logger.wandb
|
101 |
if loggers['wandb']:
|
102 |
data_dict = wandb_logger.data_dict
|
103 |
+
weights, epochs, hyp = opt.weights, opt.epochs, opt.hyp # may update values if resuming
|
104 |
|
105 |
nc = 1 if single_cls else int(data_dict['nc']) # number of classes
|
106 |
names = ['item'] if single_cls and len(data_dict['names']) != 1 else data_dict['names'] # class names
|
107 |
+
assert len(names) == nc, f'{len(names)} names found for nc={nc} dataset in {data}' # check
|
108 |
is_coco = data.endswith('coco.yaml') and nc == 80 # COCO dataset
|
109 |
|
110 |
# Model
|
|
|
115 |
ckpt = torch.load(weights, map_location=device) # load checkpoint
|
116 |
model = Model(cfg or ckpt['model'].yaml, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create
|
117 |
exclude = ['anchor'] if (cfg or hyp.get('anchors')) and not resume else [] # exclude keys
|
118 |
+
csd = ckpt['model'].float().state_dict() # checkpoint state_dict as FP32
|
119 |
+
csd = intersect_dicts(csd, model.state_dict(), exclude=exclude) # intersect
|
120 |
+
model.load_state_dict(csd, strict=False) # load
|
121 |
+
LOGGER.info(f'Transferred {len(csd)}/{len(model.state_dict())} items from {weights}') # report
|
122 |
else:
|
123 |
model = Model(cfg, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create
|
124 |
with torch_distributed_zero_first(RANK):
|
125 |
check_dataset(data_dict) # check
|
126 |
+
train_path, val_path = data_dict['train'], data_dict['val']
|
|
|
127 |
|
128 |
# Freeze
|
129 |
freeze = [] # parameter names to freeze (full or partial)
|
130 |
for k, v in model.named_parameters():
|
131 |
v.requires_grad = True # train all layers
|
132 |
if any(x in k for x in freeze):
|
133 |
+
print(f'freezing {k}')
|
134 |
v.requires_grad = False
|
135 |
|
136 |
# Optimizer
|
|
|
139 |
hyp['weight_decay'] *= batch_size * accumulate / nbs # scale weight_decay
|
140 |
LOGGER.info(f"Scaled weight_decay = {hyp['weight_decay']}")
|
141 |
|
142 |
+
g0, g1, g2 = [], [], [] # optimizer parameter groups
|
143 |
+
for v in model.modules():
|
144 |
+
if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): # bias
|
145 |
+
g2.append(v.bias)
|
146 |
+
if isinstance(v, nn.BatchNorm2d): # weight with decay
|
147 |
+
g0.append(v.weight)
|
148 |
+
elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter): # weight without decay
|
149 |
+
g1.append(v.weight)
|
150 |
|
151 |
if opt.adam:
|
152 |
+
optimizer = Adam(g0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum
|
153 |
else:
|
154 |
+
optimizer = SGD(g0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
|
155 |
|
156 |
+
optimizer.add_param_group({'params': g1, 'weight_decay': hyp['weight_decay']}) # add g1 with weight_decay
|
157 |
+
optimizer.add_param_group({'params': g2}) # add g2 (biases)
|
158 |
+
LOGGER.info(f"{colorstr('optimizer:')} {type(optimizer).__name__} with parameter groups "
|
159 |
+
f"{len(g0)} weight, {len(g1)} weight (no decay), {len(g2)} bias")
|
160 |
+
del g0, g1, g2
|
161 |
|
162 |
+
# Scheduler
|
|
|
163 |
if opt.linear_lr:
|
164 |
lf = lambda x: (1 - x / (epochs - 1)) * (1.0 - hyp['lrf']) + hyp['lrf'] # linear
|
165 |
else:
|
166 |
lf = one_cycle(1, hyp['lrf'], epochs) # cosine 1->hyp['lrf']
|
167 |
+
scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs)
|
|
|
168 |
|
169 |
# EMA
|
170 |
ema = ModelEMA(model) if RANK in [-1, 0] else None
|
|
|
189 |
# Epochs
|
190 |
start_epoch = ckpt['epoch'] + 1
|
191 |
if resume:
|
192 |
+
assert start_epoch > 0, f'{weights} training to {epochs} epochs is finished, nothing to resume.'
|
193 |
if epochs < start_epoch:
|
194 |
+
LOGGER.info(f"{weights} has been trained for {ckpt['epoch']} epochs. Fine-tuning for {epochs} more epochs.")
|
|
|
195 |
epochs += ckpt['epoch'] # finetune additional epochs
|
196 |
|
197 |
+
del ckpt, csd
|
198 |
|
199 |
# Image sizes
|
200 |
gs = max(int(model.stride.max()), 32) # grid size (max stride)
|
|
|
209 |
|
210 |
# SyncBatchNorm
|
211 |
if opt.sync_bn and cuda and RANK != -1:
|
|
|
212 |
model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device)
|
213 |
LOGGER.info('Using SyncBatchNorm()')
|
214 |
|
|
|
219 |
prefix=colorstr('train: '))
|
220 |
mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class
|
221 |
nb = len(train_loader) # number of batches
|
222 |
+
assert mlc < nc, f'Label class {mlc} exceeds nc={nc} in {data}. Possible class labels are 0-{nc - 1}'
|
223 |
|
224 |
# Process 0
|
225 |
if RANK in [-1, 0]:
|
|
|
252 |
hyp['label_smoothing'] = opt.label_smoothing
|
253 |
model.nc = nc # attach number of classes to model
|
254 |
model.hyp = hyp # attach hyperparameters to model
|
|
|
255 |
model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc # attach class weights
|
256 |
model.names = names
|
257 |
|
|
|
305 |
# Warmup
|
306 |
if ni <= nw:
|
307 |
xi = [0, nw] # x interp
|
308 |
+
# compute_loss.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou)
|
309 |
accumulate = max(1, np.interp(ni, xi, [1, nbs / batch_size]).round())
|
310 |
for j, x in enumerate(optimizer.param_groups):
|
311 |
# bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
|
|
|
319 |
sf = sz / max(imgs.shape[2:]) # scale factor
|
320 |
if sf != 1:
|
321 |
ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]] # new shape (stretched to gs-multiple)
|
322 |
+
imgs = nn.functional.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)
|
323 |
|
324 |
# Forward
|
325 |
with amp.autocast(enabled=cuda):
|
|
|
345 |
# Print
|
346 |
if RANK in [-1, 0]:
|
347 |
mloss = (mloss * i + loss_items) / (i + 1) # update mean losses
|
348 |
+
mem = f'{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G' # (GB)
|
349 |
s = ('%10s' * 2 + '%10.4g' * 6) % (
|
350 |
f'{epoch}/{epochs - 1}', mem, *mloss, targets.shape[0], imgs.shape[-1])
|
351 |
pbar.set_description(s)
|
|
|
371 |
# DDP process 0 or single-GPU
|
372 |
if RANK in [-1, 0]:
|
373 |
# mAP
|
374 |
+
ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'names', 'stride', 'class_weights'])
|
375 |
final_epoch = epoch + 1 == epochs
|
376 |
if not noval or final_epoch: # Calculate mAP
|
377 |
wandb_logger.current_epoch = epoch + 1
|
|
|
447 |
batch_size=batch_size // WORLD_SIZE * 2,
|
448 |
imgsz=imgsz,
|
449 |
model=attempt_load(m, device).half(),
|
450 |
+
iou_thres=0.7, # NMS IoU threshold for best pycocotools results
|
451 |
single_cls=single_cls,
|
452 |
dataloader=val_loader,
|
453 |
save_dir=save_dir,
|
|
|
516 |
check_requirements(exclude=['thop'])
|
517 |
|
518 |
# Resume
|
519 |
+
if opt.resume and not check_wandb_resume(opt): # resume an interrupted run
|
|
|
520 |
ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run() # specified or most recent path
|
521 |
assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist'
|
522 |
with open(Path(ckpt).parent.parent / 'opt.yaml') as f:
|
|
|
524 |
opt.cfg, opt.weights, opt.resume = '', ckpt, True # reinstate
|
525 |
LOGGER.info(f'Resuming training from {ckpt}')
|
526 |
else:
|
|
|
527 |
opt.data, opt.cfg, opt.hyp = check_file(opt.data), check_file(opt.cfg), check_file(opt.hyp) # check files
|
528 |
assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified'
|
529 |
opt.name = 'evolve' if opt.evolve else opt.name
|
|
|
534 |
if LOCAL_RANK != -1:
|
535 |
from datetime import timedelta
|
536 |
assert torch.cuda.device_count() > LOCAL_RANK, 'insufficient CUDA devices for DDP command'
|
537 |
+
assert opt.batch_size % WORLD_SIZE == 0, '--batch-size must be multiple of CUDA device count'
|
538 |
+
assert not opt.image_weights, '--image-weights argument is not compatible with DDP training'
|
539 |
+
assert not opt.evolve, '--evolve argument is not compatible with DDP training'
|
540 |
+
assert not opt.sync_bn, '--sync-bn known training issue, see https://github.com/ultralytics/yolov5/issues/3998'
|
541 |
torch.cuda.set_device(LOCAL_RANK)
|
542 |
device = torch.device('cuda', LOCAL_RANK)
|
543 |
dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo", timeout=timedelta(seconds=60))
|
|
|
|
|
544 |
|
545 |
# Train
|
546 |
if not opt.evolve:
|
|
|
585 |
hyp = yaml.safe_load(f) # load hyps dict
|
586 |
if 'anchors' not in hyp: # anchors commented in hyp.yaml
|
587 |
hyp['anchors'] = 3
|
|
|
588 |
opt.noval, opt.nosave = True, True # only val/save final epoch
|
589 |
# ei = [isinstance(x, (int, float)) for x in hyp.values()] # evolvable indices
|
590 |
yaml_file = Path(opt.save_dir) / 'hyp_evolved.yaml' # save best result here
|
|
|
636 |
|
637 |
|
638 |
def run(**kwargs):
|
639 |
+
# Usage: import train; train.run(data='coco128.yaml', imgsz=320, weights='yolov5m.pt')
|
640 |
opt = parse_opt(True)
|
641 |
for k, v in kwargs.items():
|
642 |
setattr(opt, k, v)
|
utils/general.py
CHANGED
@@ -301,7 +301,7 @@ def clean_str(s):
|
|
301 |
|
302 |
|
303 |
def one_cycle(y1=0.0, y2=1.0, steps=100):
|
304 |
-
# lambda function for sinusoidal ramp from y1 to y2
|
305 |
return lambda x: ((1 - math.cos(x * math.pi / steps)) / 2) * (y2 - y1) + y1
|
306 |
|
307 |
|
|
|
301 |
|
302 |
|
303 |
def one_cycle(y1=0.0, y2=1.0, steps=100):
|
304 |
+
# lambda function for sinusoidal ramp from y1 to y2 https://arxiv.org/pdf/1812.01187.pdf
|
305 |
return lambda x: ((1 - math.cos(x * math.pi / steps)) / 2) * (y2 - y1) + y1
|
306 |
|
307 |
|
utils/loss.py
CHANGED
@@ -108,7 +108,7 @@ class ComputeLoss:
|
|
108 |
det = model.module.model[-1] if is_parallel(model) else model.model[-1] # Detect() module
|
109 |
self.balance = {3: [4.0, 1.0, 0.4]}.get(det.nl, [4.0, 1.0, 0.25, 0.06, .02]) # P3-P7
|
110 |
self.ssi = list(det.stride).index(16) if autobalance else 0 # stride 16 index
|
111 |
-
self.BCEcls, self.BCEobj, self.gr, self.hyp, self.autobalance = BCEcls, BCEobj,
|
112 |
for k in 'na', 'nc', 'nl', 'anchors':
|
113 |
setattr(self, k, getattr(det, k))
|
114 |
|
|
|
108 |
det = model.module.model[-1] if is_parallel(model) else model.model[-1] # Detect() module
|
109 |
self.balance = {3: [4.0, 1.0, 0.4]}.get(det.nl, [4.0, 1.0, 0.25, 0.06, .02]) # P3-P7
|
110 |
self.ssi = list(det.stride).index(16) if autobalance else 0 # stride 16 index
|
111 |
+
self.BCEcls, self.BCEobj, self.gr, self.hyp, self.autobalance = BCEcls, BCEobj, 1.0, h, autobalance
|
112 |
for k in 'na', 'nc', 'nl', 'anchors':
|
113 |
setattr(self, k, getattr(det, k))
|
114 |
|