glenn-jocher
commited on
Commit
β’
efe60b5
1
Parent(s):
63dd65e
Refactor train.py and val.py `loggers` (#4137)
Browse files* Update loggers
* Config
* Update val.py
* cleanup
* fix1
* fix2
* fix3 and reformat
* format sweep.py
* Logger() class
* cleanup
* cleanup2
* wandb package import fix
* wandb package import fix2
* txt fix
* fix4
* fix5
* fix6
* drop wandb into utils/loggers
* fix 7
* rename loggers/wandb_logging to loggers/wandb
* Update message
* Update message
* Update message
* cleanup
* Fix x axis bug
* fix rank 0 issue
* cleanup
- train.py +21 -66
- utils/loggers/__init__.py +129 -0
- utils/{wandb_logging β loggers/wandb}/__init__.py +0 -0
- utils/{wandb_logging β loggers/wandb}/log_dataset.py +0 -0
- utils/{wandb_logging β loggers/wandb}/sweep.py +1 -1
- utils/{wandb_logging β loggers/wandb}/sweep.yaml +1 -1
- utils/{wandb_logging β loggers/wandb}/wandb_utils.py +14 -14
- utils/plots.py +2 -3
- val.py +4 -6
train.py
CHANGED
@@ -10,7 +10,6 @@ import os
|
|
10 |
import random
|
11 |
import sys
|
12 |
import time
|
13 |
-
import warnings
|
14 |
from copy import deepcopy
|
15 |
from pathlib import Path
|
16 |
from threading import Thread
|
@@ -24,7 +23,6 @@ import yaml
|
|
24 |
from torch.cuda import amp
|
25 |
from torch.nn.parallel import DistributedDataParallel as DDP
|
26 |
from torch.optim import Adam, SGD, lr_scheduler
|
27 |
-
from torch.utils.tensorboard import SummaryWriter
|
28 |
from tqdm import tqdm
|
29 |
|
30 |
FILE = Path(__file__).absolute()
|
@@ -42,8 +40,9 @@ from utils.google_utils import attempt_download
|
|
42 |
from utils.loss import ComputeLoss
|
43 |
from utils.plots import plot_images, plot_labels, plot_results, plot_evolution
|
44 |
from utils.torch_utils import ModelEMA, select_device, intersect_dicts, torch_distributed_zero_first, de_parallel
|
45 |
-
from utils.
|
46 |
from utils.metrics import fitness
|
|
|
47 |
|
48 |
LOGGER = logging.getLogger(__name__)
|
49 |
LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1)) # https://pytorch.org/docs/stable/elastic/run.html
|
@@ -76,37 +75,23 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
|
|
76 |
with open(save_dir / 'opt.yaml', 'w') as f:
|
77 |
yaml.safe_dump(vars(opt), f, sort_keys=False)
|
78 |
|
79 |
-
#
|
80 |
plots = not evolve # create plots
|
81 |
cuda = device.type != 'cpu'
|
82 |
init_seeds(1 + RANK)
|
83 |
with open(data) as f:
|
84 |
data_dict = yaml.safe_load(f) # data dict
|
85 |
-
|
86 |
-
# Loggers
|
87 |
-
loggers = {'wandb': None, 'tb': None} # loggers dict
|
88 |
-
if RANK in [-1, 0]:
|
89 |
-
# TensorBoard
|
90 |
-
if plots:
|
91 |
-
prefix = colorstr('tensorboard: ')
|
92 |
-
LOGGER.info(f"{prefix}Start with 'tensorboard --logdir {opt.project}', view at http://localhost:6006/")
|
93 |
-
loggers['tb'] = SummaryWriter(str(save_dir))
|
94 |
-
|
95 |
-
# W&B
|
96 |
-
opt.hyp = hyp # add hyperparameters
|
97 |
-
run_id = torch.load(weights).get('wandb_id') if weights.endswith('.pt') and os.path.isfile(weights) else None
|
98 |
-
run_id = run_id if opt.resume else None # start fresh run if transfer learning
|
99 |
-
wandb_logger = WandbLogger(opt, save_dir.stem, run_id, data_dict)
|
100 |
-
loggers['wandb'] = wandb_logger.wandb
|
101 |
-
if loggers['wandb']:
|
102 |
-
data_dict = wandb_logger.data_dict
|
103 |
-
weights, epochs, hyp = opt.weights, opt.epochs, opt.hyp # may update values if resuming
|
104 |
-
|
105 |
nc = 1 if single_cls else int(data_dict['nc']) # number of classes
|
106 |
names = ['item'] if single_cls and len(data_dict['names']) != 1 else data_dict['names'] # class names
|
107 |
assert len(names) == nc, f'{len(names)} names found for nc={nc} dataset in {data}' # check
|
108 |
is_coco = data.endswith('coco.yaml') and nc == 80 # COCO dataset
|
109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
# Model
|
111 |
pretrained = weights.endswith('.pt')
|
112 |
if pretrained:
|
@@ -351,16 +336,11 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
|
|
351 |
pbar.set_description(s)
|
352 |
|
353 |
# Plot
|
354 |
-
if plots
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
warnings.simplefilter('ignore') # suppress jit trace warning
|
360 |
-
loggers['tb'].add_graph(torch.jit.trace(de_parallel(model), imgs[0:1], strict=False), [])
|
361 |
-
elif plots and ni == 10 and loggers['wandb']:
|
362 |
-
wandb_logger.log({'Mosaics': [loggers['wandb'].Image(str(x), caption=x.name) for x in
|
363 |
-
save_dir.glob('train*.jpg') if x.exists()]})
|
364 |
|
365 |
# end batch ------------------------------------------------------------------------------------------------
|
366 |
|
@@ -368,13 +348,12 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
|
|
368 |
lr = [x['lr'] for x in optimizer.param_groups] # for loggers
|
369 |
scheduler.step()
|
370 |
|
371 |
-
# DDP process 0 or single-GPU
|
372 |
if RANK in [-1, 0]:
|
373 |
# mAP
|
|
|
374 |
ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'names', 'stride', 'class_weights'])
|
375 |
final_epoch = epoch + 1 == epochs
|
376 |
if not noval or final_epoch: # Calculate mAP
|
377 |
-
wandb_logger.current_epoch = epoch + 1
|
378 |
results, maps, _ = val.run(data_dict,
|
379 |
batch_size=batch_size // WORLD_SIZE * 2,
|
380 |
imgsz=imgsz,
|
@@ -385,29 +364,14 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
|
|
385 |
save_json=is_coco and final_epoch,
|
386 |
verbose=nc < 50 and final_epoch,
|
387 |
plots=plots and final_epoch,
|
388 |
-
|
389 |
compute_loss=compute_loss)
|
390 |
|
391 |
-
# Write
|
392 |
-
with open(results_file, 'a') as f:
|
393 |
-
f.write(s + '%10.4g' * 7 % results + '\n') # append metrics, val_loss
|
394 |
-
|
395 |
-
# Log
|
396 |
-
tags = ['train/box_loss', 'train/obj_loss', 'train/cls_loss', # train loss
|
397 |
-
'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95',
|
398 |
-
'val/box_loss', 'val/obj_loss', 'val/cls_loss', # val loss
|
399 |
-
'x/lr0', 'x/lr1', 'x/lr2'] # params
|
400 |
-
for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags):
|
401 |
-
if loggers['tb']:
|
402 |
-
loggers['tb'].add_scalar(tag, x, epoch) # TensorBoard
|
403 |
-
if loggers['wandb']:
|
404 |
-
wandb_logger.log({tag: x}) # W&B
|
405 |
-
|
406 |
# Update best mAP
|
407 |
fi = fitness(np.array(results).reshape(1, -1)) # weighted combination of [P, R, mAP@.5, mAP@.5-.95]
|
408 |
if fi > best_fitness:
|
409 |
best_fitness = fi
|
410 |
-
|
411 |
|
412 |
# Save model
|
413 |
if (not nosave) or (final_epoch and not evolve): # if save
|
@@ -418,16 +382,14 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
|
|
418 |
'ema': deepcopy(ema.ema).half(),
|
419 |
'updates': ema.updates,
|
420 |
'optimizer': optimizer.state_dict(),
|
421 |
-
'wandb_id':
|
422 |
|
423 |
# Save last, best and delete
|
424 |
torch.save(ckpt, last)
|
425 |
if best_fitness == fi:
|
426 |
torch.save(ckpt, best)
|
427 |
-
if loggers['wandb']:
|
428 |
-
if ((epoch + 1) % opt.save_period == 0 and not final_epoch) and opt.save_period != -1:
|
429 |
-
wandb_logger.log_model(last.parent, opt, epoch, fi, best_model=best_fitness == fi)
|
430 |
del ckpt
|
|
|
431 |
|
432 |
# end epoch ----------------------------------------------------------------------------------------------------
|
433 |
# end training -----------------------------------------------------------------------------------------------------
|
@@ -435,10 +397,6 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
|
|
435 |
LOGGER.info(f'{epoch - start_epoch + 1} epochs completed in {(time.time() - t0) / 3600:.3f} hours.\n')
|
436 |
if plots:
|
437 |
plot_results(save_dir=save_dir) # save as results.png
|
438 |
-
if loggers['wandb']:
|
439 |
-
files = ['results.png', 'confusion_matrix.png', *[f'{x}_curve.png' for x in ('F1', 'PR', 'P', 'R')]]
|
440 |
-
wandb_logger.log({"Results": [loggers['wandb'].Image(str(save_dir / f), caption=f) for f in files
|
441 |
-
if (save_dir / f).exists()]})
|
442 |
|
443 |
if not evolve:
|
444 |
if is_coco: # COCO dataset
|
@@ -458,11 +416,8 @@ def train(hyp, # path/to/hyp.yaml or hyp dictionary
|
|
458 |
for f in last, best:
|
459 |
if f.exists():
|
460 |
strip_optimizer(f) # strip optimizers
|
461 |
-
|
462 |
-
|
463 |
-
name='run_' + wandb_logger.wandb_run.id + '_model',
|
464 |
-
aliases=['latest', 'best', 'stripped'])
|
465 |
-
wandb_logger.finish_run()
|
466 |
|
467 |
torch.cuda.empty_cache()
|
468 |
return results
|
|
|
10 |
import random
|
11 |
import sys
|
12 |
import time
|
|
|
13 |
from copy import deepcopy
|
14 |
from pathlib import Path
|
15 |
from threading import Thread
|
|
|
23 |
from torch.cuda import amp
|
24 |
from torch.nn.parallel import DistributedDataParallel as DDP
|
25 |
from torch.optim import Adam, SGD, lr_scheduler
|
|
|
26 |
from tqdm import tqdm
|
27 |
|
28 |
FILE = Path(__file__).absolute()
|
|
|
40 |
from utils.loss import ComputeLoss
|
41 |
from utils.plots import plot_images, plot_labels, plot_results, plot_evolution
|
42 |
from utils.torch_utils import ModelEMA, select_device, intersect_dicts, torch_distributed_zero_first, de_parallel
|
43 |
+
from utils.loggers.wandb.wandb_utils import check_wandb_resume
|
44 |
from utils.metrics import fitness
|
45 |
+
from utils.loggers import Loggers
|
46 |
|
47 |
LOGGER = logging.getLogger(__name__)
|
48 |
LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1)) # https://pytorch.org/docs/stable/elastic/run.html
|
|
|
75 |
with open(save_dir / 'opt.yaml', 'w') as f:
|
76 |
yaml.safe_dump(vars(opt), f, sort_keys=False)
|
77 |
|
78 |
+
# Config
|
79 |
plots = not evolve # create plots
|
80 |
cuda = device.type != 'cpu'
|
81 |
init_seeds(1 + RANK)
|
82 |
with open(data) as f:
|
83 |
data_dict = yaml.safe_load(f) # data dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
nc = 1 if single_cls else int(data_dict['nc']) # number of classes
|
85 |
names = ['item'] if single_cls and len(data_dict['names']) != 1 else data_dict['names'] # class names
|
86 |
assert len(names) == nc, f'{len(names)} names found for nc={nc} dataset in {data}' # check
|
87 |
is_coco = data.endswith('coco.yaml') and nc == 80 # COCO dataset
|
88 |
|
89 |
+
# Loggers
|
90 |
+
if RANK in [-1, 0]:
|
91 |
+
loggers = Loggers(save_dir, results_file, weights, opt, hyp, data_dict, LOGGER).start() # loggers dict
|
92 |
+
if loggers.wandb and resume:
|
93 |
+
weights, epochs, hyp, data_dict = opt.weights, opt.epochs, opt.hyp, loggers.wandb.data_dict
|
94 |
+
|
95 |
# Model
|
96 |
pretrained = weights.endswith('.pt')
|
97 |
if pretrained:
|
|
|
336 |
pbar.set_description(s)
|
337 |
|
338 |
# Plot
|
339 |
+
if plots:
|
340 |
+
if ni < 3:
|
341 |
+
f = save_dir / f'train_batch{ni}.jpg' # filename
|
342 |
+
Thread(target=plot_images, args=(imgs, targets, paths, f), daemon=True).start()
|
343 |
+
loggers.on_train_batch_end(ni, model, imgs)
|
|
|
|
|
|
|
|
|
|
|
344 |
|
345 |
# end batch ------------------------------------------------------------------------------------------------
|
346 |
|
|
|
348 |
lr = [x['lr'] for x in optimizer.param_groups] # for loggers
|
349 |
scheduler.step()
|
350 |
|
|
|
351 |
if RANK in [-1, 0]:
|
352 |
# mAP
|
353 |
+
loggers.on_train_epoch_end(epoch)
|
354 |
ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'names', 'stride', 'class_weights'])
|
355 |
final_epoch = epoch + 1 == epochs
|
356 |
if not noval or final_epoch: # Calculate mAP
|
|
|
357 |
results, maps, _ = val.run(data_dict,
|
358 |
batch_size=batch_size // WORLD_SIZE * 2,
|
359 |
imgsz=imgsz,
|
|
|
364 |
save_json=is_coco and final_epoch,
|
365 |
verbose=nc < 50 and final_epoch,
|
366 |
plots=plots and final_epoch,
|
367 |
+
loggers=loggers,
|
368 |
compute_loss=compute_loss)
|
369 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
370 |
# Update best mAP
|
371 |
fi = fitness(np.array(results).reshape(1, -1)) # weighted combination of [P, R, mAP@.5, mAP@.5-.95]
|
372 |
if fi > best_fitness:
|
373 |
best_fitness = fi
|
374 |
+
loggers.on_train_val_end(mloss, results, lr, epoch, s, best_fitness, fi)
|
375 |
|
376 |
# Save model
|
377 |
if (not nosave) or (final_epoch and not evolve): # if save
|
|
|
382 |
'ema': deepcopy(ema.ema).half(),
|
383 |
'updates': ema.updates,
|
384 |
'optimizer': optimizer.state_dict(),
|
385 |
+
'wandb_id': loggers.wandb.wandb_run.id if loggers.wandb else None}
|
386 |
|
387 |
# Save last, best and delete
|
388 |
torch.save(ckpt, last)
|
389 |
if best_fitness == fi:
|
390 |
torch.save(ckpt, best)
|
|
|
|
|
|
|
391 |
del ckpt
|
392 |
+
loggers.on_model_save(last, epoch, final_epoch, best_fitness, fi)
|
393 |
|
394 |
# end epoch ----------------------------------------------------------------------------------------------------
|
395 |
# end training -----------------------------------------------------------------------------------------------------
|
|
|
397 |
LOGGER.info(f'{epoch - start_epoch + 1} epochs completed in {(time.time() - t0) / 3600:.3f} hours.\n')
|
398 |
if plots:
|
399 |
plot_results(save_dir=save_dir) # save as results.png
|
|
|
|
|
|
|
|
|
400 |
|
401 |
if not evolve:
|
402 |
if is_coco: # COCO dataset
|
|
|
416 |
for f in last, best:
|
417 |
if f.exists():
|
418 |
strip_optimizer(f) # strip optimizers
|
419 |
+
|
420 |
+
loggers.on_train_end(last, best)
|
|
|
|
|
|
|
421 |
|
422 |
torch.cuda.empty_cache()
|
423 |
return results
|
utils/loggers/__init__.py
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# YOLOv5 experiment logging utils
|
2 |
+
|
3 |
+
import warnings
|
4 |
+
|
5 |
+
import torch
|
6 |
+
from torch.utils.tensorboard import SummaryWriter
|
7 |
+
|
8 |
+
from utils.general import colorstr, emojis
|
9 |
+
from utils.loggers.wandb.wandb_utils import WandbLogger
|
10 |
+
from utils.torch_utils import de_parallel
|
11 |
+
|
12 |
+
LOGGERS = ('txt', 'tb', 'wandb') # text-file, TensorBoard, Weights & Biases
|
13 |
+
|
14 |
+
try:
|
15 |
+
import wandb
|
16 |
+
|
17 |
+
assert hasattr(wandb, '__version__') # verify package import not local dir
|
18 |
+
except (ImportError, AssertionError):
|
19 |
+
wandb = None
|
20 |
+
|
21 |
+
|
22 |
+
class Loggers():
|
23 |
+
# YOLOv5 Loggers class
|
24 |
+
def __init__(self, save_dir=None, results_file=None, weights=None, opt=None, hyp=None,
|
25 |
+
data_dict=None, logger=None, include=LOGGERS):
|
26 |
+
self.save_dir = save_dir
|
27 |
+
self.results_file = results_file
|
28 |
+
self.weights = weights
|
29 |
+
self.opt = opt
|
30 |
+
self.hyp = hyp
|
31 |
+
self.data_dict = data_dict
|
32 |
+
self.logger = logger # for printing results to console
|
33 |
+
self.include = include
|
34 |
+
for k in LOGGERS:
|
35 |
+
setattr(self, k, None) # init empty logger dictionary
|
36 |
+
|
37 |
+
def start(self):
|
38 |
+
self.txt = True # always log to txt
|
39 |
+
|
40 |
+
# Message
|
41 |
+
try:
|
42 |
+
import wandb
|
43 |
+
except ImportError:
|
44 |
+
prefix = colorstr('Weights & Biases: ')
|
45 |
+
s = f"{prefix}run 'pip install wandb' to automatically track and visualize YOLOv5 π runs (RECOMMENDED)"
|
46 |
+
print(emojis(s))
|
47 |
+
|
48 |
+
# TensorBoard
|
49 |
+
s = self.save_dir
|
50 |
+
if 'tb' in self.include and not self.opt.evolve:
|
51 |
+
prefix = colorstr('TensorBoard: ')
|
52 |
+
self.logger.info(f"{prefix}Start with 'tensorboard --logdir {s.parent}', view at http://localhost:6006/")
|
53 |
+
self.tb = SummaryWriter(str(s))
|
54 |
+
|
55 |
+
# W&B
|
56 |
+
try:
|
57 |
+
assert 'wandb' in self.include and wandb
|
58 |
+
run_id = torch.load(self.weights).get('wandb_id') if self.opt.resume else None
|
59 |
+
self.opt.hyp = self.hyp # add hyperparameters
|
60 |
+
self.wandb = WandbLogger(self.opt, s.stem, run_id, self.data_dict)
|
61 |
+
except:
|
62 |
+
self.wandb = None
|
63 |
+
|
64 |
+
return self
|
65 |
+
|
66 |
+
def on_train_batch_end(self, ni, model, imgs):
|
67 |
+
# Callback runs on train batch end
|
68 |
+
if ni == 0:
|
69 |
+
with warnings.catch_warnings():
|
70 |
+
warnings.simplefilter('ignore') # suppress jit trace warning
|
71 |
+
self.tb.add_graph(torch.jit.trace(de_parallel(model), imgs[0:1], strict=False), [])
|
72 |
+
if self.wandb and ni == 10:
|
73 |
+
files = sorted(self.save_dir.glob('train*.jpg'))
|
74 |
+
self.wandb.log({'Mosaics': [wandb.Image(str(f), caption=f.name) for f in files if f.exists()]})
|
75 |
+
|
76 |
+
def on_train_epoch_end(self, epoch):
|
77 |
+
# Callback runs on train epoch end
|
78 |
+
if self.wandb:
|
79 |
+
self.wandb.current_epoch = epoch + 1
|
80 |
+
|
81 |
+
def on_val_batch_end(self, pred, predn, path, names, im):
|
82 |
+
# Callback runs on train batch end
|
83 |
+
if self.wandb:
|
84 |
+
self.wandb.val_one_image(pred, predn, path, names, im)
|
85 |
+
|
86 |
+
def on_val_end(self):
|
87 |
+
# Callback runs on val end
|
88 |
+
if self.wandb:
|
89 |
+
files = sorted(self.save_dir.glob('val*.jpg'))
|
90 |
+
self.wandb.log({"Validation": [wandb.Image(str(f), caption=f.name) for f in files]})
|
91 |
+
|
92 |
+
def on_train_val_end(self, mloss, results, lr, epoch, s, best_fitness, fi):
|
93 |
+
# Callback runs on validation end during training
|
94 |
+
vals = list(mloss[:-1]) + list(results) + lr
|
95 |
+
tags = ['train/box_loss', 'train/obj_loss', 'train/cls_loss', # train loss
|
96 |
+
'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95',
|
97 |
+
'val/box_loss', 'val/obj_loss', 'val/cls_loss', # val loss
|
98 |
+
'x/lr0', 'x/lr1', 'x/lr2'] # params
|
99 |
+
if self.txt:
|
100 |
+
with open(self.results_file, 'a') as f:
|
101 |
+
f.write(s + '%10.4g' * 7 % results + '\n') # append metrics, val_loss
|
102 |
+
if self.tb:
|
103 |
+
for x, tag in zip(vals, tags):
|
104 |
+
self.tb.add_scalar(tag, x, epoch) # TensorBoard
|
105 |
+
if self.wandb:
|
106 |
+
self.wandb.log({k: v for k, v in zip(tags, vals)})
|
107 |
+
self.wandb.end_epoch(best_result=best_fitness == fi)
|
108 |
+
|
109 |
+
def on_model_save(self, last, epoch, final_epoch, best_fitness, fi):
|
110 |
+
# Callback runs on model save event
|
111 |
+
if self.wandb:
|
112 |
+
if ((epoch + 1) % self.opt.save_period == 0 and not final_epoch) and self.opt.save_period != -1:
|
113 |
+
self.wandb.log_model(last.parent, self.opt, epoch, fi, best_model=best_fitness == fi)
|
114 |
+
|
115 |
+
def on_train_end(self, last, best):
|
116 |
+
# Callback runs on training end
|
117 |
+
files = ['results.png', 'confusion_matrix.png', *[f'{x}_curve.png' for x in ('F1', 'PR', 'P', 'R')]]
|
118 |
+
files = [(self.save_dir / f) for f in files if (self.save_dir / f).exists()] # filter
|
119 |
+
if self.wandb:
|
120 |
+
wandb.log({"Results": [wandb.Image(str(f), caption=f.name) for f in files]})
|
121 |
+
wandb.log_artifact(str(best if best.exists() else last), type='model',
|
122 |
+
name='run_' + self.wandb.wandb_run.id + '_model',
|
123 |
+
aliases=['latest', 'best', 'stripped'])
|
124 |
+
self.wandb.finish_run()
|
125 |
+
|
126 |
+
def log_images(self, paths):
|
127 |
+
# Log images
|
128 |
+
if self.wandb:
|
129 |
+
self.wandb.log({"Labels": [wandb.Image(str(x), caption=x.name) for x in paths]})
|
utils/{wandb_logging β loggers/wandb}/__init__.py
RENAMED
File without changes
|
utils/{wandb_logging β loggers/wandb}/log_dataset.py
RENAMED
File without changes
|
utils/{wandb_logging β loggers/wandb}/sweep.py
RENAMED
@@ -1,12 +1,12 @@
|
|
1 |
import sys
|
2 |
from pathlib import Path
|
|
|
3 |
import wandb
|
4 |
|
5 |
FILE = Path(__file__).absolute()
|
6 |
sys.path.append(FILE.parents[2].as_posix()) # add utils/ to path
|
7 |
|
8 |
from train import train, parse_opt
|
9 |
-
import test
|
10 |
from utils.general import increment_path
|
11 |
from utils.torch_utils import select_device
|
12 |
|
|
|
1 |
import sys
|
2 |
from pathlib import Path
|
3 |
+
|
4 |
import wandb
|
5 |
|
6 |
FILE = Path(__file__).absolute()
|
7 |
sys.path.append(FILE.parents[2].as_posix()) # add utils/ to path
|
8 |
|
9 |
from train import train, parse_opt
|
|
|
10 |
from utils.general import increment_path
|
11 |
from utils.torch_utils import select_device
|
12 |
|
utils/{wandb_logging β loggers/wandb}/sweep.yaml
RENAMED
@@ -14,7 +14,7 @@
|
|
14 |
# You can use grid, bayesian and hyperopt search strategy
|
15 |
# For more info on configuring sweeps visit - https://docs.wandb.ai/guides/sweeps/configuration
|
16 |
|
17 |
-
program: utils/
|
18 |
method: random
|
19 |
metric:
|
20 |
name: metrics/mAP_0.5
|
|
|
14 |
# You can use grid, bayesian and hyperopt search strategy
|
15 |
# For more info on configuring sweeps visit - https://docs.wandb.ai/guides/sweeps/configuration
|
16 |
|
17 |
+
program: utils/loggers/wandb/sweep.py
|
18 |
method: random
|
19 |
metric:
|
20 |
name: metrics/mAP_0.5
|
utils/{wandb_logging β loggers/wandb}/wandb_utils.py
RENAMED
@@ -1,4 +1,5 @@
|
|
1 |
"""Utilities and tools for tracking runs with Weights & Biases."""
|
|
|
2 |
import logging
|
3 |
import os
|
4 |
import sys
|
@@ -8,15 +9,18 @@ from pathlib import Path
|
|
8 |
import yaml
|
9 |
from tqdm import tqdm
|
10 |
|
11 |
-
|
|
|
|
|
12 |
from utils.datasets import LoadImagesAndLabels
|
13 |
from utils.datasets import img2label_paths
|
14 |
-
from utils.general import
|
15 |
|
16 |
try:
|
17 |
import wandb
|
18 |
-
|
19 |
-
|
|
|
20 |
wandb = None
|
21 |
|
22 |
RANK = int(os.getenv('RANK', -1))
|
@@ -106,7 +110,7 @@ class WandbLogger():
|
|
106 |
self.data_dict = data_dict
|
107 |
self.bbox_media_panel_images = []
|
108 |
self.val_table_path_map = None
|
109 |
-
self.max_imgs_to_log = 16
|
110 |
# It's more elegant to stick to 1 wandb.init call, but useful config data is overwritten in the WandbLogger's wandb.init call
|
111 |
if isinstance(opt.resume, str): # checks resume from artifact
|
112 |
if opt.resume.startswith(WANDB_ARTIFACT_PREFIX):
|
@@ -134,13 +138,11 @@ class WandbLogger():
|
|
134 |
if not opt.resume:
|
135 |
wandb_data_dict = self.check_and_upload_dataset(opt) if opt.upload_dataset else data_dict
|
136 |
# Info useful for resuming from artifacts
|
137 |
-
self.wandb_run.config.update({'opt': vars(opt), 'data_dict': wandb_data_dict},
|
|
|
138 |
self.data_dict = self.setup_training(opt, data_dict)
|
139 |
if self.job_type == 'Dataset Creation':
|
140 |
self.data_dict = self.check_and_upload_dataset(opt)
|
141 |
-
else:
|
142 |
-
prefix = colorstr('wandb: ')
|
143 |
-
print(f"{prefix}Install Weights & Biases for YOLOv5 logging with 'pip install wandb' (recommended)")
|
144 |
|
145 |
def check_and_upload_dataset(self, opt):
|
146 |
assert wandb, 'Install wandb to upload dataset'
|
@@ -169,7 +171,7 @@ class WandbLogger():
|
|
169 |
opt.artifact_alias)
|
170 |
self.val_artifact_path, self.val_artifact = self.download_dataset_artifact(data_dict.get('val'),
|
171 |
opt.artifact_alias)
|
172 |
-
|
173 |
if self.train_artifact_path is not None:
|
174 |
train_path = Path(self.train_artifact_path) / 'data/images/'
|
175 |
data_dict['train'] = str(train_path)
|
@@ -177,7 +179,6 @@ class WandbLogger():
|
|
177 |
val_path = Path(self.val_artifact_path) / 'data/images/'
|
178 |
data_dict['val'] = str(val_path)
|
179 |
|
180 |
-
|
181 |
if self.val_artifact is not None:
|
182 |
self.result_artifact = wandb.Artifact("run_" + wandb.run.id + "_progress", "evaluation")
|
183 |
self.result_table = wandb.Table(["epoch", "id", "ground truth", "prediction", "avg_confidence"])
|
@@ -315,9 +316,9 @@ class WandbLogger():
|
|
315 |
)
|
316 |
|
317 |
def val_one_image(self, pred, predn, path, names, im):
|
318 |
-
if self.val_table and self.result_table:
|
319 |
self.log_training_progress(predn, path, names)
|
320 |
-
else:
|
321 |
if len(self.bbox_media_panel_images) < self.max_imgs_to_log and self.current_epoch > 0:
|
322 |
if self.current_epoch % self.bbox_interval == 0:
|
323 |
box_data = [{"position": {"minX": xyxy[0], "minY": xyxy[1], "maxX": xyxy[2], "maxY": xyxy[3]},
|
@@ -328,7 +329,6 @@ class WandbLogger():
|
|
328 |
boxes = {"predictions": {"box_data": box_data, "class_labels": names}} # inference-space
|
329 |
self.bbox_media_panel_images.append(wandb.Image(im, boxes=boxes, caption=path.name))
|
330 |
|
331 |
-
|
332 |
def log(self, log_dict):
|
333 |
if self.wandb_run:
|
334 |
for key, value in log_dict.items():
|
|
|
1 |
"""Utilities and tools for tracking runs with Weights & Biases."""
|
2 |
+
|
3 |
import logging
|
4 |
import os
|
5 |
import sys
|
|
|
9 |
import yaml
|
10 |
from tqdm import tqdm
|
11 |
|
12 |
+
FILE = Path(__file__).absolute()
|
13 |
+
sys.path.append(FILE.parents[3].as_posix()) # add yolov5/ to path
|
14 |
+
|
15 |
from utils.datasets import LoadImagesAndLabels
|
16 |
from utils.datasets import img2label_paths
|
17 |
+
from utils.general import check_dataset, check_file
|
18 |
|
19 |
try:
|
20 |
import wandb
|
21 |
+
|
22 |
+
assert hasattr(wandb, '__version__') # verify package import not local dir
|
23 |
+
except (ImportError, AssertionError):
|
24 |
wandb = None
|
25 |
|
26 |
RANK = int(os.getenv('RANK', -1))
|
|
|
110 |
self.data_dict = data_dict
|
111 |
self.bbox_media_panel_images = []
|
112 |
self.val_table_path_map = None
|
113 |
+
self.max_imgs_to_log = 16
|
114 |
# It's more elegant to stick to 1 wandb.init call, but useful config data is overwritten in the WandbLogger's wandb.init call
|
115 |
if isinstance(opt.resume, str): # checks resume from artifact
|
116 |
if opt.resume.startswith(WANDB_ARTIFACT_PREFIX):
|
|
|
138 |
if not opt.resume:
|
139 |
wandb_data_dict = self.check_and_upload_dataset(opt) if opt.upload_dataset else data_dict
|
140 |
# Info useful for resuming from artifacts
|
141 |
+
self.wandb_run.config.update({'opt': vars(opt), 'data_dict': wandb_data_dict},
|
142 |
+
allow_val_change=True)
|
143 |
self.data_dict = self.setup_training(opt, data_dict)
|
144 |
if self.job_type == 'Dataset Creation':
|
145 |
self.data_dict = self.check_and_upload_dataset(opt)
|
|
|
|
|
|
|
146 |
|
147 |
def check_and_upload_dataset(self, opt):
|
148 |
assert wandb, 'Install wandb to upload dataset'
|
|
|
171 |
opt.artifact_alias)
|
172 |
self.val_artifact_path, self.val_artifact = self.download_dataset_artifact(data_dict.get('val'),
|
173 |
opt.artifact_alias)
|
174 |
+
|
175 |
if self.train_artifact_path is not None:
|
176 |
train_path = Path(self.train_artifact_path) / 'data/images/'
|
177 |
data_dict['train'] = str(train_path)
|
|
|
179 |
val_path = Path(self.val_artifact_path) / 'data/images/'
|
180 |
data_dict['val'] = str(val_path)
|
181 |
|
|
|
182 |
if self.val_artifact is not None:
|
183 |
self.result_artifact = wandb.Artifact("run_" + wandb.run.id + "_progress", "evaluation")
|
184 |
self.result_table = wandb.Table(["epoch", "id", "ground truth", "prediction", "avg_confidence"])
|
|
|
316 |
)
|
317 |
|
318 |
def val_one_image(self, pred, predn, path, names, im):
|
319 |
+
if self.val_table and self.result_table: # Log Table if Val dataset is uploaded as artifact
|
320 |
self.log_training_progress(predn, path, names)
|
321 |
+
else: # Default to bbox media panelif Val artifact not found
|
322 |
if len(self.bbox_media_panel_images) < self.max_imgs_to_log and self.current_epoch > 0:
|
323 |
if self.current_epoch % self.bbox_interval == 0:
|
324 |
box_data = [{"position": {"minX": xyxy[0], "minY": xyxy[1], "maxX": xyxy[2], "maxY": xyxy[3]},
|
|
|
329 |
boxes = {"predictions": {"box_data": box_data, "class_labels": names}} # inference-space
|
330 |
self.bbox_media_panel_images.append(wandb.Image(im, boxes=boxes, caption=path.name))
|
331 |
|
|
|
332 |
def log(self, log_dict):
|
333 |
if self.wandb_run:
|
334 |
for key, value in log_dict.items():
|
utils/plots.py
CHANGED
@@ -327,9 +327,8 @@ def plot_labels(labels, names=(), save_dir=Path(''), loggers=None):
|
|
327 |
plt.close()
|
328 |
|
329 |
# loggers
|
330 |
-
|
331 |
-
|
332 |
-
v.log({"Labels": [v.Image(str(x), caption=x.name) for x in save_dir.glob('*labels*.jpg')]}, commit=False)
|
333 |
|
334 |
|
335 |
def plot_evolution(yaml_file='data/hyp.finetune.yaml'): # from utils.plots import *; plot_evolution()
|
|
|
327 |
plt.close()
|
328 |
|
329 |
# loggers
|
330 |
+
if loggers:
|
331 |
+
loggers.log_images(save_dir.glob('*labels*.jpg'))
|
|
|
332 |
|
333 |
|
334 |
def plot_evolution(yaml_file='data/hyp.finetune.yaml'): # from utils.plots import *; plot_evolution()
|
val.py
CHANGED
@@ -26,6 +26,7 @@ from utils.general import coco80_to_coco91_class, check_dataset, check_file, che
|
|
26 |
from utils.metrics import ap_per_class, ConfusionMatrix
|
27 |
from utils.plots import plot_images, output_to_target, plot_study_txt
|
28 |
from utils.torch_utils import select_device, time_sync
|
|
|
29 |
|
30 |
|
31 |
def save_one_txt(predn, save_conf, shape, file):
|
@@ -97,7 +98,7 @@ def run(data,
|
|
97 |
dataloader=None,
|
98 |
save_dir=Path(''),
|
99 |
plots=True,
|
100 |
-
|
101 |
compute_loss=None,
|
102 |
):
|
103 |
# Initialize/load model and set device
|
@@ -215,8 +216,7 @@ def run(data,
|
|
215 |
save_one_txt(predn, save_conf, shape, file=save_dir / 'labels' / (path.stem + '.txt'))
|
216 |
if save_json:
|
217 |
save_one_json(predn, jdict, path, class_map) # append to COCO-JSON dictionary
|
218 |
-
|
219 |
-
wandb_logger.val_one_image(pred, predn, path, names, img[si])
|
220 |
|
221 |
# Plot images
|
222 |
if plots and batch_i < 3:
|
@@ -253,9 +253,7 @@ def run(data,
|
|
253 |
# Plots
|
254 |
if plots:
|
255 |
confusion_matrix.plot(save_dir=save_dir, names=list(names.values()))
|
256 |
-
|
257 |
-
val_batches = [wandb_logger.wandb.Image(str(f), caption=f.name) for f in sorted(save_dir.glob('val*.jpg'))]
|
258 |
-
wandb_logger.log({"Validation": val_batches})
|
259 |
|
260 |
# Save JSON
|
261 |
if save_json and len(jdict):
|
|
|
26 |
from utils.metrics import ap_per_class, ConfusionMatrix
|
27 |
from utils.plots import plot_images, output_to_target, plot_study_txt
|
28 |
from utils.torch_utils import select_device, time_sync
|
29 |
+
from utils.loggers import Loggers
|
30 |
|
31 |
|
32 |
def save_one_txt(predn, save_conf, shape, file):
|
|
|
98 |
dataloader=None,
|
99 |
save_dir=Path(''),
|
100 |
plots=True,
|
101 |
+
loggers=Loggers(),
|
102 |
compute_loss=None,
|
103 |
):
|
104 |
# Initialize/load model and set device
|
|
|
216 |
save_one_txt(predn, save_conf, shape, file=save_dir / 'labels' / (path.stem + '.txt'))
|
217 |
if save_json:
|
218 |
save_one_json(predn, jdict, path, class_map) # append to COCO-JSON dictionary
|
219 |
+
loggers.on_val_batch_end(pred, predn, path, names, img[si])
|
|
|
220 |
|
221 |
# Plot images
|
222 |
if plots and batch_i < 3:
|
|
|
253 |
# Plots
|
254 |
if plots:
|
255 |
confusion_matrix.plot(save_dir=save_dir, names=list(names.values()))
|
256 |
+
loggers.on_val_end()
|
|
|
|
|
257 |
|
258 |
# Save JSON
|
259 |
if save_json and len(jdict):
|