File size: 10,360 Bytes
5de1792 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 |
import argparse, time
import os, random, shutil
import numpy as np
import torch
import torch.nn as nn
import torch.optim
import torch.utils.data
from torch.utils.tensorboard import SummaryWriter
from dataset import get_pretraining_set
from model.loss import *
from tools import AverageMeter, sum_para_cnt
global ws
parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
parser.add_argument('--epochs', default=300, type=int, metavar='N',
help='number of total epochs to run')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
help='manual epoch number (useful on restarts)')
parser.add_argument('-b', '--batch-size', default=256, type=int,
metavar='N',
help='mini-batch size (default: 256), this is the total '
'batch size of all GPUs on the current node when '
'using Data Parallel or Distributed Data Parallel')
parser.add_argument('--lr', '--learning-rate', default=0.0005, type=float,
metavar='LR', help='initial learning rate', dest='lr')
parser.add_argument('--schedule', default=[351], nargs='*', type=int,
help='learning rate schedule (when to drop lr by 10x)')
parser.add_argument('--wd', '--weight-decay', default=1e-5, type=float,
metavar='W', help='weight decay (default: 1e-4)',
dest='weight_decay')
parser.add_argument('-p', '--print-freq', default=10, type=int,
metavar='N', help='print frequency (default: 10)')
parser.add_argument('--resume', default='', type=str, metavar='PATH',
help='path to latest checkpoint (default: none)')
parser.add_argument('--seed', default=None, type=int,
help='seed for initializing training. ')
parser.add_argument('--checkpoint-path', default='./checkpoint', type=str)
parser.add_argument('--pre-dataset', default='ntu60', type=str,
help='which dataset to use for self supervised training (ntu60 or ntu120)')
parser.add_argument('--protocol', default='cross_subject', type=str,
help='training protocol cross_view/cross_subject/cross_setup')
parser.add_argument('--moda', default='joint', type=str,
help='joint, motion , bone')
parser.add_argument('--backbone', default='DSTE', type=str,
help='DSTE or STTR')
def main():
args = parser.parse_args()
# pretraining dataset and protocol
from options import options_pretraining as options
if args.pre_dataset == 'pku_v2' and args.protocol == 'cross_subject':
opts = options.opts_pku_v2_xsub()
elif args.pre_dataset == 'ntu60' and args.protocol == 'cross_view':
opts = options.opts_ntu_60_cross_view()
elif args.pre_dataset == 'ntu60' and args.protocol == 'cross_subject':
opts = options.opts_ntu_60_cross_subject()
elif args.pre_dataset == 'ntu120' and args.protocol == 'cross_setup':
opts = options.opts_ntu_120_cross_setup()
elif args.pre_dataset == 'ntu120' and args.protocol == 'cross_subject':
opts = options.opts_ntu_120_cross_subject()
# create model
if args.backbone == 'DSTE':
from model.DSTE import USDRL
model = USDRL(**opts.encoder_args)
elif args.backbone == 'STTR':
from model.STTR import USDRL
model = USDRL(**opts.encoder_args)
else:
print('backbone must be DSTE or STTR')
exit(0)
print("options",opts.train_feeder_args)
print("options",opts.encoder_args)
print(model)
print(args)
print('para count:', sum_para_cnt(model)/1e6, 'M')
#exit(0)
model = torch.nn.DataParallel(model)
model = model.cuda()
criterion = nn.MSELoss().cuda()
optimizer = torch.optim.Adam(model.parameters(), args.lr, weight_decay=args.weight_decay)
# optionally resume from a checkpoint
#args.resume = './checkpoint/exp_name/checkpoint_0300.pth.tar'
if args.resume:
if os.path.isfile(args.resume):
print("=> loading checkpoint '{}'".format(args.resume))
if torch.cuda.is_available():
checkpoint = torch.load(args.resume)
else:
print('CUDA Error: torch.cuda.is_available() == False')
exit(0)
#checkpoint = torch.load(args.resume)
args.start_epoch = checkpoint['epoch']
model.load_state_dict(checkpoint['state_dict'], strict=False)
print("=> loaded checkpoint '{}' (epoch {})"
.format(args.resume, checkpoint['epoch']))
del checkpoint
torch.cuda.empty_cache()
else:
print("=> no checkpoint found at '{}'".format(args.resume))
## Data loading code
train_dataset = get_pretraining_set(opts)
trainloader_params = {
'batch_size': args.batch_size,
'shuffle': True,
'num_workers': 8,
'pin_memory': True,
'prefetch_factor': 4,
'persistent_workers': True
}
train_loader = torch.utils.data.DataLoader(train_dataset, **trainloader_params)
writer = SummaryWriter(args.checkpoint_path)
scaler = torch.cuda.amp.GradScaler()
print(ws)
for epoch in range(args.start_epoch, args.epochs):
adjust_learning_rate(optimizer, epoch, args)
# train for one epoch
st_epoch = time.time()
loss = train(scaler, train_loader, model, criterion, optimizer, epoch, args)
print('epoch ' +str(epoch) + ' time:', time.time()-st_epoch, '\n')
writer.add_scalar('train_loss', loss.avg, global_step=epoch)
if epoch % 50 == 0:
save_checkpoint({
'epoch': epoch + 1,
'state_dict': model.state_dict(),
'optimizer' : optimizer.state_dict(),
}, is_best=False, filename=args.checkpoint_path+'/checkpoint_{:04d}.pth.tar'.format(epoch,loss.avg))
def train(scaler, train_loader, model, criterion, optimizer, epoch, args):
batch_time = AverageMeter('Time', ':6.3f')
data_time = AverageMeter('Data', ':6.3f')
losses = AverageMeter('Loss', ':.4e')
progress = ProgressMeter(
len(train_loader),
[batch_time, losses,],
prefix="Epoch: [{}] Lr_rate [{}]".format(epoch,optimizer.param_groups[0]['lr']))
# switch to train mode
model.train()
end = time.time()
for i, (data_v1, data_v2, data_v3, data_v4) in enumerate(train_loader):
# measure data loading time
for k in loss_rcd.keys():
loss_rcd[k].reset()
data_time.update(time.time() - end)
if torch.cuda.is_available():
data_v1 = data_v1.float().cuda()
data_v2 = data_v2.float().cuda()
data_v3 = data_v3.float().cuda()
data_v4 = data_v4.float().cuda()
optimizer.zero_grad()
with torch.cuda.amp.autocast():
z_t_list, z_s_list, z_i_list = model(data_v1, data_v2, data_v3, data_v4) # [512, 4096]
# Similarity
sim_t = similarity(z_t_list, criterion)
sim_s = similarity(z_s_list, criterion)
sim_i = similarity(z_i_list, criterion)
sim = sim_i + 0.5 * (sim_t + sim_s)
B, _ = z_t_list[0].shape
# Variance & AutoCov
vac_t = sum([v_ac(x) for x in z_t_list])
vac_s = sum([v_ac(x) for x in z_s_list])
vac_i = sum([v_ac(x) for x in z_i_list])
vac = vac_i + 0.5 * (vac_t + vac_s)
# cross correlation = Invariance + Reudce Redundancy
xcorr_t = cal_xc(z_t_list)
xcorr_s = cal_xc(z_s_list)
xcorr_i = cal_xc(z_i_list)
xcorr = xcorr_i + 0.5 * (xcorr_s + xcorr_t)
# Total loss, Multi-Grained Feature Decorrelation
loss = sim * ws['sim'] + vac * ws['vac'] + xcorr * ws['xcorr']
losses.update(loss.item(), B)
loss_rcd['vac'].update(vac.item(), B)
loss_rcd['sim'].update(sim.item(), B)
loss_rcd['xcorr'].update(xcorr.item(), B)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
batch_time.update(time.time() - end)
end = time.time()
if i + 1 == len(train_loader):
progress.display(i)
s = str(epoch) + '\t'
for k in loss_rcd.keys():
s += loss_rcd[k].get_str() + ' '
print(s)
return losses
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
torch.save(state, filename)
if is_best:
shutil.copyfile(filename, 'model_best.pth.tar')
class ProgressMeter(object):
def __init__(self, num_batches, meters, prefix=""):
self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
self.meters = meters
self.prefix = prefix
def display(self, batch):
entries = [self.prefix + self.batch_fmtstr.format(batch)]
entries += [str(meter) for meter in self.meters]
print('\t'.join(entries),flush=True)
def _get_batch_fmtstr(self, num_batches):
num_digits = len(str(num_batches // 1))
fmt = '{:' + str(num_digits) + 'd}'
return '[' + fmt + '/' + fmt.format(num_batches) + ']'
def adjust_learning_rate(optimizer, epoch, args):
"""Decay the learning rate based on schedule"""
lr = args.lr
for milestone in args.schedule:
lr *= 0.1 if epoch >= milestone else 1.
for param_group in optimizer.param_groups:
param_group['lr'] = lr
if __name__ == '__main__':
#exit(0)
seed = 0
random.seed(seed) # Python随机库的种子
np.random.seed(seed) # NumPy随机库的种子
torch.manual_seed(seed) # PyTorch随机库的种子
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed) # 如果使用多GPU
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
ws = {
'sim': 5,
'vac': 1.,
'xcorr': 1e-3
}
args = parser.parse_args()
loss_rcd= {}
for k in ws.keys():
loss_rcd[k] = AverageMeter(k, '.5e')
main() |