|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import torch |
|
|
import torch.nn as nn |
|
|
from torch.optim import Adam |
|
|
from tqdm import tqdm |
|
|
|
|
|
|
|
|
from tensorboardX import SummaryWriter |
|
|
import numpy as np |
|
|
|
|
|
|
|
|
|
|
|
from model import * |
|
|
|
|
|
|
|
|
|
|
|
import sys |
|
|
import os |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
model_dir = './model/semantic_cnn_model.pth' |
|
|
NUM_ARGS = 3 |
|
|
NUM_EPOCHS = 4000 |
|
|
BATCH_SIZE = 64 |
|
|
LEARNING_RATE = "lr" |
|
|
BETAS = "betas" |
|
|
EPS = "eps" |
|
|
WEIGHT_DECAY = "weight_decay" |
|
|
|
|
|
|
|
|
|
|
|
set_seed(SEED1) |
|
|
|
|
|
|
|
|
|
|
|
def adjust_learning_rate(optimizer, epoch): |
|
|
lr = 1e-3 |
|
|
if epoch > 40: |
|
|
lr = 2e-4 |
|
|
if epoch > 2000: |
|
|
lr = 2e-5 |
|
|
if epoch > 21000: |
|
|
lr = 1e-5 |
|
|
if epoch > 32984: |
|
|
lr = 1e-6 |
|
|
if epoch > 48000: |
|
|
|
|
|
lr = lr * (0.1 ** (epoch // 110000)) |
|
|
|
|
|
|
|
|
for param_group in optimizer.param_groups: |
|
|
param_group['lr'] = lr |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def train(model, dataloader, dataset, device, optimizer, criterion, epoch, epochs): |
|
|
|
|
|
|
|
|
model.train() |
|
|
|
|
|
|
|
|
running_loss = 0 |
|
|
counter = 0 |
|
|
|
|
|
num_batches = int(len(dataset)/dataloader.batch_size) |
|
|
for i, batch in tqdm(enumerate(dataloader), total=num_batches): |
|
|
|
|
|
counter += 1 |
|
|
|
|
|
scan_maps = batch['scan_map'] |
|
|
scan_maps = scan_maps.to(device) |
|
|
semantic_maps = batch['semantic_map'] |
|
|
semantic_maps = semantic_maps.to(device) |
|
|
sub_goals = batch['sub_goal'] |
|
|
sub_goals = sub_goals.to(device) |
|
|
velocities = batch['velocity'] |
|
|
velocities = velocities.to(device) |
|
|
|
|
|
|
|
|
optimizer.zero_grad() |
|
|
|
|
|
|
|
|
|
|
|
output = model(scan_maps, semantic_maps, sub_goals) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mask = (velocities != 0).any(dim=1) |
|
|
|
|
|
if mask.sum() == 0: |
|
|
loss = output.sum() * 0 |
|
|
else: |
|
|
loss = criterion(output[mask], velocities[mask]) |
|
|
|
|
|
|
|
|
loss.backward(torch.ones_like(loss)) |
|
|
optimizer.step() |
|
|
|
|
|
|
|
|
if torch.cuda.device_count() > 1: |
|
|
loss = loss.mean() |
|
|
|
|
|
running_loss += loss.item() |
|
|
|
|
|
|
|
|
|
|
|
if(i % 1280 == 0): |
|
|
print('Epoch [{}/{}], Step[{}/{}], Loss: {:.4f}' |
|
|
.format(epoch, epochs, i + 1, num_batches, loss.item())) |
|
|
|
|
|
train_loss = running_loss / len(dataset) |
|
|
|
|
|
return train_loss |
|
|
|
|
|
|
|
|
def validate(model, dataloader, dataset, device, criterion): |
|
|
|
|
|
|
|
|
model.eval() |
|
|
|
|
|
|
|
|
running_loss = 0 |
|
|
counter = 0 |
|
|
|
|
|
num_batches = int(len(dataset)/dataloader.batch_size) |
|
|
for i, batch in tqdm(enumerate(dataloader), total=num_batches): |
|
|
|
|
|
counter += 1 |
|
|
|
|
|
scan_maps = batch['scan_map'] |
|
|
scan_maps = scan_maps.to(device) |
|
|
|
|
|
semantic_maps = batch['semantic_map'] |
|
|
semantic_maps = semantic_maps.to(device) |
|
|
|
|
|
sub_goals = batch['sub_goal'] |
|
|
sub_goals = sub_goals.to(device) |
|
|
velocities = batch['velocity'] |
|
|
velocities = velocities.to(device) |
|
|
|
|
|
|
|
|
|
|
|
output = model(scan_maps, semantic_maps, sub_goals) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mask = (velocities != 0).any(dim=1) |
|
|
|
|
|
if mask.sum() == 0: |
|
|
loss = output.sum() * 0 |
|
|
else: |
|
|
loss = criterion(output[mask], velocities[mask]) |
|
|
|
|
|
|
|
|
|
|
|
if torch.cuda.device_count() > 1: |
|
|
loss = loss.mean() |
|
|
|
|
|
running_loss += loss.item() |
|
|
|
|
|
val_loss = running_loss / len(dataset) |
|
|
|
|
|
return val_loss |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main(argv): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if(len(argv) != NUM_ARGS): |
|
|
print("usage: python nedc_train_mdl.py [MDL_PATH] [TRAIN_PATH] [DEV_PATH]") |
|
|
exit(-1) |
|
|
|
|
|
|
|
|
|
|
|
mdl_path = argv[0] |
|
|
pTrain = argv[1] |
|
|
pDev = argv[2] |
|
|
|
|
|
|
|
|
|
|
|
odir = os.path.dirname(mdl_path) |
|
|
|
|
|
|
|
|
|
|
|
if not os.path.exists(odir): |
|
|
os.makedirs(odir) |
|
|
|
|
|
|
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
|
|
|
print('...Start reading data...') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_dataset = NavDataset(pTrain, 'train') |
|
|
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, \ |
|
|
shuffle=True, drop_last=True, pin_memory=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dev_dataset = NavDataset(pDev, 'dev') |
|
|
dev_dataloader = torch.utils.data.DataLoader(dev_dataset, batch_size=BATCH_SIZE, \ |
|
|
shuffle=True, drop_last=True, pin_memory=True) |
|
|
|
|
|
print('...Finish reading data...') |
|
|
|
|
|
|
|
|
|
|
|
model = SemanticCNN(Bottleneck, [2, 1, 1]) |
|
|
|
|
|
|
|
|
|
|
|
model.to(device) |
|
|
|
|
|
|
|
|
|
|
|
opt_params = { LEARNING_RATE: 0.001, |
|
|
BETAS: (.9,0.999), |
|
|
EPS: 1e-08, |
|
|
WEIGHT_DECAY: .001 } |
|
|
|
|
|
|
|
|
|
|
|
criterion = nn.MSELoss(reduction='sum') |
|
|
criterion.to(device) |
|
|
|
|
|
|
|
|
|
|
|
optimizer = Adam(model.parameters(), **opt_params) |
|
|
|
|
|
|
|
|
|
|
|
epochs = NUM_EPOCHS |
|
|
|
|
|
|
|
|
if os.path.exists(mdl_path): |
|
|
checkpoint = torch.load(mdl_path) |
|
|
model.load_state_dict(checkpoint['model']) |
|
|
optimizer.load_state_dict(checkpoint['optimizer']) |
|
|
start_epoch = checkpoint['epoch'] |
|
|
print('Load epoch {} success'.format(start_epoch)) |
|
|
else: |
|
|
start_epoch = 0 |
|
|
print('No trained models, restart training') |
|
|
|
|
|
|
|
|
if torch.cuda.device_count() > 1: |
|
|
print("Let's use 2 of total", torch.cuda.device_count(), "GPUs!") |
|
|
|
|
|
model = nn.DataParallel(model) |
|
|
|
|
|
|
|
|
|
|
|
model.to(device) |
|
|
|
|
|
|
|
|
writer = SummaryWriter('runs') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
epoch_num = 0 |
|
|
for epoch in range(start_epoch+1, epochs): |
|
|
|
|
|
|
|
|
adjust_learning_rate(optimizer, epoch) |
|
|
|
|
|
|
|
|
|
|
|
train_epoch_loss = train( |
|
|
model, train_dataloader, train_dataset, device, optimizer, criterion, epoch, epochs |
|
|
) |
|
|
|
|
|
|
|
|
valid_epoch_loss = validate( |
|
|
model, dev_dataloader, dev_dataset, device, criterion |
|
|
) |
|
|
|
|
|
|
|
|
writer.add_scalar('training loss', |
|
|
train_epoch_loss, |
|
|
epoch) |
|
|
writer.add_scalar('validation loss', |
|
|
valid_epoch_loss, |
|
|
epoch) |
|
|
|
|
|
print('Train set: Average loss: {:.4f}'.format(train_epoch_loss)) |
|
|
print('Validation set: Average loss: {:.4f}'.format(valid_epoch_loss)) |
|
|
|
|
|
|
|
|
|
|
|
if(epoch % 50 == 0): |
|
|
if torch.cuda.device_count() > 1: |
|
|
state = {'model':model.module.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch} |
|
|
else: |
|
|
state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch} |
|
|
path='./model/model' + str(epoch) +'.pth' |
|
|
torch.save(state, path) |
|
|
|
|
|
epoch_num = epoch |
|
|
|
|
|
|
|
|
if torch.cuda.device_count() > 1: |
|
|
state = {'model':model.module.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch_num} |
|
|
else: |
|
|
state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch_num} |
|
|
torch.save(state, mdl_path) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return True |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
main(sys.argv[1:]) |
|
|
|
|
|
|
|
|
|