zzuxzt's picture
Upload folder using huggingface_hub
5523920 verified
#!/usr/bin/env python
#
# file: $ISIP_EXP/SOGMP/scripts/train.py
#
# revision history: xzt
# 20220824 (TE): first version
#
# usage:
# python train.py mdir train_data val_data
#
# arguments:
# mdir: the directory where the output model is stored
# train_data: the directory of training data
# val_data: the directory of valiation data
#
# This script trains a Semantic CNN model
#------------------------------------------------------------------------------
# import pytorch modules
#
import torch
import torch.nn as nn
from torch.optim import Adam
from tqdm import tqdm
# visualize:
from tensorboardX import SummaryWriter
import numpy as np
# import the model and all of its variables/functions
#
from model import *
# import modules
#
import sys
import os
#-----------------------------------------------------------------------------
#
# global variables are listed here
#
#-----------------------------------------------------------------------------
# general global values
#
model_dir = './model/semantic_cnn_model.pth' # the path of model storage
NUM_ARGS = 3
NUM_EPOCHS = 4000
BATCH_SIZE = 64
LEARNING_RATE = "lr"
BETAS = "betas"
EPS = "eps"
WEIGHT_DECAY = "weight_decay"
# for reproducibility, we seed the rng
#
set_seed(SEED1)
# adjust_learning_rate
# 
def adjust_learning_rate(optimizer, epoch):
lr = 1e-3
if epoch > 40:
lr = 2e-4
if epoch > 2000:
lr = 2e-5
if epoch > 21000:
lr = 1e-5
if epoch > 32984:
lr = 1e-6
if epoch > 48000:
# lr = 5e-8
lr = lr * (0.1 ** (epoch // 110000))
# if epoch > 8300:
# lr = 1e-9
for param_group in optimizer.param_groups:
param_group['lr'] = lr
# train function:
def train(model, dataloader, dataset, device, optimizer, criterion, epoch, epochs):
################################## Train #####################################
# Set model to training mode
model.train()
# for each batch in increments of batch size
#
running_loss = 0
counter = 0
# get the number of batches (ceiling of train_data/batch_size):
num_batches = int(len(dataset)/dataloader.batch_size)
for i, batch in tqdm(enumerate(dataloader), total=num_batches):
#for i, batch in enumerate(dataloader, 0):
counter += 1
# collect the samples as a batch:
scan_maps = batch['scan_map']
scan_maps = scan_maps.to(device)
semantic_maps = batch['semantic_map']
semantic_maps = semantic_maps.to(device)
sub_goals = batch['sub_goal']
sub_goals = sub_goals.to(device)
velocities = batch['velocity']
velocities = velocities.to(device)
# set all gradients to 0:
optimizer.zero_grad()
# feed the network the batch
#
output = model(scan_maps, semantic_maps, sub_goals)
#writer.add_graph(model,[batch_ped_pos_t, batch_scan_t, batch_goal_t])
# get the loss
#
# loss = criterion(output, velocities)
# ---------------------------
# Mask zero-velocity samples
# ---------------------------
mask = (velocities != 0).any(dim=1) # (B,)
if mask.sum() == 0:
loss = output.sum() * 0 # safe zero loss
else:
loss = criterion(output[mask], velocities[mask])
# perform back propagation:
loss.backward(torch.ones_like(loss))
optimizer.step()
# get the loss:
# multiple GPUs:
if torch.cuda.device_count() > 1:
loss = loss.mean()
running_loss += loss.item()
# display informational message
#
if(i % 1280 == 0):
print('Epoch [{}/{}], Step[{}/{}], Loss: {:.4f}'
.format(epoch, epochs, i + 1, num_batches, loss.item()))
train_loss = running_loss / len(dataset) #counter
return train_loss
# validate function:
def validate(model, dataloader, dataset, device, criterion):
################################## Train #####################################
# set model to evaluation mode:
model.eval()
# for each batch in increments of batch size
#
running_loss = 0
counter = 0
# get the number of batches (ceiling of train_data/batch_size):
num_batches = int(len(dataset)/dataloader.batch_size)
for i, batch in tqdm(enumerate(dataloader), total=num_batches):
#for i, batch in enumerate(dataloader, 0):
counter += 1
# collect the samples as a batch:
scan_maps = batch['scan_map']
scan_maps = scan_maps.to(device)
semantic_maps = batch['semantic_map']
semantic_maps = semantic_maps.to(device)
sub_goals = batch['sub_goal']
sub_goals = sub_goals.to(device)
velocities = batch['velocity']
velocities = velocities.to(device)
# feed the network the batch
#
output = model(scan_maps, semantic_maps, sub_goals)
#writer.add_graph(model,[batch_ped_pos_t, batch_scan_t, batch_goal_t])
# get the loss
#
# loss = criterion(output, velocities)
# ---------------------------
# Mask zero-velocity samples
# ---------------------------
mask = (velocities != 0).any(dim=1) # (B,)
if mask.sum() == 0:
loss = output.sum() * 0 # safe zero loss
else:
loss = criterion(output[mask], velocities[mask])
# get the loss:
# multiple GPUs:
if torch.cuda.device_count() > 1:
loss = loss.mean()
running_loss += loss.item()
val_loss = running_loss / len(dataset) #counter
return val_loss
#------------------------------------------------------------------------------
#
# the main program starts here
#
#------------------------------------------------------------------------------
# function: main
#
# arguments: none
#
# return: none
#
# This method is the main function.
#
def main(argv):
# ensure we have the correct amount of arguments
#
#global cur_batch_win
if(len(argv) != NUM_ARGS):
print("usage: python nedc_train_mdl.py [MDL_PATH] [TRAIN_PATH] [DEV_PATH]")
exit(-1)
# define local variables
#
mdl_path = argv[0]
pTrain = argv[1]
pDev = argv[2]
# get the output directory name
#
odir = os.path.dirname(mdl_path)
# if the odir doesn't exits, we make it
#
if not os.path.exists(odir):
os.makedirs(odir)
# set the device to use GPU if available
#
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
### train:
print('...Start reading data...')
# get array of the data
# data: [[0, 1, ... 26], [27, 28, ...] ...]
# labels: [0, 0, 1, ...]
#
#[ped_pos_t, scan_t, goal_t, vel_t] = get_data(pTrain)
train_dataset = NavDataset(pTrain, 'train')
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, \
shuffle=True, drop_last=True, pin_memory=True)
#train_data = train_data - np.mean(train_data, axis=0)
### dev:
# get array of the data
# data: [[0, 1, ... 26], [27, 28, ...] ...]
# labels: [0, 0, 1, ...]
#
#[ped_pos_d, scan_d, goal_d, vel_d] = get_data(pDev)
dev_dataset = NavDataset(pDev, 'dev')
dev_dataloader = torch.utils.data.DataLoader(dev_dataset, batch_size=BATCH_SIZE, \
shuffle=True, drop_last=True, pin_memory=True)
#dev_data = dev_data - np.mean(dev_data, axis=0)
print('...Finish reading data...')
# instantiate a model
#
model = SemanticCNN(Bottleneck, [2, 1, 1])
# moves the model to device (cpu in our case so no change)
#
model.to(device)
# set the adam optimizer parameters
#
opt_params = { LEARNING_RATE: 0.001,
BETAS: (.9,0.999),
EPS: 1e-08,
WEIGHT_DECAY: .001 }
# set the loss and optimizer
#
criterion = nn.MSELoss(reduction='sum')
criterion.to(device)
# create an optimizer, and pass the model params to it
#
optimizer = Adam(model.parameters(), **opt_params)
# get the number of epochs to train on
#
epochs = NUM_EPOCHS
# if there are trained models, continue training:
if os.path.exists(mdl_path):
checkpoint = torch.load(mdl_path)
model.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optimizer'])
start_epoch = checkpoint['epoch']
print('Load epoch {} success'.format(start_epoch))
else:
start_epoch = 0
print('No trained models, restart training')
# multiple GPUs:
if torch.cuda.device_count() > 1:
print("Let's use 2 of total", torch.cuda.device_count(), "GPUs!")
# dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
model = nn.DataParallel(model) #, device_ids=[0, 1])
# moves the model to device (cpu in our case so no change)
#
model.to(device)
# tensorboard writer:
writer = SummaryWriter('runs')
# for each epoch
#
#loss_train = []
#loss_vector = []
epoch_num = 0
for epoch in range(start_epoch+1, epochs):
# adjust learning rate:
adjust_learning_rate(optimizer, epoch)
################################## Train #####################################
# for each batch in increments of batch size
#
train_epoch_loss = train(
model, train_dataloader, train_dataset, device, optimizer, criterion, epoch, epochs
)
################################## Test #####################################
valid_epoch_loss = validate(
model, dev_dataloader, dev_dataset, device, criterion
)
# log the epoch loss
writer.add_scalar('training loss',
train_epoch_loss,
epoch)
writer.add_scalar('validation loss',
valid_epoch_loss,
epoch)
print('Train set: Average loss: {:.4f}'.format(train_epoch_loss))
print('Validation set: Average loss: {:.4f}'.format(valid_epoch_loss))
# save the model
#
if(epoch % 50 == 0):
if torch.cuda.device_count() > 1: # multiple GPUS:
state = {'model':model.module.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch}
else:
state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch}
path='./model/model' + str(epoch) +'.pth'
torch.save(state, path)
epoch_num = epoch
# save the final model
if torch.cuda.device_count() > 1: # multiple GPUS:
state = {'model':model.module.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch_num}
else:
state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch_num}
torch.save(state, mdl_path)
# exit gracefully
#
return True
#
# end of function
# begin gracefully
#
if __name__ == '__main__':
main(sys.argv[1:])
#
# end of file