semantic_cnn_nav / training /scripts /train.py

Upload folder using huggingface_hub

5523920 verified 3 days ago

11.3 kB

	#!/usr/bin/env python
	#
	# file: $ISIP_EXP/SOGMP/scripts/train.py
	#
	# revision history: xzt
	# 20220824 (TE): first version
	#
	# usage:
	# python train.py mdir train_data val_data
	#
	# arguments:
	# mdir: the directory where the output model is stored
	# train_data: the directory of training data
	# val_data: the directory of valiation data
	#
	# This script trains a Semantic CNN model
	#------------------------------------------------------------------------------

	# import pytorch modules
	#
	import torch
	import torch.nn as nn
	from torch.optim import Adam
	from tqdm import tqdm

	# visualize:
	from tensorboardX import SummaryWriter
	import numpy as np

	# import the model and all of its variables/functions
	#
	from model import *

	# import modules
	#
	import sys
	import os


	#-----------------------------------------------------------------------------
	#
	# global variables are listed here
	#
	#-----------------------------------------------------------------------------

	# general global values
	#
	model_dir = './model/semantic_cnn_model.pth' # the path of model storage
	NUM_ARGS = 3
	NUM_EPOCHS = 4000
	BATCH_SIZE = 64
	LEARNING_RATE = "lr"
	BETAS = "betas"
	EPS = "eps"
	WEIGHT_DECAY = "weight_decay"

	# for reproducibility, we seed the rng
	#
	set_seed(SEED1)

	# adjust_learning_rate
	#
	def adjust_learning_rate(optimizer, epoch):
	lr = 1e-3
	if epoch > 40:
	lr = 2e-4
	if epoch > 2000:
	lr = 2e-5
	if epoch > 21000:
	lr = 1e-5
	if epoch > 32984:
	lr = 1e-6
	if epoch > 48000:
	# lr = 5e-8
	lr = lr * (0.1 ** (epoch // 110000))
	# if epoch > 8300:
	# lr = 1e-9
	for param_group in optimizer.param_groups:
	param_group['lr'] = lr



	# train function:
	def train(model, dataloader, dataset, device, optimizer, criterion, epoch, epochs):
	################################## Train #####################################
	# Set model to training mode
	model.train()
	# for each batch in increments of batch size
	#
	running_loss = 0
	counter = 0
	# get the number of batches (ceiling of train_data/batch_size):
	num_batches = int(len(dataset)/dataloader.batch_size)
	for i, batch in tqdm(enumerate(dataloader), total=num_batches):
	#for i, batch in enumerate(dataloader, 0):
	counter += 1
	# collect the samples as a batch:
	scan_maps = batch['scan_map']
	scan_maps = scan_maps.to(device)
	semantic_maps = batch['semantic_map']
	semantic_maps = semantic_maps.to(device)
	sub_goals = batch['sub_goal']
	sub_goals = sub_goals.to(device)
	velocities = batch['velocity']
	velocities = velocities.to(device)

	# set all gradients to 0:
	optimizer.zero_grad()
	# feed the network the batch
	#

	output = model(scan_maps, semantic_maps, sub_goals)
	#writer.add_graph(model,[batch_ped_pos_t, batch_scan_t, batch_goal_t])
	# get the loss
	#
	# loss = criterion(output, velocities)

	# ---------------------------
	# Mask zero-velocity samples
	# ---------------------------
	mask = (velocities != 0).any(dim=1) # (B,)

	if mask.sum() == 0:
	loss = output.sum() * 0 # safe zero loss
	else:
	loss = criterion(output[mask], velocities[mask])

	# perform back propagation:
	loss.backward(torch.ones_like(loss))
	optimizer.step()
	# get the loss:
	# multiple GPUs:
	if torch.cuda.device_count() > 1:
	loss = loss.mean()

	running_loss += loss.item()

	# display informational message
	#
	if(i % 1280 == 0):
	print('Epoch [{}/{}], Step[{}/{}], Loss: {:.4f}'
	.format(epoch, epochs, i + 1, num_batches, loss.item()))

	train_loss = running_loss / len(dataset) #counter

	return train_loss

	# validate function:
	def validate(model, dataloader, dataset, device, criterion):
	################################## Train #####################################
	# set model to evaluation mode:
	model.eval()
	# for each batch in increments of batch size
	#
	running_loss = 0
	counter = 0
	# get the number of batches (ceiling of train_data/batch_size):
	num_batches = int(len(dataset)/dataloader.batch_size)
	for i, batch in tqdm(enumerate(dataloader), total=num_batches):
	#for i, batch in enumerate(dataloader, 0):
	counter += 1
	# collect the samples as a batch:
	scan_maps = batch['scan_map']
	scan_maps = scan_maps.to(device)

	semantic_maps = batch['semantic_map']
	semantic_maps = semantic_maps.to(device)

	sub_goals = batch['sub_goal']
	sub_goals = sub_goals.to(device)
	velocities = batch['velocity']
	velocities = velocities.to(device)

	# feed the network the batch
	#
	output = model(scan_maps, semantic_maps, sub_goals)
	#writer.add_graph(model,[batch_ped_pos_t, batch_scan_t, batch_goal_t])
	# get the loss
	#
	# loss = criterion(output, velocities)
	# ---------------------------
	# Mask zero-velocity samples
	# ---------------------------
	mask = (velocities != 0).any(dim=1) # (B,)

	if mask.sum() == 0:
	loss = output.sum() * 0 # safe zero loss
	else:
	loss = criterion(output[mask], velocities[mask])

	# get the loss:
	# multiple GPUs:
	if torch.cuda.device_count() > 1:
	loss = loss.mean()

	running_loss += loss.item()

	val_loss = running_loss / len(dataset) #counter

	return val_loss

	#------------------------------------------------------------------------------
	#
	# the main program starts here
	#
	#------------------------------------------------------------------------------

	# function: main
	#
	# arguments: none
	#
	# return: none
	#
	# This method is the main function.
	#
	def main(argv):

	# ensure we have the correct amount of arguments
	#
	#global cur_batch_win
	if(len(argv) != NUM_ARGS):
	print("usage: python nedc_train_mdl.py [MDL_PATH] [TRAIN_PATH] [DEV_PATH]")
	exit(-1)

	# define local variables
	#
	mdl_path = argv[0]
	pTrain = argv[1]
	pDev = argv[2]

	# get the output directory name
	#
	odir = os.path.dirname(mdl_path)

	# if the odir doesn't exits, we make it
	#
	if not os.path.exists(odir):
	os.makedirs(odir)

	# set the device to use GPU if available
	#
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	### train:
	print('...Start reading data...')
	# get array of the data
	# data: [[0, 1, ... 26], [27, 28, ...] ...]
	# labels: [0, 0, 1, ...]
	#
	#[ped_pos_t, scan_t, goal_t, vel_t] = get_data(pTrain)
	train_dataset = NavDataset(pTrain, 'train')
	train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, \
	shuffle=True, drop_last=True, pin_memory=True)
	#train_data = train_data - np.mean(train_data, axis=0)

	### dev:

	# get array of the data
	# data: [[0, 1, ... 26], [27, 28, ...] ...]
	# labels: [0, 0, 1, ...]
	#
	#[ped_pos_d, scan_d, goal_d, vel_d] = get_data(pDev)
	dev_dataset = NavDataset(pDev, 'dev')
	dev_dataloader = torch.utils.data.DataLoader(dev_dataset, batch_size=BATCH_SIZE, \
	shuffle=True, drop_last=True, pin_memory=True)
	#dev_data = dev_data - np.mean(dev_data, axis=0)
	print('...Finish reading data...')

	# instantiate a model
	#
	model = SemanticCNN(Bottleneck, [2, 1, 1])

	# moves the model to device (cpu in our case so no change)
	#
	model.to(device)

	# set the adam optimizer parameters
	#
	opt_params = { LEARNING_RATE: 0.001,
	BETAS: (.9,0.999),
	EPS: 1e-08,
	WEIGHT_DECAY: .001 }

	# set the loss and optimizer
	#
	criterion = nn.MSELoss(reduction='sum')
	criterion.to(device)

	# create an optimizer, and pass the model params to it
	#
	optimizer = Adam(model.parameters(), **opt_params)

	# get the number of epochs to train on
	#
	epochs = NUM_EPOCHS

	# if there are trained models, continue training:
	if os.path.exists(mdl_path):
	checkpoint = torch.load(mdl_path)
	model.load_state_dict(checkpoint['model'])
	optimizer.load_state_dict(checkpoint['optimizer'])
	start_epoch = checkpoint['epoch']
	print('Load epoch {} success'.format(start_epoch))
	else:
	start_epoch = 0
	print('No trained models, restart training')

	# multiple GPUs:
	if torch.cuda.device_count() > 1:
	print("Let's use 2 of total", torch.cuda.device_count(), "GPUs!")
	# dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
	model = nn.DataParallel(model) #, device_ids=[0, 1])

	# moves the model to device (cpu in our case so no change)
	#
	model.to(device)

	# tensorboard writer:
	writer = SummaryWriter('runs')

	# for each epoch
	#
	#loss_train = []
	#loss_vector = []
	epoch_num = 0
	for epoch in range(start_epoch+1, epochs):

	# adjust learning rate:
	adjust_learning_rate(optimizer, epoch)
	################################## Train #####################################
	# for each batch in increments of batch size
	#
	train_epoch_loss = train(
	model, train_dataloader, train_dataset, device, optimizer, criterion, epoch, epochs
	)

	################################## Test #####################################
	valid_epoch_loss = validate(
	model, dev_dataloader, dev_dataset, device, criterion
	)

	# log the epoch loss
	writer.add_scalar('training loss',
	train_epoch_loss,
	epoch)
	writer.add_scalar('validation loss',
	valid_epoch_loss,
	epoch)

	print('Train set: Average loss: {:.4f}'.format(train_epoch_loss))
	print('Validation set: Average loss: {:.4f}'.format(valid_epoch_loss))

	# save the model
	#
	if(epoch % 50 == 0):
	if torch.cuda.device_count() > 1: # multiple GPUS:
	state = {'model':model.module.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch}
	else:
	state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch}
	path='./model/model' + str(epoch) +'.pth'
	torch.save(state, path)

	epoch_num = epoch

	# save the final model
	if torch.cuda.device_count() > 1: # multiple GPUS:
	state = {'model':model.module.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch_num}
	else:
	state = {'model':model.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch_num}
	torch.save(state, mdl_path)

	# exit gracefully
	#

	return True
	#
	# end of function


	# begin gracefully
	#
	if __name__ == '__main__':
	main(sys.argv[1:])
	#
	# end of file