Spaces:

dongyi
/

MMFS

Sleeping

MMFS / models /base_model.py

limoran

add basic files

7e2a2a5 about 1 year ago

15.9 kB

	import os
	import torch
	from collections import OrderedDict
	from abc import ABC, abstractmethod
	from models.modules import networks
	from utils.util import check_path
	from utils.net_size import calc_computation


	class BaseModel(ABC):
	"""This class is an abstract base class (ABC) for models.
	To create a subclass, you need to implement the following five functions:
	-- <__init__>: initialize the class; first call BaseModel.__init__(self, opt).
	-- <set_input>: unpack data from dataset and apply preprocessing.
	-- <forward>: produce intermediate results.
	-- <optimize_parameters>: calculate losses, gradients, and update network weights.
	-- <modify_commandline_options>: (optionally) add model-specific options and set default options.
	"""

	def __init__(self, config, DDP_device=None):
	"""Initialize the BaseModel class.

	Parameters:
	opt (Option class)-- stores all the experiment flags; needs to be a subclass of BaseOptions

	When creating your custom class, you need to implement your own initialization.
	In this function, you should first call <BaseModel.__init__(self, opt)>
	Then, you need to define four lists:
	-- self.loss_names (str list): specify the training losses that you want to plot and save.
	-- self.model_names (str list): define networks used in our training.
	-- self.visual_names (str list): specify the images that you want to display and save.
	-- self.optimizers (optimizer list): define and initialize optimizers. You can define one optimizer for each network. If two networks are updated at the same time, you can use itertools.chain to group them. See cycle_gan_model.py for an example.
	"""
	self.config = config
	self.gpu_ids = config['common']['gpu_ids']
	self.isTrain = config['common']['phase'] == 'train'
	if DDP_device is None:
	self.device = torch.device('cuda:{}'.format(self.gpu_ids[0])) if self.gpu_ids else torch.device('cpu') # get device name: CPU or GPU
	self.DDP_device = None
	self.on_cpu = (self.device.type == 'cpu')
	else:
	self.device = DDP_device
	self.DDP_device = DDP_device
	self.on_cpu = False
	self.save_dir = os.path.join(config['training']['checkpoints_dir'], config['common']['name']) # save all the checkpoints to save_dir
	if config['dataset']['preprocess'] != 'scale_width': # with [scale_width], input images might have different sizes, which hurts the performance of cudnn.benchmark.
	torch.backends.cudnn.benchmark = True
	self.loss_names = []
	self.model_names = []
	self.visual_names = []
	self.optimizers = []
	self.image_paths = []
	self.metric = 0 # used for learning rate policy 'plateau'
	self.curr_epoch = 0
	self.total_iters = 0
	self.best_val_loss = 999999

	@abstractmethod
	def set_input(self, input):
	"""Unpack input data from the dataloader and perform necessary pre-processing steps.

	Parameters:
	input (dict): includes the data itself and its metadata information.
	"""
	pass

	@abstractmethod
	def forward(self):
	"""Run forward pass; called by both functions <configimize_parameters> and <test>."""
	pass

	@abstractmethod
	def trace_jit(self, input):
	"""trace torchscript model for C++. Called by <trace_jit.py>"""
	pass

	@abstractmethod
	def optimize_parameters(self):
	"""Calculate losses, gradients, and update network weights; called in every training iteration"""
	pass

	@abstractmethod
	def eval_step(self):
	"""Forward and backward pass but without upgrading weights; called in every validation iteration"""
	pass

	def setup(self, config, DDP_device=None):
	"""Load and print networks; create schedulers

	Parameters:
	opt (Option class) -- stores all the experiment flags; needs to be a subclass of BaseOptions
	"""
	if self.isTrain:
	self.schedulers = [networks.get_scheduler(optimizer, config) for optimizer in self.optimizers]
	if not self.isTrain:
	load_suffix = '{}'.format(config['testing']['which_epoch'])
	self.load_networks(load_suffix)
	elif config['training']['continue_train']:
	load_suffix = '{}'.format(config['training']['which_epoch'])
	self.load_networks(load_suffix)
	self.print_networks(config['common']['verbose'], DDP_device=DDP_device)

	def eval(self):
	"""Make models eval mode during test time"""
	for name in self.model_names:
	if isinstance(name, str):
	net = getattr(self, 'net' + name)
	net.eval()

	def train(self):
	"""Make models train mode during train time"""
	for name in self.model_names:
	if isinstance(name, str):
	net = getattr(self, 'net' + name)
	net.train()

	def test(self):
	"""Forward function used in test time.

	This function wraps <forward> function in no_grad() so we don't save intermediate steps for backprop
	It also calls <compute_visuals> to produce additional visualization results
	"""
	with torch.no_grad():
	self.forward()
	self.compute_visuals()

	def compute_visuals(self):
	"""Calculate additional output images for visdom and HTML visualization"""
	pass

	def get_image_paths(self):
	""" Return image paths that are used to load current data"""
	return self.image_paths

	def update_learning_rate(self):
	"""Update learning rates for all the networks; called at the end of every epoch"""
	for scheduler in self.schedulers:
	if self.config['training']['lr_policy'] == 'plateau':
	scheduler.step(self.metric, epoch=self.curr_epoch)
	else:
	scheduler.step(epoch=self.curr_epoch)

	# lr = self.optimizers[0].param_groups[0]['lr']
	# print('learning rate = %.7f' % lr)

	def get_current_visuals(self):
	"""Return visualization images. train.py will display these images with visdom, and save the images to a HTML"""
	if not self.isTrain and len(self.config['testing']['visual_names']) > 0:
	visual_names = list(set(self.visual_names).intersection(set(self.config['testing']['visual_names'])))
	else:
	visual_names = self.visual_names
	visual_ret = OrderedDict()
	for name in visual_names:
	if isinstance(name, str):
	visual_ret[name] = getattr(self, name)
	return visual_ret

	def get_current_losses(self):
	"""Return traning losses / errors. train.py will print out these errors on console, and save them to a file"""
	errors_ret = OrderedDict()
	for name in self.loss_names:
	if isinstance(name, str):
	errors_ret[name] = float(getattr(self, 'loss_' + name)) # float(...) works for both scalar tensor and float number
	return errors_ret

	def save_networks(self, epoch, val_loss=None):
	"""Save all the networks to the disk.

	Parameters:
	epoch (int) -- current epoch; used in the file name '%s_net_%s.pth' % (epoch, name)
	"""
	check_path(self.save_dir)
	save_filename = 'epoch_%s.pth' % epoch if val_loss is None else 'best_val_epoch.pth'
	checkpoint = {}
	# save all the models
	for name in self.model_names:
	if isinstance(name, str):
	net = getattr(self, 'net' + name)
	if len(self.gpu_ids) > 0 and torch.cuda.is_available():
	# if use DDP, save only on rank 0. If using dataparallel, second condition meets.
	if self.DDP_device == 0 or self.DDP_device is None:
	checkpoint[name+'_model'] = net.module.state_dict()
	else:
	checkpoint[name+'_model'] = net.state_dict()

	# save all the optimizers
	optimizer_index = 0
	for optimizer in self.optimizers:
	checkpoint['optimizer_'+str(optimizer_index)] = optimizer.state_dict()
	optimizer_index += 1

	# save all the schedulers
	scheduler_index = 0
	for scheduler in self.schedulers:
	checkpoint['scheduler_' + str(scheduler_index)] = scheduler.state_dict()
	scheduler_index += 1

	# save other information
	checkpoint['epoch'] = self.curr_epoch
	checkpoint['total_iters'] = self.total_iters
	checkpoint['metric'] = self.metric
	if val_loss is not None:
	checkpoint['best_val_loss'] = val_loss

	torch.save(checkpoint, os.path.join(self.save_dir, save_filename))

	def __patch_instance_norm_state_dict(self, state_dict, module, keys, i=0):
	"""Fix InstanceNorm checkpoints incompatibility (prior to 0.4)"""
	key = keys[i]
	if i + 1 == len(keys): # at the end, pointing to a parameter/buffer
	if module.__class__.__name__.startswith('InstanceNorm') and \
	(key == 'running_mean' or key == 'running_var'):
	if getattr(module, key) is None:
	state_dict.pop('.'.join(keys))
	if module.__class__.__name__.startswith('InstanceNorm') and \
	(key == 'num_batches_tracked'):
	state_dict.pop('.'.join(keys))
	else:
	self.__patch_instance_norm_state_dict(state_dict, getattr(module, key), keys, i + 1)

	def load_networks(self, epoch, ckpt=None):
	"""Load all the networks from the disk.

	Parameters:
	epoch (str) -- current epoch; used in the file name 'epoch_%s.pth' % epoch. Models in the old format
	with the names '%s_net_%s.pth' % (epoch, name) are also supported. Models in the new format takes priority.
	"""
	load_filename = 'epoch_%s.pth' % epoch
	if ckpt is None:
	final_load_path = os.path.join(self.save_dir, load_filename)
	else:
	final_load_path = ckpt
	if os.path.exists(final_load_path):
	# new checkpoint format.
	print('loading the model in new format from %s' % final_load_path)
	if self.DDP_device is not None:
	# unpack the tensors on GPU 0, then transfer to whatever device it needs to be on
	map_location = {'cuda:%d' % 0: 'cuda:%d' % self.DDP_device}
	checkpoint = torch.load(final_load_path, map_location=map_location)
	else:
	checkpoint = torch.load(final_load_path)
	for k, v in checkpoint.items():
	# load models
	if 'model' in k:
	name = k.split('_model')[0]
	if not self.isTrain and 'D' in name: # does not load discriminator when not training
	continue
	if not hasattr(self, 'net' + name):
	continue
	net = getattr(self, 'net' + name)
	if isinstance(net, torch.nn.DataParallel) or isinstance(net, torch.nn.parallel.DistributedDataParallel):
	net = net.module

	# if you are using PyTorch newer than 0.4 (e.g., built from
	# GitHub source), you can remove str() on self.device
	if hasattr(v, '_metadata'):
	del v._metadata

	# patch InstanceNorm checkpoints prior to 0.4
	for key in list(v.keys()): # need to copy keys here because we mutate in loop
	self.__patch_instance_norm_state_dict(v, net, key.split('.'))
	net.load_state_dict(v)
	# load optimizers
	elif 'optimizer' in k:
	if not self.isTrain:
	continue
	index = int(k.split('_')[-1])
	self.optimizers[index].load_state_dict(v)
	# load schedulers
	elif 'scheduler' in k:
	if not self.isTrain:
	continue
	index = int(k.split('_')[-1])
	self.schedulers[index].load_state_dict(v)
	# load other stuffs
	elif k == 'epoch':
	self.curr_epoch = int(v) + 1
	elif k == 'total_iters':
	self.total_iters = int(v)
	elif k == 'metric':
	self.metric = float(v)
	elif k == 'best_val_loss':
	self.best_val_loss = float(v)
	else:
	print('Checkpoint load error. Unrecognized parameter saved in checkpoint: ', k)
	return

	# old checkpoint format.
	for name in self.model_names:
	if isinstance(name, str):
	load_filename = '%s_net_%s.pth' % (epoch, name)
	load_path = os.path.join(self.save_dir, load_filename)
	net = getattr(self, 'net' + name)
	if isinstance(net, torch.nn.DataParallel) or isinstance(net, torch.nn.parallel.DistributedDataParallel):
	net = net.module
	print('loading the model from %s' % load_path)
	# if you are using PyTorch newer than 0.4 (e.g., built from
	# GitHub source), you can remove str() on self.device
	state_dict = torch.load(load_path, map_location=str(self.device))
	if hasattr(state_dict, '_metadata'):
	del state_dict._metadata

	# patch InstanceNorm checkpoints prior to 0.4
	for key in list(state_dict.keys()): # need to copy keys here because we mutate in loop
	self.__patch_instance_norm_state_dict(state_dict, net, key.split('.'))
	net.load_state_dict(state_dict)

	def print_networks(self, verbose, DDP_device=None):
	"""Print the total number of parameters in the network and (if verbose) network architecture

	Parameters:
	verbose (bool) -- if verbose: print the network architecture
	"""
	if DDP_device is None or DDP_device == 0:
	print('---------- Networks initialized -------------')
	for name in self.model_names:
	if isinstance(name, str):
	net = getattr(self, 'net' + name)
	num_params = 0
	for param in net.parameters():
	num_params += param.numel()
	if verbose:
	print(net)
	print('[Network %s] Total number of parameters : %.3f M' % (name, num_params / 1e6))
	if 'G' in name:
	calc_computation(net, self.config['model']['input_nc'], self.config['dataset']['crop_size'],self.config['dataset']['crop_size'], DDP_device=DDP_device)
	print('-----------------------------------------------')

	def set_requires_grad(self, nets, requires_grad=False):
	"""Set requies_grad=Fasle for all the networks to avoid unnecessary computations
	Parameters:
	nets (network list) -- a list of networks
	requires_grad (bool) -- whether the networks require gradients or not
	"""
	if not isinstance(nets, list):
	nets = [nets]
	for net in nets:
	if net is not None:
	for param in net.parameters():
	param.requires_grad = requires_grad