class ScheduledOptim(): '''A simple wrapper class for learning rate scheduling''' def __init__(self, optimizer, d_model, init_lr, n_warmup_steps): assert n_warmup_steps > 0, 'must be greater than 0' self._optimizer = optimizer self.init_lr = init_lr self.d_model = d_model self.n_warmup_steps = n_warmup_steps self.n_steps = 0 def step(self): "Step with the inner optimizer" self._update_learning_rate() self._optimizer.step() def zero_grad(self): "Zero out the gradients with the inner optimizer" self._optimizer.zero_grad() def _get_lr_scale(self): d_model = self.d_model n_steps, n_warmup_steps = self.n_steps, self.n_warmup_steps return (d_model ** -0.5) * min(n_steps ** (-0.5), n_steps * n_warmup_steps ** (-1.5)) def state_dict(self): optimizer_state_dict = { 'init_lr':self.init_lr, 'd_model':self.d_model, 'n_warmup_steps':self.n_warmup_steps, 'n_steps':self.n_steps, '_optimizer':self._optimizer.state_dict(), } return optimizer_state_dict def load_state_dict(self, state_dict): self.init_lr = state_dict['init_lr'] self.d_model = state_dict['d_model'] self.n_warmup_steps = state_dict['n_warmup_steps'] self.n_steps = state_dict['n_steps'] self._optimizer.load_state_dict(state_dict['_optimizer']) def _update_learning_rate(self): ''' Learning rate scheduling per step ''' self.n_steps += 1 for param_group in self._optimizer.param_groups: lr = self.init_lr*self._get_lr_scale() self.lr = lr param_group['lr'] = lr