File size: 5,714 Bytes
7934b29 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from torch.optim.optimizer import Optimizer
__all__ = ['Novograd']
def _check_valid_opt_params(lr, eps, betas):
if lr < 0:
raise ValueError(f"Invalid learning rate: {lr}")
if eps < 0:
raise ValueError(f"Invalid epsilon value: {eps}")
if not (0.0 <= betas[0] < 1.0 and 0.0 <= betas[1] < 1.0):
raise ValueError(f"Betas have to be between 0 and 1: {betas}")
class Novograd(Optimizer):
"""Implements Novograd algorithm.
It has been proposed in "Stochastic Gradient Methods with Layer-wise
Adaptive Moments for Training of Deep Networks"
(https://arxiv.org/abs/1905.11286)
Arguments:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups
lr (float, optional): learning rate (default: 1e-3)
betas (Tuple[float, float], optional): coefficients used for computing
running averages of gradient and its square (default: (0.9, 0.999))
eps (float, optional): term added to the denominator to improve
numerical stability (default: 1e-8)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
amsgrad (boolean, optional): whether to use the AMSGrad variant of this
algorithm from the paper "On the Convergence of Adam and Beyond"
"""
def __init__(
self,
params,
lr=1e-3,
betas=(0.95, 0.98),
eps=1e-8,
weight_decay=0,
grad_averaging=False,
amsgrad=False,
luc=False,
luc_trust=1e-3,
luc_eps=1e-8,
):
_check_valid_opt_params(lr, eps, betas)
defaults = dict(
lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, grad_averaging=grad_averaging, amsgrad=amsgrad,
)
self.luc = luc
self.luc_trust = luc_trust
self.luc_eps = luc_eps
super(Novograd, self).__init__(params, defaults)
def __setstate__(self, state):
super(Novograd, self).__setstate__(state)
for group in self.param_groups:
group.setdefault("amsgrad", False)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group["params"]:
if p.grad is None:
continue
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError("Sparse gradients are not supported.")
amsgrad = group["amsgrad"]
state = self.state[p]
# State initialization
if not state:
state["step"] = 0
# Exponential moving average of gradient values
state["exp_avg"] = torch.zeros_like(p.data)
# Exponential moving average of squared gradient values
state["exp_avg_sq"] = torch.zeros([]).to(state["exp_avg"].device)
if amsgrad:
# Maintains max of all exp moving avg of squared grad
state["max_exp_avg_sq"] = torch.zeros([]).to(state["exp_avg"].device)
exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
if amsgrad:
max_exp_avg_sq = state["max_exp_avg_sq"]
beta1, beta2 = group["betas"]
state["step"] += 1
norm = grad.norm().pow(2)
if exp_avg_sq == 0:
exp_avg_sq.copy_(norm)
else:
exp_avg_sq.mul_(beta2).add_(norm, alpha=1.0 - beta2)
if amsgrad:
# Maintains max of all 2nd moment running avg till now
torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
# Use the max for normalizing running avg. of gradient
denom = max_exp_avg_sq.sqrt().add_(group["eps"])
else:
denom = exp_avg_sq.sqrt().add_(group["eps"])
grad.div_(denom)
if group["weight_decay"] != 0:
grad.add_(p.data, alpha=group["weight_decay"])
if group["grad_averaging"]:
grad.mul_(1 - beta1)
exp_avg.mul_(beta1).add_(grad)
if self.luc:
# Clip update so that updates are less than eta*weights
data_norm = torch.norm(p.data)
grad_norm = torch.norm(exp_avg.data)
luc_factor = self.luc_trust * data_norm / (grad_norm + self.luc_eps)
luc_factor = min(luc_factor, group["lr"])
p.data.add_(exp_avg, alpha=-luc_factor)
else:
p.data.add_(exp_avg, alpha=-group["lr"])
return loss
|