gradio_deploy / aot /utils /learning.py
Schrodingers's picture
Upload folder using huggingface_hub
ffbe0b4
import math
def adjust_learning_rate(optimizer,
base_lr,
p,
itr,
max_itr,
restart=1,
warm_up_steps=1000,
is_cosine_decay=False,
min_lr=1e-5,
encoder_lr_ratio=1.0,
freeze_params=[]):
if restart > 1:
each_max_itr = int(math.ceil(float(max_itr) / restart))
itr = itr % each_max_itr
warm_up_steps /= restart
max_itr = each_max_itr
if itr < warm_up_steps:
now_lr = min_lr + (base_lr - min_lr) * itr / warm_up_steps
else:
itr = itr - warm_up_steps
max_itr = max_itr - warm_up_steps
if is_cosine_decay:
now_lr = min_lr + (base_lr - min_lr) * (math.cos(math.pi * itr /
(max_itr + 1)) +
1.) * 0.5
else:
now_lr = min_lr + (base_lr - min_lr) * (1 - itr / (max_itr + 1))**p
for param_group in optimizer.param_groups:
if encoder_lr_ratio != 1.0 and "encoder." in param_group["name"]:
param_group['lr'] = (now_lr - min_lr) * encoder_lr_ratio + min_lr
else:
param_group['lr'] = now_lr
for freeze_param in freeze_params:
if freeze_param in param_group["name"]:
param_group['lr'] = 0
param_group['weight_decay'] = 0
break
return now_lr
def get_trainable_params(model,
base_lr,
weight_decay,
use_frozen_bn=False,
exclusive_wd_dict={},
no_wd_keys=[]):
params = []
memo = set()
total_param = 0
for key, value in model.named_parameters():
if value in memo:
continue
total_param += value.numel()
if not value.requires_grad:
continue
memo.add(value)
wd = weight_decay
for exclusive_key in exclusive_wd_dict.keys():
if exclusive_key in key:
wd = exclusive_wd_dict[exclusive_key]
break
if len(value.shape) == 1: # normalization layers
if 'bias' in key: # bias requires no weight decay
wd = 0.
elif not use_frozen_bn: # if not use frozen BN, apply zero weight decay
wd = 0.
elif 'encoder.' not in key: # if use frozen BN, apply weight decay to all frozen BNs in the encoder
wd = 0.
else:
for no_wd_key in no_wd_keys:
if no_wd_key in key:
wd = 0.
break
params += [{
"params": [value],
"lr": base_lr,
"weight_decay": wd,
"name": key
}]
print('Total Param: {:.2f}M'.format(total_param / 1e6))
return params
def freeze_params(module):
for p in module.parameters():
p.requires_grad = False
def calculate_params(state_dict):
memo = set()
total_param = 0
for key, value in state_dict.items():
if value in memo:
continue
memo.add(value)
total_param += value.numel()
print('Total Param: {:.2f}M'.format(total_param / 1e6))