gradio_deploy / aot /utils /checkpoint.py
Schrodingers's picture
Upload folder using huggingface_hub
ffbe0b4
import torch
import os
import shutil
import numpy as np
def load_network_and_optimizer(net, opt, pretrained_dir, gpu, scaler=None):
pretrained = torch.load(pretrained_dir,
map_location=torch.device("cuda:" + str(gpu)))
pretrained_dict = pretrained['state_dict']
model_dict = net.state_dict()
pretrained_dict_update = {}
pretrained_dict_remove = []
for k, v in pretrained_dict.items():
if k in model_dict:
pretrained_dict_update[k] = v
elif k[:7] == 'module.':
if k[7:] in model_dict:
pretrained_dict_update[k[7:]] = v
else:
pretrained_dict_remove.append(k)
model_dict.update(pretrained_dict_update)
net.load_state_dict(model_dict)
opt.load_state_dict(pretrained['optimizer'])
if scaler is not None and 'scaler' in pretrained.keys():
scaler.load_state_dict(pretrained['scaler'])
del (pretrained)
return net.cuda(gpu), opt, pretrained_dict_remove
def load_network_and_optimizer_v2(net, opt, pretrained_dir, gpu, scaler=None):
pretrained = torch.load(pretrained_dir,
map_location=torch.device("cuda:" + str(gpu)))
# load model
pretrained_dict = pretrained['state_dict']
model_dict = net.state_dict()
pretrained_dict_update = {}
pretrained_dict_remove = []
for k, v in pretrained_dict.items():
if k in model_dict:
pretrained_dict_update[k] = v
elif k[:7] == 'module.':
if k[7:] in model_dict:
pretrained_dict_update[k[7:]] = v
else:
pretrained_dict_remove.append(k)
model_dict.update(pretrained_dict_update)
net.load_state_dict(model_dict)
# load optimizer
opt_dict = opt.state_dict()
all_params = {
param_group['name']: param_group['params'][0]
for param_group in opt_dict['param_groups']
}
pretrained_opt_dict = {'state': {}, 'param_groups': []}
for idx in range(len(pretrained['optimizer']['param_groups'])):
param_group = pretrained['optimizer']['param_groups'][idx]
if param_group['name'] in all_params.keys():
pretrained_opt_dict['state'][all_params[
param_group['name']]] = pretrained['optimizer']['state'][
param_group['params'][0]]
param_group['params'][0] = all_params[param_group['name']]
pretrained_opt_dict['param_groups'].append(param_group)
opt_dict.update(pretrained_opt_dict)
opt.load_state_dict(opt_dict)
# load scaler
if scaler is not None and 'scaler' in pretrained.keys():
scaler.load_state_dict(pretrained['scaler'])
del (pretrained)
return net.cuda(gpu), opt, pretrained_dict_remove
def load_network(net, pretrained_dir, gpu):
pretrained = torch.load(pretrained_dir,
map_location=torch.device("cuda:" + str(gpu)))
if 'state_dict' in pretrained.keys():
pretrained_dict = pretrained['state_dict']
elif 'model' in pretrained.keys():
pretrained_dict = pretrained['model']
else:
pretrained_dict = pretrained
model_dict = net.state_dict()
pretrained_dict_update = {}
pretrained_dict_remove = []
for k, v in pretrained_dict.items():
if k in model_dict:
pretrained_dict_update[k] = v
elif k[:7] == 'module.':
if k[7:] in model_dict:
pretrained_dict_update[k[7:]] = v
else:
pretrained_dict_remove.append(k)
model_dict.update(pretrained_dict_update)
net.load_state_dict(model_dict)
del (pretrained)
return net.cuda(gpu), pretrained_dict_remove
def save_network(net,
opt,
step,
save_path,
max_keep=8,
backup_dir='./saved_models',
scaler=None):
ckpt = {'state_dict': net.state_dict(), 'optimizer': opt.state_dict()}
if scaler is not None:
ckpt['scaler'] = scaler.state_dict()
try:
if not os.path.exists(save_path):
os.makedirs(save_path)
save_file = 'save_step_%s.pth' % (step)
save_dir = os.path.join(save_path, save_file)
torch.save(ckpt, save_dir)
except:
save_path = backup_dir
if not os.path.exists(save_path):
os.makedirs(save_path)
save_file = 'save_step_%s.pth' % (step)
save_dir = os.path.join(save_path, save_file)
torch.save(ckpt, save_dir)
all_ckpt = os.listdir(save_path)
if len(all_ckpt) > max_keep:
all_step = []
for ckpt_name in all_ckpt:
step = int(ckpt_name.split('_')[-1].split('.')[0])
all_step.append(step)
all_step = list(np.sort(all_step))[:-max_keep]
for step in all_step:
ckpt_path = os.path.join(save_path, 'save_step_%s.pth' % (step))
os.system('rm {}'.format(ckpt_path))
def cp_ckpt(remote_dir="data_wd/youtube_vos_jobs/result", curr_dir="backup"):
exps = os.listdir(curr_dir)
for exp in exps:
exp_dir = os.path.join(curr_dir, exp)
stages = os.listdir(exp_dir)
for stage in stages:
stage_dir = os.path.join(exp_dir, stage)
finals = ["ema_ckpt", "ckpt"]
for final in finals:
final_dir = os.path.join(stage_dir, final)
ckpts = os.listdir(final_dir)
for ckpt in ckpts:
if '.pth' not in ckpt:
continue
curr_ckpt_path = os.path.join(final_dir, ckpt)
remote_ckpt_path = os.path.join(remote_dir, exp, stage,
final, ckpt)
if os.path.exists(remote_ckpt_path):
os.system('rm {}'.format(remote_ckpt_path))
try:
shutil.copy(curr_ckpt_path, remote_ckpt_path)
print("Copy {} to {}.".format(curr_ckpt_path,
remote_ckpt_path))
except OSError as Inst:
return