vllm_fairseq / fairseq-0.10.2 /scripts /average_checkpoints.py

Add files using upload-large-folder tool

74f3e76 verified 11 months ago

6.02 kB

	#!/usr/bin/env python3
	# Copyright (c) Facebook, Inc. and its affiliates.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.

	import argparse
	import collections
	import os
	import re

	import torch
	from fairseq.file_io import PathManager


	def average_checkpoints(inputs):
	"""Loads checkpoints from inputs and returns a model with averaged weights.

	Args:
	inputs: An iterable of string paths of checkpoints to load from.

	Returns:
	A dict of string keys mapping to various values. The 'model' key
	from the returned dict should correspond to an OrderedDict mapping
	string parameter names to torch Tensors.
	"""
	params_dict = collections.OrderedDict()
	params_keys = None
	new_state = None
	num_models = len(inputs)

	for fpath in inputs:
	with PathManager.open(fpath, "rb") as f:
	state = torch.load(
	f,
	map_location=(
	lambda s, _: torch.serialization.default_restore_location(s, "cpu")
	),
	)
	# Copies over the settings from the first checkpoint
	if new_state is None:
	new_state = state

	model_params = state["model"]

	model_params_keys = list(model_params.keys())
	if params_keys is None:
	params_keys = model_params_keys
	elif params_keys != model_params_keys:
	raise KeyError(
	"For checkpoint {}, expected list of params: {}, "
	"but found: {}".format(f, params_keys, model_params_keys)
	)

	for k in params_keys:
	p = model_params[k]
	if isinstance(p, torch.HalfTensor):
	p = p.float()
	if k not in params_dict:
	params_dict[k] = p.clone()
	# NOTE: clone() is needed in case of p is a shared parameter
	else:
	params_dict[k] += p

	averaged_params = collections.OrderedDict()
	for k, v in params_dict.items():
	averaged_params[k] = v
	if averaged_params[k].is_floating_point():
	averaged_params[k].div_(num_models)
	else:
	averaged_params[k] //= num_models
	new_state["model"] = averaged_params
	return new_state


	def last_n_checkpoints(paths, n, update_based, upper_bound=None):
	assert len(paths) == 1
	path = paths[0]
	if update_based:
	pt_regexp = re.compile(r"checkpoint_\d+_(\d+)\.pt")
	else:
	pt_regexp = re.compile(r"checkpoint(\d+)\.pt")
	files = PathManager.ls(path)

	entries = []
	for f in files:
	m = pt_regexp.fullmatch(f)
	if m is not None:
	sort_key = int(m.group(1))
	if upper_bound is None or sort_key <= upper_bound:
	entries.append((sort_key, m.group(0)))
	if len(entries) < n:
	raise Exception(
	"Found {} checkpoint files but need at least {}", len(entries), n
	)
	return [os.path.join(path, x[1]) for x in sorted(entries, reverse=True)[:n]]


	def main():
	parser = argparse.ArgumentParser(
	description="Tool to average the params of input checkpoints to "
	"produce a new checkpoint",
	)
	# fmt: off
	parser.add_argument('--inputs', required=True, nargs='+',
	help='Input checkpoint file paths.')
	parser.add_argument('--output', required=True, metavar='FILE',
	help='Write the new checkpoint containing the averaged weights to this path.')
	num_group = parser.add_mutually_exclusive_group()
	num_group.add_argument('--num-epoch-checkpoints', type=int,
	help='if set, will try to find checkpoints with names checkpoint_xx.pt in the path specified by input, '
	'and average last this many of them.')
	num_group.add_argument('--num-update-checkpoints', type=int,
	help='if set, will try to find checkpoints with names checkpoint_ee_xx.pt in the path specified by input, '
	'and average last this many of them.')
	parser.add_argument('--checkpoint-upper-bound', type=int,
	help='when using --num-epoch-checkpoints, this will set an upper bound on which epoch to use, '
	'when using --num-update-checkpoints, this will set an upper bound on which update to use'
	'e.g., with --num-epoch-checkpoints=10 --checkpoint-upper-bound=50, checkpoints 41-50 would be averaged.'
	'e.g., with --num-update-checkpoints=10 --checkpoint-upper-bound=50000, checkpoints 40500-50000 would be averaged assuming --save-interval-updates 500'
	)
	# fmt: on
	args = parser.parse_args()
	print(args)

	num = None
	is_update_based = False
	if args.num_update_checkpoints is not None:
	num = args.num_update_checkpoints
	is_update_based = True
	elif args.num_epoch_checkpoints is not None:
	num = args.num_epoch_checkpoints

	assert args.checkpoint_upper_bound is None or (
	args.num_epoch_checkpoints is not None
	or args.num_update_checkpoints is not None
	), "--checkpoint-upper-bound requires --num-epoch-checkpoints or --num-update-checkpoints"
	assert (
	args.num_epoch_checkpoints is None or args.num_update_checkpoints is None
	), "Cannot combine --num-epoch-checkpoints and --num-update-checkpoints"

	if num is not None:
	args.inputs = last_n_checkpoints(
	args.inputs,
	num,
	is_update_based,
	upper_bound=args.checkpoint_upper_bound,
	)
	print("averaging checkpoints: ", args.inputs)

	new_state = average_checkpoints(args.inputs)
	with PathManager.open(args.output, "wb") as f:
	torch.save(new_state, f)
	print("Finished writing averaged checkpoint to {}".format(args.output))


	if __name__ == "__main__":
	main()