Ahma-3B / EasyLM /jax_utils.py

aapot

Add easylm training code

5a63fc6 9 months ago

13 kB

	import os
	import math
	from typing import Any, Mapping, Text, Tuple, Union, NamedTuple
	from functools import partial
	import re
	import dataclasses
	import random
	from ml_collections import ConfigDict
	from ml_collections.config_dict.config_dict import placeholder

	import flax
	import jax
	import jax.numpy as jnp
	from jax.sharding import PartitionSpec as PS
	from jax.sharding import Mesh
	from jax.experimental import mesh_utils
	from jax.experimental.pjit import with_sharding_constraint as _with_sharding_constraint
	from jax.experimental.pjit import pjit
	from jax.interpreters import pxla
	import numpy as np
	from transformers import FlaxLogitsWarper


	class JaxRNG(object):
	""" A convenient stateful Jax RNG wrapper. Can be used to wrap RNG inside
	pure function.
	"""

	@classmethod
	def from_seed(cls, seed):
	return cls(jax.random.PRNGKey(seed))

	def __init__(self, rng):
	self.rng = rng

	def __call__(self, keys=None):
	if keys is None:
	self.rng, split_rng = jax.random.split(self.rng)
	return split_rng
	elif isinstance(keys, int):
	split_rngs = jax.random.split(self.rng, num=keys + 1)
	self.rng = split_rngs[0]
	return tuple(split_rngs[1:])
	else:
	split_rngs = jax.random.split(self.rng, num=len(keys) + 1)
	self.rng = split_rngs[0]
	return {key: val for key, val in zip(keys, split_rngs[1:])}


	class JaxDistributedConfig(object):
	""" Utility class for initializing JAX distributed. """

	@staticmethod
	def get_default_config(updates=None):
	config = ConfigDict()
	config.initialize_jax_distributed = False
	config.coordinator_address = placeholder(str)
	config.num_processes = placeholder(int)
	config.process_id = placeholder(int)
	config.local_device_ids = placeholder(str)

	if updates is not None:
	config.update(ConfigDict(updates).copy_and_resolve_references())
	return config

	@classmethod
	def initialize(cls, config):
	config = cls.get_default_config(config)
	if config.initialize_jax_distributed:
	if config.local_device_ids is not None:
	local_device_ids = [int(x) for x in config.local_device_ids.split(',')]
	else:
	local_device_ids = None

	jax.distributed.initialize(
	coordinator_address=config.coordinator_address,
	num_processes=config.num_processes,
	process_id=config.process_id,
	local_device_ids=local_device_ids,
	)


	class FlaxTemperatureLogitsWarper(FlaxLogitsWarper):
	""" JIT traceable version of FlaxLogitsWarper that performs temperature scaling."""
	def __init__(self, temperature):
	self.temperature = temperature

	def __call__(self, input_ids, scores, cur_len):
	return scores / jnp.clip(self.temperature, a_min=1e-8)


	def make_shard_and_gather_fns(partition_specs, dtype_specs=None):
	""" Create pytree of sharding and gathering functions from pytree of
	partition specs.
	"""
	float_dtypes = (jnp.bfloat16, jnp.float16, jnp.float32, jnp.float64)

	def make_to_dtype_fn(dtype_spec):
	def to_dtype(tensor):
	if dtype_specs in float_dtypes and getattr(tensor, 'dtype', None) in float_dtypes:
	# Convert all float tensors to the same dtype
	return tensor.astype(dtype_specs)
	elif hasattr(dtype_spec, 'dtype') and hasattr(tensor, 'dtype'):
	return tensor.astype(dtype_spec.dtype)
	return tensor
	return to_dtype

	def make_shard_fn(partition_spec, dtype_spec=None):
	jax_shard_function = pjit(
	make_to_dtype_fn(dtype_spec),
	in_shardings=None,
	out_shardings=partition_spec
	)
	def shard_fn(tensor):
	return jax_shard_function(tensor).block_until_ready()
	return shard_fn

	def make_gather_fn(partition_spec, dtype_spec=None):
	jax_gather_fn = pjit(
	make_to_dtype_fn(dtype_spec),
	in_shardings=partition_spec,
	out_shardings=None
	)
	def gather_fn(tensor):
	return jax.device_get(jax_gather_fn(tensor))
	return gather_fn

	if dtype_specs is None or dtype_specs in float_dtypes:
	shard_fns = jax.tree_util.tree_map(make_shard_fn, partition_specs)
	gather_fns = jax.tree_util.tree_map(make_gather_fn, partition_specs)
	else:
	shard_fns = jax.tree_util.tree_map(
	make_shard_fn, partition_specs, dtype_specs
	)
	gather_fns = jax.tree_util.tree_map(
	make_gather_fn, partition_specs, dtype_specs
	)
	return shard_fns, gather_fns


	def set_random_seed(seed):
	np.random.seed(seed)
	random.seed(seed)
	init_rng(seed)


	def get_jax_mesh(axis_dims, names):
	if axis_dims.startswith('!'):
	# Allow splitting a physical mesh axis if needed
	mesh_axis_splitting = True
	axis_dims = axis_dims[1:]
	else:
	mesh_axis_splitting = False

	if ':' in axis_dims:
	dims = []
	dim_names = []
	for axis in axis_dims.split(','):
	name, dim = axis.split(':')
	assert name in names
	dims.append(int(dim))
	dim_names.append(name)
	assert(set(dim_names) == set(names))
	else:
	dims = [int(x) for x in axis_dims.split(',')]
	dim_names = names
	assert len(dims) == len(names)
	mesh_shape = np.arange(jax.device_count()).reshape(dims).shape
	if mesh_axis_splitting:
	physical_mesh = np.array(jax.devices()).reshape(mesh_shape)
	else:
	physical_mesh = mesh_utils.create_device_mesh(mesh_shape)
	return Mesh(physical_mesh, dim_names)


	def names_in_current_mesh(*names):
	""" Check if current mesh axes contain these names. """
	mesh_axis_names = pxla.thread_resources.env.physical_mesh.axis_names
	return set(names) <= set(mesh_axis_names)


	def get_names_from_parition_spec(partition_specs):
	""" Return axis names from partition specs. """
	names = set()
	if isinstance(partition_specs, dict):
	partition_specs = partition_specs.values()
	for item in partition_specs:
	if item is None:
	continue
	elif isinstance(item, str):
	names.add(item)
	else:
	names.update(get_names_from_parition_spec(item))

	return list(names)


	def with_sharding_constraint(x, partition_specs):
	""" A smarter version of with_sharding_constraint that only applies the
	constraint if the current mesh contains the axes in the partition specs.
	"""
	axis_names = get_names_from_parition_spec(partition_specs)
	if names_in_current_mesh(*axis_names):
	x = _with_sharding_constraint(x, partition_specs)
	return x


	def wrap_function_with_rng(rng):
	""" To be used as decorator, automatically bookkeep a RNG for the wrapped function. """
	def wrap_function(function):
	def wrapped(args, *kwargs):
	nonlocal rng
	rng, split_rng = jax.random.split(rng)
	return function(split_rng, args, *kwargs)
	return wrapped
	return wrap_function


	def init_rng(seed):
	global jax_utils_rng
	jax_utils_rng = JaxRNG.from_seed(seed)


	def next_rng(args, *kwargs):
	global jax_utils_rng
	return jax_utils_rng(args, *kwargs)


	def get_metrics(metrics, unreplicate=False, stack=False):
	if unreplicate:
	metrics = flax.jax_utils.unreplicate(metrics)
	metrics = jax.device_get(metrics)
	if stack:
	return jax.tree_map(lambda args: np.stack(args), metrics)
	else:
	return {key: float(val) for key, val in metrics.items()}


	def mse_loss(val, target, valid=None):
	if valid is None:
	valid = jnp.ones((*target.shape[:2], 1))
	valid = valid.astype(jnp.float32)
	loss = jnp.mean(
	jnp.where(
	valid > 0.0,
	jnp.square(val - target),
	0.0
	)
	)
	return loss


	def cross_entropy_loss_and_accuracy(logits, tokens, valid=None):
	if valid is None:
	valid = jnp.ones(tokens.shape[:2])
	valid = valid.astype(jnp.float32)
	valid_text_length = jnp.maximum(jnp.sum(valid, axis=-1), 1e-10)
	logits = logits.astype(jnp.float32) # for numerical stability
	token_log_prob = jnp.squeeze(
	jnp.take_along_axis(
	jax.nn.log_softmax(logits, axis=-1),
	jnp.expand_dims(tokens, -1),
	axis=-1,
	),
	-1,
	)
	token_log_prob = jnp.where(valid > 0.0, token_log_prob, jnp.array(0.0))
	loss = -jnp.mean(jnp.sum(token_log_prob, axis=-1) / valid_text_length)
	correct = jnp.where(
	valid > 0.0,
	jnp.argmax(logits, axis=-1) == tokens,
	jnp.array(False)
	)
	accuracy = jnp.mean(jnp.sum(correct, axis=-1) / valid_text_length)
	return loss, accuracy


	def global_norm(tree):
	""" Return the global L2 norm of a pytree. """
	squared = jax.tree_util.tree_map(lambda x: jnp.sum(jnp.square(x)), tree)
	flattened, _ = jax.flatten_util.ravel_pytree(squared)
	return jnp.sqrt(jnp.sum(flattened))


	def average_metrics(metrics):
	with jax.spmd_mode("allow_all"):
	return jax.tree_map(
	lambda *args: jnp.mean(jnp.stack(args)),
	*metrics
	)


	def get_float_dtype_by_name(dtype):
	return {
	'bf16': jnp.bfloat16,
	'bfloat16': jnp.bfloat16,
	'fp16': jnp.float16,
	'float16': jnp.float16,
	'fp32': jnp.float32,
	'float32': jnp.float32,
	'fp64': jnp.float64,
	'float64': jnp.float64,
	}[dtype]


	def float_tensor_to_dtype(tensor, dtype):
	if dtype is None or dtype == '':
	return tensor
	if isinstance(dtype, str):
	dtype = get_float_dtype_by_name(dtype)
	float_dtypes = (jnp.bfloat16, jnp.float16, jnp.float32, jnp.float64)
	if getattr(tensor, 'dtype', None) in float_dtypes:
	tensor = tensor.astype(dtype)
	return tensor


	def float_to_dtype(tree, dtype):
	return jax.tree_util.tree_map(
	partial(float_tensor_to_dtype, dtype=dtype), tree
	)


	def get_gradient_checkpoint_policy(name):
	return {
	'everything_saveable': jax.checkpoint_policies.everything_saveable,
	'nothing_saveable': jax.checkpoint_policies.nothing_saveable,
	'checkpoint_dots': jax.checkpoint_policies.checkpoint_dots,
	'checkpoint_dots_with_no_batch_dims': jax.checkpoint_policies.checkpoint_dots_with_no_batch_dims,
	}[name]


	def tree_path_to_string(path, sep=None):
	keys = []
	for key in path:
	if isinstance(key, jax.tree_util.SequenceKey):
	keys.append(str(key.idx))
	elif isinstance(key, jax.tree_util.DictKey):
	keys.append(str(key.key))
	elif isinstance(key, jax.tree_util.GetAttrKey):
	keys.append(str(key.name))
	elif isinstance(key, jax.tree_util.FlattenedIndexKey):
	keys.append(str(key.key))
	else:
	keys.append(str(key))
	if sep is None:
	return tuple(keys)
	return sep.join(keys)


	def flatten_tree(xs, is_leaf=None, sep=None):
	flattened, _ = jax.tree_util.tree_flatten_with_path(xs, is_leaf=is_leaf)
	output = {}
	for key, val in flattened:
	output[tree_path_to_string(key, sep=sep)] = val
	return output


	def named_tree_map(f, tree, *rest, is_leaf=None, sep=None):
	""" An extended version of jax.tree_util.tree_map, where the mapped function
	f takes both the name (path) and the tree leaf as input.
	"""
	return jax.tree_util.tree_map_with_path(
	lambda path, x, r: f(tree_path_to_string(path, sep=sep), x, r),
	tree, *rest,
	is_leaf=is_leaf
	)


	def match_partition_rules(rules, params):
	""" Returns a pytree of PartitionSpec according to rules. Supports handling
	Flax TrainState and Optax optimizer state.
	"""
	def get_partition_spec(name, leaf):
	if len(leaf.shape) == 0 or np.prod(leaf.shape) == 1:
	""" Don't partition scalar values. """
	return PS()
	for rule, ps in rules:
	if re.search(rule, name) is not None:
	return ps
	raise ValueError(f'Partition rule not found for param: {name}')
	return named_tree_map(get_partition_spec, params, sep='/')


	def get_weight_decay_mask(exclusions):
	""" Return a weight decay mask function that computes the pytree masks
	according to the given exclusion rules.
	"""
	def decay(name, _):
	for rule in exclusions:
	if re.search(rule, name) is not None:
	return False
	return True

	def weight_decay_mask(params):
	return named_tree_map(decay, params, sep='/')

	return weight_decay_mask


	def tree_apply(fns, tree):
	""" Apply a pytree of functions to the pytree. """
	return jax.tree_util.tree_map(lambda fn, x: fn(x), fns, tree)