Saving weights and logs of step 10000

Browse files

Files changed (15) hide show

added_tokens.json +1 -0
config.json +40 -0
distributed_shampoo.py +1801 -0
flax_model.msgpack +3 -0
flax_model_to_pytorch.py +27 -0
merges.txt +0 -0
replace_token_script.py +80 -0
run_clm_flax.py +892 -0
runs/events.out.tfevents.1642710569.t1v-n-42145f73-w-0.1403347.0.v2 +3 -0
special_tokens_map.json +1 -0
start_train.sh +29 -0
tokenizer.json +0 -0
tokenizer_config.json +1 -0
train_tokenizer.py +30 -0
vocab.json +0 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"<\|endoftext\|>": 50257}

config.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "_name_or_path": "./",
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 1024,
+  "n_head": 16,
+  "n_inner": null,
+  "n_layer": 24,
+  "n_positions": 1024,
+  "n_special": 0,
+  "predict_special_tokens": true,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "transformers_version": "4.16.0.dev0",
+  "use_cache": true,
+  "vocab_size": 50257
+}

distributed_shampoo.py ADDED Viewed

	@@ -0,0 +1,1801 @@

+#from https://github.com/google-research/google-research/blob/master/scalable_shampoo/optax/distributed_shampoo.py
+# coding=utf-8
+# Copyright 2022 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# An implementation of distributed Shampoo optimizer from:
+#
+#  Scalable Second Order Optimization for Deep Learning
+#  Rohan Anil, Vineet Gupta, Tomer Koren, Kevin Regan, Yoram Singer
+#  Preprint Paper: https://arxiv.org/abs/2002.09018
+#
+# This implementation moves computation of inverse pth root back to the
+# accelerator (if higher precision is available).
+#
+# Authors: Rohan Anil (rohananil at google dot com)
+#    &     Vineet Gupta (vineet at google dot com)
+#
+"""Distributed Shampoo Implementation."""
+import enum
+import functools
+import itertools
+from typing import Any, List, NamedTuple
+import chex
+from flax import struct
+import jax
+from jax import lax
+import jax.experimental.pjit as pjit
+import jax.numpy as jnp
+import numpy as np
+import optax
+PartitionSpec = pjit.PartitionSpec
+# pylint:disable=no-value-for-parameter
+@struct.dataclass
+class QuantizedValue:
+  """State associated with quantized value."""
+  quantized: chex.Array
+  diagonal: chex.Array  # Diagonal (if extract_diagonal is set)
+  bucket_size: chex.Array
+  quantized_dtype: jnp.dtype = struct.field(
+      pytree_node=False)  # Dtype for the quantized value.
+  extract_diagonal: bool = struct.field(
+      pytree_node=False)  # In case its centered.
+  shape: Any = struct.field(pytree_node=False)  # Shape of the tensor.
+  @classmethod
+  def from_float_value(cls, fvalue, quantized_dtype, extract_diagonal=False):
+    if isinstance(fvalue, list) and not fvalue:
+      return QuantizedValue([], [], [], quantized_dtype, extract_diagonal, [])
+    quantized, diagonal_fvalue, bucket_size = QuantizedValue.quantize(
+        fvalue, quantized_dtype, extract_diagonal)
+    return QuantizedValue(quantized, diagonal_fvalue, bucket_size,
+                          quantized_dtype, extract_diagonal,
+                          list(quantized.shape))
+  # Quantization is from Lingvo JAX optimizers.
+  # We extend it for int16 quantization of PSD matrices.
+  @classmethod
+  def quantize(cls, fvalue, quantized_dtype, extract_diagonal=False):
+    """Returns quantized value and the bucket."""
+    if quantized_dtype == jnp.float32:
+      return fvalue, [], []
+    elif quantized_dtype == jnp.bfloat16:
+      return fvalue.astype(jnp.bfloat16), [], []
+    float_dtype = fvalue.dtype
+    if quantized_dtype == jnp.int8:
+      # value -128 is not used.
+      num_buckets = jnp.array(127.0, dtype=float_dtype)
+    elif quantized_dtype == jnp.int16:
+      # value -32768 is not used.
+      num_buckets = jnp.array(32767.0, dtype=float_dtype)
+    else:
+      raise ValueError(f'Quantized dtype {quantized_dtype} not supported.')
+    # max value is mapped to num_buckets
+    if extract_diagonal and fvalue.ndim != 2:
+      raise ValueError(
+          f'Input array {fvalue} must be 2D to work with extract_diagonal.')
+    diagonal_fvalue = []
+    if extract_diagonal:
+      diagonal_fvalue = jnp.diag(fvalue)
+      # Remove the diagonal entries.
+      fvalue = fvalue - jnp.diag(diagonal_fvalue)
+    # TODO(rohananil): Extend this by making use of information about the blocks
+    # SM3 style which will be useful for diagonal statistics
+    # We first decide the scale.
+    if fvalue.ndim < 1:
+      raise ValueError(
+          f'Input array {fvalue} must have a strictly positive number of '
+          'dimensions.')
+    max_abs = jnp.max(jnp.abs(fvalue), axis=0)
+    bucket_size = max_abs / num_buckets
+    bs_expanded = bucket_size[jnp.newaxis, Ellipsis]
+    # To avoid divide by 0.0
+    bs_nonzero = jnp.where(bs_expanded > 0.0, bs_expanded,
+                           jnp.ones_like(bs_expanded))
+    ratio = fvalue / bs_nonzero
+    # We use rounding to remove bias.
+    quantized = jnp.round(ratio)
+    return quantized.astype(quantized_dtype), diagonal_fvalue, bucket_size
+  def to_float(self):
+    """Returns the float value."""
+    if isinstance(self.quantized, list) and not self.quantized:
+      return self.quantized
+    if self.quantized_dtype == jnp.float32:
+      return self.quantized
+    if self.quantized_dtype == jnp.bfloat16:
+      return self.quantized.astype(jnp.float32)
+    float_dtype = self.bucket_size.dtype
+    bucket_size = self.bucket_size[jnp.newaxis, Ellipsis]
+    val = self.quantized.astype(float_dtype) * bucket_size
+    if self.extract_diagonal:
+      val += jnp.diag(self.diagonal)
+    return val
+# Per parameter optimizer state used in data-parallel training.
+class ParameterStats(NamedTuple):
+  """State associated to each parameter of the model being trained."""
+  diagonal_statistics: QuantizedValue  # Accumulator for diagonal preconditioner
+  statistics: List[Any]  # Statistics (QuantizedValue, chex.Array)
+  preconditioners: List[Any]  # Preconditioners (QuantizedValue, chex.Array)
+  diagonal_momentum: QuantizedValue  # Momentum for the diagonal preconditioner
+  momentum: QuantizedValue  # Momentum for the shampoo preconditioner
+# For training extremely large model; We keep a global state with a concatenated
+# statistics and preconditioner states for all vars. This is so that we can
+# annotate the leading axis to be sharded to save memory at the cost of
+# communication.
+@struct.dataclass
+class GlobalShardedParameterStats:
+  statistics: chex.Array  # Statistics
+  preconditioners: chex.Array  # Preconditioners
+  exponents: chex.Array  # exponents
+# These are per-parameter local states; All statistics here mirror the parameter
+# Thus the sharding is copied over from the param specification.
+@struct.dataclass
+class LocalShardedParameterStats:
+  """State associated to each parameter of the model being trained."""
+  diagonal_statistics: QuantizedValue  # Accumulator for diagonal preconditioner
+  diagonal_momentum: QuantizedValue  # Momentum for the diagonal preconditioner
+  momentum: QuantizedValue  # Momentum for the shampoo preconditioner
+  index_start: np.int32 = struct.field(
+      pytree_node=False)  # Index into global statistics array
+  sizes: Any = struct.field(pytree_node=False)  # Sizes of the statistics.
+class ShardedShampooStats(NamedTuple):
+  """Shampoo state in sharded mode."""
+  global_stats: Any
+  local_stats: Any
+class ShampooState(NamedTuple):
+  count: chex.Array
+  stats: Any
+class InitFnState(NamedTuple):
+  init_fn: Any
+  pspec_fn: Any
+  shape_and_dtype_fn: Any
+class GraftingType(enum.IntEnum):
+  SGD = 1
+  ADAGRAD = 2
+  RMSPROP = 3
+  RMSPROP_NORMALIZED = 4
+def power_iteration(
+    matrix,
+    num_iters=100,
+    error_tolerance=1e-6,
+    precision=lax.Precision.HIGHEST):
+  r"""Power iteration algorithm.
+  The power iteration algorithm takes a symmetric PSD matrix `A`, and produces
+  a scalar `\lambda` , which is the greatest (in absolute value) eigenvalue
+  of `A`, and a vector v, which is the corresponding eigenvector of `A`.
+  References:
+    [Wikipedia, 2021](https://en.wikipedia.org/wiki/Power_iteration)
+  Args:
+    matrix: the symmetric PSD matrix.
+    num_iters: Number of iterations.
+    error_tolerance: Iterative exit condition.
+    precision: precision XLA related flag, the available options are:
+      a) lax.Precision.DEFAULT (better step time, but not precise)
+      b) lax.Precision.HIGH (increased precision, slower)
+      c) lax.Precision.HIGHEST (best possible precision, slowest)
+  Returns:
+    eigen vector, eigen value
+  """
+  matrix_size = matrix.shape[-1]
+  def _iter_condition(state):
+    i, unused_v, unused_s, unused_s_v, run_step = state
+    return jnp.logical_and(i < num_iters, run_step)
+  def _iter_body(state):
+    """One step of power iteration."""
+    i, new_v, s, s_v, unused_run_step = state
+    new_v = new_v / jnp.linalg.norm(new_v)
+    s_v = jnp.einsum('ij,j->i', matrix, new_v, precision=precision)
+    s_new = jnp.einsum('i,i->', new_v, s_v, precision=precision)
+    return (i + 1, s_v, s_new, s_v,
+            jnp.greater(jnp.abs(s_new - s), error_tolerance))
+  # Figure out how to use step as seed for random.
+  v_0 = np.random.RandomState(1729).uniform(-1.0, 1.0,
+                                            matrix_size).astype(matrix.dtype)
+  init_state = tuple([0, v_0, jnp.zeros([], dtype=matrix.dtype), v_0, True])
+  _, v_out, s_out, _, _ = lax.while_loop(
+      _iter_condition, _iter_body, init_state)
+  v_out = v_out / jnp.linalg.norm(v_out)
+  return v_out, s_out
+def matrix_inverse_pth_root(
+    matrix,
+    p,
+    num_iters=100,
+    ridge_epsilon=1e-6,
+    error_tolerance=1e-6,
+    precision=lax.Precision.HIGHEST):
+  """Computes `matrix^(-1/p)`, where `p` is a positive integer.
+  This function uses the Coupled newton iterations algorithm for
+  the computation of a matrix's inverse pth root.
+  References:
+    [Functions of Matrices, Theory and Computation,
+     Nicholas J Higham, Pg 184, Eq 7.18](
+     https://epubs.siam.org/doi/book/10.1137/1.9780898717778)
+  Args:
+    matrix: the symmetric PSD matrix whose power it to be computed
+    p: exponent, for p a positive integer.
+    num_iters: Maximum number of iterations.
+    ridge_epsilon: Ridge epsilon added to make the matrix positive definite.
+    error_tolerance: Error indicator, useful for early termination.
+    precision: precision XLA related flag, the available options are:
+      a) lax.Precision.DEFAULT (better step time, but not precise)
+      b) lax.Precision.HIGH (increased precision, slower)
+      c) lax.Precision.HIGHEST (best possible precision, slowest)
+  Returns:
+    matrix^(-1/p)
+  """
+  assert matrix.shape[0] == matrix.shape[1]
+  # We use float32 for the matrix inverse pth root.
+  # Switch to f64 if you have hardware that supports it.
+  matrix_size = matrix.shape[0]
+  alpha = jnp.asarray(-1.0 / p, jnp.float32)
+  identity = jnp.eye(matrix_size, dtype=jnp.float32)
+  _, max_ev = power_iteration(
+      matrix=matrix, num_iters=100,
+      error_tolerance=1e-6, precision=precision)
+  ridge_epsilon = ridge_epsilon * jnp.maximum(max_ev, 1e-16)
+  def _unrolled_mat_pow_1(mat_m):
+    """Computes mat_m^1."""
+    return mat_m
+  def _unrolled_mat_pow_2(mat_m):
+    """Computes mat_m^2."""
+    return jnp.matmul(mat_m, mat_m, precision=precision)
+  def _unrolled_mat_pow_4(mat_m):
+    """Computes mat_m^4."""
+    mat_pow_2 = _unrolled_mat_pow_2(mat_m)
+    return jnp.matmul(
+        mat_pow_2, mat_pow_2, precision=precision)
+  def _unrolled_mat_pow_8(mat_m):
+    """Computes mat_m^4."""
+    mat_pow_4 = _unrolled_mat_pow_4(mat_m)
+    return jnp.matmul(
+        mat_pow_4, mat_pow_4, precision=precision)
+  def mat_power(mat_m, p):
+    """Computes mat_m^p, for p == 1, 2, 4 or 8.
+    Args:
+      mat_m: a square matrix
+      p: a positive integer
+    Returns:
+      mat_m^p
+    """
+    # We unrolled the loop for performance reasons.
+    exponent = jnp.round(jnp.log2(p))
+    return lax.switch(
+        jnp.asarray(exponent, jnp.int32), [
+            _unrolled_mat_pow_1,
+            _unrolled_mat_pow_2,
+            _unrolled_mat_pow_4,
+            _unrolled_mat_pow_8,
+        ], (mat_m))
+  def _iter_condition(state):
+    (i, unused_mat_m, unused_mat_h, unused_old_mat_h, error,
+     run_step) = state
+    error_above_threshold = jnp.logical_and(
+        error > error_tolerance, run_step)
+    return jnp.logical_and(i < num_iters, error_above_threshold)
+  def _iter_body(state):
+    (i, mat_m, mat_h, unused_old_mat_h, error, unused_run_step) = state
+    mat_m_i = (1 - alpha) * identity + alpha * mat_m
+    new_mat_m = jnp.matmul(mat_power(mat_m_i, p), mat_m, precision=precision)
+    new_mat_h = jnp.matmul(mat_h, mat_m_i, precision=precision)
+    new_error = jnp.max(jnp.abs(new_mat_m - identity))
+    # sometimes error increases after an iteration before decreasing and
+    # converging. 1.2 factor is used to bound the maximal allowed increase.
+    return (i + 1, new_mat_m, new_mat_h, mat_h, new_error,
+            new_error < error * 1.2)
+  if matrix_size == 1:
+    resultant_mat_h = (matrix + ridge_epsilon)**alpha
+    error = 0
+  else:
+    damped_matrix = matrix + ridge_epsilon * identity
+    z = (1 + p) / (2 * jnp.linalg.norm(damped_matrix))
+    new_mat_m_0 = damped_matrix * z
+    new_error = jnp.max(jnp.abs(new_mat_m_0 - identity))
+    new_mat_h_0 = identity * jnp.power(z, 1.0 / p)
+    init_state = tuple(
+        [0, new_mat_m_0, new_mat_h_0, new_mat_h_0, new_error, True])
+    _, mat_m, mat_h, old_mat_h, error, convergence = lax.while_loop(
+        _iter_condition, _iter_body, init_state)
+    error = jnp.max(jnp.abs(mat_m - identity))
+    is_converged = jnp.asarray(convergence, old_mat_h.dtype)
+    resultant_mat_h = is_converged * mat_h + (1 - is_converged) * old_mat_h
+    resultant_mat_h = jnp.asarray(resultant_mat_h, matrix.dtype)
+  return resultant_mat_h, error
+def merge_small_dims(shape_to_merge, max_dim):
+  """Merge small dimensions.
+  If there are some small dimensions, we collapse them:
+  e.g. [1, 2, 512, 1, 2048, 1, 3, 4] --> [1024, 2048, 12] if max_dim = 1024
+       [1, 2, 768, 1, 2048] --> [2, 768, 2048]
+  Args:
+    shape_to_merge: Shape to merge small dimensions.
+    max_dim: Maximal dimension of output shape used in merging.
+  Returns:
+    Merged shape.
+  """
+  resulting_shape = []
+  product = 1
+  for d in shape_to_merge:
+    if product * d <= max_dim:
+      product *= d
+    else:
+      if product > 1:
+        resulting_shape.append(product)
+      product = d
+  if product > 1:
+    resulting_shape.append(product)
+  return resulting_shape
+def pad_matrix(mat, max_size):
+  """Pad a matrix to a max_size.
+  Args:
+    mat: a matrix to pad.
+    max_size: matrix size requested.
+  Returns:
+    Given M returns [[M, 0], [0, I]]
+  """
+  size = mat.shape[0]
+  assert size <= max_size
+  if size == max_size:
+    return mat
+  pad_size = max_size - size
+  zs1 = jnp.zeros([size, pad_size], dtype=mat.dtype)
+  zs2 = jnp.zeros([pad_size, size], dtype=mat.dtype)
+  eye = jnp.eye(pad_size, dtype=mat.dtype)
+  mat = jnp.concatenate([mat, zs1], 1)
+  mat = jnp.concatenate([mat, jnp.concatenate([zs2, eye], 1)], 0)
+  return mat
+def pad_vector(vec, max_size):
+  """Pad a vector to a max_size.
+  Args:
+    vec: a vector to pad.
+    max_size: matrix size requested.
+  Returns:
+    Given V returns [V, 0]
+  """
+  size = vec.shape[0]
+  assert size <= max_size
+  if size == max_size:
+    return vec
+  pad_size = max_size - size
+  zs1 = jnp.zeros([pad_size], dtype=vec.dtype)
+  return jnp.concatenate([vec, zs1], 0)
+def efficient_cond(predicate, compute_fn, init_state, *args, **kwargs):
+  """Avoids wasteful buffer allocation with XLA."""
+  def _iter_body(unused_state):
+    results = compute_fn(*args, **kwargs)
+    return tuple([False] + list(results))
+  def _iter_condition(state):
+    return state[0]
+  results = jax.lax.while_loop(_iter_condition, _iter_body,
+                               tuple([predicate] + init_state))
+  return tuple(results[1:])
+class BlockPartitioner:
+  """Partitions a tensor into smaller tensors."""
+  def __init__(self, param, block_size):
+    self._shape = param.shape
+    self._splits = []
+    split_sizes = []
+    # We split params into smaller blocks. Here we store the metadata to make
+    # that split.
+    for i, d in enumerate(param.shape):
+      if 0 < block_size < d:
+        # d-1, otherwise split appends a 0-size array.
+        nsplit = (d - 1) // block_size
+        indices = (np.arange(nsplit, dtype=np.int32) + 1) * block_size
+        sizes = np.ones(nsplit + 1, dtype=np.int32) * block_size
+        sizes[-1] = d - indices[-1]
+        self._splits.append((i, indices))
+        split_sizes.append(sizes)
+      else:
+        split_sizes.append(np.array([d], dtype=np.int32))
+    self._num_splits = len(split_sizes)
+    self._preconditioner_shapes = []
+    for t in itertools.product(*split_sizes):
+      self._preconditioner_shapes.extend([[d, d] for d in t])
+  def shapes_for_preconditioners(self):
+    return self._preconditioner_shapes
+  def num_splits(self):
+    return self._num_splits
+  def partition(self, tensor):
+    """Partition tensor into blocks."""
+    assert tensor.shape == self._shape
+    tensors = [tensor]
+    for (i, indices) in self._splits:
+      tensors_local = []
+      for t in tensors:
+        tensors_local.extend(jnp.split(t, indices_or_sections=indices, axis=i))
+      tensors = tensors_local
+    return tensors
+  def merge_partitions(self, partitions):
+    """Merge partitions back to original shape."""
+    for (i, indices) in reversed(self._splits):
+      n = len(indices) + 1
+      partial_merged_tensors = []
+      ind = 0
+      while ind < len(partitions):
+        partial_merged_tensors.append(
+            jnp.concatenate(partitions[ind:ind + n], axis=i))
+        ind += n
+      partitions = partial_merged_tensors
+    assert len(partitions) == 1
+    return partitions[0]
+class Preconditioner:
+  """Compute statistics/shape from gradients for preconditioning."""
+  def __init__(self, param, block_size, best_effort_shape_interpretation):
+    self._original_shape = param.shape
+    self._transformed_shape = param.shape
+    if best_effort_shape_interpretation:
+      self._transformed_shape = merge_small_dims(self._original_shape,
+                                                 block_size)
+    reshaped_param = jnp.reshape(param, self._transformed_shape)
+    self._partitioner = BlockPartitioner(reshaped_param, block_size)
+  def statistics_from_grad(self, grad):
+    """Compute statistics from gradients.
+    Args:
+      grad: Gradient to compute statistics from.
+    Returns:
+      A list of gradient statistics for each partition.
+    """
+    reshaped_grad = jnp.reshape(grad, self._transformed_shape)
+    partitioned_grads = self._partitioner.partition(reshaped_grad)
+    stats = []
+    for g in partitioned_grads:
+      g_stats = []
+      rank = len(g.shape)
+      for i in range(rank):
+        axes = list(range(i)) + list(range(i + 1, rank))
+        stat = jnp.tensordot(g, g, axes=(axes, axes))
+        g_stats.append(stat)
+      stats.extend(g_stats)
+    return stats
+  def shapes_for_preconditioners(self):
+    """Returns shape from statistics."""
+    return self._partitioner.shapes_for_preconditioners()
+  def exponent_for_preconditioner(self):
+    """Returns exponent to use for inverse-pth root M^{-1/p}."""
+    return 2 * len(self._transformed_shape)
+  def preconditioned_grad(self, grad, preconditioners):
+    """Precondition the gradient.
+    Args:
+      grad: A gradient tensor to precondition.
+      preconditioners: A list of preconditioners to apply.
+    Returns:
+      A preconditioned gradient.
+    """
+    reshaped_grad = jnp.reshape(grad, self._transformed_shape)
+    partitioned_grads = self._partitioner.partition(reshaped_grad)
+    preconditioned_partitioned_grads = []
+    num_splits = self._partitioner.num_splits()
+    for i, g in enumerate(partitioned_grads):
+      preconditioners_for_grad = preconditioners[i * num_splits:(i + 1) *
+                                                 num_splits]
+      rank = len(g.shape)
+      precond_g = g
+      for j in range(rank):
+        precond_g = jnp.tensordot(
+            precond_g, preconditioners_for_grad[j], axes=[[0], [0]])
+      preconditioned_partitioned_grads.append(precond_g)
+    merged_grad = self._partitioner.merge_partitions(
+        preconditioned_partitioned_grads)
+    return jnp.reshape(merged_grad, self._original_shape)
+def _convert_to_parameter_stats(global_stats, local_stat):
+  """Creates parameter stats from sharded stats."""
+  index_start = int(local_stat.index_start)
+  index_end = int(len(local_stat.sizes)) + index_start
+  statistics = global_stats.statistics[index_start:index_end, :, :]
+  preconditioners = global_stats.preconditioners[index_start:index_end, :, :]
+  new_statistics = []
+  new_preconditioners = []
+  for i, size in enumerate(local_stat.sizes):
+    new_statistics.append(statistics[i][:size, :size])
+    new_preconditioners.append(preconditioners[i][:size, :size])
+  return ParameterStats(local_stat.diagonal_statistics, new_statistics,
+                        new_preconditioners, local_stat.diagonal_momentum,
+                        local_stat.momentum)
+def _convert_from_parameter_stats(parameter_stats, local_stats):
+  """Creates sharded stats from paramter stats."""
+  return LocalShardedParameterStats(parameter_stats.diagonal_statistics,
+                                    parameter_stats.diagonal_momentum,
+                                    parameter_stats.momentum,
+                                    local_stats.index_start, local_stats.sizes)
+def batch(x, num_devices):
+  """Batch `x` so that so that leading axis is num_devices."""
+  n = len(x)
+  b = int(n / num_devices)
+  return jnp.stack([jnp.stack(x[idx:idx + b]) for idx in range(0, n, b)])
+def unbatch(batched_values):
+  """Unbatch values across leading axis and return a list of elements."""
+  b1, b2 = batched_values.shape[0], batched_values.shape[1]
+  results = []
+  for v_array in jnp.split(batched_values, indices_or_sections=b1, axis=0):
+    v_array = jnp.squeeze(v_array)
+    # b2 = batches (number of preconditioner computation) per core.
+    if b2 > 1:
+      for v in jnp.split(v_array, indices_or_sections=b2, axis=0):
+        results.append(jnp.squeeze(v))
+    else:
+      results.append(v_array)
+  return results
+def distributed_shampoo(
+    learning_rate,
+    block_size,
+    beta1=0.9,
+    beta2=0.999,
+    diagonal_epsilon=1e-10,
+    matrix_epsilon=1e-6,
+    weight_decay=0.0,
+    start_preconditioning_step=5,
+    preconditioning_compute_steps=1,
+    statistics_compute_steps=1,
+    best_effort_shape_interpretation=True,
+    graft_type=GraftingType.SGD,
+    nesterov=True,
+    exponent_override=0,
+    # Pass pmap 'batch axis name' in pmap mode.
+    batch_axis_name=None,
+    ### Only set following 3 params in pjit/spmd mode.
+    ### WARNING: Experimental
+    statistics_partition_spec=None,
+    preconditioner_partition_spec=None,
+    num_devices_for_pjit=None,
+    shard_optimizer_states=False,
+    ###
+    ### Experimental memory reduction mode
+    best_effort_memory_usage_reduction=False,
+    ###
+    inverse_failure_threshold=0.1,
+    moving_average_for_momentum=False,
+    skip_preconditioning_dim_size_gt=4096,
+    clip_by_scaled_gradient_norm=None,
+    precision=lax.Precision.HIGHEST):
+  """Distributed Shampoo optimizer.
+  Distributed Shampoo is a second-order preconditioned method (concretely, a
+  variant of full-matrix Adagrad), that provides significant convergence and
+  wall-clock time improvements compared to conventional first-order methods,
+  and that has been shown to scale to large state-of-the-art deep learning
+  models.
+  References:
+    Scalable Second Order Optimization for Deep Learning,
+    Rohan Anil, Vineet Gupta, Tomer Koren, Kevin Regan, Yoram Singer
+    Preprint: https://arxiv.org/abs/2002.09018
+  Args:
+    learning_rate: the step size used to update the parameters.
+    block_size: Block size for large layers (if > 0). Preconditioning compute
+      operation is cubic in the dimension of the tensor. Block size allows us to
+      chunk the layers into sub-layers of maximal dimension dictated by this
+      value. Use 128 as default (increase if you have compute budget).
+    beta1: momentum parameter.
+    beta2: second moment averaging parameter.
+    diagonal_epsilon: epsilon for diagonal adagrad (only if layerwise grafting
+      to AdaGrad is enabled).
+    matrix_epsilon: epsilon to add to statistics before computing inverse pth
+      root. If you are running in f32 precision for inverse pth root
+      (recommended today) this can go upto 1e-6. If you have latest hardware
+      with native f64 precision, set this upto 1e-12.
+    weight_decay: Weight decay for regularization.
+    start_preconditioning_step: When to start Shampoo update before which
+      diagonal update is used. This is because we dont have enough information
+      to do stable inverse.
+    preconditioning_compute_steps: How often to compute preconditioner.
+      Performance tuning params for controlling memory and compute requirements.
+      Ideally set this and statistics_compute_steps params to 1.
+    statistics_compute_steps: How often to compute statistics.
+    best_effort_shape_interpretation: If there are some small dimensions,
+      collapse them e.g. [1, 2, 512, 1, 2048, 1, 3, 4] --> [1024, 2048, 12] if
+      block = 1024, [1, 2, 768, 1, 2048] --> [2, 768, 2048]
+    graft_type: Grafting is a technique to fix the layerwise scale of Shampoo
+      optimizer. This allows us to plugin the Shampoo optimizer into settings
+      where SGD/AdaGrad is already well tuned. Available options are:
+        GraftingType.SGD and GraftingType.ADAGRAD.
+    nesterov: Nesterov momentum.
+    exponent_override: Override the exponent used in matrix inverse.
+    batch_axis_name: labeled axis over pmap for data-parallel training the
+      optimizer used for.
+    statistics_partition_spec: PartitionSpec to be used in sharded mode.
+    preconditioner_partition_spec: PartitionSpec to be used in sharded mode.
+    num_devices_for_pjit: Number of devices to parallelize over when using pjit.
+    shard_optimizer_states: Shard optimizer states to save memory in model
+      parallel training.
+    best_effort_memory_usage_reduction: Best effort memory usage reduction.
+      diagonal_statistics -> jnp.bfloat16
+      momentum buffers (2x) -> jnp.int8
+      statistics, preconditioners -> jnp.int16 + diagonals
+    inverse_failure_threshold: numerics are hard and inverses fail sometimes; we
+      determine that using this threshold.
+    moving_average_for_momentum: Whether to use moving average for momentum
+      instead of exponential moving average.
+    skip_preconditioning_dim_size_gt: Skip if preconditioning dim size is
+        greater than this value.
+    clip_by_scaled_gradient_norm: Clip by scaled gradient norm (only useful
+      when using RMSProp Grafting).
+    precision: precision XLA related flag, the available options are: a)
+      lax.Precision.DEFAULT (better step time, but not precise) b)
+      lax.Precision.HIGH (increased precision, slower) c) lax.Precision.HIGHEST
+      (best possible precision, slowest)
+  Returns:
+    a GradientTransformation.
+  """
+  def quantized_dtype_for_momentum_buffers():
+    return jnp.int8 if best_effort_memory_usage_reduction else jnp.float32
+  # TODO(rohananil): Explore int8-16 quantization with non-linear bucket sizes.
+  def quantized_dtype_for_diagonal_statistics_buffers():
+    return jnp.bfloat16 if best_effort_memory_usage_reduction else jnp.float32
+  # Preconditioner and statistics are both stores as int16 in this mode.
+  # We take out the diagonal to make quantization easier.
+  def quantized_dtype_for_second_moment_statistics_buffers():
+    return jnp.int16 if best_effort_memory_usage_reduction and batch_axis_name else jnp.float32
+  # Preconditioner and statistics are both stores as int16 in this mode.
+  # We take out the diagonal to make quantization easier.
+  def quantized_dtype_for_second_moment_preconditioner_buffers():
+    return jnp.int16 if best_effort_memory_usage_reduction and batch_axis_name else jnp.float32
+  def _to_float(maybe_quantized):
+    if isinstance(maybe_quantized, QuantizedValue):
+      return maybe_quantized.to_float()
+    else:
+      return maybe_quantized
+  def _maybe_quantize_statistics(statistics_list):
+    return _maybe_quantize_matrices_with_dtype(
+        statistics_list, quantized_dtype_for_second_moment_statistics_buffers())
+  def _maybe_quantize_preconditioners(statistics_list):
+    return _maybe_quantize_matrices_with_dtype(
+        statistics_list,
+        quantized_dtype_for_second_moment_preconditioner_buffers())
+  def _maybe_quantize_matrices_with_dtype(statistics_list, quantized_dtype):
+    if quantized_dtype != jnp.float32:
+      return ([
+          QuantizedValue.from_float_value(
+              s, quantized_dtype, extract_diagonal=True)
+          for s in statistics_list
+      ])
+    else:
+      return statistics_list
+  def _maybe_dequantize_preconditioners(preconditioner_list):
+    return _maybe_dequantize_matrices_with_dtype(
+        preconditioner_list,
+        quantized_dtype_for_second_moment_preconditioner_buffers())
+  def _maybe_dequantize_matrices_with_dtype(statistics_list, quantized_dtype):
+    if quantized_dtype != jnp.float32:
+      return [s.to_float() for s in statistics_list]
+    else:
+      return statistics_list
+  def _quantize_diagonal_statistics(diagonal_statistics):
+    return QuantizedValue.from_float_value(
+        diagonal_statistics, quantized_dtype_for_diagonal_statistics_buffers())
+  def _quantize_momentum(momentum_statistics):
+    return QuantizedValue.from_float_value(
+        momentum_statistics, quantized_dtype_for_momentum_buffers())
+  def sharded_init_fn(params):
+    """Returns optimizer state (for PJIT mode).
+    Args:
+      params: the parameters that should be updated.
+    """
+    params_flat, treedef = jax.tree_flatten(params)
+    # Find max size to pad to.
+    max_size = 0
+    for param in params_flat:
+      preconditioner = Preconditioner(param, block_size,
+                                      best_effort_shape_interpretation)
+      if not _skip_preconditioning(param):
+        shapes = preconditioner.shapes_for_preconditioners()
+        sizes = [s[0] for s in shapes]
+        max_size = max(max(sizes), max_size)
+    padded_statistics = []
+    padded_preconditioners = []
+    local_stats_flat = []
+    exponents = []
+    for param in params_flat:
+      preconditioner = Preconditioner(param, block_size,
+                                      best_effort_shape_interpretation)
+      shapes = preconditioner.shapes_for_preconditioners()
+      sizes = []
+      statistics = []
+      preconditioners = []
+      index_start = len(padded_statistics)
+      if not _skip_preconditioning(param):
+        sizes = [s[0] for s in shapes]
+        shapes = preconditioner.shapes_for_preconditioners()
+        statistics = [matrix_epsilon * jnp.eye(max_size) for s in shapes]
+        preconditioners = [jnp.eye(max_size) for s in shapes]
+        padded_statistics.extend(statistics)
+        padded_preconditioners.extend(preconditioners)
+        exponent = (
+            preconditioner.exponent_for_preconditioner()
+            if exponent_override == 0 else exponent_override)
+        exponents.extend([exponent] * len(shapes))
+      diagonal_statistics = []
+      if graft_type != GraftingType.SGD:
+        diagonal_statistics = jnp.zeros_like(param)
+      local_stats_flat.append(
+          LocalShardedParameterStats(
+              _quantize_diagonal_statistics(diagonal_statistics),
+              _quantize_momentum(jnp.zeros_like(param)),
+              _quantize_momentum(jnp.zeros_like(param)), index_start, sizes))
+    local_stats = jax.tree_unflatten(treedef, local_stats_flat)
+    # Pad the statistics and preconditioner matrices to be a multiple of
+    # num devices.
+    # TODO(rohananil): Relax to only the size of the mesh axis where the dim
+    # is split on.
+    to_pad = -len(padded_statistics) % num_devices_for_pjit
+    padded_statistics.extend([
+        jnp.eye(max_size, dtype=padded_statistics[0].dtype)
+        for _ in range(to_pad)
+    ])
+    padded_preconditioners.extend([
+        jnp.eye(max_size, dtype=padded_statistics[0].dtype)
+        for _ in range(to_pad)
+    ])
+    exponents.extend([1 for _ in range(to_pad)])
+    global_stats = GlobalShardedParameterStats(
+        jnp.stack(padded_statistics), jnp.stack(padded_preconditioners),
+        jnp.stack(exponents))
+    return ShampooState(
+        count=jnp.zeros([], jnp.int32),
+        stats=ShardedShampooStats(global_stats, local_stats))
+  def _max_statistics_size_from_params(params):
+    max_size = 0
+    for param in params:
+      param_clone = jnp.zeros(param.shape, dtype=param.dtype)
+      preconditioner = Preconditioner(param_clone, block_size,
+                                      best_effort_shape_interpretation)
+      if not _skip_preconditioning(param):
+        shapes = preconditioner.shapes_for_preconditioners()
+        sizes = [s[0] for s in shapes]
+        max_size = max(max(sizes), max_size)
+    return max_size
+  def _remove_leading_sharding_annotation(pspec):
+    """Mapping from N-d to (N-1)-d, used for quantization, factoring etc."""
+    # None and PSpec(None) are valid PSpecs.
+    if pspec and len(pspec) > 1:
+      return PartitionSpec(*pspec[1:])
+    else:
+      return pspec
+  def sharded_init_partition_spec_fn(params, params_partition_spec,
+                                     partition_spec_for_statistics):
+    """Returns a parallel state tree with PartitionSpec associated with state.
+    Args:
+      params: A pytree with params.
+      params_partition_spec: A pytree with PartitionSpec for params.
+      partition_spec_for_statistics: PartitionSpec for the statistics.
+    """
+    # Parallel lists of spec, and params.
+    param_pspec_flat, _ = jax.tree_flatten(params_partition_spec)
+    params_flat, treedef = jax.tree_flatten(params)
+    assert param_pspec_flat
+    assert params_flat
+    # Step is replicated across cores.
+    # None means cores.
+    local_stats_flat = []
+    num_statistics = 0
+    for param, param_pspec in zip(params_flat, param_pspec_flat):
+      param_clone = jnp.zeros(param.shape, dtype=param.dtype)
+      preconditioner = Preconditioner(param_clone, block_size,
+                                      best_effort_shape_interpretation)
+      shapes = preconditioner.shapes_for_preconditioners()
+      sizes = []
+      index_start = num_statistics
+      if not _skip_preconditioning(param):
+        sizes = [s[0] for s in shapes]
+        shapes = preconditioner.shapes_for_preconditioners()
+        num_statistics += len(shapes)
+      diagonal_statistics_pspec = []
+      diagonal_statistics_scale_pspec = []
+      if graft_type != GraftingType.SGD:
+        # Identically shaped param.
+        diagonal_statistics_pspec = param_pspec
+        if quantized_dtype_for_diagonal_statistics_buffers() != jnp.float32:
+          diagonal_statistics_scale_pspec = _remove_leading_sharding_annotation(
+              param_pspec)
+      m1_pspec = param_pspec
+      m2_pspec = param_pspec
+      m1_scale_pspec = []
+      m2_scale_pspec = []
+      if quantized_dtype_for_momentum_buffers() != jnp.float32:
+        m1_scale_pspec = _remove_leading_sharding_annotation(m1_pspec)
+        m2_scale_pspec = _remove_leading_sharding_annotation(m2_pspec)
+      local_stats_flat.append(
+          LocalShardedParameterStats(
+              QuantizedValue(diagonal_statistics_pspec, [],
+                             diagonal_statistics_scale_pspec,
+                             quantized_dtype_for_diagonal_statistics_buffers(),
+                             False, list(param.shape)),
+              QuantizedValue(m1_pspec, [], m1_scale_pspec,
+                             quantized_dtype_for_momentum_buffers(), False,
+                             list(param.shape)),
+              QuantizedValue(m2_pspec, [], m2_scale_pspec,
+                             quantized_dtype_for_momentum_buffers(), False,
+                             list(param.shape)), index_start, sizes))
+    local_stats = jax.tree_unflatten(treedef, local_stats_flat)
+    global_stats = GlobalShardedParameterStats(partition_spec_for_statistics,
+                                               partition_spec_for_statistics,
+                                               PartitionSpec())
+    count_pspec = PartitionSpec()
+    return ShampooState(
+        count=count_pspec, stats=ShardedShampooStats(global_stats, local_stats))
+  def sharded_init_shape_and_dtype_fn(params):
+    """Returns a parallel state tree with shape, dtype associated with state.
+    Args:
+      params: A pytree with params.
+    """
+    # Parallel lists of spec, and params.
+    params_flat, treedef = jax.tree_flatten(params)
+    assert params_flat
+    # Step is replicated across cores.
+    # None means cores.
+    local_stats_flat = []
+    num_statistics = 0
+    for param in params_flat:
+      param_clone = jnp.zeros(param.shape, dtype=param.dtype)
+      preconditioner = Preconditioner(param_clone, block_size,
+                                      best_effort_shape_interpretation)
+      shapes = preconditioner.shapes_for_preconditioners()
+      sizes = []
+      index_start = num_statistics
+      if not _skip_preconditioning(param):
+        sizes = [s[0] for s in shapes]
+        shapes = preconditioner.shapes_for_preconditioners()
+        num_statistics += len(shapes)
+      diagonal_statistics_shape_and_dtype = []
+      diagonal_statistics_scale_shape_and_dtype = []
+      if graft_type != GraftingType.SGD:
+        diagonal_statistics_shape_and_dtype = [list(param.shape), param.dtype]
+        qdtype = quantized_dtype_for_diagonal_statistics_buffers()
+        if qdtype != jnp.float32:
+          diagonal_statistics_shape_and_dtype = [list(param.shape), qdtype]
+          diagonal_statistics_scale_shape_and_dtype = [
+              list(param.shape)[1:], param.dtype
+          ]
+      m1_shape_and_dtype = [list(param.shape), param.dtype]
+      m2_shape_and_dtype = [list(param.shape), param.dtype]
+      m1_scale_shape_and_dtype = []
+      m2_scale_shape_and_dtype = []
+      qdtype = quantized_dtype_for_momentum_buffers()
+      if qdtype != jnp.float32:
+        m1_shape_and_dtype = [list(param.shape), qdtype]
+        m2_shape_and_dtype = [list(param.shape), qdtype]
+        m1_scale_shape_and_dtype = [list(param.shape)[1:], qdtype]
+        m2_scale_shape_and_dtype = [list(param.shape)[1:], qdtype]
+      local_stats_flat.append(
+          LocalShardedParameterStats(
+              QuantizedValue(diagonal_statistics_shape_and_dtype, [],
+                             diagonal_statistics_scale_shape_and_dtype,
+                             quantized_dtype_for_diagonal_statistics_buffers(),
+                             False, list(param.shape)),
+              QuantizedValue(m1_shape_and_dtype, [], m1_scale_shape_and_dtype,
+                             quantized_dtype_for_momentum_buffers(), False,
+                             list(param.shape)),
+              QuantizedValue(m2_shape_and_dtype, [], m2_scale_shape_and_dtype,
+                             quantized_dtype_for_momentum_buffers(), False,
+                             list(param.shape)), index_start, sizes))
+    local_stats = jax.tree_unflatten(treedef, local_stats_flat)
+    max_statistics_size = _max_statistics_size_from_params(params_flat)
+    to_pad = -num_statistics % num_devices_for_pjit
+    num_statistics += to_pad
+    statistics_shape = [
+        num_statistics, max_statistics_size, max_statistics_size
+    ]
+    global_stats = GlobalShardedParameterStats([statistics_shape, jnp.float32],
+                                               [statistics_shape, jnp.float32],
+                                               [[num_statistics], jnp.int32])
+    return ShampooState(
+        count=[[], jnp.float32],
+        stats=ShardedShampooStats(global_stats, local_stats))
+  def sharded_update_fn(grads, state, params):
+    """Transform the input gradient and update all statistics in sharded mode.
+    Args:
+      grads: the gradient tensors for the parameters.
+      state: a named tuple containing the state of the optimizer
+      params: the parameters that should be updated.
+    Returns:
+      A tuple containing the new parameters and the new optimizer state.
+    """
+    params_flat, treedef = jax.tree_flatten(params)
+    grads_flat = treedef.flatten_up_to(grads)
+    global_stats = state.stats.global_stats
+    local_stats_flat = treedef.flatten_up_to(state.stats.local_stats)
+    stats_flat = [
+        _convert_to_parameter_stats(global_stats, local_stat)
+        for local_stat in local_stats_flat
+    ]
+    new_stats_flat = jax.tree_multimap(
+        lambda g, s, p: _compute_stats(g, s, p, state.count), grads_flat,
+        stats_flat, params_flat)
+    outputs = jax.tree_multimap(
+        lambda g, s, p: _transform_grad(g, s, p, state.count), grads_flat,
+        new_stats_flat, params_flat)
+    updates_flat, new_stats_flat = list(zip(*outputs)) if outputs else ((), ())
+    updates = jax.tree_unflatten(treedef, updates_flat)
+    # Create new local_stats
+    new_local_stats_flat = [
+        _convert_from_parameter_stats(new_stat, local_stat)
+        for new_stat, local_stat in zip(new_stats_flat, local_stats_flat)
+    ]
+    new_local_stats = jax.tree_unflatten(treedef, new_local_stats_flat)
+    max_size = global_stats.statistics.shape[1]
+    new_padded_statistics = []
+    for stat in new_stats_flat:
+      new_padded_statistics.extend(
+          [pad_matrix(stat, max_size) for stat in stat.statistics])
+    # Create global stats
+    # TODO(rohananil): Preconditioner is not updated every step, so cost of
+    # stack/pad can be obviated away.
+    # Pad the statistics and preconditioner matrices to be a multiple of
+    # num devices.
+    # TODO(rohananil): Relax to only the size of the mesh axis where the dim
+    # is split on.
+    to_pad = -len(new_padded_statistics) % num_devices_for_pjit
+    new_padded_statistics.extend([
+        jnp.eye(max_size, dtype=new_padded_statistics[0].dtype)
+        for _ in range(to_pad)
+    ])
+    new_stacked_padded_statistics = jnp.stack(new_padded_statistics)
+    new_stacked_padded_statistics = pjit.with_sharding_constraint(
+        new_stacked_padded_statistics, statistics_partition_spec)
+    def _internal_inverse_pth_root_all():
+      preconditioners, errors = _matrix_inverse_pth_root_pjit(
+          new_stacked_padded_statistics, global_stats.exponents,
+          statistics_partition_spec)
+      return preconditioners, errors
+    if preconditioning_compute_steps == 1:
+      new_preconditioners, errors = _internal_inverse_pth_root_all()
+    else:
+      # Passing statistics instead of preconditioners as they are similarly
+      # shaped tensors. Note statistics will be ignored as we are passing in
+      # a large init value for error.
+      preconditioners_init = new_stacked_padded_statistics
+      n = new_stacked_padded_statistics.shape[0]
+      errors_init = jnp.ones([n], jnp.float32) * inverse_failure_threshold
+      init_state = [preconditioners_init, errors_init]
+      perform_step = state.count % preconditioning_compute_steps == 0
+      new_preconditioners, errors = efficient_cond(
+          perform_step, _internal_inverse_pth_root_all, init_state)
+    errors = errors.reshape((-1, 1, 1))
+    predicate = jnp.logical_or(
+        jnp.isnan(errors),
+        errors >= inverse_failure_threshold).astype(new_preconditioners.dtype)
+    # TODO(rohananil): Check for numerical instabilities.
+    new_conditional_preconditioners = (
+        predicate * global_stats.preconditioners +
+        (1.0 - predicate) * new_preconditioners)
+    new_global_stats = GlobalShardedParameterStats(
+        new_stacked_padded_statistics, new_conditional_preconditioners,
+        global_stats.exponents)
+    new_shampoo_state = ShampooState(
+        count=state.count + 1,
+        stats=ShardedShampooStats(new_global_stats, new_local_stats))
+    return updates, new_shampoo_state
+  def init_fn(params):
+    """Initialise the optimiser's state."""
+    def _init(param):
+      preconditioner = Preconditioner(param, block_size,
+                                      best_effort_shape_interpretation)
+      statistics = []
+      preconditioners = []
+      if not _skip_preconditioning(param):
+        shapes = preconditioner.shapes_for_preconditioners()
+        statistics = [matrix_epsilon * jnp.eye(s[0]) for s in shapes]
+        preconditioners = [jnp.eye(s[0]) for s in shapes]
+      diagonal_statistics = []
+      if graft_type != GraftingType.SGD:
+        diagonal_statistics = jnp.zeros_like(param)
+      return ParameterStats(
+          _quantize_diagonal_statistics(diagonal_statistics),
+          _maybe_quantize_statistics(statistics),
+          _maybe_quantize_preconditioners(preconditioners),
+          _quantize_momentum(jnp.zeros_like(param)),
+          _quantize_momentum(jnp.zeros_like(param)))
+    return ShampooState(
+        count=jnp.zeros([], jnp.int32), stats=jax.tree_map(_init, params))
+  def _skip_preconditioning(param):
+    return len(param.shape) < 1 or any(
+        [s > skip_preconditioning_dim_size_gt for s in param.shape])
+  def _compute_stats(grad, state, param, step):
+    """Compute per-parameter statistics."""
+    preconditioner = Preconditioner(param, block_size,
+                                    best_effort_shape_interpretation)
+    new_statistics = [[]] * len(state.statistics)
+    w1 = beta2
+    w2 = beta2 if beta2 == 1.0 else (1.0 - beta2)
+    if not _skip_preconditioning(param):
+      def compute_updated_statistics():
+        new_stats = preconditioner.statistics_from_grad(grad)
+        new_stats_accumulators = []
+        for stat, stat_accumulator in zip(new_stats, state.statistics):
+          new_stats_accumulators.append(w1 * _to_float(stat_accumulator) +
+                                        w2 * stat)
+        return _maybe_quantize_statistics(new_stats_accumulators)
+      if statistics_compute_steps > 1:
+        perform_step = step % statistics_compute_steps == 0
+        init_state = state.statistics
+        new_statistics = list(
+            efficient_cond(perform_step, compute_updated_statistics,
+                           init_state))
+      else:
+        new_statistics = compute_updated_statistics()
+    return ParameterStats(state.diagonal_statistics, new_statistics,
+                          state.preconditioners, state.diagonal_momentum,
+                          state.momentum)
+  def _matrix_inverse_pth_root_vmap(xs, ps):
+    mi_pth_root = functools.partial(
+        matrix_inverse_pth_root,
+        ridge_epsilon=matrix_epsilon,
+        precision=precision)
+    return jax.vmap(mi_pth_root)(xs, ps)
+  def _quantized_matrix_inverse_pth_root_vmap(qxs, qds, qbs, ps):
+    def _quantized_to_float(qx, qd, qb):
+      qv = QuantizedValue(qx, qd, qb, qx.dtype, True, list(qx.shape))
+      return qv.to_float()
+    def matrix_inverse_pth_root_wrapper(qx, qd, qb, p):
+      v = _quantized_to_float(qx, qd, qb)
+      preconditioner, error = matrix_inverse_pth_root(
+          v, p, ridge_epsilon=matrix_epsilon, precision=precision)
+      qp = QuantizedValue.from_float_value(preconditioner, qx.dtype, True)
+      return qp.quantized, qp.diagonal, qp.bucket_size, error
+    return jax.vmap(matrix_inverse_pth_root_wrapper)(qxs, qds, qbs, ps)
+  def _matrix_inverse_pth_root_pjit(xs, ps, statistics_partition_spec=None):
+    # Partition the concatenated statistics matrix across all cores.
+    pspec_for_partition = preconditioner_partition_spec
+    partitioned_xs = pjit.with_sharding_constraint(xs, pspec_for_partition)
+    partitioned_ps = pjit.with_sharding_constraint(
+        ps, pjit.PartitionSpec(preconditioner_partition_spec[0]))
+    # Run matrix inverse pth root on each shard.
+    partitioned_preconditioners, partitioned_errors = (
+        _matrix_inverse_pth_root_vmap(partitioned_xs, partitioned_ps))
+    # Reshard output to have the same PSpec as input. This is required to avoid
+    # vmap seeing the full set of statistics.
+    partitioned_preconditioners = pjit.with_sharding_constraint(
+        partitioned_preconditioners, pspec_for_partition)
+    # Recombine the outputs at each core.
+    preconditioners = pjit.with_sharding_constraint(partitioned_preconditioners,
+                                                    statistics_partition_spec)
+    errors = pjit.with_sharding_constraint(partitioned_errors,
+                                           pjit.PartitionSpec())
+    return preconditioners, errors
+  def _pmap_compute_preconditioners(states, step, statistics,
+                                    num_statistics_per_state, original_shapes,
+                                    exponents, max_size, prev_preconditioners):
+    """Computes preconditioners for given statistics in states in PMAP mode.
+    Args:
+      states: A list of optimizer states.
+      step: Current step number
+      statistics: A list of statistics for all variables (for every dim)
+      num_statistics_per_state: Number of statistis per state to reconstruct
+        output states.
+      original_shapes: A list of shapes of the statistics.
+      exponents: Exponent power to use for inverse-pth roots.
+      max_size: Maximum dim of the statistics to pad.
+      prev_preconditioners: Previously available preconditioner.
+    Returns:
+      New optimizer states after computing the preconditioner.
+    """
+    num_devices = lax.psum(1, batch_axis_name)
+    num_statistics = len(statistics)
+    # Pad statistics and exponents to next multiple of num_devices.
+    packed_statistics = [pad_matrix(stat, max_size) for stat in statistics]
+    to_pad = -num_statistics % num_devices
+    packed_statistics.extend([
+        jnp.eye(max_size, dtype=packed_statistics[0].dtype)
+        for _ in range(to_pad)
+    ])
+    exponents.extend([1 for _ in range(to_pad)])
+    if not packed_statistics:
+      return states
+    all_statistics = batch(packed_statistics, num_devices)
+    all_exponents = batch(exponents, num_devices)
+    def _internal_inverse_pth_root_all():
+      current_replica = lax.axis_index(batch_axis_name)
+      preconditioners, errors = _matrix_inverse_pth_root_vmap(
+          all_statistics[current_replica], all_exponents[current_replica])
+      preconditioners = jax.lax.all_gather(preconditioners, batch_axis_name)
+      errors = jax.lax.all_gather(errors, batch_axis_name)
+      preconditioners_flat = unbatch(preconditioners)
+      errors_flat = unbatch(errors)
+      return preconditioners_flat, errors_flat
+    if preconditioning_compute_steps == 1:
+      preconditioners_flat, errors_flat = _internal_inverse_pth_root_all()
+    else:
+      # Passing statistics instead of preconditioners as they are similarly
+      # shaped tensors. Note statistics will be ignored as we are passing in
+      # a large init value for error.
+      preconditioners_init = packed_statistics
+      errors_init = ([inverse_failure_threshold] * len(packed_statistics))
+      init_state = [preconditioners_init, errors_init]
+      perform_step = step % preconditioning_compute_steps == 0
+      preconditioners_flat, errors_flat = efficient_cond(
+          perform_step, _internal_inverse_pth_root_all, init_state)
+    def _skip(error):
+      condition = jnp.logical_or(
+          jnp.isnan(error), error >= inverse_failure_threshold)
+      return condition.astype(error.dtype)
+    def _select_preconditioner(error, new_p, old_p):
+      return lax.cond(
+          _skip(error), lambda _: old_p, lambda _: new_p, operand=None)
+    new_preconditioners_flat = []
+    for p, shape, prev_p, error in zip(preconditioners_flat, original_shapes,
+                                       prev_preconditioners, errors_flat):
+      new_preconditioners_flat.append(
+          _select_preconditioner(error, p[:shape[0], :shape[1]], prev_p))
+    assert len(states) == len(num_statistics_per_state)
+    assert len(new_preconditioners_flat) == num_statistics
+    # Add back empty preconditioners so we that we can set the optimizer state.
+    preconditioners_for_states = []
+    idx = 0
+    for num_statistics, state in zip(num_statistics_per_state, states):
+      if num_statistics == 0:
+        preconditioners_for_states.append([])
+      else:
+        preconditioners_for_state = new_preconditioners_flat[idx:idx +
+                                                             num_statistics]
+        assert len(state.statistics) == len(preconditioners_for_state)
+        preconditioners_for_states.append(preconditioners_for_state)
+        idx += num_statistics
+    new_states = []
+    for state, new_preconditioners in zip(states, preconditioners_for_states):
+      new_states.append(
+          ParameterStats(state.diagonal_statistics, state.statistics,
+                         new_preconditioners, state.diagonal_momentum,
+                         state.momentum))
+    return new_states
+  def _pmap_quantized_compute_preconditioners(states, step, statistics,
+                                              num_statistics_per_state,
+                                              original_shapes, exponents,
+                                              max_size, prev_preconditioners):
+    """Computes preconditioners for given statistics in states in PMAP mode.
+    For quantization, each statistic is represented by three values:
+      quantized matrix, diagonal, and bucket sizes, we run inverse pth-roots
+      without ever recreating the original matrix in f32.
+    Args:
+      states: A list of optimizer states.
+      step: Current step number
+      statistics: A list of statistics for all variables (for every dim)
+      num_statistics_per_state: Number of statistis per state to reconstruct
+        output states.
+      original_shapes: A list of shapes of the statistics.
+      exponents: Exponent power to use for inverse-pth roots.
+      max_size: Maximum dim of the statistics to pad.
+      prev_preconditioners: Previously available preconditioner.
+    Returns:
+      New optimizer states after computing the preconditioner.
+    """
+    num_devices = lax.psum(1, batch_axis_name)
+    num_statistics = len(statistics)
+    quantized_dtype = quantized_dtype_for_second_moment_statistics_buffers()
+    # Complexity here is around: shapes needing be statically shaped,
+    # our custom quantization type requires a different type of packing.
+    # Parallel tensors:
+    # quantized [dxd]
+    # diagonals [d] f32
+    # bucket_sizes [d] f32
+    packed_quantized_statistics = [
+        pad_matrix(stat.quantized, max_size) for stat in statistics
+    ]
+    packed_quantized_diagonals = [
+        pad_vector(stat.diagonal, max_size) for stat in statistics
+    ]
+    packed_quantized_bucket_sizes = [
+        pad_vector(stat.bucket_size, max_size) for stat in statistics
+    ]
+    to_pad = -num_statistics % num_devices
+    padded_eye = jnp.eye(max_size, dtype=jnp.float32)
+    quantized_eye = QuantizedValue.from_float_value(padded_eye, quantized_dtype,
+                                                    True)
+    packed_quantized_statistics.extend(
+        [quantized_eye.quantized for _ in range(to_pad)])
+    packed_quantized_diagonals.extend(
+        [quantized_eye.diagonal for _ in range(to_pad)])
+    packed_quantized_bucket_sizes.extend(
+        [quantized_eye.bucket_size for _ in range(to_pad)])
+    exponents.extend([1 for _ in range(to_pad)])
+    if not packed_quantized_statistics:
+      return states
+    all_quantized_statistics = batch(packed_quantized_statistics, num_devices)
+    all_quantized_diagonals = batch(packed_quantized_diagonals, num_devices)
+    all_quantized_bucket_sizes = batch(packed_quantized_bucket_sizes,
+                                       num_devices)
+    all_exponents = batch(exponents, num_devices)
+    def _internal_inverse_pth_root_all():
+      current_replica = lax.axis_index(batch_axis_name)
+      (quantized_preconditioners, quantized_diagonals, quantized_bucket_sizes,
+       errors) = (
+           _quantized_matrix_inverse_pth_root_vmap(
+               all_quantized_statistics[current_replica],
+               all_quantized_diagonals[current_replica],
+               all_quantized_bucket_sizes[current_replica],
+               all_exponents[current_replica]))
+      quantized_preconditioners = jax.lax.all_gather(quantized_preconditioners,
+                                                     batch_axis_name)
+      quantized_diagonals = jax.lax.all_gather(quantized_diagonals,
+                                               batch_axis_name)
+      quantized_bucket_sizes = jax.lax.all_gather(quantized_bucket_sizes,
+                                                  batch_axis_name)
+      errors = jax.lax.all_gather(errors, batch_axis_name)
+      quantized_preconditioners_flat = unbatch(quantized_preconditioners)
+      quantized_diagonals_flat = unbatch(quantized_diagonals)
+      quantized_bucket_sizes_flat = unbatch(quantized_bucket_sizes)
+      errors_flat = unbatch(errors)
+      return (quantized_preconditioners_flat, quantized_diagonals_flat,
+              quantized_bucket_sizes_flat, errors_flat)
+    if preconditioning_compute_steps == 1:
+      (quantized_preconditioners_flat, quantized_diagonals_flat,
+       quantized_bucket_sizes_flat, errors_flat) = (
+           _internal_inverse_pth_root_all())
+    else:
+      # Passing statistics instead of preconditioners as they are similarly
+      # shaped tensors. Note statistics will be ignored as we are passing in
+      # a large init value for error.
+      quantized_preconditioners_init = packed_quantized_statistics
+      quantized_diagonals_init = packed_quantized_diagonals
+      quantized_bucket_sizes_init = packed_quantized_bucket_sizes
+      errors_init = ([inverse_failure_threshold] *
+                     len(quantized_preconditioners_init))
+      init_state = [
+          quantized_preconditioners_init, quantized_diagonals_init,
+          quantized_bucket_sizes_init, errors_init
+      ]
+      perform_step = step % preconditioning_compute_steps == 0
+      (quantized_preconditioners_flat, quantized_diagonals_flat,
+       quantized_bucket_sizes_flat, errors_flat) = (
+           efficient_cond(perform_step, _internal_inverse_pth_root_all,
+                          init_state))
+    def _skip(error):
+      condition = jnp.logical_or(
+          jnp.isnan(error), error >= inverse_failure_threshold)
+      return condition.astype(error.dtype)
+    def _select_preconditioner(error, new_p, old_p):
+      return lax.cond(
+          _skip(error), lambda _: old_p, lambda _: new_p, operand=None)
+    new_quantized_preconditioners_flat = []
+    new_quantized_diagonals_flat = []
+    new_quantized_bucket_sizes_flat = []
+    for p, d, b, shape, prev_p, error in zip(quantized_preconditioners_flat,
+                                             quantized_diagonals_flat,
+                                             quantized_bucket_sizes_flat,
+                                             original_shapes,
+                                             prev_preconditioners, errors_flat):
+      new_quantized_preconditioners_flat.append(
+          _select_preconditioner(error, p[:shape[0], :shape[1]],
+                                 prev_p.quantized))
+      new_quantized_diagonals_flat.append(
+          _select_preconditioner(error, d[:shape[0]], prev_p.diagonal))
+      new_quantized_bucket_sizes_flat.append(
+          _select_preconditioner(error, b[:shape[0]], prev_p.bucket_size))
+    assert len(states) == len(num_statistics_per_state)
+    assert len(new_quantized_preconditioners_flat) == num_statistics
+    assert len(new_quantized_diagonals_flat) == num_statistics
+    assert len(new_quantized_bucket_sizes_flat) == num_statistics
+    # Add back empty preconditioners so we that we can set the optimizer state.
+    preconditioners_for_states = []
+    idx = 0
+    for num_statistics, state in zip(num_statistics_per_state, states):
+      if num_statistics == 0:
+        preconditioners_for_states.append([])
+      else:
+        quantized_preconditioners_for_state = new_quantized_preconditioners_flat[
+            idx:idx + num_statistics]
+        quantized_diagonals_for_state = new_quantized_diagonals_flat[
+            idx:idx + num_statistics]
+        quantized_bucket_sizes_for_state = new_quantized_bucket_sizes_flat[
+            idx:idx + num_statistics]
+        assert len(state.statistics) == len(quantized_preconditioners_for_state)
+        assert len(state.statistics) == len(quantized_diagonals_for_state)
+        assert len(state.statistics) == len(quantized_bucket_sizes_for_state)
+        quantized_preconditioners = []
+        for qv, qd, qb in zip(quantized_preconditioners_for_state,
+                              quantized_diagonals_for_state,
+                              quantized_bucket_sizes_for_state):
+          quantized_preconditioners.append(
+              QuantizedValue(qv, qd, qb, qv.dtype, True, list(qv.shape)))
+        preconditioners_for_states.append(quantized_preconditioners)
+        idx += num_statistics
+    new_states = []
+    for state, new_preconditioners in zip(states, preconditioners_for_states):
+      new_states.append(
+          ParameterStats(state.diagonal_statistics, state.statistics,
+                         new_preconditioners, state.diagonal_momentum,
+                         state.momentum))
+    return new_states
+  def _pjit_compute_preconditioners(states, step, statistics,
+                                    num_statistics_per_state, original_shapes,
+                                    exponents, max_size, prev_preconditioners):
+    """Computes preconditioners for given statistics in states in PJIT mode.
+    Args:
+      states: A list of optimizer states.
+      step: Current step number
+      statistics: A list of statistics for all variables (for every dim)
+      num_statistics_per_state: Number of statistis per state to reconstruct
+        output states.
+      original_shapes: A list of shapes of the statistics.
+      exponents: Exponent power to use for inverse-pth roots.
+      max_size: Maximum dim of the statistics to pad.
+      prev_preconditioners: Previously available preconditioner.
+    Returns:
+      New optimizer states after computing the preconditioner.
+    """
+    num_statistics = len(statistics)
+    to_pad = -num_statistics % num_devices_for_pjit
+    padded_statistics = [pad_matrix(stat, max_size) for stat in statistics]
+    padded_statistics.extend([
+        jnp.eye(max_size, dtype=padded_statistics[0].dtype)
+        for _ in range(to_pad)
+    ])
+    exponents.extend([1 for _ in range(to_pad)])
+    all_statistics = jnp.stack(padded_statistics)
+    all_exponents = jnp.stack(exponents)
+    def _internal_inverse_pth_root_all():
+      preconditioners, errors = _matrix_inverse_pth_root_pjit(
+          all_statistics, all_exponents)
+      b1 = preconditioners.shape[0]
+      def split(batched_values):
+        return [
+            jnp.squeeze(v)
+            for v in jnp.split(batched_values, indices_or_sections=b1, axis=0)
+        ]
+      return split(preconditioners), split(errors)
+    if preconditioning_compute_steps == 1:
+      preconditioners_flat, errors_flat = _internal_inverse_pth_root_all()
+    else:
+      # Passing statistics instead of preconditioners as they are similarly
+      # shaped tensors. Note statistics will be ignored as we are passing in
+      # a large init value for error.
+      preconditioners_init = padded_statistics
+      errors_init = [inverse_failure_threshold] * len(padded_statistics)
+      init_state = [preconditioners_init, errors_init]
+      perform_step = step % preconditioning_compute_steps == 0
+      preconditioners_flat, errors_flat = efficient_cond(
+          perform_step, _internal_inverse_pth_root_all, init_state)
+    def _skip(error):
+      condition = jnp.logical_or(
+          jnp.isnan(error), error >= inverse_failure_threshold)
+      return condition.astype(error.dtype)
+    def _select_preconditioner(error, new_p, old_p):
+      return lax.cond(
+          _skip(error), lambda _: old_p, lambda _: new_p, operand=None)
+    new_preconditioners_flat = []
+    for p, shape, prev_p, error in zip(preconditioners_flat, original_shapes,
+                                       prev_preconditioners, errors_flat):
+      new_preconditioners_flat.append(
+          _select_preconditioner(error, p[:shape[0], :shape[1]], prev_p))
+    assert len(states) == len(num_statistics_per_state)
+    assert len(new_preconditioners_flat) == num_statistics
+    # Add back empty preconditioners so we that we can set the optimizer state.
+    preconditioners_for_states = []
+    idx = 0
+    for num_statistics, state in zip(num_statistics_per_state, states):
+      if num_statistics == 0:
+        preconditioners_for_states.append([])
+      else:
+        preconditioners_for_state = new_preconditioners_flat[idx:idx +
+                                                             num_statistics]
+        assert len(state.statistics) == len(preconditioners_for_state)
+        preconditioners_for_states.append(preconditioners_for_state)
+        idx += num_statistics
+    new_states = []
+    for state, new_preconditioners in zip(states, preconditioners_for_states):
+      new_states.append(
+          ParameterStats(state.diagonal_statistics, state.statistics,
+                         new_preconditioners, state.diagonal_momentum,
+                         state.momentum))
+    return new_states
+  def _compute_preconditioners(states, params, step):
+    """Computes preconditioners for given statistics in states.
+    Args:
+      states: A list of optimizer states.
+      params: A list of params.
+      step: Current step number
+    Returns:
+      New optimizer states after computing the preconditioner.
+    """
+    statistics = []
+    num_statistics_per_state = []
+    original_shapes = []
+    exponents = []
+    max_size = 0
+    prev_preconditioners = []
+    for state, param in zip(states, params):
+      num_statistics = len(state.statistics)
+      num_statistics_per_state.append(num_statistics)
+      original_shapes_for_state = []
+      if num_statistics > 0:
+        preconditioner = Preconditioner(param, block_size,
+                                        best_effort_shape_interpretation)
+        for statistic in state.statistics:
+          exponents.append(preconditioner.exponent_for_preconditioner(
+          ) if exponent_override == 0 else exponent_override)
+          original_shapes_for_state.append(statistic.shape)
+          max_size = max(max_size, statistic.shape[0])
+        statistics.extend(state.statistics)
+        prev_preconditioners.extend(state.preconditioners)
+        original_shapes.extend(original_shapes_for_state)
+    if batch_axis_name:
+      # Quantization is only enabled if batch_axis_name is not set.
+      quantized_dtype = quantized_dtype_for_second_moment_statistics_buffers()
+      if quantized_dtype == jnp.float32:
+        return _pmap_compute_preconditioners(states, step, statistics,
+                                             num_statistics_per_state,
+                                             original_shapes, exponents,
+                                             max_size, prev_preconditioners)
+      else:
+        return _pmap_quantized_compute_preconditioners(
+            states, step, statistics, num_statistics_per_state, original_shapes,
+            exponents, max_size, prev_preconditioners)
+    else:
+      return _pjit_compute_preconditioners(states, step, statistics,
+                                           num_statistics_per_state,
+                                           original_shapes, exponents, max_size,
+                                           prev_preconditioners)
+  def _transform_grad(grad, state, param, step):
+    """Transform per-parameter gradients."""
+    preconditioner = Preconditioner(param, block_size,
+                                    best_effort_shape_interpretation)
+    sgd_update = grad
+    new_diagonal_statistics = state.diagonal_statistics.to_float()
+    if graft_type == GraftingType.ADAGRAD:
+      new_diagonal_statistics = state.diagonal_statistics.to_float(
+      ) + jnp.square(grad)
+      adagrad_update = grad / (
+          jnp.sqrt(new_diagonal_statistics) + diagonal_epsilon)
+      grafting_update = adagrad_update
+    elif (graft_type == GraftingType.RMSPROP or
+          graft_type == GraftingType.RMSPROP_NORMALIZED):
+      scaled_grad = grad
+      if graft_type == GraftingType.RMSPROP_NORMALIZED:
+        scaled_grad = grad / jnp.linalg.norm(grad)
+      w1 = beta2
+      w2 = beta2 if beta2 == 1.0 else (1.0 - beta2)
+      new_diagonal_statistics = (
+          w1 * state.diagonal_statistics.to_float() +
+          w2 * jnp.square(scaled_grad))
+      rmsprop_update = scaled_grad / (
+          jnp.sqrt(new_diagonal_statistics) + diagonal_epsilon)
+      if clip_by_scaled_gradient_norm:
+        scaled_grad_norm = jnp.linalg.norm(rmsprop_update) / (
+            jnp.sqrt(float(rmsprop_update.size)))
+        clipping_denom = jnp.maximum(
+            1., scaled_grad_norm / clip_by_scaled_gradient_norm)
+        rmsprop_update /= clipping_denom
+      grafting_update = rmsprop_update
+    else:
+      grafting_update = sgd_update
+    precond_grad = grad
+    if not _skip_preconditioning(param):
+      precond_grad = preconditioner.preconditioned_grad(
+          precond_grad,
+          _maybe_dequantize_preconditioners(state.preconditioners))
+    else:
+      precond_grad = grafting_update
+    grafting_update_norm = jnp.linalg.norm(grafting_update)
+    precond_grad_norm = jnp.linalg.norm(precond_grad)
+    multiplier = (grafting_update_norm / (precond_grad_norm + 1e-16))
+    shampoo_update = precond_grad * multiplier
+    shampoo_update_with_wd = shampoo_update
+    grafting_update_with_wd = grafting_update
+    if weight_decay != 0:
+      shampoo_update_with_wd = shampoo_update + weight_decay * param
+      grafting_update_with_wd = grafting_update + weight_decay * param
+    w = (1.0 - beta1) if moving_average_for_momentum else 1.0
+    shampoo_update_with_wd_momentum = (
+        state.momentum.to_float() * beta1 + w * shampoo_update_with_wd)
+    grafting_update_with_wd_momentum = (
+        state.diagonal_momentum.to_float() * beta1 +
+        w * grafting_update_with_wd)
+    run_shampoo = (step >= start_preconditioning_step).astype(
+        grafting_update_with_wd_momentum.dtype)
+    momentum_update = (
+        run_shampoo * shampoo_update_with_wd_momentum +
+        (1.0 - run_shampoo) * grafting_update_with_wd_momentum)
+    wd_update = (
+        run_shampoo * shampoo_update_with_wd +
+        (1.0 - run_shampoo) * grafting_update_with_wd)
+    if nesterov:
+      momentum_update = w * wd_update + beta1 * momentum_update
+    lr = learning_rate
+    if callable(learning_rate):
+      lr = learning_rate(step)
+    transformed_update = -1.0 * lr * momentum_update
+    param_stats = ParameterStats(
+        _quantize_diagonal_statistics(new_diagonal_statistics),
+        state.statistics, state.preconditioners,
+        _quantize_momentum(grafting_update_with_wd_momentum),
+        _quantize_momentum(shampoo_update_with_wd_momentum))
+    return transformed_update, param_stats
+  def update_fn(grads, state, params):
+    """Transform the input gradient and update all statistics.
+    Args:
+      grads: the gradient tensors for the parameters.
+      state: a named tuple containing the state of the optimizer
+      params: the parameters that should be updated.
+    Returns:
+      A tuple containing the new parameters and the new optimizer state.
+    """
+    params_flat, treedef = jax.tree_flatten(params)
+    stats_flat = treedef.flatten_up_to(state.stats)
+    grads_flat = treedef.flatten_up_to(grads)
+    new_stats_flat = jax.tree_multimap(
+        lambda g, s, p: _compute_stats(g, s, p, state.count), grads_flat,
+        stats_flat, params_flat)
+    new_stats_flat = _compute_preconditioners(new_stats_flat, params_flat,
+                                              state.count)
+    outputs = jax.tree_multimap(
+        lambda g, s, p: _transform_grad(g, s, p, state.count), grads_flat,
+        new_stats_flat, params_flat)
+    updates_flat, new_stats_flat = list(zip(*outputs)) if outputs else ((), ())
+    updates = jax.tree_unflatten(treedef, updates_flat)
+    new_stats = jax.tree_unflatten(treedef, new_stats_flat)
+    new_state = ShampooState(
+        count=state.count+1, stats=new_stats)
+    return updates, new_state
+  if shard_optimizer_states:
+    # Hijacks the init_fn signature so we can return an OptState with
+    # appropriate init_fns.
+    def _init_fns(unused_params):
+      return InitFnState(
+          init_fn=sharded_init_fn,
+          pspec_fn=sharded_init_partition_spec_fn,
+          shape_and_dtype_fn=sharded_init_shape_and_dtype_fn)
+    return optax.GradientTransformation(_init_fns, sharded_update_fn)
+  else:
+    return optax.GradientTransformation(init_fn, update_fn)

flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c8fc602f9bddd0cad0b464f75463b0329adea24e06851d5740ce11781d6cc4ba
+size 1419302302

flax_model_to_pytorch.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from transformers import AutoModelForCausalLM, FlaxAutoModelForCausalLM, AutoTokenizer
+import torch
+import numpy as np
+import jax
+import jax.numpy as jnp
+def to_f32(t):
+    return jax.tree_map(lambda x: x.astype(jnp.float32) if x.dtype == jnp.bfloat16 else x, t)
+jax.config.update('jax_platform_name', 'cpu')
+MODEL_PATH = "./"
+model = FlaxAutoModelForCausalLM.from_pretrained(MODEL_PATH)
+model.params = to_f32(model.params)
+model.save_pretrained(MODEL_PATH)
+pt_model = AutoModelForCausalLM.from_pretrained(
+    MODEL_PATH, from_flax=True).to('cpu')
+input_ids = np.asarray(2 * [128 * [0]], dtype=np.int32)
+input_ids_pt = torch.tensor(input_ids)
+logits_pt = pt_model(input_ids_pt).logits
+print(logits_pt)
+logits_fx = model(input_ids).logits
+print(logits_fx)
+pt_model.save_pretrained(MODEL_PATH)

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

replace_token_script.py ADDED Viewed

	@@ -0,0 +1,80 @@

+''''This script was used to replace the final index of tokenizer.json and vocab.json
+     with "<|endoftext|>" token. Also reassociate the corresponding merges'''
+import json
+tokenizer_path = 'tokenizer.json'
+model_config_path = 'config.json'
+vocab_path = 'vocab.json'
+with open(vocab_path, "r") as f:
+    vocab_data = json.load(f)
+with open(tokenizer_path, "r") as f:
+    tokenizer_data = json.load(f)
+with open(model_config_path, "r") as f:
+    model_config = json.load(f)
+model_vocab_size = model_config['vocab_size']
+tokenizer_vocab = tokenizer_data['model']['vocab']
+mergeslength = len(tokenizer_data['model']['merges'])
+#readjust added_tokens 'id' to model_vocab_size - 1
+tokenizer_data['added_tokens'][-1]['id'] = model_vocab_size - 1
+final_index = model_vocab_size - 1
+eos = '<|endoftext|>'
+#retrieve the key of final index
+old_key_final_index_tokenizer = list(tokenizer_data['model']['vocab'].keys())[final_index]
+old_key_final_index_vocab = list(vocab_data.keys())[final_index]
+old_key_final_index_vocab_min2 = list(vocab_data.keys())[final_index - 1]
+old_key_final_index_tokenizer_merges = tokenizer_data['model']['merges'][mergeslength - 1]
+print(f"old_key_final_index_tokenizer = {old_key_final_index_tokenizer}")
+print(f"old_key_final_index_vocab = {old_key_final_index_vocab}")
+print(f"old_key_final_index_vocab_min2 = {old_key_final_index_vocab_min2}")
+print(f"old_key_final_index_tokenizer_merges = {old_key_final_index_tokenizer_merges}")
+#replace old key with new key
+tokenizer_data['model']['vocab']['<|endoftext|>'] = tokenizer_data['model']['vocab'][old_key_final_index_tokenizer]
+vocab_data[eos] = vocab_data[old_key_final_index_vocab]
+#replace the final merges idx with vocab_data - 1
+tokenizer_data['model']['merges'] = tokenizer_data['model']['merges'][: mergeslength - 1]
+#delete old key
+del tokenizer_data['model']['vocab'][old_key_final_index_tokenizer]
+del vocab_data[old_key_final_index_vocab]
+#check updated key
+old_key_final_index_tokenizer = list(tokenizer_data['model']['vocab'].keys())[final_index]
+old_key_final_index_vocab = list(vocab_data.keys())[final_index]
+old_key_final_index_tokenizer_merges = tokenizer_data['model']['merges'][mergeslength - 2]
+print(len(tokenizer_data['model']['merges']))
+print()
+print(f"updated old_key_final_index_tokenizer = {old_key_final_index_tokenizer}")
+print(f"updated old_key_final_index_vocab = {old_key_final_index_vocab}")
+print(f"updated old_key_final_index_tokenizer_merges = {old_key_final_index_tokenizer_merges}")
+with open(tokenizer_path, "w")as f:
+    json.dump(tokenizer_data, f)
+with open(vocab_path, "w")as f:
+    json.dump(vocab_data, f)
+with open('merges.txt') as f:
+    lines = f.readlines()
+with open("merges.txt", "w") as f:
+    for i in range(len(lines) - 1):
+        f.write(lines[i])
+with open('merges.txt') as f:
+    newlines = f.readlines()
+print(f"newlines[len(newlines) - 1] = {newlines[len(newlines) - 1]}")

run_clm_flax.py ADDED Viewed

	@@ -0,0 +1,892 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Pre-training/Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset.
+Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
+https://huggingface.co/models?filter=text-generation
+"""
+# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
+import json
+import logging
+import math
+import os
+import sys
+import time
+import gc
+from dataclasses import asdict, dataclass, field
+from enum import Enum
+from itertools import chain
+from pathlib import Path
+from typing import Callable, Optional
+import datasets
+import numpy as np
+from datasets import Dataset, load_dataset, load_from_disk
+from tqdm import tqdm
+import jax
+import jax.numpy as jnp
+import optax
+import transformers
+from flax import jax_utils, traverse_util
+from flax.jax_utils import unreplicate
+from flax.training import train_state
+from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
+from huggingface_hub import Repository
+from transformers import (
+    CONFIG_MAPPING,
+    FLAX_MODEL_FOR_CAUSAL_LM_MAPPING,
+    AutoConfig,
+    AutoTokenizer,
+    FlaxAutoModelForCausalLM,
+    HfArgumentParser,
+    is_tensorboard_available,
+    set_seed,
+)
+from transformers.file_utils import get_full_repo_name
+from transformers.testing_utils import CaptureLogger
+from distributed_shampoo import distributed_shampoo, GraftingType
+logger = logging.getLogger(__name__)
+MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_CAUSAL_LM_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+@dataclass
+class TrainingArguments:
+    output_dir: str = field(
+        metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
+    )
+    overwrite_output_dir: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Overwrite the content of the output directory. "
+                "Use this to continue training if output_dir points to a checkpoint directory."
+            )
+        },
+    )
+    do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
+    do_eval: bool = field(default=False, metadata={"help": "Whether to run eval on the dev set."})
+    per_device_train_batch_size: int = field(
+        default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for training."}
+    )
+    per_device_eval_batch_size: int = field(
+        default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."}
+    )
+    gradient_accumulation_steps: int = field(
+        default=1,
+        metadata={
+            "help": "Number of updates steps to accumulate before performing a backward/update pass."
+        },
+    )
+    learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for AdamW."})
+    weight_decay: float = field(default=0.0, metadata={"help": "Weight decay for AdamW if we apply some."})
+    adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for AdamW optimizer"})
+    adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for AdamW optimizer"})
+    adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for AdamW optimizer."})
+    adafactor: bool = field(default=False, metadata={"help": "Whether or not to replace AdamW by Adafactor."})
+    distributed_shampoo: bool = field(
+        default=False, metadata={"help": "Use Distributed Shampoo optimizer instead of AdamW."},
+    )
+    quantize_shampoo: bool = field(
+        default=False, metadata={"help": "Quantize Distributed Shampoo optimizer."},
+    )
+    num_train_epochs: float = field(default=3.0, metadata={"help": "Total number of training epochs to perform."})
+    warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})
+    warmup_ratio: float = field(default=0.0, metadata={"help": "Linear warmup ratio of total train steps."})
+    cosine_decay: bool = field(
+        default=False, metadata={"help": "Whether or not to use cosine decay instead of the basic linear decay schedule."}
+    )
+    gradient_clipping: bool = field(
+        default=False, metadata={"help": "Whether or not to use gradient clipping."}
+    )
+    logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
+    save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})
+    eval_steps: int = field(default=None, metadata={"help": "Run an evaluation every X steps."})
+    seed: int = field(default=42, metadata={"help": "Random seed that will be set at the beginning of training."})
+    push_to_hub: bool = field(
+        default=False, metadata={"help": "Whether or not to upload the trained model to the model hub after training."}
+    )
+    hub_model_id: str = field(
+        default=None, metadata={"help": "The name of the repository to keep in sync with the local `output_dir`."}
+    )
+    hub_token: str = field(default=None, metadata={"help": "The token to use to push to the Model Hub."})
+    def __post_init__(self):
+        if self.output_dir is not None:
+            self.output_dir = os.path.expanduser(self.output_dir)
+    def to_dict(self):
+        """
+        Serializes this instance while replace `Enum` by their values (for JSON serialization support). It obfuscates
+        the token values by removing their value.
+        """
+        d = asdict(self)
+        for k, v in d.items():
+            if isinstance(v, Enum):
+                d[k] = v.value
+            if isinstance(v, list) and len(v) > 0 and isinstance(v[0], Enum):
+                d[k] = [x.value for x in v]
+            if k.endswith("_token"):
+                d[k] = f"<{k.upper()}>"
+        return d
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
+    """
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The model checkpoint for weights initialization."
+            "Don't set if you want to train a model from scratch."
+        },
+    )
+    model_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    dtype: Optional[str] = field(
+        default="float32",
+        metadata={
+            "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`."
+        },
+    )
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    dataset_filepath: Optional[str] = field(
+        default=None, metadata={"help": "Filepath to locally saved HF Dataset (with 'dataset.save_to_disk' method) to use for training"}
+    )
+    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    validation_split_percentage: Optional[int] = field(
+        default=5,
+        metadata={
+            "help": "The percentage of the train set used as validation set in case there's no validation split"
+        },
+    )
+    block_size: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "Optional input sequence length after tokenization. "
+            "The training dataset will be truncated in block of this size for training. "
+            "Default to the model max input length for single sentence inputs (take into account special tokens)."
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    keep_linebreaks: bool = field(
+        default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
+    )
+    def __post_init__(self):
+        if self.dataset_name is None and self.train_file is None and self.dataset_filepath is None and self.validation_file is None:
+            raise ValueError("Need either a dataset name or a training/validation file.")
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
+class TrainState(train_state.TrainState):
+    dropout_rng: jnp.ndarray
+    def replicate(self):
+        return jax_utils.replicate(self).replace(dropout_rng=shard_prng_key(self.dropout_rng))
+def data_loader(rng: jax.random.PRNGKey, dataset: Dataset, batch_size: int, shuffle: bool = False):
+    """
+    Returns batches of size `batch_size` from truncated `dataset`, sharded over all local devices.
+    Shuffle batches if `shuffle` is `True`.
+    """
+    steps_per_epoch = len(dataset) // batch_size
+    if shuffle:
+        batch_idx = jax.random.permutation(rng, len(dataset))
+    else:
+        batch_idx = jnp.arange(len(dataset))
+    batch_idx = batch_idx[: steps_per_epoch * batch_size]  # Skip incomplete batch.
+    batch_idx = batch_idx.reshape((steps_per_epoch, batch_size))
+    for idx in batch_idx:
+        batch = dataset[idx]
+        batch = {k: np.array(v) for k, v in batch.items()}
+        yield batch
+def write_train_metric(summary_writer, train_metrics, train_time, step):
+    summary_writer.scalar("train_time", train_time, step)
+    train_metrics = get_metrics(train_metrics)
+    for key, vals in train_metrics.items():
+        tag = f"train_{key}"
+        for i, val in enumerate(vals):
+            summary_writer.scalar(tag, val, step - len(vals) + i + 1)
+def write_eval_metric(summary_writer, eval_metrics, step):
+    for metric_name, value in eval_metrics.items():
+        summary_writer.scalar(f"eval_{metric_name}", value, step)
+def create_linear_learning_rate_fn(
+    train_ds_size: int, train_batch_size: int, num_train_epochs: int, num_warmup_steps: int, learning_rate: float
+) -> Callable[[int], jnp.array]:
+    """Returns a linear warmup, linear decay learning rate function."""
+    steps_per_epoch = train_ds_size // train_batch_size
+    num_train_steps = steps_per_epoch * num_train_epochs
+    warmup_fn = optax.linear_schedule(init_value=0.0, end_value=learning_rate, transition_steps=num_warmup_steps)
+    decay_fn = optax.linear_schedule(
+        init_value=learning_rate, end_value=0, transition_steps=num_train_steps - num_warmup_steps
+    )
+    schedule_fn = optax.join_schedules(schedules=[warmup_fn, decay_fn], boundaries=[num_warmup_steps])
+    return schedule_fn
+def create_cosine_learning_rate_fn(
+    train_ds_size: int, train_batch_size: int, num_train_epochs: int, num_warmup_steps: int, learning_rate: float
+) -> Callable[[int], jnp.array]:
+    """Returns a linear warmup, cosine decay learning rate function."""
+    steps_per_epoch = train_ds_size // train_batch_size
+    num_train_steps = steps_per_epoch * num_train_epochs
+    warmup_fn = optax.linear_schedule(init_value=0.0, end_value=learning_rate, transition_steps=num_warmup_steps)
+    decay_fn = optax.cosine_decay_schedule(
+        init_value=learning_rate, decay_steps=num_train_steps - num_warmup_steps, alpha=0.1
+    )
+    schedule_fn = optax.join_schedules(schedules=[warmup_fn, decay_fn], boundaries=[num_warmup_steps])
+    return schedule_fn
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    if (
+        os.path.exists(training_args.output_dir)
+        and os.listdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({training_args.output_dir}) already exists and is not empty."
+            "Use --overwrite_output_dir to overcome."
+        )
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    # Setup logging, we only want one process per machine to log things on the screen.
+    logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
+    if jax.process_index() == 0:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    logger.info(f"Training/evaluation parameters {training_args}")
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+    # Handle the repository creation
+    if training_args.push_to_hub:
+        if training_args.hub_model_id is None:
+            repo_name = get_full_repo_name(
+                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
+            )
+        else:
+            repo_name = training_args.hub_model_id
+        repo = Repository(training_args.output_dir, clone_from=repo_name)
+    #  Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, keep_in_memory=False
+        )
+        if "validation" not in dataset.keys():
+            dataset["validation"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+            )
+            dataset["train"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+            )
+    elif data_args.dataset_filepath is not None:
+        # Loading a dataset from local file.
+        dataset = load_from_disk(data_args.dataset_filepath)
+        if "validation" not in dataset.keys():
+            dataset = datasets.train_test_split(test_size=data_args.validation_split_percentage/100)
+            dataset["validation"] = dataset["test"]
+            del dataset["test"]
+    else:
+        data_files = {}
+        dataset_args = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+        extension = data_args.train_file.split(".")[-1]
+        if extension == "txt":
+            extension = "text"
+            dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
+        dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir, **dataset_args)
+        if "validation" not in dataset.keys():
+            dataset["validation"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+                **dataset_args,
+            )
+            dataset["train"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+                **dataset_args,
+            )
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # Load pretrained model and tokenizer
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    if model_args.config_name:
+        config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
+    elif model_args.model_name_or_path:
+        config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
+    else:
+        config = CONFIG_MAPPING[model_args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+    if model_args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+        )
+    elif model_args.model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+        )
+    else:
+        raise ValueError(
+            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+        )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    if model_args.model_name_or_path:
+        model = FlaxAutoModelForCausalLM.from_pretrained(
+            model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
+        )
+    else:
+        model = FlaxAutoModelForCausalLM.from_config(
+            config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
+        )
+    # Preprocessing the datasets.
+    # First we tokenize all the texts.
+    if training_args.do_train:
+        column_names = dataset["train"].column_names
+    else:
+        column_names = dataset["validation"].column_names
+    text_column_name = "text" if "text" in column_names else column_names[0]
+    # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
+    tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
+    def tokenize_function(examples):
+        with CaptureLogger(tok_logger) as cl:
+            output = tokenizer(examples[text_column_name])
+        # clm input could be much much longer than block_size
+        if "Token indices sequence length is longer than the" in cl.out:
+            tok_logger.warning(
+                "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model."
+            )
+        return output
+    tokenized_datasets = dataset.map(
+        tokenize_function,
+        batched=True,
+        num_proc=data_args.preprocessing_num_workers,
+        remove_columns=column_names,
+        load_from_cache_file=not data_args.overwrite_cache,
+    )
+    if data_args.block_size is None:
+        block_size = tokenizer.model_max_length
+        if block_size > config.max_position_embeddings:
+            logger.warning(
+                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
+                "Picking 1024 instead. You can change that default value by passing --block_size xxx."
+            )
+            block_size = 1024
+    else:
+        if data_args.block_size > tokenizer.model_max_length:
+            logger.warning(
+                f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model"
+                f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
+            )
+        block_size = min(data_args.block_size, tokenizer.model_max_length)
+    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
+    def group_texts(examples):
+        # Concatenate all texts.
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+        total_length = len(concatenated_examples[list(examples.keys())[0]])
+        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+        # customize this part to your needs.
+        if total_length >= block_size:
+            total_length = (total_length // block_size) * block_size
+        # Split by chunks of max_len.
+        result = {
+            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+            for k, t in concatenated_examples.items()
+        }
+        result["labels"] = result["input_ids"].copy()
+        return result
+    # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
+    # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
+    # to preprocess.
+    #
+    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
+    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+    lm_datasets = tokenized_datasets.map(
+        group_texts,
+        batched=True,
+        num_proc=data_args.preprocessing_num_workers,
+        load_from_cache_file=not data_args.overwrite_cache,
+    )
+    if training_args.do_train:
+        if "train" not in tokenized_datasets:
+            raise ValueError("--do_train requires a train dataset")
+        train_dataset = lm_datasets["train"]
+        if data_args.max_train_samples is not None:
+            train_dataset = train_dataset.select(range(data_args.max_train_samples))
+        # test to see that tokenization worked
+        detokenized_example = tokenizer.decode(train_dataset[0]["input_ids"])
+        logger.info(f"Detokenized example: {detokenized_example}")
+        detokenized_example = tokenizer.decode(train_dataset[-1]["input_ids"])
+        logger.info(f"Detokenized example 2: {detokenized_example}")
+    if training_args.do_eval:
+        if "validation" not in tokenized_datasets:
+            raise ValueError("--do_eval requires a validation dataset")
+        eval_dataset = lm_datasets["validation"]
+        if data_args.max_eval_samples is not None:
+            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
+    # Enable tensorboard only on the master node
+    has_tensorboard = is_tensorboard_available()
+    if has_tensorboard and jax.process_index() == 0:
+        try:
+            from flax.metrics.tensorboard import SummaryWriter
+            summary_writer = SummaryWriter(log_dir=Path(os.path.join(training_args.output_dir, "runs")))
+        except ImportError as ie:
+            has_tensorboard = False
+            logger.warning(
+                f"Unable to display metrics through TensorBoard because some package are not installed: {ie}"
+            )
+    else:
+        logger.warning(
+            "Unable to display metrics through TensorBoard because the package is not installed: "
+            "Please run pip install tensorboard to enable."
+        )
+    # Initialize our training
+    rng = jax.random.PRNGKey(training_args.seed)
+    rng, dropout_rng = jax.random.split(rng)
+    # Store some constant
+    num_epochs = int(training_args.num_train_epochs)
+    train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count() * training_args.gradient_accumulation_steps
+    eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
+    steps_per_epoch = len(train_dataset) // train_batch_size
+    total_train_steps = steps_per_epoch * num_epochs
+    if training_args.warmup_ratio > 0:
+        warmup_steps = int(total_train_steps * training_args.warmup_ratio)
+    else:
+        warmup_steps = training_args.warmup_steps
+    # Create learning rate schedule
+    if training_args.cosine_decay:
+        lr_schedule_fn = create_cosine_learning_rate_fn(
+            len(train_dataset),
+            train_batch_size,
+            training_args.num_train_epochs,
+            warmup_steps,
+            training_args.learning_rate,
+        )
+    else:
+        lr_schedule_fn = create_linear_learning_rate_fn(
+            len(train_dataset),
+            train_batch_size,
+            training_args.num_train_epochs,
+            warmup_steps,
+            training_args.learning_rate,
+        )
+    # We use Optax's "masking" functionality to not apply weight decay
+    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
+    # mask boolean with the same structure as the parameters.
+    # The mask is True for parameters that should be decayed.
+    # Note that this mask is specifically adapted for FlaxGPT2.
+    # For other models, one should correct the layer norm parameter naming
+    # accordingly.
+    def decay_mask_fn(params):
+        flat_params = traverse_util.flatten_dict(params)
+        flat_mask = {
+            path: (path[-1] != "bias" and path[-2:] not in [("ln_1", "scale"), ("ln_2", "scale"), ("ln_f", "scale")])
+            for path in flat_params
+        }
+        return traverse_util.unflatten_dict(flat_mask)
+    # create adam optimizer
+    if training_args.adafactor:
+        # We use the default parameters here to initialize adafactor,
+        # For more details about the parameters please check https://github.com/deepmind/optax/blob/ed02befef9bf81cbbf236be3d2b0e032e9ed4a40/optax/_src/alias.py#L74
+        optimizer = optax.adafactor(
+            learning_rate=lr_schedule_fn,
+        )
+    elif training_args.distributed_shampoo:
+        # parameters from https://github.com/tensorflow/lingvo/blob/03ee9d7cd50764b0424c7c863733c91fc0b053ec/lingvo/jax/optimizers.py#L729
+        # Notes:
+        # - mask for weight decay is not implemented but we don't use it anyway
+        optimizer = distributed_shampoo(
+            lr_schedule_fn,
+            block_size=1536,  # recommended default for large LM is 1536
+            beta1=0.9,
+            beta2=0.999,
+            diagonal_epsilon=1e-10,
+            matrix_epsilon=1e-8,
+            weight_decay=0.0,
+            start_preconditioning_step=1001,
+            preconditioning_compute_steps=10,
+            statistics_compute_steps=1,
+            best_effort_shape_interpretation=True,
+            graft_type=GraftingType.RMSPROP_NORMALIZED,
+            nesterov=False,
+            exponent_override=0,
+            batch_axis_name="batch",
+            inverse_failure_threshold=0.1,
+            moving_average_for_momentum=True,
+            skip_preconditioning_dim_size_gt=4096,
+            clip_by_scaled_gradient_norm=None,
+            precision=jax.lax.Precision.HIGHEST,
+            best_effort_memory_usage_reduction=training_args.quantize_shampoo,
+        )
+    else:
+        optimizer = optax.adamw(
+            learning_rate=lr_schedule_fn,
+            b1=training_args.adam_beta1,
+            b2=training_args.adam_beta2,
+            eps=training_args.adam_epsilon,
+            weight_decay=training_args.weight_decay,
+            mask=decay_mask_fn,
+        )
+        if training_args.gradient_clipping:
+            optimizer = optax.chain(
+                optax.clip_by_global_norm(1.),
+                optimizer
+            )
+    # add gradient accumulation
+    if training_args.gradient_accumulation_steps > 1:
+        optimizer = optax.chain(
+            optax.apply_every(training_args.gradient_accumulation_steps), optimizer
+        )
+    # Setup train state
+    state = TrainState.create(apply_fn=model.__call__, params=model.params, tx=optimizer, dropout_rng=dropout_rng)
+    def loss_fn(logits, labels):
+        shift_logits = logits[..., :-1, :]
+        shift_labels = labels[..., 1:]
+        loss = optax.softmax_cross_entropy(shift_logits, onehot(shift_labels, shift_logits.shape[-1]))
+        return loss.mean()
+    # Define gradient update step fn
+    def train_step(state, batch):
+        dropout_rng, new_dropout_rng = jax.random.split(state.dropout_rng)
+        def compute_loss(params):
+            labels = batch.pop("labels")
+            logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
+            loss = loss_fn(logits, labels)
+            return loss
+        grad_fn = jax.value_and_grad(compute_loss)
+        loss, grad = grad_fn(state.params)
+        grad = jax.lax.pmean(grad, "batch")
+        new_state = state.apply_gradients(grads=grad, dropout_rng=new_dropout_rng)
+        metrics = {"loss": loss, "learning_rate": lr_schedule_fn(state.step // training_args.gradient_accumulation_steps)}
+        metrics = jax.lax.pmean(metrics, axis_name="batch")
+        return new_state, metrics
+    # Define eval fn
+    def eval_step(params, batch):
+        labels = batch.pop("labels")
+        logits = model(**batch, params=params, train=False)[0]
+        loss = loss_fn(logits, labels)
+        # summarize metrics
+        metrics = {"loss": loss}
+        metrics = jax.lax.pmean(metrics, axis_name="batch")
+        return metrics
+    # Create parallel version of the train and eval step
+    p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
+    p_eval_step = jax.pmap(eval_step, "batch")
+    # Replicate the train state on each device
+    state = state.replicate()
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {num_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel & distributed) = {train_batch_size}")
+    logger.info(f"  Total optimization steps = {total_train_steps}")
+    train_time = 0
+    train_metrics = []
+    epochs = tqdm(range(num_epochs), desc="Epoch ... ", position=0)
+    for epoch in epochs:
+        # ======================== Training ================================
+        train_start = time.time()
+        # Create sampling rng
+        rng, input_rng = jax.random.split(rng)
+        # Generate an epoch by shuffling sampling indices from the train dataset
+        train_loader = data_loader(input_rng, train_dataset, train_batch_size // training_args.gradient_accumulation_steps, shuffle=True)
+        steps_per_epoch = len(train_dataset) // train_batch_size
+        # train
+        steps_trained_progress_bar = tqdm(range(steps_per_epoch), desc="Training...", position=1,
+                                          leave=False)
+        for step in range(steps_per_epoch * training_args.gradient_accumulation_steps):
+            batch = next(train_loader)
+            batch = shard(batch)
+            state, train_metric = p_train_step(state, batch)
+            train_metrics.append(train_metric)
+            cur_step = epoch * (steps_per_epoch*training_args.gradient_accumulation_steps) + step
+            if step % training_args.gradient_accumulation_steps == 0:
+                steps_trained_progress_bar.update(1)
+            if cur_step % (training_args.logging_steps * training_args.gradient_accumulation_steps) == 0 and cur_step > 0:
+                # Save metrics
+                train_metric = unreplicate(train_metric)
+                train_time += time.time() - train_start
+                if has_tensorboard and jax.process_index() == 0:
+                    write_train_metric(summary_writer, train_metrics, train_time, cur_step)
+                epochs.write(
+                    f"Step... ({cur_step} | Loss: {train_metric['loss'].mean()}, Learning Rate: {train_metric['learning_rate'].mean()})"
+                )
+                train_metrics = []
+            if cur_step % (training_args.eval_steps * training_args.gradient_accumulation_steps) == 0 and cur_step > 0:
+                # ======================== Evaluating ==============================
+                eval_metrics = []
+                eval_loader = data_loader(input_rng, eval_dataset, eval_batch_size)
+                eval_steps = len(eval_dataset) // eval_batch_size
+                for _ in tqdm(range(eval_steps), desc="Evaluating...", position=2, leave=False):
+                    # Model forward
+                    batch = next(eval_loader)
+                    batch = shard(batch)
+                    metrics = p_eval_step(state.params, batch)
+                    eval_metrics.append(metrics)
+                # normalize eval metrics
+                eval_metrics = get_metrics(eval_metrics)
+                eval_metrics = jax.tree_map(jnp.mean, eval_metrics)
+                try:
+                    eval_metrics["perplexity"] = math.exp(eval_metrics["loss"])
+                except OverflowError:
+                    eval_metrics["perplexity"] = float("inf")
+                # Print metrics and update progress bar
+                desc = f"Step... ({cur_step} | Eval Loss: {eval_metrics['loss']} | Eval Perplexity: {eval_metrics['perplexity']})"
+                epochs.write(desc)
+                epochs.desc = desc
+                # Save metrics
+                if has_tensorboard and jax.process_index() == 0:
+                    write_eval_metric(summary_writer, eval_metrics, cur_step)
+            if cur_step % (training_args.save_steps * training_args.gradient_accumulation_steps) == 0 and cur_step > 0:
+                # save checkpoint after each epoch and push checkpoint to the hub
+                if jax.process_index() == 0:
+                    params = jax.device_get(unreplicate(state.params))
+                    model.save_pretrained(training_args.output_dir, params=params)
+                    tokenizer.save_pretrained(training_args.output_dir)
+                    if training_args.push_to_hub:
+                        repo.push_to_hub(commit_message=f"Saving weights and logs of step {cur_step}", blocking=False)
+        # save also at the end of epoch
+        try:
+            if jax.process_index() == 0:
+                params = jax.device_get(unreplicate(state.params))
+                model.save_pretrained(training_args.output_dir, params=params)
+                tokenizer.save_pretrained(training_args.output_dir)
+                if training_args.push_to_hub:
+                    repo.push_to_hub(commit_message=f"Saving weights and logs of epoch {epoch}", blocking=False)
+        except:
+            # push to hub fails the whole script if nothing new to commit
+            pass
+    # Eval after training
+    if training_args.do_eval:
+        eval_metrics = []
+        eval_loader = data_loader(input_rng, eval_dataset, eval_batch_size)
+        eval_steps = len(eval_dataset) // eval_batch_size
+        for _ in tqdm(range(eval_steps), desc="Evaluating...", position=2, leave=False):
+            # Model forward
+            batch = shard(next(eval_loader))
+            metrics = p_eval_step(state.params, batch)
+            eval_metrics.append(metrics)
+        # normalize eval metrics
+        eval_metrics = get_metrics(eval_metrics)
+        eval_metrics = jax.tree_map(lambda x: jnp.mean(x).item(), eval_metrics)
+        try:
+            eval_metrics["perplexity"] = math.exp(eval_metrics["loss"])
+        except OverflowError:
+            eval_metrics["perplexity"] = float("inf")
+        if jax.process_index() == 0:
+            eval_metrics = {f"eval_{metric_name}": value for metric_name, value in eval_metrics.items()}
+            path = os.path.join(training_args.output_dir, "eval_results.json")
+            with open(path, "w") as f:
+                json.dump(eval_metrics, f, indent=4, sort_keys=True)
+if __name__ == "__main__":
+    main()

runs/events.out.tfevents.1642710569.t1v-n-42145f73-w-0.1403347.0.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b72568b9aa0ed78b38fc232c126bf59229b152bbcacaae8358612edc7507a6ed
+size 1471449

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"bos_token": "<\|endoftext\|>", "eos_token": "<\|endoftext\|>", "unk_token": "<\|endoftext\|>", "pad_token": "<\|endoftext\|>"}

start_train.sh ADDED Viewed

	@@ -0,0 +1,29 @@

+# set train hyperparams
+unset LD_PRELOAD
+export HF_DATASETS_CACHE="/researchdisk/datasets_cache"
+export USE_TORCH=0
+python3 run_clm_flax.py \
+    --output_dir="./" \
+    --model_type="gpt2" \
+    --config_name="./" \
+    --tokenizer_name="./" \
+    --dataset_filepath="/researchdisk/training_dataset_full_deduplicated" \
+    --do_train --do_eval \
+    --block_size="512" \
+    --per_device_train_batch_size="16" \
+    --per_device_eval_batch_size="16" \
+    --preprocessing_num_workers="1" \
+    --adam_beta1="0.9" \
+    --adam_beta2="0.98" \
+    --learning_rate="1e-4" \
+    --weight_decay="0.01" \
+    --warmup_steps="4000" \
+    --cosine_decay \
+    --overwrite_output_dir \
+    --logging_steps="500" \
+    --eval_steps="10000" \
+    --save_steps="10000" \
+    --num_train_epochs="10" \
+    --dtype="bfloat16" \
+    --push_to_hub \
+    --hub_model_id="Finnish-NLP/gpt2-medium-finnish"

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"unk_token": "<\|endoftext\|>", "bos_token": "<\|endoftext\|>", "eos_token": "<\|endoftext\|>", "add_prefix_space": false, "special_tokens_map_file": null, "name_or_path": "./", "tokenizer_class": "GPT2Tokenizer"}

train_tokenizer.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from datasets import load_from_disk
+from tokenizers import trainers, Tokenizer, normalizers, ByteLevelBPETokenizer
+from transformers import AutoTokenizer
+model_dir = "./"
+# load dataset
+dataset = load_from_disk("/researchdisk/training_dataset_full_deduplicated")
+dataset = dataset["train"]
+# Instantiate tokenizer
+tokenizer = ByteLevelBPETokenizer()
+def batch_iterator(batch_size=1000):
+    for i in range(0, len(dataset), batch_size):
+        yield dataset[i: i + batch_size]["text"]
+# Customized training
+tokenizer.train_from_iterator(batch_iterator(), vocab_size=50257, min_frequency=2, special_tokens=[
+    "<s>",
+    "<pad>",
+    "</s>",
+    "<unk>",
+    "<mask>",
+])
+# Save files to disk
+tokenizer.save(f"{model_dir}/tokenizer.json")
+tokenizer = AutoTokenizer.from_pretrained(model_dir)
+tokenizer.save_pretrained(model_dir)

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff