aapot commited on Feb 10, 2024

Commit

a85f909

1 Parent(s): 20e722f

Add training codes

Files changed (33) hide show

.gitignore +1 -0
EasyLM/__init__.py +0 -0
EasyLM/bpt.py +228 -0
EasyLM/checkpoint.py +212 -0
EasyLM/data.py +431 -0
EasyLM/jax_utils.py +403 -0
EasyLM/models/__init__.py +0 -0
EasyLM/models/gptj/__init__.py +0 -0
EasyLM/models/gptj/gptj_model.py +1054 -0
EasyLM/models/gptj/gptj_serve.py +396 -0
EasyLM/models/gptj/gptj_train.py +272 -0
EasyLM/models/llama/convert_easylm_to_hf.py +338 -0
EasyLM/models/llama/convert_hf_to_easylm.py +196 -0
EasyLM/models/llama/convert_torch_to_easylm.py +68 -0
EasyLM/models/llama/llama_model.py +1530 -0
EasyLM/models/llama/llama_serve.py +386 -0
EasyLM/models/llama/llama_train.py +268 -0
EasyLM/models/roberta/__init__.py +0 -0
EasyLM/models/roberta/roberta_model.py +1694 -0
EasyLM/models/roberta/roberta_train.py +307 -0
EasyLM/optimizers.py +302 -0
EasyLM/scripts/__init__.py +0 -0
EasyLM/scripts/benchmark_attention.py +150 -0
EasyLM/scripts/convert_checkpoint.py +42 -0
EasyLM/scripts/diff_checkpoint.py +59 -0
EasyLM/scripts/lm_eval_harness.py +65 -0
EasyLM/scripts/lm_eval_json.py +52 -0
EasyLM/serving.py +566 -0
convert_to_hf_model.sh +4 -0
pretrain_llama_7b.sh +52 -0
tokenizer.model +3 -0
tokenizer.vocab +0 -0
train_sentencepiece.py +10 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

EasyLM/__init__.py ADDED Viewed

File without changes

EasyLM/bpt.py ADDED Viewed

	@@ -0,0 +1,228 @@

+"""
+An implementation of Blockwise parallel transformer https://arxiv.org/abs/2305.19370
+Also include a reference implementation of memory-efficient transformer https://arxiv.org/abs/2112.05682
+"""
+import functools
+from typing import NamedTuple
+import flax.linen as nn
+import jax
+import jax.lax as lax
+import jax.numpy as jnp
+from einops import rearrange
+"""
+Computing ffn blockwise without materializing the large hidden tensor, training
+4x longer sequences than the memory-efficient transformer.
+Blockwise parallel transformer https://arxiv.org/abs/2305.19370 Liu et al. 2023
+"""
+def blockwise_ffn(remat_ffn, inputs, chunk_size=2048, deterministic=True):
+    # remat_ffn: a rematerialized ffn with policy jax.checkpoint_policies.nothing_saveable()
+    # inputs: (batch, seq_len, dim)
+    # chunk_size: the chunk size to split the sequence
+    inputs = rearrange(inputs, 'b (c n) d -> b c n d', c=chunk_size)
+    def scan_ffn(remat_ffn, carry, hidden_states):
+        outputs = remat_ffn(hidden_states, deterministic=deterministic)
+        return carry, outputs
+    scan_axis = inputs.ndim - 2
+    _, res = nn.scan(
+        scan_ffn,
+        variable_broadcast="params",
+        split_rngs={"params": False, "dropout": True},
+        in_axes=scan_axis,
+        out_axes=scan_axis,
+    )(remat_ffn, None, inputs)
+    res = rearrange(res, 'b c n d -> b (c n) d')
+    return res
+"""
+Compute attention blockwise without materializing the full attention matrix,
+initially proposed in memory-efficient transformer https://arxiv.org/abs/2112.05682 Rabe et al. 2021;
+flash attention https://arxiv.org/abs/2205.14135 Dao et al. 2022 proposes a CUDA
+efficient implementation; blockwise parallel transformer https://arxiv.org/abs/2305.19370
+Liu et al. 2023 proposes blockwise computing both attention and FFN, enabling 4x
+longer sequences than memory-efficient/flash-attention and fusion of attention and FFN.
+"""
+def blockwise_attn(
+        query, key, value,
+        bias=None,
+        deterministic=True,
+        dropout_rng=None,
+        attn_pdrop=0.0,
+        causal=True,
+        query_chunk_size=2048,
+        key_chunk_size=2048,
+        dtype=jnp.float32,
+        policy=jax.checkpoint_policies.nothing_saveable(),
+        precision=None,
+        float32_logits=True,
+        prevent_cse=True,
+    ):
+    # query, key, value: (batch, seq_len, num_heads, dim_per_head)
+    # bias: (batch, seq_len) can be used to mask out attention (e.g. padding)
+    # causal: whether to use causal mask
+    # policy: one of jax.checkpoint_policies
+    query = query / jnp.sqrt(query.shape[-1]).astype(dtype)
+    if float32_logits:
+        query = query.astype(jnp.float32)
+        key = key.astype(jnp.float32)
+    batch, q_len, num_heads, dim_per_head = query.shape
+    batch, kv_len, num_heads, dim_per_head = key.shape
+    batch, kv_len, num_heads, dim_per_head = value.shape
+    num_q = q_len // query_chunk_size
+    num_kv = kv_len // key_chunk_size
+    query = query.reshape((batch, num_q, query_chunk_size, num_heads, dim_per_head))
+    key = key.reshape((batch, num_kv, key_chunk_size, num_heads, dim_per_head))
+    value = value.reshape((batch, num_kv, key_chunk_size, num_heads, dim_per_head))
+    query = jnp.moveaxis(query, 1, 0)
+    key = jnp.moveaxis(key, 1, 0)
+    value = jnp.moveaxis(value, 1, 0)
+    if bias is not None:
+        for bias_dim, broadcast_dim in zip(bias.shape, (batch, num_heads, q_len, kv_len)):
+            assert bias_dim == 1 or bias_dim == broadcast_dim
+    if not deterministic and attn_pdrop > 0.0:
+        attn_dropout_rng, dropout_rng = jax.random.split(dropout_rng)
+        attn_dropout = jax.random.bernoulli(attn_dropout_rng, attn_pdrop, (batch, num_heads, q_len, kv_len))
+    else:
+        attn_dropout = None
+    _chunk_bias_fn = functools.partial(
+        _chunk_attention_bias,
+        query_chunk_size, key_chunk_size, bias, deterministic,
+        attn_dropout, attn_pdrop, causal, dtype)
+    def scan_attention(args):
+        query_chunk, query_chunk_idx = args
+        @functools.partial(jax.checkpoint, prevent_cse=prevent_cse, policy=policy)
+        def scan_kv_block(carry, args):
+            key_chunk, value_chunk, key_chunk_idx = args
+            (numerator, denominator, prev_max_score) = carry
+            attn_weights = jnp.einsum('bqhd,bkhd->bqhk', query_chunk, key_chunk, precision=precision)
+            bias_chunk = _chunk_bias_fn(query_chunk_idx, key_chunk_idx)
+            bias_chunk = jnp.moveaxis(bias_chunk, 1, 2)
+            attn_weights = attn_weights + bias_chunk
+            max_score = jnp.max(attn_weights, axis=-1, keepdims=True)
+            max_score = jnp.maximum(prev_max_score, max_score)
+            max_score = jax.lax.stop_gradient(max_score)
+            exp_weights = jnp.exp(attn_weights - max_score)
+            exp_values = jnp.einsum(
+                'bqhv,bvhd->bqhd', exp_weights, value_chunk, precision=precision
+            )
+            correction = jnp.exp(prev_max_score - max_score)
+            numerator = numerator * correction + exp_values
+            denominator = denominator * correction + exp_weights.sum(axis=-1, keepdims=True)
+            return Carry(numerator, denominator, max_score), None
+        def skip_upper_half(carry, args):
+            key_chunk, value_chunk, key_chunk_idx = args
+            skip_block = jnp.array(False)
+            if causal:
+                skip_block = query_chunk_idx < key_chunk_idx
+            return jax.lax.cond(
+                skip_block,
+                lambda carry, args: (carry, None),
+                scan_kv_block,
+                carry,
+                args,
+            )
+        init_carry = Carry(
+            jnp.zeros((batch, query_chunk_size, num_heads, dim_per_head), dtype=query.dtype),
+            jnp.zeros((batch, query_chunk_size, num_heads, dim_per_head), dtype=query.dtype),
+            (-jnp.inf) * jnp.ones((batch, query_chunk_size, num_heads, 1), dtype=query.dtype),
+        )
+        (numerator, denominator, max_score), _ = lax.scan(
+            skip_upper_half, init_carry, xs=(key, value, jnp.arange(0, num_kv))
+        )
+        outputs = (numerator / denominator).astype(dtype)
+        return outputs
+    _, res = lax.scan(
+        lambda _, x: ((), scan_attention(x)),
+        (), xs=(query, jnp.arange(0, num_q))
+    )
+    res = rearrange(res, 'n b c h d -> b (n c) h d')
+    return res
+class Carry(NamedTuple):
+    numerator: jax.Array
+    denominator: jax.Array
+    max_so_far: jax.Array
+def _chunk_attention_bias(query_chunk_size, key_chunk_size,
+            bias, deterministic, attn_dropout, attn_pdrop, causal,
+            dtype, query_chunk_idx, key_chunk_idx):
+    query_offset = query_chunk_idx * query_chunk_size
+    key_offset = key_chunk_idx * key_chunk_size
+    chunk_bias = jnp.zeros((1, 1, 1, 1), dtype=dtype)
+    if bias is not None:
+        chunk_bias = lax.dynamic_slice(
+            bias,
+            start_indices=(0, 0, query_offset, key_offset),
+            slice_sizes=(*bias.shape[:2], min(bias.shape[-2], query_chunk_size), min(bias.shape[-1], key_chunk_size)),
+        )
+    if causal:
+        query_idx = lax.broadcasted_iota(dtype=jnp.int32, shape=(query_chunk_size, 1), dimension=0)
+        key_idx = lax.broadcasted_iota(dtype=jnp.int32, shape=(1, key_chunk_size), dimension=1)
+        offset = query_offset - key_offset
+        query_idx += offset
+        causal_mask_value = (query_idx < key_idx) * jnp.finfo(dtype).min
+        chunk_bias += causal_mask_value.reshape(1, 1, *causal_mask_value.shape)
+    if not deterministic and attn_pdrop > 0.0:
+        attn_dropout_slice = lax.dynamic_slice(
+            attn_dropout,
+            start_indices=(0, 0, query_offset, key_offset),
+            slice_sizes=(
+                *attn_dropout.shape[:2],
+                min(attn_dropout.shape[-2], query_chunk_size),
+                min(attn_dropout.shape[-1], key_chunk_size),
+            ),
+        )
+        chunk_bias += attn_dropout_slice * jnp.finfo(dtype).min
+    return chunk_bias.astype(dtype)
+if __name__ == '__main__':
+    # test
+    def reference_attn(query, key, value, causal, dtype):
+        query = query / jnp.sqrt(query.shape[-1]).astype(dtype)
+        logits = jnp.einsum("bqhc,bkhc->bhqk", query, key)
+        if causal:
+            mask_value = jnp.finfo(logits.dtype).min
+            _, q_seq_len, _, _ = query.shape
+            _, kv_seq_len, _, _ = key.shape
+            mask_shape = (q_seq_len, kv_seq_len)
+            row_ids = jax.lax.broadcasted_iota(jnp.int32, mask_shape, 0)
+            col_ids = jax.lax.broadcasted_iota(jnp.int32, mask_shape, 1)
+            causal_mask = (row_ids < col_ids)[None, None, :, :]
+            logits = logits + jnp.where(causal_mask, mask_value, 0.0)
+        weights = jax.nn.softmax(logits, axis=-1)
+        out = jnp.einsum("bhqk,bkhc->bqhc", weights, value)
+        return out
+    # random inputs
+    shape = (1, 32, 8, 64)
+    query = jax.random.normal(jax.random.PRNGKey(0), shape)
+    key = jax.random.normal(jax.random.PRNGKey(1), shape)
+    value = jax.random.normal(jax.random.PRNGKey(2), shape)
+    causal = True
+    chunk_size = 4
+    policy = jax.checkpoint_policies.nothing_saveable()
+    blockwise = blockwise_attn(query, key, value, None, False, None, 0.0, causal, chunk_size, chunk_size, jnp.float32, policy, 'float32', True, False)
+    reference = reference_attn(query, key, value, causal, 'float32')
+    assert jnp.allclose(reference, blockwise, atol=1e-6)

EasyLM/checkpoint.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import os
+import numpy as np
+from ml_collections import ConfigDict
+import mlxu
+import jax
+import jax.numpy as jnp
+import flax
+from flax.serialization import (
+    from_bytes, to_bytes, to_state_dict, from_state_dict
+)
+from flax.traverse_util import flatten_dict, unflatten_dict, empty_node
+import msgpack
+from EasyLM.jax_utils import tree_apply, float_tensor_to_dtype
+class StreamingCheckpointer(object):
+    """ Custom msgpack checkpointer that saves large train states by serializing
+        and saving tensors one by one in a streaming fashion. Avoids running
+        out of memory or local TPU disk with default flax checkpointer.
+    """
+    @staticmethod
+    def get_default_config(updates=None):
+        config = ConfigDict()
+        config.float_dtype = 'bf16'
+        config.save_optimizer_state = False
+        if updates is not None:
+            config.update(ConfigDict(updates).copy_and_resolve_references())
+        return config
+    def __init__(self, config, checkpoint_dir, enable=True):
+        self.config = self.get_default_config(config)
+        self.checkpoint_dir = checkpoint_dir
+        self.enable = enable
+    def save_checkpoint(self, train_state, filename, gather_fns=None):
+        if self.enable:
+            path = os.path.join(self.checkpoint_dir, filename)
+        else:
+            path = '/dev/null'
+        self.save_train_state_to_file(
+            train_state, path, gather_fns, self.config.float_dtype
+        )
+    @staticmethod
+    def save_train_state_to_file(train_state, path, gather_fns=None, float_dtype=None):
+        train_state = to_state_dict(train_state)
+        packer = msgpack.Packer()
+        flattend_train_state = flatten_dict(train_state)
+        if gather_fns is not None:
+            gather_fns = flatten_dict(to_state_dict(gather_fns))
+        with mlxu.open_file(path, "wb") as fout:
+            for key, value in flattend_train_state.items():
+                if gather_fns is not None:
+                    value = gather_fns[key](value)
+                value = float_tensor_to_dtype(value, float_dtype)
+                fout.write(packer.pack((key, to_bytes(value))))
+    def save_pickle(self, obj, filename):
+        if self.enable:
+            path = os.path.join(self.checkpoint_dir, filename)
+        else:
+            path = '/dev/null'
+        mlxu.save_pickle(obj, path)
+    def save_all(self, train_state, gather_fns, metadata=None, dataset=None, milestone=False):
+        step = int(jax.device_get(train_state.step))
+        if self.config.save_optimizer_state:
+            checkpoint_state = train_state
+            checkpoint_name = 'streaming_train_state'
+            checkpoint_gather_fns = gather_fns
+        else:
+            checkpoint_state = train_state.params['params']
+            checkpoint_name = 'streaming_params'
+            checkpoint_gather_fns = gather_fns.params['params']
+        if milestone:
+            # Save a milestone checkpoint that will not be overwritten
+            self.save_pickle(metadata, f'metadata_{step}.pkl')
+            self.save_pickle(dataset, f'dataset_{step}.pkl')
+            self.save_checkpoint(
+                checkpoint_state, f'{checkpoint_name}_{step}', checkpoint_gather_fns
+            )
+        else:
+            # Save a normal checkpoint that can be overwritten
+            self.save_pickle(metadata, 'metadata.pkl')
+            self.save_pickle(dataset, 'dataset.pkl')
+            self.save_checkpoint(
+                checkpoint_state, f'{checkpoint_name}', checkpoint_gather_fns
+            )
+    @staticmethod
+    def load_checkpoint(path, target=None, shard_fns=None, remove_dict_prefix=None):
+        if shard_fns is not None:
+            shard_fns = flatten_dict(
+                to_state_dict(shard_fns)
+            )
+        if remove_dict_prefix is not None:
+            remove_dict_prefix = tuple(remove_dict_prefix)
+        flattend_train_state = {}
+        with mlxu.open_file(path) as fin:
+            # 83886080 bytes = 80 MB, which is 16 blocks on GCS
+            unpacker = msgpack.Unpacker(fin, read_size=83886080, max_buffer_size=0)
+            for key, value in unpacker:
+                key = tuple(key)
+                if remove_dict_prefix is not None:
+                    if key[:len(remove_dict_prefix)] == remove_dict_prefix:
+                        key = key[len(remove_dict_prefix):]
+                    else:
+                        continue
+                tensor = from_bytes(None, value)
+                if shard_fns is not None:
+                    tensor = shard_fns[key](tensor)
+                flattend_train_state[key] = tensor
+        if target is not None:
+            flattened_target = flatten_dict(
+                to_state_dict(target), keep_empty_nodes=True
+            )
+            for key, value in flattened_target.items():
+                if key not in flattend_train_state and value == empty_node:
+                    flattend_train_state[key] = value
+        train_state = unflatten_dict(flattend_train_state)
+        if target is None:
+            return train_state
+        return from_state_dict(target, train_state)
+    @staticmethod
+    def load_flax_checkpoint(path, target=None, shard_fns=None):
+        """ Load a standard flax checkpoint that's not saved with the
+            msgpack streaming format.
+        """
+        with mlxu.open_file(path, "rb") as fin:
+            encoded_bytes = fin.read()
+        state_dict = flax.serialization.msgpack_restore(encoded_bytes)
+        if shard_fns is not None:
+            shard_fns = to_state_dict(shard_fns)
+            state_dict = tree_apply(shard_fns, state_dict)
+        if target is None:
+            return state_dict
+        return from_state_dict(target, state_dict)
+    @classmethod
+    def load_trainstate_checkpoint(cls, load_from, trainstate_target=None,
+                                   trainstate_shard_fns=None,
+                                   disallow_trainstate=False):
+        if trainstate_target is not None:
+            params_target = trainstate_target.params['params']
+        else:
+            params_target = None
+        if trainstate_shard_fns is not None:
+            params_shard_fns = trainstate_shard_fns.params['params']
+        else:
+            params_shard_fns = None
+        load_type, load_path = load_from.split('::', 1)
+        if disallow_trainstate:
+            assert load_type != 'trainstate', 'Loading full trainstate is not allowed!'
+        train_state = None
+        restored_params = None
+        if load_type == 'trainstate':
+            # Load the entire train state in the streaming format
+            train_state = cls.load_checkpoint(
+                path=load_path,
+                target=trainstate_target,
+                shard_fns=trainstate_shard_fns,
+            )
+        elif load_type == 'trainstate_params':
+            # Load the params part of the train state in the streaming format
+            restored_params = cls.load_checkpoint(
+                path=load_path,
+                target=params_target,
+                shard_fns=params_shard_fns,
+                remove_dict_prefix=('params', 'params'),
+            )
+            restored_params = flax.core.frozen_dict.freeze(
+                {'params': restored_params}
+            )
+        elif load_type == 'params':
+            # Load the params in the streaming format
+            restored_params = cls.load_checkpoint(
+                path=load_path,
+                target=params_target,
+                shard_fns=params_shard_fns,
+            )
+            restored_params = flax.core.frozen_dict.freeze(
+                {'params': restored_params}
+            )
+        elif load_type == 'flax_params':
+            # Load the params in the standard flax format (non-streaming)
+            # This requires the entire params to fit in memory
+            restored_params = cls.load_flax_checkpoint(
+                path=load_path,
+                target=params_target,
+                shard_fns=params_shard_fns
+            )
+            restored_params = flax.core.frozen_dict.freeze(
+                {'params': restored_params}
+            )
+        else:
+            raise ValueError(f'Invalid load_from type: {load_type}')
+        return train_state, restored_params

EasyLM/data.py ADDED Viewed

	@@ -0,0 +1,431 @@

+import dataclasses
+import pprint
+import time
+from functools import partial
+import json
+import base64
+from multiprocessing import Pool
+import h5py
+import mlxu
+from ml_collections.config_dict import config_dict
+from ml_collections import ConfigDict
+from tqdm import tqdm, trange
+import numpy as np
+from datasets import load_dataset, load_from_disk
+class DatasetFactory(object):
+    """ Datset builder class. """
+    @staticmethod
+    def get_default_config(updates=None):
+        config = ConfigDict()
+        config.type = 'huggingface'
+        config.text_processor = TextProcessor.get_default_config()
+        config.huggingface_dataset = HuggingfaceDataset.get_default_config()
+        config.json_dataset = JsonDataset.get_default_config()
+        if updates is not None:
+            config.update(ConfigDict(updates).copy_and_resolve_references())
+        return config
+    @classmethod
+    def load_dataset(cls, config, tokenizer, **kwargs):
+        config = cls.get_default_config(config)
+        text_processor = TextProcessor(config.text_processor, tokenizer)
+        if config.type == 'huggingface':
+            return HuggingfaceDataset(
+                config.huggingface_dataset, tokenizer, text_processor, **kwargs
+            )
+        elif config.type == 'json':
+            return JsonDataset(config.json_dataset, tokenizer, text_processor, **kwargs)
+        else:
+            raise ValueError(f'Unknown dataset type: {config.type}')
+    def __init__(self):
+        raise ValueError('DatasetFactory is a static class and should not be instantiated.')
+class TextProcessor(object):
+    """ Example processor that converts a dictionary of texts into tokens. """
+    @staticmethod
+    def get_default_config(updates=None):
+        config = ConfigDict()
+        config.fields_from_example = ''
+        config.fields = ''
+        config.subfield_separator = ' '
+        config.add_bos_token = True
+        config.add_eos_token = True
+        config.prepend_text = ''
+        config.base64_token_dtype = 'i4'
+        if updates is not None:
+            config.update(ConfigDict(updates).copy_and_resolve_references())
+        return config
+    def __init__(self, config, tokenizer):
+        self.config = self.get_default_config(config)
+        assert self.config.fields != '' or self.config.fields_from_example != '', (
+            'Either fields or fields_from_example must be specified.'
+        )
+        self.tokenizer = tokenizer
+    def __call__(self, example, has_aux=False):
+        if has_aux:
+            example, *aux = example
+        else:
+            aux = tuple()
+        token_buffer = []
+        loss_mask_buffer = []
+        if self.config.add_bos_token:
+            token_buffer.append(self.tokenizer.bos_token_id)
+            loss_mask_buffer.append(0.0)
+        if self.config.fields_from_example != '':
+            fields = example[self.config.fields_from_example].split(',')
+        else:
+            fields = self.config.fields.split(',')
+        for i, field in enumerate(fields):
+            if field.startswith('[') and field.endswith(']'):
+                # No loss for this field.
+                field = field[1:-1]
+                mask = 0.0
+            else:
+                mask = 1.0
+            if field.startswith('<|') and field.endswith('|>'):
+                # Special tokens.
+                field = field[2:-2]
+                if field == 'bos':
+                    token_buffer.append(self.tokenizer.bos_token_id)
+                elif field == 'eos':
+                    token_buffer.append(self.tokenizer.eos_token_id)
+                else:
+                    # Token ID specified directly.
+                    token_buffer.append(int(field))
+                loss_mask_buffer.append(mask)
+            elif field.startswith('{') and field.endswith('}'):
+                field = field[1:-1]
+                # Base64 encoded raw tokens.
+                tokens = np.frombuffer(
+                    base64.b64decode(example[field]),
+                    dtype=self.config.base64_token_dtype
+                ).tolist()
+                token_buffer.extend(tokens)
+                loss_mask_buffer.extend([mask for _ in range(len(tokens))])
+            else:
+                subfields = field.split('+')
+                text = self.config.subfield_separator.join(
+                    [example[subfield] for subfield in subfields]
+                )
+                if i == 0:
+                    text = self.config.prepend_text + text
+                tokens = self.tokenizer.encode(text)
+                token_buffer.extend(tokens)
+                loss_mask_buffer.extend([mask for _ in range(len(tokens))])
+        if self.config.add_eos_token:
+            token_buffer.append(self.tokenizer.eos_token_id)
+            loss_mask_buffer.append(1.0)
+        return token_buffer, loss_mask_buffer, *aux
+class HuggingfaceDataset(object):
+    """ Huggingface dataset, where the dataset is loaded using the huggingface
+        datasets.load_dataset() function.
+    """
+    @staticmethod
+    def get_default_config(updates=None):
+        config = ConfigDict()
+        config.path = 'c4'
+        config.name = 'en'
+        config.split = 'train'
+        config.streaming = False
+        config.seq_length = 1024
+        config.batch_size = 8
+        config.always_start_with_bos = False
+        config.start_seek_loc = 0
+        config.tokens_count_at_start = 0
+        config.batch_token_dtype = 'i4'
+        if updates is not None:
+            config.update(ConfigDict(updates).copy_and_resolve_references())
+        return config
+    def __init__(self, config, tokenizer, text_processor, eval_dataset=False):
+        self.config = self.get_default_config(config)
+        name = self.config.name if self.config.name != '' else None
+        split = self.config.split if self.config.split != '' else None
+        self._tokenizer = tokenizer
+        self._text_processor = text_processor
+        self._dataset = load_from_disk(
+            self.config.path
+        )[split]
+        self._dataset = self._dataset.to_iterable_dataset(num_shards=128 if len(self._dataset) > 128 else len(self._dataset))
+        self._eval_dataset = eval_dataset
+        self._train_epochs = 0
+        self._dataset_loc = self.config.start_seek_loc
+        self._total_tokens = self.config.tokens_count_at_start
+        self._index = 0
+    def __iter__(self):
+        chunk_size = self.config.batch_size * self.config.seq_length
+        total_tokens = 0
+        while True:
+            token_buffer = []
+            loss_mask_buffer = []
+            if not self._eval_dataset:
+                self._shuffle()
+            for index, example in enumerate(self._dataset):
+                self._index = index
+                if not self._eval_dataset and self._dataset_loc > index:
+                    continue
+                tokens, loss_masks = self.text_processor(example)
+                token_buffer.extend(tokens)
+                loss_mask_buffer.extend(loss_masks)
+                while len(token_buffer) > chunk_size + 1:
+                    self._total_tokens += chunk_size
+                    metrics = {
+                        'dataset_example_index': index,
+                        'dataset_total_tokens': self._total_tokens,
+                        'epoch': self._train_epochs,
+                    }
+                    batch = {
+                        'input_tokens': np.array(token_buffer[:chunk_size], dtype=self.config.batch_token_dtype).reshape(
+                            self.config.batch_size, -1
+                        ),
+                        'target_tokens': np.array(token_buffer[1:chunk_size + 1], dtype=self.config.batch_token_dtype).reshape(
+                            self.config.batch_size, -1
+                        ),
+                        'loss_masks': np.array(loss_mask_buffer[1:chunk_size + 1], dtype=np.float32).reshape(
+                            self.config.batch_size, -1
+                        ),
+                    }
+                    if self.config.always_start_with_bos:
+                        batch['input_tokens'][:, 0] = self.tokenizer.bos_token_id
+                    yield batch, metrics
+                    token_buffer = token_buffer[chunk_size:]
+                    loss_mask_buffer = loss_mask_buffer[chunk_size:]
+            if self._eval_dataset:
+                break
+            else:
+                self._dataset_loc = 0
+                self._shuffle()
+                self._train_epochs += 1
+                print(f"TRAIN {self._train_epochs} EPOCH DONE")
+    def _shuffle(self):
+        self._dataset = self._dataset.shuffle(buffer_size=100)
+    def get_state_dict(self):
+        return dict(
+            config=self.config,
+            dataset_loc=self._index,
+            total_tokens=self._total_tokens,
+            epochs=self._train_epochs,
+        )
+    def load_state_dict(self, state_dict):
+        if 'config' in state_dict:
+            self.config.update(ConfigDict(state_dict['config']))
+        self._dataset_loc = state_dict.get('dataset_loc', self.config.start_seek_loc)
+        self._total_tokens = state_dict.get('total_tokens', self.config.tokens_count_at_start)
+        self._train_epochs = state_dict.get('epochs', 0)
+    @property
+    def seq_length(self):
+        return self.config.seq_length
+    @property
+    def tokenizer(self):
+        return self._tokenizer
+    @property
+    def text_processor(self):
+        return self._text_processor
+    @property
+    def dataset(self):
+        return self._dataset
+    @property
+    def vocab_size(self):
+        return len(self._tokenizer)
+class JsonDataset(object):
+    """ JSON dataset, where each line of the data file contains a JSON
+        dictionary with text fields.
+    """
+    @staticmethod
+    def get_default_config(updates=None):
+        config = ConfigDict()
+        config.path = ''
+        config.seq_length = 1024
+        config.batch_size = 8
+        config.always_start_with_bos = False
+        config.start_seek_loc = 0
+        config.example_index_at_start = 0
+        config.tokens_count_at_start = 0
+        config.tokenizer_processes = 1
+        config.tokenizer_parallel_chunk_size = 32
+        config.tokenizer_parallel_batch_size = 1024
+        config.throughput_average_window_size = 200
+        if updates is not None:
+            config.update(ConfigDict(updates).copy_and_resolve_references())
+        return config
+    def __init__(self, config, tokenizer, text_processor):
+        self.config = self.get_default_config(config)
+        assert self.config.path != ''
+        self._tokenizer = tokenizer
+        self._text_processor = text_processor
+        self._index = self.config.example_index_at_start
+        self._file_loc = self.config.start_seek_loc
+        self._total_tokens = self.config.tokens_count_at_start
+    def parse_json(self, line):
+        if not line or line == '\n':
+            return None
+        try:
+            data = json.loads(line)
+        except json.decoder.JSONDecodeError:
+            print(f'Error parsing json line:\n{line}')
+            return None
+        return data
+    def json_iterator(self):
+        with mlxu.open_file(self.config.path, 'r') as fin:
+            fin.seek(self._file_loc)
+            while True:
+                line = fin.readline()
+                self._file_loc = fin.tell()
+                if not line:   # Reached EOF
+                    self._index = 0
+                    fin.seek(0)
+                    continue
+                data = self.parse_json(line)
+                if data is not None:
+                    # JSON parsing succeeded
+                    yield data, self._file_loc, self._index
+                self._index += 1
+    def batched(self, iterator, batch_size):
+        batch = []
+        for example in iterator:
+            batch.append(example)
+            if len(batch) == batch_size:
+                yield batch
+                batch = []
+        if len(batch) > 0:
+            yield batch
+    def parallel_example_iterator(self):
+        if self.config.tokenizer_processes == 1:
+            for example, loc, index in self.json_iterator():
+                yield self.text_processor((example, loc, index), has_aux=True)
+        else:
+            process_pool = Pool(self.config.tokenizer_processes)
+            batched_iterator = self.batched(
+                self.json_iterator(), self.config.tokenizer_parallel_batch_size
+            )
+            with process_pool as pool:
+                map_fn = partial(self.text_processor, has_aux=True)
+                next_batch = pool.map_async(
+                    map_fn, next(batched_iterator),
+                    chunksize=self.config.tokenizer_parallel_chunk_size
+                )
+                while True:
+                    current_batch = next_batch
+                    next_batch = pool.map_async(
+                        map_fn, next(batched_iterator),
+                        chunksize=self.config.tokenizer_parallel_chunk_size
+                    )
+                    for example in current_batch.get():
+                        yield example
+    def __iter__(self):
+        chunk_size = self.config.batch_size * self.config.seq_length
+        token_buffer = []
+        loss_mask_buffer = []
+        last_time = 0.0
+        step_times = []
+        start_time = time.time()
+        start_tokens = self._total_tokens
+        for tokens, loss_masks, loc, index in self.parallel_example_iterator():
+            token_buffer.extend(tokens)
+            loss_mask_buffer.extend(loss_masks)
+            while len(token_buffer) > chunk_size + 1:
+                self._total_tokens += chunk_size
+                step_times.append(time.time() - last_time)
+                last_time = time.time()
+                if len(step_times) > self.config.throughput_average_window_size:
+                    step_times = step_times[-self.config.throughput_average_window_size:]
+                average_throughput = chunk_size / np.mean(step_times)
+                accumulated_throughput = (
+                    (self._total_tokens - start_tokens) / (time.time() - start_time)
+                )
+                metrics = {
+                    'dataset_file_loc': loc,
+                    'dataset_example_index': index,
+                    'dataset_total_tokens': self._total_tokens,
+                    'dataset_accumulated_tps': accumulated_throughput,
+                    'dataset_average_tps': average_throughput,
+                }
+                batch = {
+                    'input_tokens': np.array(token_buffer[:chunk_size], dtype=np.int32).reshape(
+                        self.config.batch_size, -1
+                    ),
+                    'target_tokens': np.array(token_buffer[1:chunk_size + 1], dtype=np.int32).reshape(
+                        self.config.batch_size, -1
+                    ),
+                    'loss_masks': np.array(loss_mask_buffer[1:chunk_size + 1], dtype=np.float32).reshape(
+                        self.config.batch_size, -1
+                    ),
+                }
+                if self.config.always_start_with_bos:
+                    batch['input_tokens'][:, 0] = self.tokenizer.bos_token_id
+                yield batch, metrics
+                token_buffer = token_buffer[chunk_size:]
+                loss_mask_buffer = loss_mask_buffer[chunk_size:]
+    def get_state_dict(self):
+        return dict(
+            config=self.config,
+            index=self._index,
+            file_loc=self._file_loc,
+            total_tokens=self._total_tokens,
+        )
+    def load_state_dict(self, state_dict):
+        if 'config' in state_dict:
+            self.config.update(ConfigDict(state_dict['config']))
+        self._index = state_dict.get('index', self.config.example_index_at_start)
+        self._file_loc = state_dict.get('file_loc', self.config.start_seek_loc)
+        self._total_tokens = state_dict.get('total_tokens', self.config.tokens_count_at_start)
+    @property
+    def seq_length(self):
+        return self.config.seq_length
+    @property
+    def tokenizer(self):
+        return self._tokenizer
+    @property
+    def text_processor(self):
+        return self._text_processor
+    @property
+    def vocab_size(self):
+        return len(self.tokenizer)

EasyLM/jax_utils.py ADDED Viewed

	@@ -0,0 +1,403 @@

+import os
+import math
+from typing import Any, Mapping, Text, Tuple, Union, NamedTuple
+from functools import partial
+import re
+import dataclasses
+import random
+from ml_collections import ConfigDict
+from ml_collections.config_dict.config_dict import placeholder
+import flax
+import jax
+import jax.numpy as jnp
+from jax.sharding import PartitionSpec as PS
+from jax.sharding import Mesh
+from jax.experimental import mesh_utils
+from jax.experimental.pjit import with_sharding_constraint as _with_sharding_constraint
+from jax.experimental.pjit import pjit
+from jax.interpreters import pxla
+import numpy as np
+from transformers import FlaxLogitsWarper
+class JaxRNG(object):
+    """ A convenient stateful Jax RNG wrapper. Can be used to wrap RNG inside
+        pure function.
+    """
+    @classmethod
+    def from_seed(cls, seed):
+        return cls(jax.random.PRNGKey(seed))
+    def __init__(self, rng):
+        self.rng = rng
+    def __call__(self, keys=None):
+        if keys is None:
+            self.rng, split_rng = jax.random.split(self.rng)
+            return split_rng
+        elif isinstance(keys, int):
+            split_rngs = jax.random.split(self.rng, num=keys + 1)
+            self.rng = split_rngs[0]
+            return tuple(split_rngs[1:])
+        else:
+            split_rngs = jax.random.split(self.rng, num=len(keys) + 1)
+            self.rng = split_rngs[0]
+            return {key: val for key, val in zip(keys, split_rngs[1:])}
+class JaxDistributedConfig(object):
+    """ Utility class for initializing JAX distributed. """
+    @staticmethod
+    def get_default_config(updates=None):
+        config = ConfigDict()
+        config.initialize_jax_distributed = False
+        config.coordinator_address = placeholder(str)
+        config.num_processes = placeholder(int)
+        config.process_id = placeholder(int)
+        config.local_device_ids = placeholder(str)
+        if updates is not None:
+            config.update(ConfigDict(updates).copy_and_resolve_references())
+        return config
+    @classmethod
+    def initialize(cls, config):
+        config = cls.get_default_config(config)
+        if config.initialize_jax_distributed:
+            if config.local_device_ids is not None:
+                local_device_ids = [int(x) for x in config.local_device_ids.split(',')]
+            else:
+                local_device_ids = None
+            jax.distributed.initialize(
+                coordinator_address=config.coordinator_address,
+                num_processes=config.num_processes,
+                process_id=config.process_id,
+                local_device_ids=local_device_ids,
+            )
+class FlaxTemperatureLogitsWarper(FlaxLogitsWarper):
+    """ JIT traceable version of FlaxLogitsWarper that performs temperature scaling."""
+    def __init__(self, temperature):
+        self.temperature = temperature
+    def __call__(self, input_ids, scores, cur_len):
+        return scores / jnp.clip(self.temperature, a_min=1e-8)
+def make_shard_and_gather_fns(partition_specs, dtype_specs=None):
+    """ Create pytree of sharding and gathering functions from pytree of
+        partition specs.
+    """
+    float_dtypes = (jnp.bfloat16, jnp.float16, jnp.float32, jnp.float64)
+    def make_to_dtype_fn(dtype_spec):
+        def to_dtype(tensor):
+            if dtype_specs in float_dtypes and getattr(tensor, 'dtype', None) in float_dtypes:
+                # Convert all float tensors to the same dtype
+                return tensor.astype(dtype_specs)
+            elif hasattr(dtype_spec, 'dtype') and hasattr(tensor, 'dtype'):
+                return tensor.astype(dtype_spec.dtype)
+            return tensor
+        return to_dtype
+    def make_shard_fn(partition_spec, dtype_spec=None):
+        jax_shard_function = pjit(
+            make_to_dtype_fn(dtype_spec),
+            in_shardings=None,
+            out_shardings=partition_spec
+        )
+        def shard_fn(tensor):
+            return jax_shard_function(tensor).block_until_ready()
+        return shard_fn
+    def make_gather_fn(partition_spec, dtype_spec=None):
+        jax_gather_fn = pjit(
+            make_to_dtype_fn(dtype_spec),
+            in_shardings=partition_spec,
+            out_shardings=None
+        )
+        def gather_fn(tensor):
+            return jax.device_get(jax_gather_fn(tensor))
+        return gather_fn
+    if dtype_specs is None or dtype_specs in float_dtypes:
+        shard_fns = jax.tree_util.tree_map(make_shard_fn, partition_specs)
+        gather_fns = jax.tree_util.tree_map(make_gather_fn, partition_specs)
+    else:
+        shard_fns = jax.tree_util.tree_map(
+            make_shard_fn, partition_specs, dtype_specs
+        )
+        gather_fns = jax.tree_util.tree_map(
+            make_gather_fn, partition_specs, dtype_specs
+        )
+    return shard_fns, gather_fns
+def set_random_seed(seed):
+    np.random.seed(seed)
+    random.seed(seed)
+    init_rng(seed)
+def get_jax_mesh(axis_dims, names):
+    if axis_dims.startswith('!'):
+        # Allow splitting a physical mesh axis if needed
+        mesh_axis_splitting = True
+        axis_dims = axis_dims[1:]
+    else:
+        mesh_axis_splitting = False
+    if ':' in axis_dims:
+        dims = []
+        dim_names = []
+        for axis in axis_dims.split(','):
+            name, dim = axis.split(':')
+            assert name in names
+            dims.append(int(dim))
+            dim_names.append(name)
+        assert(set(dim_names) == set(names))
+    else:
+        dims = [int(x) for x in axis_dims.split(',')]
+        dim_names = names
+    assert len(dims) == len(names)
+    mesh_shape = np.arange(jax.device_count()).reshape(dims).shape
+    if mesh_axis_splitting:
+        physical_mesh = np.array(jax.devices()).reshape(mesh_shape)
+    else:
+        physical_mesh = mesh_utils.create_device_mesh(mesh_shape)
+    return Mesh(physical_mesh, dim_names)
+def names_in_current_mesh(*names):
+    """ Check if current mesh axes contain these names. """
+    mesh_axis_names = pxla.thread_resources.env.physical_mesh.axis_names
+    return set(names) <= set(mesh_axis_names)
+def get_names_from_parition_spec(partition_specs):
+    """ Return axis names from partition specs. """
+    names = set()
+    if isinstance(partition_specs, dict):
+        partition_specs = partition_specs.values()
+    for item in partition_specs:
+        if item is None:
+            continue
+        elif isinstance(item, str):
+            names.add(item)
+        else:
+            names.update(get_names_from_parition_spec(item))
+    return list(names)
+def with_sharding_constraint(x, partition_specs):
+    """ A smarter version of with_sharding_constraint that only applies the
+        constraint if the current mesh contains the axes in the partition specs.
+    """
+    axis_names = get_names_from_parition_spec(partition_specs)
+    if names_in_current_mesh(*axis_names):
+        x = _with_sharding_constraint(x, partition_specs)
+    return x
+def wrap_function_with_rng(rng):
+    """ To be used as decorator, automatically bookkeep a RNG for the wrapped function. """
+    def wrap_function(function):
+        def wrapped(*args, **kwargs):
+            nonlocal rng
+            rng, split_rng = jax.random.split(rng)
+            return function(split_rng, *args, **kwargs)
+        return wrapped
+    return wrap_function
+def init_rng(seed):
+    global jax_utils_rng
+    jax_utils_rng = JaxRNG.from_seed(seed)
+def next_rng(*args, **kwargs):
+    global jax_utils_rng
+    return jax_utils_rng(*args, **kwargs)
+def get_metrics(metrics, unreplicate=False, stack=False):
+    if unreplicate:
+        metrics = flax.jax_utils.unreplicate(metrics)
+    metrics = jax.device_get(metrics)
+    if stack:
+        return jax.tree_map(lambda *args: np.stack(args), *metrics)
+    else:
+        return {key: float(val) for key, val in metrics.items()}
+def mse_loss(val, target, valid=None):
+    if valid is None:
+        valid = jnp.ones((*target.shape[:2], 1))
+    valid = valid.astype(jnp.float32)
+    loss = jnp.mean(
+        jnp.where(
+            valid > 0.0,
+            jnp.square(val - target),
+            0.0
+        )
+    )
+    return loss
+def cross_entropy_loss_and_accuracy(logits, tokens, valid=None):
+    if valid is None:
+        valid = jnp.ones(tokens.shape[:2])
+    valid = valid.astype(jnp.float32)
+    valid_text_length = jnp.maximum(jnp.sum(valid, axis=-1), 1e-10)
+    logits = logits.astype(jnp.float32) # for numerical stability
+    token_log_prob = jnp.squeeze(
+        jnp.take_along_axis(
+            jax.nn.log_softmax(logits, axis=-1),
+            jnp.expand_dims(tokens, -1),
+            axis=-1,
+        ),
+        -1,
+    )
+    token_log_prob = jnp.where(valid > 0.0, token_log_prob, jnp.array(0.0))
+    loss = -jnp.mean(jnp.sum(token_log_prob, axis=-1) / valid_text_length)
+    correct = jnp.where(
+        valid > 0.0,
+        jnp.argmax(logits, axis=-1) == tokens,
+        jnp.array(False)
+    )
+    accuracy = jnp.mean(jnp.sum(correct, axis=-1) / valid_text_length)
+    return loss, accuracy
+def global_norm(tree):
+    """ Return the global L2 norm of a pytree. """
+    squared = jax.tree_util.tree_map(lambda x: jnp.sum(jnp.square(x)), tree)
+    flattened, _ = jax.flatten_util.ravel_pytree(squared)
+    return jnp.sqrt(jnp.sum(flattened))
+def average_metrics(metrics):
+    with jax.spmd_mode("allow_all"):
+        return jax.tree_map(
+            lambda *args: jnp.mean(jnp.stack(args)),
+            *metrics
+        )
+def get_float_dtype_by_name(dtype):
+    return {
+        'bf16': jnp.bfloat16,
+        'bfloat16': jnp.bfloat16,
+        'fp16': jnp.float16,
+        'float16': jnp.float16,
+        'fp32': jnp.float32,
+        'float32': jnp.float32,
+        'fp64': jnp.float64,
+        'float64': jnp.float64,
+    }[dtype]
+def float_tensor_to_dtype(tensor, dtype):
+    if dtype is None or dtype == '':
+        return tensor
+    if isinstance(dtype, str):
+        dtype = get_float_dtype_by_name(dtype)
+    float_dtypes = (jnp.bfloat16, jnp.float16, jnp.float32, jnp.float64)
+    if getattr(tensor, 'dtype', None) in float_dtypes:
+        tensor = tensor.astype(dtype)
+    return tensor
+def float_to_dtype(tree, dtype):
+    return jax.tree_util.tree_map(
+        partial(float_tensor_to_dtype, dtype=dtype), tree
+    )
+def get_gradient_checkpoint_policy(name):
+    return {
+        'everything_saveable': jax.checkpoint_policies.everything_saveable,
+        'nothing_saveable': jax.checkpoint_policies.nothing_saveable,
+        'checkpoint_dots': jax.checkpoint_policies.checkpoint_dots,
+        'checkpoint_dots_with_no_batch_dims': jax.checkpoint_policies.checkpoint_dots_with_no_batch_dims,
+    }[name]
+def tree_path_to_string(path, sep=None):
+    keys = []
+    for key in path:
+        if isinstance(key, jax.tree_util.SequenceKey):
+            keys.append(str(key.idx))
+        elif isinstance(key, jax.tree_util.DictKey):
+            keys.append(str(key.key))
+        elif isinstance(key, jax.tree_util.GetAttrKey):
+            keys.append(str(key.name))
+        elif isinstance(key, jax.tree_util.FlattenedIndexKey):
+            keys.append(str(key.key))
+        else:
+            keys.append(str(key))
+    if sep is None:
+        return tuple(keys)
+    return sep.join(keys)
+def flatten_tree(xs, is_leaf=None, sep=None):
+    flattened, _ = jax.tree_util.tree_flatten_with_path(xs, is_leaf=is_leaf)
+    output = {}
+    for key, val in flattened:
+        output[tree_path_to_string(key, sep=sep)] = val
+    return output
+def named_tree_map(f, tree, *rest, is_leaf=None, sep=None):
+    """ An extended version of jax.tree_util.tree_map, where the mapped function
+        f takes both the name (path) and the tree leaf as input.
+    """
+    return jax.tree_util.tree_map_with_path(
+        lambda path, x, *r: f(tree_path_to_string(path, sep=sep), x, *r),
+        tree, *rest,
+        is_leaf=is_leaf
+    )
+def match_partition_rules(rules, params):
+    """ Returns a pytree of PartitionSpec according to rules. Supports handling
+        Flax TrainState and Optax optimizer state.
+    """
+    def get_partition_spec(name, leaf):
+        if len(leaf.shape) == 0 or np.prod(leaf.shape) == 1:
+            """ Don't partition scalar values. """
+            return PS()
+        for rule, ps in rules:
+            if re.search(rule, name) is not None:
+                return ps
+        raise ValueError(f'Partition rule not found for param: {name}')
+    return named_tree_map(get_partition_spec, params, sep='/')
+def get_weight_decay_mask(exclusions):
+    """ Return a weight decay mask function that computes the pytree masks
+        according to the given exclusion rules.
+    """
+    def decay(name, _):
+        for rule in exclusions:
+            if re.search(rule, name) is not None:
+                return False
+        return True
+    def weight_decay_mask(params):
+        return named_tree_map(decay, params, sep='/')
+    return weight_decay_mask
+def tree_apply(fns, tree):
+    """ Apply a pytree of functions to the pytree. """
+    return jax.tree_util.tree_map(lambda fn, x: fn(x), fns, tree)

EasyLM/models/__init__.py ADDED Viewed

File without changes

EasyLM/models/gptj/__init__.py ADDED Viewed

File without changes

EasyLM/models/gptj/gptj_model.py ADDED Viewed

	@@ -0,0 +1,1054 @@

+# coding=utf-8
+# Copyright 2021 The EleutherAI and The HuggingFace Inc. team.
+# Modifications copyright 2022 Xinyang Geng
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+from typing import Optional, Tuple
+import json
+import numpy as np
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen import combine_masks, make_causal_mask
+from flax.linen.attention import dot_product_attention_weights
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+from flax.linen import partitioning as nn_partitioning
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutput
+from transformers.modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring
+from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from transformers.generation.flax_logits_process import FlaxLogitsProcessorList
+from transformers import AutoTokenizer
+from jax.sharding import PartitionSpec
+from ml_collections import ConfigDict
+from ml_collections.config_dict import config_dict
+from mlxu import function_args_to_config, load_pickle, open_file
+from EasyLM.jax_utils import (
+    with_sharding_constraint, get_jax_mesh, get_gradient_checkpoint_policy
+)
+"""
+The follow code is taken from
+transformers/src/transformers/models/gptj/configuration_gptj.py
+and modified to work with EasyLM.
+"""
+GPTJ_STANDARD_CONFIGS = {
+    '6b': {
+        "vocab_size": 50400,
+        "n_positions": 2048,
+        "n_embd": 4096,
+        "n_layer": 28,
+        "n_head": 16,
+        "rotary_dim": 64,
+        "n_inner": None,
+        "activation_function": "gelu_new",
+        "layer_norm_epsilon": 1e-5,
+        "initializer_range": 0.02,
+        "scale_attn_weights": True,
+        "use_cache": True,
+        "bos_token_id": 50256,
+        "eos_token_id": 50256,
+        "tie_word_embeddings": False,
+        "n_real_tokens": 50257,
+    }
+}
+class GPTJConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GPTJModel`]. It is used to instantiate a GPT-J
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the GPT-J
+    [EleutherAI/gpt-j-6B](https://huggingface.co/EleutherAI/gpt-j-6B) architecture. Configuration objects inherit from
+    [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`]
+    for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50400):
+            Vocabulary size of the GPT-J model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GPTJModel`].
+        n_positions (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        n_embd (`int`, *optional*, defaults to 4096):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (`int`, *optional*, defaults to 28):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        rotary_dim (`int`, *optional*, defaults to 64):
+            Number of dimensions in the embedding that Rotary Position Embedding is applied to.
+        n_inner (`int`, *optional*, defaults to 0):
+            Dimensionality of the inner feed-forward layers. 0 will set it to 4 times n_embd
+        activation_function (`str`, *optional*, defaults to `"gelu_new"`):
+            Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (`int`, *optional*, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        scale_attn_weights (`bool`, *optional*, defaults to `True`):
+            Scale attention weights by dividing by sqrt(hidden_size).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+    Example:
+    ```python
+    >>> from transformers import GPTJModel, GPTJConfig
+    >>> # Initializing a GPT-J 6B configuration
+    >>> configuration = GPTJConfig()
+    >>> # Initializing a model from the configuration
+    >>> model = GPTJModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "gptj"
+    attribute_map = {
+        "max_position_embeddings": "n_positions",
+        "hidden_size": "n_embd",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+    }
+    def __init__(
+        self,
+        vocab_size=50400,
+        n_positions=2048,
+        n_embd=4096,
+        n_layer=28,
+        n_head=16,
+        rotary_dim=64,
+        n_inner=None,
+        activation_function="gelu_new",
+        resid_pdrop=0.0,
+        embd_pdrop=0.0,
+        attn_pdrop=0.0,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        scale_attn_weights=True,
+        use_cache=True,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        tie_word_embeddings=False,
+        gradient_checkpointing=True,
+        gradient_checkpointing_policy='nothing_saveable',
+        n_real_tokens=50257,
+        fcm_min_ratio=0.0,
+        fcm_max_ratio=0.0,
+        **kwargs
+    ):
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_inner = n_inner
+        self.rotary_dim = rotary_dim
+        self.activation_function = activation_function
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.scale_attn_weights = scale_attn_weights
+        self.use_cache = use_cache
+        self.gradient_checkpointing = gradient_checkpointing
+        self.gradient_checkpointing_policy = gradient_checkpointing_policy
+        self.n_real_tokens = n_real_tokens
+        self.fcm_min_ratio = fcm_min_ratio
+        self.fcm_max_ratio = fcm_max_ratio
+        if self.n_real_tokens is None:
+            self.n_real_tokens = self.vocab_size
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        super().__init__(
+            bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs
+        )
+    @classmethod
+    def get_default_config(cls, updates=None):
+        none_arg_types = dict(
+            n_inner=int,
+            rotary_dim=int,
+        )
+        config = function_args_to_config(cls.__init__, none_arg_types=none_arg_types)
+        if updates is not None:
+            config.update(ConfigDict(updates).copy_and_resolve_references())
+        return config
+    @staticmethod
+    def get_jax_mesh(axis_dims):
+        return get_jax_mesh(axis_dims, ('dp', 'fsdp', 'mp'))
+    @staticmethod
+    def get_partition_rules():
+        """ Parition rules for GPTJ. Note that these rules are orderd, so that
+            the beginning rules match first. It is important to use
+            PartitionSpec() instead of None here because JAX does not treat
+            None as a pytree leaf.
+        """
+        return (
+            ('transformer/wte/embedding', PartitionSpec('mp', 'fsdp')),
+            ('attn/(k_proj|q_proj|v_proj)/kernel', PartitionSpec('fsdp', 'mp')),
+            ('attn/out_proj/kernel', PartitionSpec('mp', 'fsdp')),
+            ('mlp/fc_in/kernel', PartitionSpec('fsdp', 'mp')),
+            ('mlp/fc_in/bias', PartitionSpec('mp')),
+            ('mlp/fc_out/kernel', PartitionSpec('mp', 'fsdp')),
+            ('mlp/fc_out/bias', PartitionSpec()),
+            ('ln_[0-9]+/bias', PartitionSpec()),
+            ('[0-9]+/ln_[0-9]+/scale', PartitionSpec()),
+            ('ln_f/bias', PartitionSpec()),
+            ('ln_f/scale', PartitionSpec()),
+            ('lm_head/kernel', PartitionSpec('fsdp', 'mp')),
+            ('lm_head/bias', PartitionSpec('mp')),
+            ('.*', PartitionSpec()),
+        )
+    @staticmethod
+    def get_weight_decay_exclusions():
+        return (
+            'ln_[0-9]+/bias', 'ln_[0-9]+/scale', 'ln_f/bias', 'ln_f/scale',
+            'bias'
+        )
+    @staticmethod
+    def rng_keys():
+        return ('params', 'dropout', 'fcm')
+    @staticmethod
+    def get_tokenizer_config(updates=None):
+        config = ConfigDict()
+        config.name = 'EleutherAI/gpt-j-6B'
+        config.bos_token = '<|endoftext|>'
+        config.eos_token = '<|endoftext|>'
+        config.pad_token = '<|extratoken_40|>'
+        config.cls_token = '<|extratoken_41|>'
+        config.mask_token = '<|extratoken_42|>'
+        if updates is not None:
+            config.update(ConfigDict(updates).copy_and_resolve_references())
+        return config
+    @classmethod
+    def get_tokenizer(cls, config, padding_side='left', truncation_side='right'):
+        config = cls.get_tokenizer_config(config)
+        return AutoTokenizer.from_pretrained(
+            config.name,
+            bos_token=config.bos_token,
+            eos_token=config.eos_token,
+            pad_token=config.pad_token,
+            cls_token=config.cls_token,
+            mask_token=config.mask_token,
+            padding_side=padding_side,
+            truncation_side=truncation_side,
+        )
+    @staticmethod
+    def load_pretrained(name, dtype=jnp.float32):
+        with jax.default_device(jax.devices("cpu")[0]):
+            params = FlaxGPTJForCausalLM.from_pretrained(
+                name, _do_init=False, dtype=dtype
+            )[1]
+            params = freeze({'params': params})
+        return jax.device_get(params)
+    @classmethod
+    def load_config(cls, path):
+        if path in GPTJ_STANDARD_CONFIGS:
+            return cls.from_dict(GPTJ_STANDARD_CONFIGS[path])
+        load_type, load_path = path.split('::', 1)
+        if load_type == 'pickle':
+            return cls.from_dict(load_pickle(load_path)['gptj_config'])
+        elif load_type == 'json':
+            with open_file(load_path, 'r') as fin:
+                raw_config = fin.read()
+            return cls.from_dict(json.loads(raw_config))
+        elif load_type == 'huggingface':
+            return cls.from_pretrained(load_path)
+        else:
+            raise ValueError(f'Unsupported load config type: {load_type}')
+"""
+The follow code is taken from
+transformers/src/transformers/models/gptj/modeling_flax_gptj.py
+and modified to work with EasyLM.
+"""
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "gptj"
+_CONFIG_FOR_DOC = "GPTJConfig"
+remat = nn_partitioning.remat
+GPTJ_START_DOCSTRING = r"""
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a Flax Linen
+    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
+    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
+    Finally, this model supports inherent JAX features such as:
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+    Parameters:
+        config ([`GPTJConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs).
+            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+            specified all the computation will be performed with the given `dtype`.
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.**
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+            [`~FlaxPreTrainedModel.to_bf16`].
+"""
+GPTJ_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length`. Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+        past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
+            Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
+            auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+def create_sinusoidal_positions(num_pos, dim):
+    inv_freq = 1.0 / (10000 ** (np.arange(0, dim, 2) / dim))
+    sinusoid_inp = np.einsum("i , j -> i j", np.arange(num_pos), inv_freq).astype("float32")
+    sin, cos = np.sin(sinusoid_inp), np.cos(sinusoid_inp)
+    sentinel = dim // 2 + dim % 2
+    out = np.zeros((num_pos, dim))
+    out[:, 0:sentinel] = sin
+    out[:, sentinel:] = cos
+    return jnp.array(out)
+def rotate_every_two(tensor):
+    rotate_half_tensor = jnp.stack((-tensor[:, :, :, 1::2], tensor[:, :, :, ::2]), axis=-1)
+    rotate_half_tensor = rotate_half_tensor.reshape(rotate_half_tensor.shape[:-2] + (-1,))
+    return rotate_half_tensor
+def apply_rotary_pos_emb(tensor, sincos):
+    sin_pos, cos_pos = sincos
+    sin_pos = sin_pos[:, :, None, :].repeat(2, 3)
+    cos_pos = cos_pos[:, :, None, :].repeat(2, 3)
+    return (tensor * cos_pos) + (rotate_every_two(tensor) * sin_pos)
+class FlaxGPTJAttention(nn.Module):
+    config: GPTJConfig
+    dtype: jnp.dtype = jnp.float32
+    causal: bool = True
+    is_cross_attention: bool = False
+    def setup(self):
+        config = self.config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        self.rotary_dim = config.rotary_dim
+        dense = partial(
+            nn.Dense,
+            self.embed_dim,
+            use_bias=False,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.variance_scaling(
+                scale=1.0, mode='fan_in',
+                distribution='normal',
+            )
+        )
+        self.q_proj, self.k_proj, self.v_proj = dense(), dense(), dense()
+        self.out_proj = dense()
+        self.resid_dropout = nn.Dropout(rate=config.resid_pdrop)
+        self.causal_mask = make_causal_mask(jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool")
+        if self.rotary_dim is not None and self.rotary_dim > 0:
+            pos_embd_dim = self.rotary_dim
+        else:
+            pos_embd_dim = self.embed_dim // self.num_heads
+        self.embed_positions = create_sinusoidal_positions(config.max_position_embeddings, pos_embd_dim)
+    def _split_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))
+    def _merge_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
+    @nn.compact
+    def _concatenate_to_cache(self, key, value, query, attention_mask):
+        """
+        This function takes projected key, value states from a single input token and concatenates the states to cached
+        states from previous steps. This function is slighly adapted from the official Flax repository:
+        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
+        """
+        # detect if we're initializing by absence of existing cache data.
+        is_initialized = self.has_variable("cache", "cached_key")
+        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
+        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
+        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+        if is_initialized:
+            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
+            # update key, value caches with our new 1d spatial slices
+            cur_index = cache_index.value
+            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
+            key = lax.dynamic_update_slice(cached_key.value, key, indices)
+            value = lax.dynamic_update_slice(cached_value.value, value, indices)
+            cached_key.value = key
+            cached_value.value = value
+            num_updated_cache_vectors = query.shape[1]
+            cache_index.value = cache_index.value + num_updated_cache_vectors
+            # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements.
+            pad_mask = jnp.broadcast_to(
+                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
+                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
+            )
+            attention_mask = combine_masks(pad_mask, attention_mask)
+        return key, value, attention_mask
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        position_ids,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        fcm_mask=None,
+    ):
+        query = self.q_proj(hidden_states)
+        key = self.k_proj(hidden_states)
+        value = self.v_proj(hidden_states)
+        query = self._split_heads(query)
+        key = self._split_heads(key)
+        value = self._split_heads(value)
+        sincos = jnp.take(self.embed_positions, position_ids, axis=0)
+        sincos = jnp.split(sincos, 2, axis=-1)
+        # Rotary position embeddings induce some weird issues in multi-host environments, so we remove activation-sharding for keys/query vectors to fix this.
+        # key = with_sharding_constraint(key, PartitionSpec("dp", None, None, None))
+        # query = with_sharding_constraint(query, PartitionSpec("dp", None, None, None))
+        if self.rotary_dim is not None and self.rotary_dim > 0:
+            k_rot = key[:, :, :, : self.rotary_dim]
+            k_pass = key[:, :, :, self.rotary_dim :]
+            q_rot = query[:, :, :, : self.rotary_dim]
+            q_pass = query[:, :, :, self.rotary_dim :]
+            k_rot = apply_rotary_pos_emb(k_rot, sincos)
+            q_rot = apply_rotary_pos_emb(q_rot, sincos)
+            key = jnp.concatenate([k_rot, k_pass], axis=-1)
+            query = jnp.concatenate([q_rot, q_pass], axis=-1)
+        else:
+            key = apply_rotary_pos_emb(key, sincos)
+            query = apply_rotary_pos_emb(query, sincos)
+        query_length, key_length = query.shape[1], key.shape[1]
+        if self.has_variable("cache", "cached_key"):
+            mask_shift = self.variables["cache"]["cache_index"]
+            max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
+            causal_mask = lax.dynamic_slice(
+                self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length)
+            )
+        else:
+            causal_mask = self.causal_mask[:, :, :query_length, :key_length]
+        batch_size = hidden_states.shape[0]
+        causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
+        attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
+        if self.causal:
+            attention_mask = combine_masks(attention_mask, causal_mask, fcm_mask)
+        else:
+            attention_mask = attention_mask
+        dropout_rng = None
+        if not deterministic and self.config.attn_pdrop > 0.0:
+            dropout_rng = self.make_rng("dropout")
+        # During fast autoregressive decoding, we feed one position at a time,
+        # and cache the keys and values step by step.
+        if self.has_variable("cache", "cached_key") or init_cache:
+            key, value, attention_mask = self._concatenate_to_cache(key, value, query, attention_mask)
+        # transform boolean mask into float mask
+        attention_bias = lax.select(
+            attention_mask > 0,
+            jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+            jnp.full(attention_mask.shape, -1e9).astype(self.dtype),
+        )
+        # usual dot product attention
+        attn_weights = dot_product_attention_weights(
+            query,
+            key,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.config.attn_pdrop,
+            deterministic=deterministic,
+            dtype=jnp.promote_types(self.dtype, jnp.float32),
+            precision=None,
+        )
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value)
+        attn_output = self._merge_heads(attn_output)
+        attn_output = self.out_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output, deterministic=deterministic)
+        outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
+        return outputs
+class FlaxGPTJMLP(nn.Module):
+    config: GPTJConfig
+    intermediate_size: int
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        embed_dim = self.config.hidden_size
+        kernel_init=jax.nn.initializers.variance_scaling(
+            scale=1.0, mode='fan_in',
+            distribution='normal',
+        )
+        self.fc_in = nn.Dense(self.intermediate_size, dtype=self.dtype, kernel_init=kernel_init)
+        self.fc_out = nn.Dense(embed_dim, dtype=self.dtype, kernel_init=kernel_init)
+        self.act = ACT2FN[self.config.activation_function]
+        self.dropout = nn.Dropout(rate=self.config.resid_pdrop)
+    def __call__(self, hidden_states, deterministic: bool = True):
+        hidden_states = self.fc_in(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.fc_out(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        return hidden_states
+class FlaxGPTJBlock(nn.Module):
+    config: GPTJConfig
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        hidden_size = self.config.hidden_size
+        inner_dim = self.config.n_inner if self.config.n_inner is not None else 4 * hidden_size
+        self.ln_1 = nn.LayerNorm(
+            epsilon=self.config.layer_norm_epsilon,
+            dtype=jnp.promote_types(self.dtype, jnp.float32)
+        )
+        self.attn = FlaxGPTJAttention(self.config, dtype=self.dtype)
+        self.mlp = FlaxGPTJMLP(self.config, inner_dim, dtype=self.dtype)
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_ids=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        fcm_mask=None,
+    ):
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_outputs = self.attn(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            fcm_mask=fcm_mask,
+        )
+        attn_output = attn_outputs[0]
+        feed_forward_hidden_states = self.mlp(hidden_states, deterministic=deterministic)
+        # residual connection
+        hidden_states = attn_output + feed_forward_hidden_states + residual
+        return (hidden_states,) + attn_outputs[1:]
+class FlaxGPTJPreTrainedModel(FlaxPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = GPTJConfig
+    base_model_prefix = "transformer"
+    module_class: nn.Module = None
+    def __init__(
+        self,
+        config: GPTJConfig,
+        input_shape: Tuple = (1, 1),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_ids = jnp.zeros(input_shape, dtype="i4")
+        attention_mask = jnp.ones_like(input_ids)
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape)
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+        if self.config.add_cross_attention:
+            encoder_hidden_states = jnp.zeros(input_shape + (self.config.n_embd,))
+            encoder_attention_mask = attention_mask
+            module_init_outputs = self.module.init(
+                rngs,
+                input_ids,
+                attention_mask,
+                position_ids,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                return_dict=False,
+            )
+        else:
+            module_init_outputs = self.module.init(rngs, input_ids, attention_mask, position_ids, return_dict=False)
+        random_params = module_init_outputs["params"]
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+    def init_cache(self, batch_size, max_length):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+        """
+        # init input variables to retrieve cache
+        input_ids = jnp.ones((batch_size, max_length))
+        attention_mask = jnp.ones_like(input_ids)
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
+        )
+        return init_variables["cache"]
+    def _get_logits_processor(self,*args, **kwargs) -> FlaxLogitsProcessorList:
+        processors = super()._get_logits_processor(*args, **kwargs)
+        def squash_extra_tokens(input_ids, scores, cur_len):
+            return scores.at[:, self.config.n_real_tokens:].set(-float('inf'))
+        processors.append(squash_extra_tokens)
+        return processors
+    @add_start_docstrings_to_model_forward(GPTJ_INPUTS_DOCSTRING)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        position_ids=None,
+        params: dict = None,
+        past_key_values: dict = None,
+        dropout_rng: jax.random.PRNGKey = None,
+        train: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        batch_size, sequence_length = input_ids.shape
+        if position_ids is None:
+            if past_key_values is not None:
+                raise ValueError("Make sure to provide `position_ids` when passing `past_key_values`.")
+            position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
+        if attention_mask is None:
+            attention_mask = jnp.ones((batch_size, sequence_length))
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+        inputs = {"params": params or self.params}
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be changed by FlaxGPTJAttention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+        outputs = self.module.apply(
+            inputs,
+            jnp.array(input_ids, dtype="i4"),
+            jnp.array(attention_mask, dtype="i4"),
+            jnp.array(position_ids, dtype="i4"),
+            not train,
+            False,
+            output_attentions,
+            output_hidden_states,
+            return_dict,
+            rngs=rngs,
+            mutable=mutable,
+        )
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs, past_key_values = outputs
+            outputs["past_key_values"] = unfreeze(past_key_values["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs, past_key_values = outputs
+            outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]
+        return outputs
+class FlaxGPTJBlockCollection(nn.Module):
+    config: GPTJConfig
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        block = FlaxGPTJBlock
+        if self.config.gradient_checkpointing:
+            FlaxGPT2CheckpointBlock = remat(
+                block, static_argnums=(3, 4, 5),
+                policy=get_gradient_checkpoint_policy(
+                    self.config.gradient_checkpointing_policy
+                )
+            )
+            block = FlaxGPT2CheckpointBlock
+        self.blocks = [
+            block(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.num_hidden_layers)
+        ]
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_ids=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        if not deterministic and self.config.fcm_max_ratio > 0:
+            # Apply forgetful causal mask
+            batch_size, seq_length = hidden_states.shape[0], hidden_states.shape[1]
+            fcm_ratio = jax.random.uniform(
+                self.make_rng('fcm'), shape=(batch_size, 1, 1, 1),
+                minval=self.config.fcm_min_ratio,
+                maxval=self.config.fcm_max_ratio
+            )
+            fcm_mask = jax.random.uniform(
+                self.make_rng('fcm'),
+                shape=(batch_size, 1, seq_length, seq_length)
+            ) > fcm_ratio
+            fcm_mask = fcm_mask.at[:, :, :, 0].set(True)
+            fcm_mask = fcm_mask.astype('bool')
+        else:
+            fcm_mask = None
+        for block in self.blocks:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            layer_outputs = block(
+                hidden_states,
+                attention_mask,
+                position_ids,
+                deterministic,
+                init_cache,
+                output_attentions,
+                fcm_mask,
+            )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions += (layer_outputs[1],)
+        # this contains possible `None` values - `FlaxGPTJModule` will filter them out
+        outputs = (hidden_states, all_hidden_states, all_attentions)
+        return outputs
+class FlaxGPTJModule(nn.Module):
+    config: GPTJConfig
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.embed_dim = self.config.hidden_size
+        self.wte = nn.Embed(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+        )
+        self.dropout = nn.Dropout(rate=self.config.embd_pdrop)
+        self.h = FlaxGPTJBlockCollection(self.config, dtype=self.dtype)
+        self.ln_f = nn.LayerNorm(
+            epsilon=self.config.layer_norm_epsilon,
+            dtype=jnp.promote_types(self.dtype, jnp.float32)
+        )
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        position_ids,
+        deterministic=True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        input_embeds = self.wte(input_ids.astype("i4"))
+        hidden_states = self.dropout(input_embeds, deterministic=deterministic)
+        outputs = self.h(
+            hidden_states,
+            attention_mask,
+            position_ids=position_ids,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        hidden_states = self.ln_f(hidden_states)
+        if output_hidden_states:
+            all_hidden_states = outputs[1] + (hidden_states,)
+            outputs = (hidden_states, all_hidden_states) + outputs[2:]
+        else:
+            outputs = (hidden_states,) + outputs[1:]
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+        return FlaxBaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=outputs[1],
+            attentions=outputs[-1],
+        )
+@add_start_docstrings(
+    "The bare GPTJ Model transformer outputting raw hidden-states without any specific head on top.",
+    GPTJ_START_DOCSTRING,
+)
+class FlaxGPTJModel(FlaxGPTJPreTrainedModel):
+    module_class = FlaxGPTJModule
+append_call_sample_docstring(
+    FlaxGPTJModel,
+    _CHECKPOINT_FOR_DOC,
+    FlaxCausalLMOutput,
+    _CONFIG_FOR_DOC,
+)
+class FlaxGPTJForCausalLMModule(nn.Module):
+    config: GPTJConfig
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.transformer = FlaxGPTJModule(self.config, dtype=self.dtype)
+        self.lm_head = nn.Dense(
+            self.config.vocab_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.variance_scaling(
+                scale=1.0, mode='fan_in',
+                distribution='normal',
+            )
+        )
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        position_ids=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        batch_size, seq_length = input_ids.shape
+        if attention_mask is None:
+            attention_mask = jnp.ones_like(input_ids)
+        if position_ids is None:
+            position_ids = jnp.broadcast_to(
+                jnp.clip(jnp.cumsum(attention_mask, axis=-1) - 1, a_min=0),
+                (batch_size, seq_length)
+            )
+        outputs = self.transformer(
+            input_ids,
+            attention_mask,
+            position_ids,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        if self.config.tie_word_embeddings:
+            shared_kernel = self.transformer.variables["params"]["wte"]["embedding"].T
+            lm_logits = self.lm_head.apply({"params": {"kernel": shared_kernel}}, hidden_states)
+        else:
+            lm_logits = self.lm_head(hidden_states)
+        if not return_dict:
+            return (lm_logits,) + outputs[1:]
+        return FlaxCausalLMOutput(logits=lm_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
+@add_start_docstrings(
+    """
+    The GPTJ Model transformer with a language modeling head on top.
+    """,
+    GPTJ_START_DOCSTRING,
+)
+class FlaxGPTJForCausalLM(FlaxGPTJPreTrainedModel):
+    module_class = FlaxGPTJForCausalLMModule
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
+        # initializing the cache
+        batch_size, seq_length = input_ids.shape
+        past_key_values = self.init_cache(batch_size, max_length)
+        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
+        # But since GPTJ uses a causal mask, those positions are masked anyways.
+        # Thus we can create a single static attention_mask here, which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if attention_mask is not None:
+            position_ids = attention_mask.cumsum(axis=-1) - 1
+            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
+        else:
+            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
+        return {
+            "past_key_values": past_key_values,
+            "attention_mask": extended_attention_mask,
+            "position_ids": position_ids,
+        }
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
+        return model_kwargs
+append_call_sample_docstring(
+    FlaxGPTJForCausalLM,
+    _CHECKPOINT_FOR_DOC,
+    FlaxCausalLMOutput,
+    _CONFIG_FOR_DOC,
+)

EasyLM/models/gptj/gptj_serve.py ADDED Viewed

	@@ -0,0 +1,396 @@

+import pprint
+from functools import partial
+import numpy as np
+import mlxu
+import jax
+import jax.numpy as jnp
+from jax.experimental.pjit import pjit
+from jax.sharding import PartitionSpec as PS
+import flax
+from flax import linen as nn
+from flax.jax_utils import prefetch_to_device
+from flax.training.train_state import TrainState
+import optax
+from transformers import GenerationConfig, FlaxLogitsProcessorList
+from EasyLM.checkpoint import StreamingCheckpointer
+from EasyLM.serving import LMServer
+from EasyLM.jax_utils import (
+    JaxRNG, JaxDistributedConfig, next_rng, match_partition_rules, tree_apply,
+    set_random_seed, get_float_dtype_by_name, make_shard_and_gather_fns,
+    with_sharding_constraint, FlaxTemperatureLogitsWarper
+)
+from EasyLM.models.gptj.gptj_model import (
+    GPTJConfig, FlaxGPTJForCausalLMModule, FlaxGPTJForCausalLM
+)
+FLAGS, FLAGS_DEF = mlxu.define_flags_with_default(
+    seed=42,
+    initialize_jax_distributed=False,
+    mesh_dim='1,-1,1',
+    dtype='bf16',
+    input_length=1024,
+    seq_length=2048,
+    top_k=50,
+    top_p=1.0,
+    do_sample=True,
+    num_beams=1,
+    add_bos_token=False,
+    load_gptj_config='',
+    load_checkpoint='',
+    tokenizer=GPTJConfig.get_tokenizer_config(),
+    lm_server=LMServer.get_default_config(),
+    jax_distributed=JaxDistributedConfig.get_default_config(),
+)
+def main(argv):
+    JaxDistributedConfig.initialize(FLAGS.jax_distributed)
+    set_random_seed(FLAGS.seed)
+    prefix_tokenizer = GPTJConfig.get_tokenizer(
+        FLAGS.tokenizer, truncation_side='left', padding_side='left'
+    )
+    tokenizer = GPTJConfig.get_tokenizer(
+        FLAGS.tokenizer, truncation_side='right', padding_side='right'
+    )
+    with jax.default_device(jax.devices("cpu")[0]):
+        gptj_config = GPTJConfig.load_config(FLAGS.load_gptj_config)
+        load_type, load_path = FLAGS.load_checkpoint.split('::', 1)
+        if load_type == 'huggingface':
+            params = gptj_config.load_pretrained(load_path)
+        else:
+            _, params = StreamingCheckpointer.load_trainstate_checkpoint(
+                FLAGS.load_checkpoint, disallow_trainstate=True
+            )
+        hf_model = FlaxGPTJForCausalLM(
+            gptj_config,
+            input_shape=(1, FLAGS.seq_length),
+            seed=FLAGS.seed,
+            _do_init=False
+        )
+    model_ps = match_partition_rules(
+        GPTJConfig.get_partition_rules(), params
+    )
+    shard_fns, _ = make_shard_and_gather_fns(
+        model_ps, get_float_dtype_by_name(FLAGS.dtype)
+    )
+    @partial(
+        pjit,
+        in_shardings=(model_ps, PS(), PS()),
+        out_shardings=(PS(), PS(), PS())
+    )
+    def forward_loglikelihood(params, rng, batch):
+        batch = with_sharding_constraint(batch, PS(('dp', 'fsdp')))
+        rng_generator = JaxRNG(rng)
+        input_tokens = batch['input_tokens']
+        output_tokens = batch['output_tokens']
+        input_mask = batch['input_mask']
+        output_mask = batch['output_mask']
+        logits = hf_model.module.apply(
+            params, input_tokens, attention_mask=input_mask,
+            deterministic=True, rngs=rng_generator(gptj_config.rng_keys()),
+        ).logits
+        if gptj_config.n_real_tokens is not None:
+          logits = logits.at[:, :, gptj_config.n_real_tokens:].set(-1e8)
+        loglikelihood = -optax.softmax_cross_entropy_with_integer_labels(
+            logits, output_tokens
+        )
+        loglikelihood = jnp.sum(loglikelihood * output_mask, axis=-1)
+        match_count = jnp.sum(
+            (jnp.argmax(logits, axis=-1) == output_tokens) * output_mask,
+            axis=-1
+        )
+        total = jnp.sum(output_mask, axis=-1)
+        is_greedy = match_count == total
+        return loglikelihood, is_greedy, rng_generator()
+    @partial(
+        pjit,
+        in_shardings=(model_ps, PS(), PS(), PS()),
+        out_shardings=(PS(), PS())
+    )
+    def forward_generate(params, rng, batch, temperature):
+        batch = with_sharding_constraint(batch, PS(('dp', 'fsdp')))
+        rng_generator = JaxRNG(rng)
+        output = hf_model.generate(
+            batch['input_tokens'],
+            attention_mask=batch['attention_mask'],
+            params=params['params'],
+            prng_key=rng_generator(),
+            logits_processor=FlaxLogitsProcessorList(
+                [FlaxTemperatureLogitsWarper(temperature)]
+            ),
+            generation_config=GenerationConfig(
+                max_new_tokens=FLAGS.seq_length - FLAGS.input_length,
+                pad_token_id=tokenizer.eos_token_id,
+                bos_token_id=tokenizer.bos_token_id,
+                eos_token_id=tokenizer.eos_token_id,
+                do_sample=FLAGS.do_sample,
+                num_beams=FLAGS.num_beams,
+                top_k=FLAGS.top_k,
+                top_p=FLAGS.top_p,
+            )
+        ).sequences[:, batch['input_tokens'].shape[1]:]
+        return output, rng_generator()
+    @partial(
+        pjit,
+        in_shardings=(model_ps, PS(), PS()),
+        out_shardings=(PS(), PS())
+    )
+    def forward_greedy_generate(params, rng, batch):
+        batch = with_sharding_constraint(batch, PS(('dp', 'fsdp')))
+        rng_generator = JaxRNG(rng)
+        output = hf_model.generate(
+            batch['input_tokens'],
+            attention_mask=batch['attention_mask'],
+            params=params['params'],
+            prng_key=rng_generator(),
+            generation_config=GenerationConfig(
+                max_new_tokens=FLAGS.seq_length - FLAGS.input_length,
+                pad_token_id=tokenizer.eos_token_id,
+                bos_token_id=tokenizer.bos_token_id,
+                eos_token_id=tokenizer.eos_token_id,
+                do_sample=False,
+                num_beams=1,
+            )
+        ).sequences[:, batch['input_tokens'].shape[1]:]
+        return output, rng_generator()
+    mesh = GPTJConfig.get_jax_mesh(FLAGS.mesh_dim)
+    with mesh:
+        params = tree_apply(shard_fns, params)
+        sharded_rng = next_rng()
+    class ModelServer(LMServer):
+        @staticmethod
+        def loglikelihood(prefix_text, text):
+            nonlocal sharded_rng
+            prefix = prefix_tokenizer(
+                prefix_text,
+                padding='max_length',
+                truncation=True,
+                max_length=FLAGS.input_length,
+                return_tensors='np',
+            )
+            inputs = tokenizer(
+                text,
+                padding='max_length',
+                truncation=True,
+                max_length=FLAGS.seq_length - FLAGS.input_length,
+                return_tensors='np',
+            )
+            output_tokens = np.concatenate([prefix.input_ids, inputs.input_ids], axis=1)
+            bos_tokens = np.full(
+                (output_tokens.shape[0], 1), tokenizer.bos_token_id, dtype=np.int32
+            )
+            input_tokens = np.concatenate([bos_tokens, output_tokens[:, :-1]], axis=-1)
+            input_mask = np.concatenate(
+                [prefix.attention_mask, inputs.attention_mask], axis=1
+            )
+            if FLAGS.add_bos_token:
+                bos_mask = np.ones_like(input_mask[:, :1])
+            else:
+                bos_mask = np.zeros_like(input_mask[:, :1])
+            input_mask = np.concatenate([bos_mask, input_mask[:, :-1]], axis=1)
+            output_mask = np.concatenate(
+                [np.zeros_like(prefix.attention_mask), inputs.attention_mask], axis=1
+            )
+            batch = dict(
+                input_tokens=input_tokens,
+                output_tokens=output_tokens,
+                input_mask=input_mask,
+                output_mask=output_mask,
+            )
+            with mesh:
+                loglikelihood, is_greedy, sharded_rng = forward_loglikelihood(
+                    params, sharded_rng, batch
+                )
+                loglikelihood, is_greedy = jax.device_get((loglikelihood, is_greedy))
+            return loglikelihood, is_greedy
+        @staticmethod
+        def loglikelihood_rolling(text):
+            nonlocal sharded_rng
+            inputs = tokenizer(
+                text,
+                padding='longest',
+                truncation=False,
+                max_length=np.iinfo(np.int32).max,
+                return_tensors='np',
+            )
+            batch_size = inputs.input_ids.shape[0]
+            output_tokens = inputs.input_ids
+            attention_mask = inputs.attention_mask
+            if output_tokens.shape[1] < FLAGS.seq_length:
+                padding_length = FLAGS.seq_length - output_tokens.shape[1]
+                pad_tokens = np.full(
+                    (batch_size, padding_length), tokenizer.pad_token_id, dtype=np.int32
+                )
+                output_tokens = np.concatenate([output_tokens, pad_tokens], axis=-1)
+                pad_mask = np.zeros(
+                    (batch_size, padding_length), dtype=inputs.attention_mask.dtype
+                )
+                attention_mask = np.concatenate([attention_mask, pad_mask], axis=-1)
+            bos_tokens = np.full(
+                (batch_size, 1), tokenizer.bos_token_id, dtype=np.int32
+            )
+            input_tokens = np.concatenate([bos_tokens, output_tokens[:, :-1]], axis=-1)
+            bos_mask = np.ones((batch_size, 1), dtype=inputs.attention_mask.dtype)
+            total_seq_length = output_tokens.shape[1]
+            total_loglikelihood = 0.0
+            total_is_greedy = True
+            # Sliding window
+            for i in range(0, total_seq_length, FLAGS.seq_length):
+                # Last window
+                if i + FLAGS.seq_length > total_seq_length:
+                    last_output_mask = np.copy(attention_mask[:, -FLAGS.seq_length:])
+                    last_output_mask[:, :i - total_seq_length] = 0.0
+                    batch = dict(
+                        input_tokens=input_tokens[:, -FLAGS.seq_length:],
+                        output_tokens=output_tokens[:, -FLAGS.seq_length:],
+                        input_mask=attention_mask[:, -FLAGS.seq_length:],
+                        output_mask=last_output_mask,
+                    )
+                # Normal window
+                else:
+                    batch = dict(
+                        input_tokens=input_tokens[:, i:i + FLAGS.seq_length],
+                        output_tokens=output_tokens[:, i:i + FLAGS.seq_length],
+                        input_mask=attention_mask[:, i:i + FLAGS.seq_length],
+                        output_mask=attention_mask[:, i:i + FLAGS.seq_length],
+                    )
+                with mesh:
+                    loglikelihood, is_greedy, sharded_rng = forward_loglikelihood(
+                        params, sharded_rng, batch
+                    )
+                    loglikelihood, is_greedy = jax.device_get((loglikelihood, is_greedy))
+                total_loglikelihood += loglikelihood
+                total_is_greedy = np.logical_and(is_greedy, total_is_greedy)
+            return total_loglikelihood, total_is_greedy
+        @staticmethod
+        def generate(text, temperature):
+            nonlocal sharded_rng
+            inputs = prefix_tokenizer(
+                text,
+                padding='max_length',
+                truncation=True,
+                max_length=FLAGS.input_length,
+                return_tensors='np',
+            )
+            input_tokens = inputs.input_ids
+            input_mask = inputs.attention_mask
+            if FLAGS.add_bos_token:
+                input_tokens[:, 0] = tokenizer.bos_token_id
+                input_mask[:, 0] = 1
+            batch = dict(
+                input_tokens=input_tokens,
+                attention_mask=input_mask,
+            )
+            with mesh:
+                output, sharded_rng = forward_generate(
+                    params, sharded_rng, batch, temperature
+                )
+                output = jax.device_get(output)
+            output_text = []
+            for text in list(tokenizer.batch_decode(output)):
+                if tokenizer.eos_token in text:
+                    text = text.split(tokenizer.eos_token, maxsplit=1)[0]
+                output_text.append(text)
+            return output_text
+        @staticmethod
+        def greedy_until(prefix_text, until, max_length):
+            nonlocal sharded_rng
+            all_outputs = []
+            for pf, ut in zip(prefix_text, until):
+                if isinstance(ut, str):
+                    ut = [ut]
+                total_length = 0
+                total_generated = ''
+                while total_length < max_length:
+                    pf_tokens = tokenizer(
+                        pf,
+                        padding=False,
+                        truncation=False,
+                        max_length=np.iinfo(np.int32).max,
+                        return_tensors='np',
+                    )
+                    input_tokens = pf_tokens.input_ids
+                    attention_mask = pf_tokens.attention_mask
+                    if input_tokens.shape[1] < FLAGS.input_length:
+                        extra = FLAGS.input_length - input_tokens.shape[1]
+                        pad_tokens = np.full(
+                            (1, extra), tokenizer.pad_token_id, dtype=np.int32
+                        )
+                        input_tokens = np.concatenate(
+                            [pad_tokens, input_tokens], axis=1
+                        )
+                        pad_attention = np.zeros((1, extra), dtype=attention_mask.dtype)
+                        attention_mask = np.concatenate(
+                            [pad_attention, attention_mask], axis=1
+                        )
+                    elif input_tokens.shape[1] > FLAGS.input_length:
+                        input_tokens = input_tokens[:, -FLAGS.input_length:]
+                        attention_mask = attention_mask[:, -FLAGS.input_length:]
+                    if FLAGS.add_bos_token:
+                        input_tokens[:, 0] = tokenizer.bos_token_id
+                        attention_mask[:, 0] = 1
+                    batch = dict(input_tokens=input_tokens, attention_mask=attention_mask)
+                    with mesh:
+                        output, sharded_rng = forward_greedy_generate(
+                            params, sharded_rng, batch
+                        )
+                        output = jax.device_get(output)
+                    total_length += output.shape[1]
+                    output_text = tokenizer.batch_decode(output)[0]
+                    total_generated = total_generated + output_text
+                    pf = pf + output_text
+                    done = False
+                    for s in ut:
+                        if s in total_generated:
+                            total_generated = total_generated.split(s, maxsplit=1)[0]
+                            done = True
+                    if done:
+                        break
+                all_outputs.append(total_generated)
+            return all_outputs
+    server = ModelServer(FLAGS.lm_server)
+    server.run()
+if __name__ == "__main__":
+    mlxu.run(main)

EasyLM/models/gptj/gptj_train.py ADDED Viewed

	@@ -0,0 +1,272 @@

+import pprint
+from functools import partial
+from tqdm import tqdm, trange
+import numpy as np
+import mlxu
+import jax
+import jax.numpy as jnp
+from jax.experimental.pjit import pjit, with_sharding_constraint
+from jax.sharding import PartitionSpec as PS
+from flax.training.train_state import TrainState
+from EasyLM.data import DatasetFactory
+from EasyLM.checkpoint import StreamingCheckpointer
+from EasyLM.optimizers import OptimizerFactory
+from EasyLM.jax_utils import (
+    JaxRNG, JaxDistributedConfig, next_rng, match_partition_rules,
+    cross_entropy_loss_and_accuracy, global_norm, get_float_dtype_by_name,
+    set_random_seed, average_metrics, get_weight_decay_mask,
+    make_shard_and_gather_fns, tree_apply
+)
+from EasyLM.models.gptj.gptj_model import GPTJConfig, FlaxGPTJForCausalLMModule
+FLAGS, FLAGS_DEF = mlxu.define_flags_with_default(
+    seed=42,
+    mesh_dim='1,-1,1',
+    dtype='fp32',
+    total_steps=10000,
+    load_gptj_config='',
+    update_gptj_config='',
+    load_checkpoint='',
+    load_dataset_state='',
+    log_freq=50,
+    save_model_freq=0,
+    save_milestone_freq=0,
+    eval_steps=0,
+    tokenizer=GPTJConfig.get_tokenizer_config(),
+    train_dataset=DatasetFactory.get_default_config(),
+    eval_dataset=DatasetFactory.get_default_config(),
+    optimizer=OptimizerFactory.get_default_config(),
+    checkpointer=StreamingCheckpointer.get_default_config(),
+    gptj=GPTJConfig.get_default_config(),
+    logger=mlxu.WandBLogger.get_default_config(),
+    log_all_worker=False,
+    jax_distributed=JaxDistributedConfig.get_default_config(),
+)
+def main(argv):
+    JaxDistributedConfig.initialize(FLAGS.jax_distributed)
+    variant = mlxu.get_user_flags(FLAGS, FLAGS_DEF)
+    flags_config_dict = mlxu.user_flags_to_config_dict(FLAGS, FLAGS_DEF)
+    logger = mlxu.WandBLogger(
+        config=FLAGS.logger,
+        variant=variant,
+        enable=FLAGS.log_all_worker or (jax.process_index() == 0),
+    )
+    set_random_seed(FLAGS.seed)
+    tokenizer = GPTJConfig.get_tokenizer(FLAGS.tokenizer)
+    dataset = DatasetFactory.load_dataset(FLAGS.train_dataset, tokenizer)
+    if FLAGS.load_dataset_state != '':
+        dataset.load_state_dict(mlxu.load_pickle(FLAGS.load_dataset_state))
+    if FLAGS.eval_steps > 0:
+        eval_dataset = DatasetFactory.load_dataset(
+            FLAGS.eval_dataset, dataset.tokenizer
+        )
+        eval_iterator = iter(eval_dataset)
+    seq_length = dataset.seq_length
+    if FLAGS.load_gptj_config != '':
+        gptj_config = GPTJConfig.load_config(FLAGS.load_gptj_config)
+    else:
+        gptj_config = GPTJConfig(**FLAGS.gptj)
+    if FLAGS.update_gptj_config != '':
+        gptj_config.update(dict(eval(FLAGS.update_gptj_config)))
+    gptj_config.update(dict(
+        bos_token_id=dataset.tokenizer.bos_token_id,
+        eos_token_id=dataset.tokenizer.eos_token_id,
+    ))
+    if gptj_config.vocab_size < dataset.vocab_size:
+        gptj_config.update(dict(vocab_size=dataset.vocab_size))
+    model = FlaxGPTJForCausalLMModule(
+        gptj_config, dtype=get_float_dtype_by_name(FLAGS.dtype)
+    )
+    optimizer, optimizer_info = OptimizerFactory.get_optimizer(
+        FLAGS.optimizer,
+        get_weight_decay_mask(GPTJConfig.get_weight_decay_exclusions()),
+    )
+    def create_trainstate_from_params(params):
+        return TrainState.create(params=params, tx=optimizer, apply_fn=None)
+    def init_fn(rng):
+        rng_generator = JaxRNG(rng)
+        params = model.init(
+            input_ids=jnp.zeros((4, seq_length), dtype=jnp.int32),
+            position_ids=jnp.zeros((4, seq_length), dtype=jnp.int32),
+            attention_mask=jnp.ones((4, seq_length), dtype=jnp.int32),
+            rngs=rng_generator(gptj_config.rng_keys()),
+        )
+        return TrainState.create(params=params, tx=optimizer, apply_fn=None)
+    def train_step(train_state, rng, batch):
+        rng_generator = JaxRNG(rng)
+        batch = with_sharding_constraint(batch, PS(('dp', 'fsdp')))
+        def loss_and_accuracy(params):
+            logits = model.apply(
+                params, batch['input_tokens'], deterministic=False,
+                rngs=rng_generator(gptj_config.rng_keys()),
+            ).logits
+            return cross_entropy_loss_and_accuracy(
+                logits, batch['target_tokens'], batch['loss_masks']
+            )
+        grad_fn = jax.value_and_grad(loss_and_accuracy, has_aux=True)
+        (loss, accuracy), grads = grad_fn(train_state.params)
+        train_state = train_state.apply_gradients(grads=grads)
+        metrics = dict(
+            loss=loss,
+            accuracy=accuracy,
+            learning_rate=optimizer_info['learning_rate_schedule'](train_state.step),
+            gradient_norm=global_norm(grads),
+            param_norm=global_norm(train_state.params),
+        )
+        return train_state, rng_generator(), metrics
+    def eval_step(train_state, rng, batch):
+        rng_generator = JaxRNG(rng)
+        batch = with_sharding_constraint(batch, PS(('dp', 'fsdp')))
+        logits = model.apply(
+            train_state.params, batch['input_tokens'], deterministic=True,
+            rngs=rng_generator(gptj_config.rng_keys()),
+        ).logits
+        loss, accuracy = cross_entropy_loss_and_accuracy(
+            logits, batch['target_tokens'], batch['loss_masks']
+        )
+        metrics = dict(
+            eval_loss=loss,
+            eval_accuracy=accuracy,
+        )
+        return rng_generator(), metrics
+    train_state_shapes = jax.eval_shape(init_fn, next_rng())
+    train_state_partition = match_partition_rules(
+        GPTJConfig.get_partition_rules(), train_state_shapes
+    )
+    shard_fns, gather_fns = make_shard_and_gather_fns(
+        train_state_partition, train_state_shapes
+    )
+    checkpointer = StreamingCheckpointer(
+        FLAGS.checkpointer, logger.output_dir,
+        enable=jax.process_index() == 0,
+    )
+    sharded_init_fn = pjit(
+        init_fn,
+        in_shardings=PS(),
+        out_shardings=train_state_partition
+    )
+    sharded_create_trainstate_from_params = pjit(
+        create_trainstate_from_params,
+        in_shardings=(train_state_partition.params, ),
+        out_shardings=train_state_partition,
+        donate_argnums=(0, ),
+    )
+    sharded_train_step = pjit(
+        train_step,
+        in_shardings=(train_state_partition, PS(), PS()),
+        out_shardings=(train_state_partition, PS(), PS()),
+        donate_argnums=(0, 1),
+    )
+    sharded_eval_step = pjit(
+        eval_step,
+        in_shardings=(train_state_partition, PS(), PS()),
+        out_shardings=(PS(), PS()),
+        donate_argnums=(1,),
+    )
+    def save_checkpoint(train_state, milestone=False):
+        step = int(jax.device_get(train_state.step))
+        metadata = dict(
+            step=step,
+            variant=variant,
+            flags=flags_config_dict,
+            gptj_config=gptj_config.to_dict(),
+        )
+        checkpointer.save_all(
+            train_state=train_state,
+            gather_fns=gather_fns,
+            metadata=metadata,
+            dataset=dataset.get_state_dict(),
+            milestone=milestone,
+        )
+    mesh = GPTJConfig.get_jax_mesh(FLAGS.mesh_dim)
+    with mesh:
+        train_state, restored_params = None, None
+        if FLAGS.load_checkpoint != '':
+            load_type, load_path = FLAGS.load_checkpoint.split('::', 1)
+            if load_type == 'huggingface':
+                restored_params = tree_apply(
+                    shard_fns.params, gptj_config.load_pretrained(load_path)
+                )
+                train_state = None
+            else:
+                train_state, restored_params = checkpointer.load_trainstate_checkpoint(
+                    FLAGS.load_checkpoint, train_state_shapes, shard_fns
+                )
+        if train_state is None and restored_params is None:
+            # Initialize from scratch
+            train_state = sharded_init_fn(next_rng())
+        elif train_state is None and restored_params is not None:
+            # Restore from params but initialize train_state
+            train_state = sharded_create_trainstate_from_params(restored_params)
+            del restored_params
+        start_step = int(jax.device_get(train_state.step))
+        if FLAGS.save_model_freq > 0:
+            save_checkpoint(train_state)
+        sharded_rng = next_rng()
+        step_counter = trange(start_step, FLAGS.total_steps, ncols=0)
+        for step, (batch, dataset_metrics) in zip(step_counter, dataset):
+            train_state, sharded_rng, metrics = sharded_train_step(
+                train_state, sharded_rng, batch
+            )
+            if step % FLAGS.log_freq == 0:
+                if FLAGS.eval_steps > 0:
+                    eval_metric_list = []
+                    for _ in range(FLAGS.eval_steps):
+                        eval_batch, _ = next(eval_iterator)
+                        sharded_rng, eval_metrics = sharded_eval_step(
+                            train_state, sharded_rng, eval_batch
+                        )
+                        eval_metric_list.append(eval_metrics)
+                    metrics.update(average_metrics(eval_metric_list))
+                log_metrics = {"step": step}
+                log_metrics.update(metrics)
+                log_metrics.update(dataset_metrics)
+                log_metrics = jax.device_get(log_metrics)
+                logger.log(log_metrics)
+                tqdm.write("\n" + pprint.pformat(log_metrics) + "\n")
+            if FLAGS.save_milestone_freq > 0 and (step + 1) % FLAGS.save_milestone_freq == 0:
+                save_checkpoint(train_state, milestone=True)
+            elif FLAGS.save_model_freq > 0 and (step + 1) % FLAGS.save_model_freq == 0:
+                save_checkpoint(train_state)
+        if FLAGS.save_model_freq > 0:
+            save_checkpoint(train_state)
+if __name__ == "__main__":
+    mlxu.run(main)

EasyLM/models/llama/convert_easylm_to_hf.py ADDED Viewed

	@@ -0,0 +1,338 @@

+# Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2023 Xinyang Geng
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This script converts LLaMA model checkpoint trained by EsayLM to the
+# HuggingFace transformers LLaMA PyTorch format, which can then be loaded
+# by HuggingFace transformers.
+import gc
+import json
+import math
+import os
+import shutil
+import numpy as np
+import mlxu
+import jax
+import jax.numpy as jnp
+import flax
+from flax.traverse_util import flatten_dict
+import torch
+from transformers import LlamaConfig, LlamaForCausalLM
+from EasyLM.checkpoint import StreamingCheckpointer
+from EasyLM.jax_utils import float_tensor_to_dtype
+FLAGS, FLAGS_DEF = mlxu.define_flags_with_default(
+    load_checkpoint='',
+    tokenizer_path='',
+    model_size='13b',
+    output_dir='',
+)
+LLAMA_STANDARD_CONFIGS = {
+    'small': {
+        'vocab_size': 64256,
+        'dim': 768,
+        'intermediate_size': 3072,
+        'n_layers': 12,
+        'n_heads': 12,
+        'norm_eps': 1e-6,
+    },
+    'medium': {
+        'vocab_size': 64256,
+        'dim': 1024,
+        'intermediate_size': 4096,
+        'n_layers': 24,
+        'n_heads': 16,
+        'norm_eps': 1e-6,
+    },
+    'large': {
+        'vocab_size': 64256,
+        'dim': 1536,
+        'intermediate_size': 6144,
+        'n_layers': 24,
+        'n_heads': 16,
+        'norm_eps': 1e-6,
+    },
+    'xlarge': {
+        'vocab_size': 64256,
+        'dim': 2048,
+        'intermediate_size': 8192,
+        'n_layers': 24,
+        'n_heads': 32,
+        'norm_eps': 1e-6,
+    },
+    '1b': {
+        'vocab_size': 64256,
+        'dim': 2048,
+        'intermediate_size': 5504,
+        'n_layers': 22,
+        'n_heads': 16,
+        'norm_eps': 1e-6,
+    },
+    '3b': {
+        'vocab_size': 64256,
+        'dim': 3200,
+        'intermediate_size': 8640,
+        'n_layers': 26,
+        'n_heads': 32,
+        'norm_eps': 1e-6,
+    },
+    '7b': {
+        'vocab_size': 64256,
+        'dim': 4096,
+        'intermediate_size': 11008,
+        'n_layers': 32,
+        'n_heads': 32,
+        'norm_eps': 1e-6,
+    },
+    '13b': {
+        'vocab_size': 64256,
+        'dim': 5120,
+        'intermediate_size': 13824,
+        'n_layers': 40,
+        'n_heads': 40,
+        'norm_eps': 1e-6,
+    },
+    '30b': {
+        'vocab_size': 64256,
+        'dim': 6656,
+        'intermediate_size': 17920,
+        'n_layers': 60,
+        'n_heads': 52,
+        'norm_eps': 1e-6,
+    },
+    '65b': {
+        'vocab_size': 64256,
+        'dim': 8192,
+        'intermediate_size': 22016,
+        'n_layers': 80,
+        'n_heads': 64,
+        'norm_eps': 1e-5,
+    },
+}
+def match_keywords(string, positives, negatives):
+    for positive in positives:
+        if positive not in string:
+            return False
+    for negative in negatives:
+        if negative in string:
+            return False
+    return True
+def load_and_convert_checkpoint(path):
+    _, flax_params = StreamingCheckpointer.load_trainstate_checkpoint(path)
+    flax_params = flatten_dict(flax_params['params'], sep='.')
+    torch_params = {}
+    for key, tensor in flax_params.items():
+        if match_keywords(key, ["kernel"], ["norm", 'ln_f']):
+            tensor = tensor.T
+        torch_params[key] = torch.tensor(
+            float_tensor_to_dtype(tensor, 'fp32'), dtype=torch.float16
+        )
+    return torch_params
+def read_json(path):
+    with open(path, "r") as f:
+        return json.load(f)
+def write_json(text, path):
+    with open(path, "w") as f:
+        json.dump(text, f)
+def write_model(loaded, model_path, model_size):
+    os.makedirs(model_path, exist_ok=True)
+    tmp_model_path = os.path.join(model_path, "tmp")
+    os.makedirs(tmp_model_path, exist_ok=True)
+    params = LLAMA_STANDARD_CONFIGS[model_size]
+    n_layers = params["n_layers"]
+    n_heads = params["n_heads"]
+    dim = params["dim"]
+    dims_per_head = dim // n_heads
+    base = 10000.0
+    inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
+    # permute for sliced rotary
+    def permute(w):
+        return w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
+    param_count = 0
+    index_dict = {"weight_map": {}}
+    for layer_i in range(n_layers):
+        filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin"
+        state_dict = {
+            f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
+                loaded[f"transformer.h.{layer_i}.attention.wq.kernel"]
+            ),
+            f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
+                loaded[f"transformer.h.{layer_i}.attention.wk.kernel"]
+            ),
+            f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"transformer.h.{layer_i}.attention.wv.kernel"],
+            f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"transformer.h.{layer_i}.attention.wo.kernel"],
+            f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"transformer.h.{layer_i}.feed_forward.w1.kernel"],
+            f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"transformer.h.{layer_i}.feed_forward.w2.kernel"],
+            f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"transformer.h.{layer_i}.feed_forward.w3.kernel"],
+            f"model.layers.{layer_i}.input_layernorm.weight": loaded[f"transformer.h.{layer_i}.attention_norm.kernel"],
+            f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[f"transformer.h.{layer_i}.ffn_norm.kernel"],
+        }
+        state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
+        for k, v in state_dict.items():
+            index_dict["weight_map"][k] = filename
+            param_count += v.numel()
+        torch.save(state_dict, os.path.join(tmp_model_path, filename))
+    filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin"
+        # Unsharded
+    state_dict = {
+        "model.embed_tokens.weight": loaded["transformer.wte.embedding"],
+        "model.norm.weight": loaded["transformer.ln_f.kernel"],
+        "lm_head.weight": loaded["lm_head.kernel"],
+    }
+    for k, v in state_dict.items():
+        index_dict["weight_map"][k] = filename
+        param_count += v.numel()
+    torch.save(state_dict, os.path.join(tmp_model_path, filename))
+    # Write configs
+    index_dict["metadata"] = {"total_size": param_count * 2}
+    write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
+    config = LlamaConfig(
+        vocab_size=params["vocab_size"],
+        hidden_size=dim,
+        intermediate_size=params["intermediate_size"],
+        num_attention_heads=params["n_heads"],
+        num_hidden_layers=params["n_layers"],
+        rms_norm_eps=params["norm_eps"],
+    )
+    config.save_pretrained(tmp_model_path)
+    # Make space so we can load the model properly now.
+    del state_dict
+    del loaded
+    gc.collect()
+    print("Loading the checkpoint in a Llama model.")
+    model = LlamaForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float16)
+    # Avoid saving this as part of the config.
+    print("Model parameter count", model.num_parameters())
+    del model.config._name_or_path
+    print("Saving in the Transformers format.")
+    model.save_pretrained(model_path, safe_serialization=True)
+    shutil.rmtree(tmp_model_path)
+def write_tokenizer(tokenizer_path, input_tokenizer_path):
+    print(f"Fetching the tokenizer from {input_tokenizer_path}.")
+    os.makedirs(tokenizer_path, exist_ok=True)
+    write_json(
+        {
+            "bos_token": {
+                "content": "<s>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "eos_token": {
+                "content": "</s>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "unk_token": {
+                "content": "<unk>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+        },
+        os.path.join(tokenizer_path, "special_tokens_map.json")
+    )
+    write_json(
+        {
+            "add_bos_token": True,
+            "add_eos_token": False,
+            "model_max_length": 2048,
+            "pad_token": None,
+            "sp_model_kwargs": {},
+            "tokenizer_class": "LlamaTokenizer",
+            "clean_up_tokenization_spaces": False,
+            "bos_token": {
+                "__type": "AddedToken",
+                "content": "<s>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "eos_token": {
+                "__type": "AddedToken",
+                "content": "</s>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+            "unk_token": {
+                "__type": "AddedToken",
+                "content": "<unk>",
+                "lstrip": False,
+                "normalized": True,
+                "rstrip": False,
+                "single_word": False
+            },
+        },
+        os.path.join(tokenizer_path, "tokenizer_config.json"),
+    )
+    shutil.copyfile(input_tokenizer_path, os.path.join(tokenizer_path, "tokenizer.model"))
+def main(argv):
+    assert FLAGS.load_checkpoint != "" and FLAGS.output_dir != ""# and FLAGS.tokenizer_path != ""
+    assert FLAGS.model_size in LLAMA_STANDARD_CONFIGS
+    # write_tokenizer(
+    #     tokenizer_path=FLAGS.output_dir,
+    #     input_tokenizer_path=FLAGS.tokenizer_path,
+    # )
+    write_model(
+        load_and_convert_checkpoint(FLAGS.load_checkpoint),
+        model_path=FLAGS.output_dir,
+        model_size=FLAGS.model_size,
+    )
+if __name__ == "__main__":
+    mlxu.run(main)

EasyLM/models/llama/convert_hf_to_easylm.py ADDED Viewed

	@@ -0,0 +1,196 @@

+"""
+Usage:
+python convert_hf_to_easylm.py  \
+       --checkpoint_dir     /path/hf_format_dir/    \
+       --output_file /path/easylm_format.stream   \
+       --model_size 7b \
+       --streaming
+"""
+import time
+from pathlib import Path
+import argparse
+import mlxu
+import torch
+import flax
+from EasyLM.checkpoint import StreamingCheckpointer
+LLAMA_STANDARD_CONFIGS = {
+    '1b': {
+        'dim': 2048,
+        'intermediate_size': 5504,
+        'n_layers': 22,
+        'n_heads': 16,
+        'norm_eps': 1e-6,
+    },
+    '3b': {
+        'dim': 3200,
+        'intermediate_size': 8640,
+        'n_layers': 26,
+        'n_heads': 32,
+        'norm_eps': 1e-6,
+    },
+    "7b": {
+        "dim": 4096,
+        "intermediate_size": 11008,
+        "n_layers": 32,
+        "n_heads": 32,
+        "norm_eps": 1e-6,
+    },
+    "13b": {
+        "dim": 5120,
+        "intermediate_size": 13824,
+        "n_layers": 40,
+        "n_heads": 40,
+        "norm_eps": 1e-6,
+    },
+    "30b": {
+        "dim": 6656,
+        "intermediate_size": 17920,
+        "n_layers": 60,
+        "n_heads": 52,
+        "norm_eps": 1e-6,
+    },
+    "65b": {
+        "dim": 8192,
+        "intermediate_size": 22016,
+        "n_layers": 80,
+        "n_heads": 64,
+        "norm_eps": 1e-5,
+    },
+}
+def inverse_permute(params, w):
+    n_layers = params["n_layers"]
+    n_heads = params["n_heads"]
+    dim = params["dim"]
+    reshaped_w = w.reshape(n_heads, 2, dim // n_heads // 2, dim)
+    transposed_w = reshaped_w.transpose(0, 2, 1, 3)
+    inverted_w = transposed_w.reshape(dim, dim)
+    return inverted_w
+def main(args):
+    start = time.time()
+    params = LLAMA_STANDARD_CONFIGS[args.model_size]
+    ckpt_paths = sorted(Path(args.checkpoint_dir).glob("*.bin"))
+    ckpt = {}
+    for i, ckpt_path in enumerate(ckpt_paths):
+        checkpoint = torch.load(ckpt_path, map_location="cpu")
+        for k, v in checkpoint.items():
+            if k.startswith("model."):
+                k = k[6:]
+            ckpt[k] = v
+    print(f"Start convert weight to easylm format...")
+    jax_weights = {
+        "transformer": {
+            "wte": {"embedding": ckpt["embed_tokens.weight"].numpy()},
+            "ln_f": {"kernel": ckpt["norm.weight"].numpy()},
+            "h": {
+                "%d"
+                % (layer): {
+                    "attention": {
+                        "wq": {
+                            "kernel": inverse_permute(
+                                params,
+                                ckpt[f"layers.{layer}.self_attn.q_proj.weight"].numpy(),
+                            ).transpose()
+                        },
+                        "wk": {
+                            "kernel": inverse_permute(
+                                params,
+                                ckpt[f"layers.{layer}.self_attn.k_proj.weight"].numpy(),
+                            ).transpose()
+                        },
+                        "wv": {
+                            "kernel": ckpt[f"layers.{layer}.self_attn.v_proj.weight"]
+                            .numpy()
+                            .transpose()
+                        },
+                        "wo": {
+                            "kernel": ckpt[f"layers.{layer}.self_attn.o_proj.weight"]
+                            .numpy()
+                            .transpose()
+                        },
+                    },
+                    "feed_forward": {
+                        "w1": {
+                            "kernel": ckpt[f"layers.{layer}.mlp.gate_proj.weight"]
+                            .numpy()
+                            .transpose()
+                        },
+                        "w2": {
+                            "kernel": ckpt[f"layers.{layer}.mlp.down_proj.weight"]
+                            .numpy()
+                            .transpose()
+                        },
+                        "w3": {
+                            "kernel": ckpt[f"layers.{layer}.mlp.up_proj.weight"]
+                            .numpy()
+                            .transpose()
+                        },
+                    },
+                    "attention_norm": {
+                        "kernel": ckpt[f"layers.{layer}.input_layernorm.weight"].numpy()
+                    },
+                    "ffn_norm": {
+                        "kernel": ckpt[
+                            f"layers.{layer}.post_attention_layernorm.weight"
+                        ].numpy()
+                    },
+                }
+                for layer in range(params["n_layers"])
+            },
+        },
+        "lm_head": {"kernel": ckpt["lm_head.weight"].numpy().transpose()},
+    }
+    print(f"Convert weight to easylm format finished...")
+    print(f"Start to save...")
+    if args.streaming:
+        StreamingCheckpointer.save_train_state_to_file(jax_weights, args.output_file)
+    else:
+        with mlxu.open_file(args.output_file, "wb") as fout:
+            fout.write(flax.serialization.msgpack_serialize(jax_weights, in_place=True))
+    print(
+        f"Save finished!!! take time: {time.time() - start} save path: {args.output_file}"
+    )
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="hf to easylm format script")
+    parser.add_argument(
+        "--checkpoint_dir",
+        type=str,
+        help="Need to be converted model weight dir. it is a dir",
+    )
+    parser.add_argument(
+        "--output_file", type=str, help="Save model weight file path, it is a file."
+    )
+    parser.add_argument(
+        "--model_size",
+        type=str,
+        default="7b",
+        choices=["7b", "13b", "30b", "65b"],
+        help="model size",
+    )
+    parser.add_argument(
+        "--streaming",
+        action="store_true",
+        default=True,
+        help="whether is model weight saved stream format",
+    )
+    args = parser.parse_args()
+    print(f"checkpoint_dir: {args.checkpoint_dir}")
+    print(f"output_file: {args.output_file}")
+    print(f"model_size: {args.model_size}")
+    print(f"streaming: {args.streaming}")
+    main(args)

EasyLM/models/llama/convert_torch_to_easylm.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# This script converts the standrd LLaMA PyTorch checkpoint released by Meta
+# to the EasyLM checkpoint format. The converted checkpoint can then be loaded
+# by EasyLM for fine-tuning or inference.
+# This script is largely borrow from https://github.com/Sea-Snell/JAX_llama
+from pathlib import Path
+import json
+import numpy as np
+import torch
+import flax
+import mlxu
+from EasyLM.checkpoint import StreamingCheckpointer
+FLAGS, FLAGS_DEF = mlxu.define_flags_with_default(
+    checkpoint_dir='',
+    output_file='',
+    streaming=True,
+)
+def main(argv):
+    ckpt_paths = sorted(Path(FLAGS.checkpoint_dir).glob("*.pth"))
+    ckpts = {}
+    for i, ckpt_path in enumerate(ckpt_paths):
+        checkpoint = torch.load(ckpt_path, map_location="cpu")
+        ckpts[int(ckpt_path.name.split('.', maxsplit=2)[1])] = checkpoint
+    ckpts = [ckpts[i] for i in sorted(list(ckpts.keys()))]
+    with open(Path(FLAGS.checkpoint_dir) / "params.json", "r") as f:
+        params = json.loads(f.read())
+    jax_weights = {
+        'transformer': {
+            'wte': {'embedding': np.concatenate([ckpt['tok_embeddings.weight'].numpy() for ckpt in ckpts], axis=1)},
+            'ln_f': {'kernel': ckpts[0]['norm.weight'].numpy()},
+            'h': {
+                '%d' % (layer): {
+                    'attention': {
+                        'wq': {'kernel': np.concatenate([ckpt['layers.%d.attention.wq.weight' % (layer)].numpy() for ckpt in ckpts], axis=0).transpose()},
+                        'wk': {'kernel': np.concatenate([ckpt['layers.%d.attention.wk.weight' % (layer)].numpy() for ckpt in ckpts], axis=0).transpose()},
+                        'wv': {'kernel': np.concatenate([ckpt['layers.%d.attention.wv.weight' % (layer)].numpy() for ckpt in ckpts], axis=0).transpose()},
+                        'wo': {'kernel': np.concatenate([ckpt['layers.%d.attention.wo.weight' % (layer)].numpy() for ckpt in ckpts], axis=1).transpose()},
+                    },
+                    'feed_forward': {
+                        'w1': {'kernel': np.concatenate([ckpt['layers.%d.feed_forward.w1.weight' % (layer)].numpy() for ckpt in ckpts], axis=0).transpose()},
+                        'w2': {'kernel': np.concatenate([ckpt['layers.%d.feed_forward.w2.weight' % (layer)].numpy() for ckpt in ckpts], axis=1).transpose()},
+                        'w3': {'kernel': np.concatenate([ckpt['layers.%d.feed_forward.w3.weight' % (layer)].numpy() for ckpt in ckpts], axis=0).transpose()},
+                    },
+                    'attention_norm': {'kernel': ckpts[0]['layers.%d.attention_norm.weight' % (layer)].numpy()},
+                    'ffn_norm': {'kernel': ckpts[0]['layers.%d.ffn_norm.weight' % (layer)].numpy()},
+                }
+            for layer in range(params['n_layers'])},
+        },
+        'lm_head': {'kernel': np.concatenate([ckpt['output.weight'].numpy() for ckpt in ckpts], axis=0).transpose()},
+    }
+    if FLAGS.streaming:
+        StreamingCheckpointer.save_train_state_to_file(
+            jax_weights, FLAGS.output_file
+        )
+    else:
+        with mlxu.open_file(FLAGS.output_file, 'wb') as fout:
+            fout.write(flax.serialization.msgpack_serialize(jax_weights, in_place=True))
+if __name__ == '__main__':
+    mlxu.run(main)

EasyLM/models/llama/llama_model.py ADDED Viewed

	@@ -0,0 +1,1530 @@

+import os
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple, Union
+import json
+import tempfile
+from functools import partial
+import numpy as np
+import jax
+import jax.numpy as jnp
+from jax import lax
+from jax.sharding import PartitionSpec as PS
+import flax.linen as nn
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen import combine_masks, make_causal_mask
+from flax.linen.attention import dot_product_attention_weights
+from flax.traverse_util import flatten_dict, unflatten_dict
+from flax.linen import partitioning as nn_partitioning
+import einops
+import sentencepiece as spm
+from transformers import AutoTokenizer
+from transformers.convert_slow_tokenizer import import_protobuf
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
+from transformers.modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutput
+from transformers.modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring
+from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from ml_collections import ConfigDict
+from ml_collections.config_dict import config_dict
+from mlxu import function_args_to_config, load_pickle, open_file
+from EasyLM.bpt import blockwise_ffn, blockwise_attn
+from EasyLM.jax_utils import (
+    with_sharding_constraint, get_jax_mesh, get_gradient_checkpoint_policy
+)
+LLAMA_STANDARD_CONFIGS = {
+    'small': {
+        'vocab_size': 64256,
+        'hidden_size': 768,
+        'intermediate_size': 3072,
+        'num_hidden_layers': 12,
+        'num_attention_heads': 12,
+        'max_sequence_length': 2048,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-6,
+        'use_cache': True,
+        'tie_word_embeddings': False,
+    },
+    'medium': {
+        'vocab_size': 64256,
+        'hidden_size': 1024,
+        'intermediate_size': 4096,
+        'num_hidden_layers': 24,
+        'num_attention_heads': 16,
+        'max_sequence_length': 2048,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-6,
+        'use_cache': True,
+        'tie_word_embeddings': False,
+    },
+    'large': {
+        'vocab_size': 64256,
+        'hidden_size': 1536,
+        'intermediate_size': 6144,
+        'num_hidden_layers': 24,
+        'num_attention_heads': 16,
+        'max_sequence_length': 2048,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-6,
+        'use_cache': True,
+        'tie_word_embeddings': False,
+    },
+    'xlarge': {
+        'vocab_size': 64256,
+        'hidden_size': 2048,
+        'intermediate_size': 8192,
+        'num_hidden_layers': 24,
+        'num_attention_heads': 32,
+        'max_sequence_length': 2048,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-6,
+        'use_cache': True,
+        'tie_word_embeddings': False,
+    },
+    '1b': {
+        'vocab_size': 64256,
+        'hidden_size': 2048,
+        'intermediate_size': 5504,
+        'num_hidden_layers': 22,
+        'num_attention_heads': 16,
+        'max_sequence_length': 2048,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-6,
+        'use_cache': True,
+        'tie_word_embeddings': False,
+    },
+    '3b': {
+        'vocab_size': 64256,
+        'hidden_size': 3200,
+        'intermediate_size': 8640,
+        'num_hidden_layers': 26,
+        'num_attention_heads': 32,
+        'max_sequence_length': 2048,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-6,
+        'use_cache': True,
+        'tie_word_embeddings': False,
+    },
+    '7b': {
+        'vocab_size': 64256,
+        'hidden_size': 4096,
+        'intermediate_size': 11008,
+        'num_hidden_layers': 32,
+        'num_attention_heads': 32,
+        'max_sequence_length': 2048,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-6,
+        'use_cache': True,
+        'tie_word_embeddings': False,
+    },
+    '13b': {
+        'vocab_size': 64256,
+        'hidden_size': 5120,
+        'intermediate_size': 13824,
+        'num_hidden_layers': 40,
+        'num_attention_heads': 40,
+        'max_sequence_length': 2048,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-6,
+        'use_cache': True,
+        'tie_word_embeddings': False,
+    },
+    '30b': {
+        'vocab_size': 64256,
+        'hidden_size': 6656,
+        'intermediate_size': 17920,
+        'num_hidden_layers': 60,
+        'num_attention_heads': 52,
+        'max_sequence_length': 2048,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-6,
+        'use_cache': True,
+        'tie_word_embeddings': False,
+    },
+    '65b': {
+        'vocab_size': 64256,
+        'hidden_size': 8192,
+        'intermediate_size': 22016,
+        'num_hidden_layers': 80,
+        'num_attention_heads': 64,
+        'max_sequence_length': 2048,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-5,
+        'use_cache': True,
+        'tie_word_embeddings': False,
+    },
+    'debug': { # A small model for debugging
+        'vocab_size': 64256,
+        'hidden_size': 128,
+        'intermediate_size': 256,
+        'num_hidden_layers': 2,
+        'num_attention_heads': 4,
+        'max_sequence_length': 2048,
+        'initializer_range': 0.02,
+        'rms_norm_eps': 1e-6,
+        'use_cache': True,
+        'tie_word_embeddings': False,
+    },
+}
+class LLaMAConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`~LLaMAModel`]. It is used to instantiate an LLaMA
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the LLaMA-7B.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`~LLaMAModel`] or [`~TFLLaMAModel`].
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_sequence_length (`int`, *optional*, defaults to 2048):
+            Max sequence length for model (for RoPE computation)
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        Example:
+    ```python
+    >>> from transformers import LLaMAModel, LLaMAConfig
+    >>> # Initializing a LLaMA llama-7b style configuration
+    >>> configuration = LLaMAConfig()
+    >>> # Initializing a model from the llama-7b style configuration
+    >>> model = LLaMAModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "llama"
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        max_sequence_length=2048,
+        rms_norm_eps=1e-6,
+        initializer_range=0.02,
+        use_cache=True,
+        # pad_token_id=-1,
+        bos_token_id=0,
+        eos_token_id=1,
+        resid_pdrop=0.0,
+        embd_pdrop=0.0,
+        attn_pdrop=0.0,
+        tie_word_embeddings=False,
+        remat_block='nothing_saveable',
+        remat_attention='',
+        remat_mlp='',
+        scan_attention=False,
+        scan_mlp=False,
+        scan_query_chunk_size=1024,
+        scan_key_chunk_size=1024,
+        scan_mlp_chunk_size=1024,
+        fcm_min_ratio=0.0,
+        fcm_max_ratio=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.initializer_range = initializer_range
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_sequence_length = max_sequence_length
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.remat_block = remat_block
+        self.remat_attention = remat_attention
+        self.remat_mlp = remat_mlp
+        self.scan_attention = scan_attention
+        self.scan_mlp = scan_mlp
+        self.scan_query_chunk_size = scan_query_chunk_size
+        self.scan_key_chunk_size = scan_key_chunk_size
+        self.scan_mlp_chunk_size = scan_mlp_chunk_size
+        self.fcm_min_ratio = fcm_min_ratio
+        self.fcm_max_ratio = fcm_max_ratio
+        super().__init__(
+            # pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+    @classmethod
+    def get_default_config(cls, updates=None):
+        config = function_args_to_config(cls.__init__)
+        if updates is not None:
+            config.update(ConfigDict(updates).copy_and_resolve_references())
+        return config
+    @staticmethod
+    def get_jax_mesh(axis_dims):
+        return get_jax_mesh(axis_dims, ('dp', 'fsdp', 'mp'))
+    @staticmethod
+    def get_partition_rules():
+        """ Parition rules for GPTJ. Note that these rules are orderd, so that
+            the beginning rules match first. It is important to use
+            PartitionSpec() instead of None here because JAX does not treat
+            None as a pytree leaf.
+        """
+        return (
+            # embeddings
+            ("transformer/wte/embedding", PS("mp", "fsdp")),
+            # atention
+            ("attention/(wq|wk|wv)/kernel", PS("fsdp", "mp")),
+            ("attention/wo/kernel", PS("mp", "fsdp")),
+            # mlp
+            ("feed_forward/w1/kernel", PS("fsdp", "mp")),
+            ("feed_forward/w2/kernel", PS("mp", "fsdp")),
+            ("feed_forward/w3/kernel", PS("fsdp", "mp")),
+            # layer norms
+            ("attention_norm/kernel", PS(None)),
+            ("ffn_norm/kernel", PS(None)),
+            # output head
+            ("transformer/ln_f/kernel", PS(None)),
+            ("lm_head/kernel", PS("fsdp", "mp")),
+            ('.*', PS(None)),
+        )
+    @staticmethod
+    def get_weight_decay_exclusions():
+        return (
+            "attention_norm/kernel",
+            "ffn_norm/kernel",
+            "transformer/ln_f/kernel",
+        )
+    @staticmethod
+    def rng_keys():
+        return ('params', 'dropout', 'fcm')
+    @staticmethod
+    def get_tokenizer_config(updates=None):
+        config = ConfigDict()
+        config.vocab_file = ''
+        config.pretrained_model_name_or_path = ''
+        config.add_bos_token = False
+        config.add_eos_token = False
+        if updates is not None:
+            config.update(ConfigDict(updates).copy_and_resolve_references())
+        return config
+    @classmethod
+    def get_tokenizer(cls, config, padding_side='left', truncation_side='right'):
+        config = cls.get_tokenizer_config(config)
+        if config.vocab_file == '':
+            assert config.pretrained_model_name_or_path != '', 'vocab_file or pretrained_model_name_or_path must be specified'
+        if config.pretrained_model_name_or_path != '':
+            tokenizer = AutoTokenizer.from_pretrained(
+                config.pretrained_model_name_or_path,
+                add_bos_token=config.add_bos_token,
+                add_eos_token=config.add_eos_token,
+                padding_side=padding_side,
+                truncation_side=truncation_side,
+            )
+        else:
+            tokenizer = LlamaTokenizer(
+                vocab_file=config.vocab_file,
+                add_bos_token=config.add_bos_token,
+                add_eos_token=config.add_eos_token,
+                padding_side=padding_side,
+                truncation_side=truncation_side,
+            )
+        return tokenizer
+    @classmethod
+    def load_config(cls, path):
+        if path in LLAMA_STANDARD_CONFIGS:
+            return cls.from_dict(LLAMA_STANDARD_CONFIGS[path])
+        load_type, load_path = path.split('::', 1)
+        if load_type == 'pickle':
+            return cls.from_dict(load_pickle(load_path)['llama_config'])
+        elif load_type == 'json':
+            with open_file(load_path, 'r') as fin:
+                raw_config = fin.read()
+            return cls.from_dict(json.loads(raw_config))
+        else:
+            raise ValueError(f'Unsupported load config type: {load_type}')
+remat = nn_partitioning.remat
+logger = logging.get_logger(__name__)
+class RMSNorm(nn.Module):
+    dim: int
+    eps: float=1e-6
+    dtype: jnp.dtype=jnp.float32
+    param_dtype: jnp.dtype=jnp.float32
+    def setup(self) -> None:
+        self.weight = self.param(
+            'kernel',
+            nn.initializers.ones,
+            (self.dim,),
+            self.param_dtype,
+        )
+    def _norm(self, x: jnp.ndarray) -> jnp.ndarray:
+        return x * jax.lax.rsqrt(jnp.square(x).mean(-1, keepdims=True) + self.eps)
+    def __call__(self, x: jnp.ndarray) -> jnp.ndarray:
+        x = x.astype(jnp.promote_types(self.dtype, jnp.float32))
+        output = self._norm(x).astype(self.dtype)
+        weight = jnp.asarray(self.weight, self.dtype)
+        return output * weight
+def precompute_freqs_cis(dim: int, end: int, theta: float=10000.0, dtype: jnp.dtype=jnp.float32) -> jnp.ndarray:
+    freqs = 1.0 / (theta ** (np.arange(0, dim, 2)[: (dim // 2)].astype(dtype) / dim))
+    t = np.arange(end)  # type: ignore
+    freqs = np.outer(t, freqs).astype(dtype)  # type: ignore
+    sin, cos = np.sin(freqs), np.cos(freqs)
+    freqs_cis = np.complex64(cos + 1j * sin)
+    return jnp.asarray(freqs_cis)
+def apply_rotary_emb(
+    xq: jnp.ndarray,
+    xk: jnp.ndarray,
+    freqs_cis: jnp.ndarray,
+    dtype: jnp.dtype=jnp.float32,
+) -> Tuple[jnp.ndarray, jnp.ndarray]:
+    reshape_xq = xq.astype(jnp.float32).reshape(*xq.shape[:-1], -1, 2)
+    reshape_xk = xk.astype(jnp.float32).reshape(*xk.shape[:-1], -1, 2)
+    xq_ = jax.lax.complex(reshape_xq[..., 0], reshape_xq[..., 1])
+    xk_ = jax.lax.complex(reshape_xk[..., 0], reshape_xk[..., 1])
+    # add head dim
+    freqs_cis = jnp.reshape(freqs_cis, (*freqs_cis.shape[:2], 1, *freqs_cis.shape[2:]))
+    xq_out = xq_ * freqs_cis
+    xq_out = jnp.stack((jnp.real(xq_out), jnp.imag(xq_out)), axis=-1).reshape(*xq_out.shape[:-1], -1)
+    xk_out = xk_ * freqs_cis
+    xk_out = jnp.stack((jnp.real(xk_out), jnp.imag(xk_out)), axis=-1).reshape(*xk_out.shape[:-1], -1)
+    return xq_out.astype(dtype), xk_out.astype(dtype)
+class FlaxLLaMAAttention(nn.Module):
+    config: LLaMAConfig
+    dtype: jnp.dtype=jnp.float32
+    param_dtype: jnp.dtype=jnp.float32
+    precision: Optional[Union[jax.lax.Precision, str]]=None
+    def setup(self):
+        config = self.config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        self.wq = nn.Dense(
+            config.num_attention_heads*self.head_dim,
+            dtype=self.dtype,
+            param_dtype=self.param_dtype,
+            use_bias=False,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            precision=self.precision,
+        )
+        self.wk = nn.Dense(
+            config.num_attention_heads*self.head_dim,
+            dtype=self.dtype,
+            param_dtype=self.param_dtype,
+            use_bias=False,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            precision=self.precision,
+        )
+        self.wv = nn.Dense(
+            config.num_attention_heads*self.head_dim,
+            dtype=self.dtype,
+            param_dtype=self.param_dtype,
+            use_bias=False,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            precision=self.precision,
+        )
+        self.wo = nn.Dense(
+            config.hidden_size,
+            dtype=self.dtype,
+            param_dtype=self.param_dtype,
+            use_bias=False,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            precision=self.precision,
+        )
+        self.resid_dropout = nn.Dropout(rate=config.resid_pdrop)
+        self.causal_mask = make_causal_mask(jnp.ones((1, config.max_sequence_length), dtype="bool"), dtype="bool")
+        self.freqs_cis = precompute_freqs_cis(
+            self.head_dim,
+            config.max_sequence_length * 2,
+            dtype=self.dtype,
+        )
+    def _split_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))
+    def _merge_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
+    @nn.compact
+    def _concatenate_to_cache(self, key, value, query, attention_mask):
+        """
+        This function takes projected key, value states from a single input token and concatenates the states to cached
+        states from previous steps. This function is slighly adapted from the official Flax repository:
+        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
+        """
+        # detect if we're initializing by absence of existing cache data.
+        is_initialized = self.has_variable("cache", "cached_key")
+        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
+        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
+        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+        if is_initialized:
+            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
+            # update key, value caches with our new 1d spatial slices
+            cur_index = cache_index.value
+            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
+            key = lax.dynamic_update_slice(cached_key.value, key, indices)
+            value = lax.dynamic_update_slice(cached_value.value, value, indices)
+            cached_key.value = key
+            cached_value.value = value
+            num_updated_cache_vectors = query.shape[1]
+            cache_index.value = cache_index.value + num_updated_cache_vectors
+            # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements.
+            pad_mask = jnp.broadcast_to(
+                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
+                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
+            )
+            attention_mask = combine_masks(pad_mask, attention_mask)
+        return key, value, attention_mask
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        position_ids,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        fcm_mask=None,
+    ):
+        xq, xk, xv = self.wq(hidden_states), self.wk(hidden_states), self.wv(hidden_states)
+        xq = with_sharding_constraint(xq, PS(("dp", "fsdp"), None, "mp"))
+        xk = with_sharding_constraint(xk, PS(("dp", "fsdp"), None, "mp"))
+        xv = with_sharding_constraint(xv, PS(("dp", "fsdp"), None, "mp"))
+        xq = self._split_heads(xq)
+        xk = self._split_heads(xk)
+        xv = self._split_heads(xv)
+        freqs_cis = jnp.take(self.freqs_cis, position_ids, axis=0)
+        xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis, dtype=self.dtype)
+        dropout_rng = None
+        if not deterministic and self.config.attn_pdrop > 0.0:
+            dropout_rng = self.make_rng("dropout")
+        if self.config.scan_attention and not (self.has_variable("cache", "cached_key") or init_cache):
+            # doesn't need blockwise attention if we are doing autoregressive decoding since no quadratic memory
+            # attention mask without nxn materlization, blockwise_attn will handle the rest
+            attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
+            # transform boolean mask into float mask
+            attention_bias = lax.select(
+                attention_mask > 0,
+                jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
+            )
+            attn_weights = None
+            attn_output = blockwise_attn(
+                xq,
+                xk,
+                xv,
+                bias=attention_bias,
+                deterministic=deterministic,
+                dropout_rng=dropout_rng,
+                attn_pdrop=self.config.attn_pdrop,
+                causal=True,
+                query_chunk_size=self.config.scan_query_chunk_size,
+                key_chunk_size=self.config.scan_key_chunk_size,
+                dtype=self.dtype,
+                policy=get_gradient_checkpoint_policy('nothing_saveable'),
+                precision=self.precision,
+                float32_logits=True,
+                prevent_cse=True,
+            )
+            attn_output = with_sharding_constraint(attn_output, PS(("dp", "fsdp"), None, "mp", None))
+        else:
+            query_length, key_length = xq.shape[1], xk.shape[1]
+            if self.has_variable("cache", "cached_key"):
+                mask_shift = self.variables["cache"]["cache_index"]
+                max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
+                causal_mask = lax.dynamic_slice(
+                    self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length)
+                )
+            else:
+                causal_mask = self.causal_mask[:, :, :query_length, :key_length]
+            batch_size = hidden_states.shape[0]
+            causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
+            attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
+            attention_mask = combine_masks(attention_mask, causal_mask, fcm_mask)
+            # During fast autoregressive decoding, we feed one position at a time,
+            # and cache the keys and values step by step.
+            if self.has_variable("cache", "cached_key") or init_cache:
+                xk, xv, attention_mask = self._concatenate_to_cache(xk, xv, xq, attention_mask)
+            # transform boolean mask into float mask
+            attention_bias = lax.select(
+                attention_mask > 0,
+                jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
+            )
+            attn_weights = dot_product_attention_weights(
+                xq,
+                xk,
+                bias=attention_bias,
+                dropout_rng=dropout_rng,
+                dropout_rate=self.config.attn_pdrop,
+                deterministic=deterministic,
+                dtype=jnp.promote_types(self.dtype, jnp.float32),
+                precision=self.precision,
+            )
+            attn_weights = with_sharding_constraint(attn_weights, PS(("dp", "fsdp"), "mp", None, None))
+            attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, xv, precision=self.precision)
+        attn_output = self._merge_heads(attn_output)
+        attn_output = self.wo(attn_output)
+        attn_output = self.resid_dropout(attn_output, deterministic=deterministic)
+        outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
+        return outputs
+class FlaxLLaMAMLP(nn.Module):
+    config: LLaMAConfig
+    dtype: jnp.dtype=jnp.float32
+    param_dtype: jnp.dtype=jnp.float32
+    precision: Optional[Union[jax.lax.Precision, str]]=None
+    def setup(self) -> None:
+        config = self.config
+        self.w1 = nn.Dense(
+            config.intermediate_size,
+            dtype=self.dtype,
+            param_dtype=self.param_dtype,
+            use_bias=False,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            precision=self.precision,
+        )
+        self.w2 = nn.Dense(
+            config.hidden_size,
+            dtype=self.dtype,
+            param_dtype=self.param_dtype,
+            use_bias=False,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            precision=self.precision,
+        )
+        self.w3 = nn.Dense(
+            config.intermediate_size,
+            dtype=self.dtype,
+            param_dtype=self.param_dtype,
+            use_bias=False,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            precision=self.precision,
+        )
+        self.dropout = nn.Dropout(rate=self.config.resid_pdrop)
+    def __call__(self, x: jnp.ndarray, deterministic: bool = True) -> jnp.ndarray:
+        x = self.w2(nn.silu(self.w1(x)) * self.w3(x))
+        x = self.dropout(x, deterministic=deterministic)
+        return x
+class FlaxLLaMABlock(nn.Module):
+    config: LLaMAConfig
+    dtype: jnp.dtype=jnp.float32
+    param_dtype: jnp.dtype=jnp.float32
+    precision: Optional[Union[jax.lax.Precision, str]]=None
+    def setup(self) -> None:
+        attention_module = FlaxLLaMAAttention
+        mlp_module = FlaxLLaMAMLP
+        if self.config.remat_attention != '':
+            attention_module = remat(
+                FlaxLLaMAAttention, static_argnums=(3, 4, 5),
+                policy=get_gradient_checkpoint_policy(self.config.remat_attention),
+                prevent_cse=True,
+            )
+        if self.config.remat_mlp != '':
+            mlp_module = remat(
+                FlaxLLaMAMLP, static_argnums=(1,),
+                policy=get_gradient_checkpoint_policy(self.config.remat_mlp),
+                prevent_cse=True,
+            )
+        self.attention = attention_module(
+            self.config,
+            dtype=self.dtype,
+            param_dtype=self.param_dtype,
+            precision=self.precision,
+        )
+        self.feed_forward = mlp_module(
+            self.config,
+            dtype=self.dtype,
+            param_dtype=self.param_dtype,
+            precision=self.precision,
+        )
+        self.attention_norm = RMSNorm(
+            self.config.hidden_size,
+            eps=self.config.rms_norm_eps,
+            dtype=self.dtype,
+            param_dtype=self.param_dtype,
+        )
+        self.ffn_norm = RMSNorm(
+            self.config.hidden_size,
+            eps=self.config.rms_norm_eps,
+            dtype=self.dtype,
+            param_dtype=self.param_dtype,
+        )
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_ids=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        fcm_mask: Optional[jnp.ndarray] = None,
+    ):
+        attn_outputs = self.attention(
+            self.attention_norm(hidden_states),
+            attention_mask,
+            position_ids,
+            deterministic,
+            init_cache,
+            output_attentions,
+            fcm_mask,
+        )
+        attn_output = attn_outputs[0]
+        hidden_states = hidden_states + attn_output
+        feed_forward_input = self.ffn_norm(hidden_states)
+        if self.config.scan_mlp:
+            feed_forward_hidden_states = blockwise_ffn(
+                self.feed_forward,
+                feed_forward_input,
+                self.config.scan_mlp_chunk_size,
+                deterministic,
+            )
+        else:
+            feed_forward_hidden_states = self.feed_forward(
+                feed_forward_input,
+                deterministic,
+            )
+        feed_forward_hidden_states = with_sharding_constraint(feed_forward_hidden_states, PS(("dp", "fsdp"), None, "mp"))
+        hidden_states = hidden_states + feed_forward_hidden_states
+        return (hidden_states,) + attn_outputs[1:]
+class FlaxLLaMAPreTrainedModel(FlaxPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = LLaMAConfig
+    base_model_prefix = "transformer"
+    module_class: nn.Module = None
+    def __init__(
+        self,
+        config: LLaMAConfig,
+        input_shape: Tuple = (1, 1),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_ids = jnp.zeros(input_shape, dtype="i4")
+        attention_mask = jnp.ones_like(input_ids)
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape)
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+        if self.config.add_cross_attention:
+            encoder_hidden_states = jnp.zeros(input_shape + (self.config.hidden_size,))
+            encoder_attention_mask = attention_mask
+            module_init_outputs = self.module.init(
+                rngs,
+                input_ids,
+                attention_mask,
+                position_ids,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                return_dict=False,
+            )
+        else:
+            module_init_outputs = self.module.init(rngs, input_ids, attention_mask, position_ids, return_dict=False)
+        random_params = module_init_outputs["params"]
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+    def init_cache(self, batch_size, max_length):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+        """
+        # init input variables to retrieve cache
+        input_ids = jnp.ones((batch_size, max_length))
+        attention_mask = jnp.ones_like(input_ids)
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
+        )
+        return init_variables["cache"]
+    @add_start_docstrings_to_model_forward("")
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        position_ids=None,
+        params: dict = None,
+        past_key_values: dict = None,
+        dropout_rng: jax.random.PRNGKey = None,
+        train: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        batch_size, sequence_length = input_ids.shape
+        if position_ids is None:
+            if past_key_values is not None:
+                raise ValueError("Make sure to provide `position_ids` when passing `past_key_values`.")
+            position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
+        if attention_mask is None:
+            attention_mask = jnp.ones((batch_size, sequence_length))
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+        inputs = {"params": params or self.params}
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be changed by FlaxGPTJAttention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+        outputs = self.module.apply(
+            inputs,
+            jnp.array(input_ids, dtype="i4"),
+            jnp.array(attention_mask, dtype="i4"),
+            jnp.array(position_ids, dtype="i4"),
+            not train,
+            False,
+            output_attentions,
+            output_hidden_states,
+            return_dict,
+            rngs=rngs,
+            mutable=mutable,
+        )
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs, past_key_values = outputs
+            outputs["past_key_values"] = unfreeze(past_key_values["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs, past_key_values = outputs
+            outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]
+        return outputs
+class FlaxLLaMABlockCollection(nn.Module):
+    config: LLaMAConfig
+    dtype: jnp.dtype = jnp.float32
+    param_dtype: jnp.dtype=jnp.float32
+    precision: Optional[Union[jax.lax.Precision, str]]=None
+    def setup(self):
+        block = FlaxLLaMABlock
+        if self.config.remat_block != '':
+            block = remat(
+                FlaxLLaMABlock, static_argnums=(3, 4, 5),
+                policy=get_gradient_checkpoint_policy(self.config.remat_block)
+            )
+        self.blocks = [
+            block(
+                self.config,
+                name=str(i),
+                dtype=self.dtype,
+                param_dtype=self.param_dtype,
+                precision=self.precision
+            ) for i in range(self.config.num_hidden_layers)
+        ]
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_ids=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        if not deterministic and self.config.fcm_max_ratio > 0:
+            # Apply forgetful causal mask
+            batch_size, seq_length = hidden_states.shape[0], hidden_states.shape[1]
+            fcm_ratio = jax.random.uniform(
+                self.make_rng('fcm'), shape=(batch_size, 1, 1, 1),
+                minval=self.config.fcm_min_ratio,
+                maxval=self.config.fcm_max_ratio
+            )
+            fcm_mask = jax.random.uniform(
+                self.make_rng('fcm'),
+                shape=(batch_size, 1, 1, seq_length)
+            ) > fcm_ratio
+            fcm_mask = fcm_mask.at[:, :, :, 0].set(True)
+            fcm_mask = fcm_mask.astype('bool')
+        else:
+            fcm_mask = None
+        for block in self.blocks:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            layer_outputs = block(
+                hidden_states,
+                attention_mask,
+                position_ids,
+                deterministic,
+                init_cache,
+                output_attentions,
+                fcm_mask,
+            )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions += (layer_outputs[1],)
+        # this contains possible `None` values - `FlaxGPTJModule` will filter them out
+        outputs = (hidden_states, all_hidden_states, all_attentions)
+        return outputs
+class FlaxLLaMAModule(nn.Module):
+    config: LLaMAConfig
+    dtype: jnp.dtype = jnp.float32
+    param_dtype: jnp.dtype=jnp.float32
+    precision: Optional[Union[jax.lax.Precision, str]]=None
+    def setup(self):
+        self.embed_dim = self.config.hidden_size
+        self.wte = nn.Embed(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+            dtype=self.dtype,
+            param_dtype=self.param_dtype,
+        )
+        self.dropout = nn.Dropout(rate=self.config.embd_pdrop)
+        self.h = FlaxLLaMABlockCollection(self.config, dtype=self.dtype, param_dtype=self.param_dtype, precision=self.precision)
+        self.ln_f = RMSNorm(self.config.hidden_size, eps=self.config.rms_norm_eps, dtype=self.dtype, param_dtype=self.param_dtype)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        position_ids,
+        deterministic=True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        input_embeds = self.wte(input_ids.astype("i4"))
+        hidden_states = self.dropout(input_embeds, deterministic=deterministic)
+        outputs = self.h(
+            hidden_states,
+            attention_mask,
+            position_ids=position_ids,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        hidden_states = self.ln_f(hidden_states)
+        if output_hidden_states:
+            all_hidden_states = outputs[1] + (hidden_states,)
+            outputs = (hidden_states, all_hidden_states) + outputs[2:]
+        else:
+            outputs = (hidden_states,) + outputs[1:]
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+        return FlaxBaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=outputs[1],
+            attentions=outputs[-1],
+        )
+@add_start_docstrings("", "")
+class FlaxLLaMAModel(FlaxLLaMAPreTrainedModel):
+    module_class = FlaxLLaMAModule
+# append_call_sample_docstring(
+#     FlaxLLaMAModel,
+#     _TOKENIZER_FOR_DOC,
+#     _CHECKPOINT_FOR_DOC,
+#     FlaxCausalLMOutput,
+#     _CONFIG_FOR_DOC,
+# )
+class FlaxLLaMAForCausalLMModule(nn.Module):
+    config: LLaMAConfig
+    dtype: jnp.dtype = jnp.float32
+    param_dtype: jnp.dtype=jnp.float32
+    precision: Optional[Union[jax.lax.Precision, str]]=None
+    def setup(self):
+        self.transformer = FlaxLLaMAModule(self.config, dtype=self.dtype)
+        self.lm_head = nn.Dense(
+            self.config.vocab_size,
+            dtype=self.dtype,
+            param_dtype=self.param_dtype,
+            use_bias=False,
+            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+            precision=self.precision,
+        )
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        position_ids=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        batch_size, seq_length = input_ids.shape
+        if attention_mask is None:
+            attention_mask = jnp.ones_like(input_ids)
+        if position_ids is None:
+            position_ids = jnp.broadcast_to(
+                jnp.clip(jnp.cumsum(attention_mask, axis=-1) - 1, a_min=0),
+                (batch_size, seq_length)
+            )
+        outputs = self.transformer(
+            input_ids,
+            attention_mask,
+            position_ids,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        if self.config.tie_word_embeddings:
+            shared_kernel = self.transformer.variables["params"]["wte"]["embedding"].T
+            lm_logits = self.lm_head.apply({"params": {"kernel": shared_kernel}}, hidden_states)
+        else:
+            lm_logits = self.lm_head(hidden_states)
+        if not return_dict:
+            return (lm_logits,) + outputs[1:]
+        return FlaxCausalLMOutput(logits=lm_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
+@add_start_docstrings("", "")
+class FlaxLLaMAForCausalLM(FlaxLLaMAPreTrainedModel):
+    module_class = FlaxLLaMAForCausalLMModule
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
+        # initializing the cache
+        batch_size, seq_length = input_ids.shape
+        past_key_values = self.init_cache(batch_size, max_length)
+        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
+        # But since GPTJ uses a causal mask, those positions are masked anyways.
+        # Thus we can create a single static attention_mask here, which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if attention_mask is not None:
+            position_ids = attention_mask.cumsum(axis=-1) - 1
+            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
+        else:
+            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
+        return {
+            "past_key_values": past_key_values,
+            "attention_mask": extended_attention_mask,
+            "position_ids": position_ids,
+        }
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
+        return model_kwargs
+# append_call_sample_docstring(
+#     FlaxGPTJForCausalLM,
+#     _TOKENIZER_FOR_DOC,
+#     _CHECKPOINT_FOR_DOC,
+#     FlaxCausalLMOutput,
+#     _CONFIG_FOR_DOC,
+# )
+VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
+PRETRAINED_VOCAB_FILES_MAP = {}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
+SPIECE_UNDERLINE = "▁"
+class LlamaTokenizer(PreTrainedTokenizer):
+    """
+    Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
+    no padding token in the original model.
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+        pad_token (`str` or `tokenizers.AddedToken`, *optional*):
+            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
+            attention mechanisms or loss computation.
+        sp_model_kwargs (`Dict[str, Any]`, `Optional`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+        add_bos_token (`bool`, *optional*, defaults to `True`):
+            Whether or not to add an `bos_token` at the start of sequences.
+        add_eos_token (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an `eos_token` at the end of sequences.
+        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
+            extra spaces.
+        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
+            Whether or not the default system prompt for Llama should be used.
+        spaces_between_special_tokens (`bool`, *optional*, defaults to `False`):
+            Whether or not to add spaces between special tokens.
+        legacy (`bool`, *optional*):
+            Whether or not the `legacy` behavior of the tokenizer should be used. Legacy is before the merge of #24622
+            and #25224 which includes fixes to properly handle tokens that appear after special tokens. A simple
+            example:
+            - `legacy=True`:
+            ```python
+            >>> from transformers import T5Tokenizer
+            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=True)
+            >>> tokenizer.encode("Hello <extra_id_0>.")
+            [8774, 32099, 3, 5, 1]
+            ```
+            - `legacy=False`:
+            ```python
+            >>> from transformers import T5Tokenizer
+            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
+            >>> tokenizer.encode("Hello <extra_id_0>.")  # the extra space `[3]` is no longer here
+            [8774, 32099, 5, 1]
+            ```
+            Checkout the [pull request](https://github.com/huggingface/transformers/pull/24565) for more details.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        pad_token=None,
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        add_bos_token=False,
+        add_eos_token=False,
+        clean_up_tokenization_spaces=False,
+        use_default_system_prompt=False,
+        spaces_between_special_tokens=False,
+        legacy=None,
+        **kwargs,
+    ):
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        bos_token = AddedToken(bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, normalized=False, special=True) if isinstance(pad_token, str) else pad_token
+        if legacy is None:
+            logger.warning_once(
+                f"You are using the default legacy behaviour of the {self.__class__}. This is"
+                " expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
+                " If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it"
+                " means, and thoroughly read the reason why this was added as explained in"
+                " https://github.com/huggingface/transformers/pull/24565"
+            )
+            legacy = True
+        self.legacy = legacy
+        self.vocab_file = vocab_file
+        self.add_bos_token = add_bos_token
+        self.add_eos_token = add_eos_token
+        self.use_default_system_prompt = use_default_system_prompt
+        self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            add_bos_token=add_bos_token,
+            add_eos_token=add_eos_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            use_default_system_prompt=use_default_system_prompt,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+            legacy=legacy,
+            **kwargs,
+        )
+    @property
+    def unk_token_length(self):
+        return len(self.sp_model.encode(str(self.unk_token)))
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_spm_processor
+    def get_spm_processor(self, from_slow=False):
+        tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        if self.legacy or from_slow:  # no dependency on protobuf
+            tokenizer.Load(self.vocab_file)
+            return tokenizer
+        with open(self.vocab_file, "rb") as f:
+            sp_model = f.read()
+            model_pb2 = import_protobuf(f"The new behaviour of {self.__class__.__name__} (with `self.legacy = False`)")
+            model = model_pb2.ModelProto.FromString(sp_model)
+            normalizer_spec = model_pb2.NormalizerSpec()
+            normalizer_spec.add_dummy_prefix = False
+            model.normalizer_spec.MergeFrom(normalizer_spec)
+            sp_model = model.SerializeToString()
+            tokenizer.LoadFromSerializedProto(sp_model)
+        return tokenizer
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
+        return state
+    def __setstate__(self, d):
+        self.__dict__ = d
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
+    @property
+    def vocab_size(self):
+        """Returns vocab size"""
+        return self.sp_model.get_piece_size()
+    def get_vocab(self):
+        """Returns vocab as a dict"""
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
+    def tokenize(self, text: "TextInput", add_special_tokens=False, **kwargs) -> List[str]:
+        """
+        Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
+        first token is special.
+        """
+        if self.legacy or len(text) == 0:
+            return super().tokenize(text, **kwargs)
+        tokens = super().tokenize(SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " "), **kwargs)
+        if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
+            tokens = tokens[1:]
+        return tokens
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize
+    def _tokenize(self, text, **kwargs):
+        """
+        Returns a tokenized string.
+        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
+        SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
+        `['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
+        `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
+        `self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
+        """
+        tokens = self.sp_model.encode(text, out_type=str)
+        if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")):
+            return tokens
+        # 1. Encode string + prefix ex: "<unk> Hey"
+        tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
+        # 2. Remove self.unk_token from ['<','unk','>', '▁Hey']
+        return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.piece_to_id(token)
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        return token
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        # since we manually add the prefix space, we have to remove it when decoding
+        if tokens[0].startswith(SPIECE_UNDERLINE):
+            tokens[0] = tokens[0][1:]
+        current_sub_tokens = []
+        out_string = ""
+        prev_is_special = False
+        for i, token in enumerate(tokens):
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                if not prev_is_special and i != 0 and self.legacy:
+                    out_string += " "
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                prev_is_special = True
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+                prev_is_special = False
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string
+    def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+        return (out_vocab_file,)
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+        output = bos_token_id + token_ids_0 + eos_token_id
+        if token_ids_1 is not None:
+            output = output + bos_token_id + token_ids_1 + eos_token_id
+        return output
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+        bos_token_id = [1] if self.add_bos_token else []
+        eos_token_id = [1] if self.add_eos_token else []
+        if token_ids_1 is None:
+            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
+        return (
+            bos_token_id
+            + ([0] * len(token_ids_0))
+            + eos_token_id
+            + bos_token_id
+            + ([0] * len(token_ids_1))
+            + eos_token_id
+        )
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
+        sequence pair mask has the following format:
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+        if token_ids_1 is None, only returns the first portion of the mask (0s).
+        Args:
+            token_ids_0 (`List[int]`):
+                List of ids.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
+        if token_ids_1 is not None:
+            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
+        return output

EasyLM/models/llama/llama_serve.py ADDED Viewed

	@@ -0,0 +1,386 @@

+import pprint
+from functools import partial
+import numpy as np
+import mlxu
+import jax
+import jax.numpy as jnp
+from jax.experimental.pjit import pjit
+from jax.sharding import PartitionSpec as PS
+import optax
+from transformers import GenerationConfig, FlaxLogitsProcessorList
+from EasyLM.checkpoint import StreamingCheckpointer
+from EasyLM.serving import LMServer
+from EasyLM.jax_utils import (
+    JaxRNG, JaxDistributedConfig, next_rng, match_partition_rules, tree_apply,
+    set_random_seed, get_float_dtype_by_name, make_shard_and_gather_fns,
+    with_sharding_constraint, FlaxTemperatureLogitsWarper
+)
+from EasyLM.models.llama.llama_model import LLaMAConfig, FlaxLLaMAForCausalLM
+FLAGS, FLAGS_DEF = mlxu.define_flags_with_default(
+    seed=42,
+    initialize_jax_distributed=False,
+    mesh_dim='1,-1,1',
+    dtype='bf16',
+    input_length=1024,
+    seq_length=2048,
+    top_k=50,
+    top_p=1.0,
+    do_sample=True,
+    num_beams=1,
+    add_bos_token=True,
+    load_llama_config='',
+    load_checkpoint='',
+    tokenizer=LLaMAConfig.get_tokenizer_config(),
+    lm_server=LMServer.get_default_config(),
+    jax_distributed=JaxDistributedConfig.get_default_config(),
+)
+def main(argv):
+    JaxDistributedConfig.initialize(FLAGS.jax_distributed)
+    set_random_seed(FLAGS.seed)
+    prefix_tokenizer = LLaMAConfig.get_tokenizer(
+        FLAGS.tokenizer, truncation_side='left', padding_side='left'
+    )
+    tokenizer = LLaMAConfig.get_tokenizer(
+        FLAGS.tokenizer, truncation_side='right', padding_side='right'
+    )
+    with jax.default_device(jax.devices("cpu")[0]):
+        llama_config = LLaMAConfig.load_config(FLAGS.load_llama_config)
+        _, params = StreamingCheckpointer.load_trainstate_checkpoint(
+            FLAGS.load_checkpoint, disallow_trainstate=True
+        )
+        hf_model = FlaxLLaMAForCausalLM(
+            llama_config,
+            input_shape=(1, FLAGS.seq_length),
+            seed=FLAGS.seed,
+            _do_init=False
+        )
+    model_ps = match_partition_rules(
+        LLaMAConfig.get_partition_rules(), params
+    )
+    shard_fns, _ = make_shard_and_gather_fns(
+        model_ps, get_float_dtype_by_name(FLAGS.dtype)
+    )
+    @partial(
+        pjit,
+        in_shardings=(model_ps, PS(), PS()),
+        out_shardings=(PS(), PS(), PS())
+    )
+    def forward_loglikelihood(params, rng, batch):
+        batch = with_sharding_constraint(batch, PS(('dp', 'fsdp')))
+        rng_generator = JaxRNG(rng)
+        input_tokens = batch['input_tokens']
+        output_tokens = batch['output_tokens']
+        input_mask = batch['input_mask']
+        output_mask = batch['output_mask']
+        logits = hf_model.module.apply(
+            params, input_tokens, attention_mask=input_mask,
+            deterministic=True, rngs=rng_generator(llama_config.rng_keys()),
+        ).logits
+        # if llama_config.n_real_tokens is not None:
+        #   logits = logits.at[:, :, llama_config.n_real_tokens:].set(-1e8)
+        loglikelihood = -optax.softmax_cross_entropy_with_integer_labels(
+            logits, output_tokens
+        )
+        loglikelihood = jnp.sum(loglikelihood * output_mask, axis=-1)
+        match_count = jnp.sum(
+            (jnp.argmax(logits, axis=-1) == output_tokens) * output_mask,
+            axis=-1
+        )
+        total = jnp.sum(output_mask, axis=-1)
+        is_greedy = match_count == total
+        return loglikelihood, is_greedy, rng_generator()
+    @partial(
+        pjit,
+        in_shardings=(model_ps, PS(), PS(), PS()),
+        out_shardings=(PS(), PS())
+    )
+    def forward_generate(params, rng, batch, temperature):
+        batch = with_sharding_constraint(batch, PS(('dp', 'fsdp')))
+        rng_generator = JaxRNG(rng)
+        output = hf_model.generate(
+            batch['input_tokens'],
+            attention_mask=batch['attention_mask'],
+            params=params['params'],
+            prng_key=rng_generator(),
+            logits_processor=FlaxLogitsProcessorList(
+                [FlaxTemperatureLogitsWarper(temperature)]
+            ),
+            generation_config=GenerationConfig(
+                max_new_tokens=FLAGS.seq_length - FLAGS.input_length,
+                pad_token_id=tokenizer.eos_token_id,
+                bos_token_id=tokenizer.bos_token_id,
+                eos_token_id=tokenizer.eos_token_id,
+                do_sample=FLAGS.do_sample,
+                num_beams=FLAGS.num_beams,
+                top_k=FLAGS.top_k,
+                top_p=FLAGS.top_p,
+            )
+        ).sequences[:, batch['input_tokens'].shape[1]:]
+        return output, rng_generator()
+    @partial(
+        pjit,
+        in_shardings=(model_ps, PS(), PS()),
+        out_shardings=(PS(), PS())
+    )
+    def forward_greedy_generate(params, rng, batch):
+        batch = with_sharding_constraint(batch, PS(('dp', 'fsdp')))
+        rng_generator = JaxRNG(rng)
+        output = hf_model.generate(
+            batch['input_tokens'],
+            attention_mask=batch['attention_mask'],
+            params=params['params'],
+            prng_key=rng_generator(),
+            generation_config=GenerationConfig(
+                max_new_tokens=FLAGS.seq_length - FLAGS.input_length,
+                pad_token_id=tokenizer.eos_token_id,
+                bos_token_id=tokenizer.bos_token_id,
+                eos_token_id=tokenizer.eos_token_id,
+                do_sample=False,
+                num_beams=1,
+            )
+        ).sequences[:, batch['input_tokens'].shape[1]:]
+        return output, rng_generator()
+    mesh = LLaMAConfig.get_jax_mesh(FLAGS.mesh_dim)
+    with mesh:
+        params = tree_apply(shard_fns, params)
+        sharded_rng = next_rng()
+    class ModelServer(LMServer):
+        @staticmethod
+        def loglikelihood(prefix_text, text):
+            nonlocal sharded_rng
+            prefix = prefix_tokenizer(
+                prefix_text,
+                padding='max_length',
+                truncation=True,
+                max_length=FLAGS.input_length,
+                return_tensors='np',
+            )
+            inputs = tokenizer(
+                text,
+                padding='max_length',
+                truncation=True,
+                max_length=FLAGS.seq_length - FLAGS.input_length,
+                return_tensors='np',
+            )
+            output_tokens = np.concatenate([prefix.input_ids, inputs.input_ids], axis=1)
+            bos_tokens = np.full(
+                (output_tokens.shape[0], 1), tokenizer.bos_token_id, dtype=np.int32
+            )
+            input_tokens = np.concatenate([bos_tokens, output_tokens[:, :-1]], axis=-1)
+            input_mask = np.concatenate(
+                [prefix.attention_mask, inputs.attention_mask], axis=1
+            )
+            if FLAGS.add_bos_token:
+                bos_mask = np.ones_like(input_mask[:, :1])
+            else:
+                bos_mask = np.zeros_like(input_mask[:, :1])
+            input_mask = np.concatenate([bos_mask, input_mask[:, :-1]], axis=1)
+            output_mask = np.concatenate(
+                [np.zeros_like(prefix.attention_mask), inputs.attention_mask], axis=1
+            )
+            batch = dict(
+                input_tokens=input_tokens,
+                output_tokens=output_tokens,
+                input_mask=input_mask,
+                output_mask=output_mask,
+            )
+            with mesh:
+                loglikelihood, is_greedy, sharded_rng = forward_loglikelihood(
+                    params, sharded_rng, batch
+                )
+                loglikelihood, is_greedy = jax.device_get((loglikelihood, is_greedy))
+            return loglikelihood, is_greedy
+        @staticmethod
+        def loglikelihood_rolling(text):
+            nonlocal sharded_rng
+            inputs = tokenizer(
+                text,
+                padding='longest',
+                truncation=False,
+                max_length=np.iinfo(np.int32).max,
+                return_tensors='np',
+            )
+            batch_size = inputs.input_ids.shape[0]
+            output_tokens = inputs.input_ids
+            attention_mask = inputs.attention_mask
+            if output_tokens.shape[1] < FLAGS.seq_length:
+                padding_length = FLAGS.seq_length - output_tokens.shape[1]
+                pad_tokens = np.full(
+                    (batch_size, padding_length), tokenizer.pad_token_id, dtype=np.int32
+                )
+                output_tokens = np.concatenate([output_tokens, pad_tokens], axis=-1)
+                pad_mask = np.zeros(
+                    (batch_size, padding_length), dtype=inputs.attention_mask.dtype
+                )
+                attention_mask = np.concatenate([attention_mask, pad_mask], axis=-1)
+            bos_tokens = np.full(
+                (batch_size, 1), tokenizer.bos_token_id, dtype=np.int32
+            )
+            input_tokens = np.concatenate([bos_tokens, output_tokens[:, :-1]], axis=-1)
+            bos_mask = np.ones((batch_size, 1), dtype=inputs.attention_mask.dtype)
+            total_seq_length = output_tokens.shape[1]
+            total_loglikelihood = 0.0
+            total_is_greedy = True
+            # Sliding window
+            for i in range(0, total_seq_length, FLAGS.seq_length):
+                # Last window
+                if i + FLAGS.seq_length > total_seq_length:
+                    last_output_mask = np.copy(attention_mask[:, -FLAGS.seq_length:])
+                    last_output_mask[:, :i - total_seq_length] = 0.0
+                    batch = dict(
+                        input_tokens=input_tokens[:, -FLAGS.seq_length:],
+                        output_tokens=output_tokens[:, -FLAGS.seq_length:],
+                        input_mask=attention_mask[:, -FLAGS.seq_length:],
+                        output_mask=last_output_mask,
+                    )
+                # Normal window
+                else:
+                    batch = dict(
+                        input_tokens=input_tokens[:, i:i + FLAGS.seq_length],
+                        output_tokens=output_tokens[:, i:i + FLAGS.seq_length],
+                        input_mask=attention_mask[:, i:i + FLAGS.seq_length],
+                        output_mask=attention_mask[:, i:i + FLAGS.seq_length],
+                    )
+                with mesh:
+                    loglikelihood, is_greedy, sharded_rng = forward_loglikelihood(
+                        params, sharded_rng, batch
+                    )
+                    loglikelihood, is_greedy = jax.device_get((loglikelihood, is_greedy))
+                total_loglikelihood += loglikelihood
+                total_is_greedy = np.logical_and(is_greedy, total_is_greedy)
+            return total_loglikelihood, total_is_greedy
+        @staticmethod
+        def generate(text, temperature):
+            nonlocal sharded_rng
+            inputs = prefix_tokenizer(
+                text,
+                padding='max_length',
+                truncation=True,
+                max_length=FLAGS.input_length,
+                return_tensors='np',
+            )
+            input_tokens = inputs.input_ids
+            input_mask = inputs.attention_mask
+            if FLAGS.add_bos_token:
+                input_tokens[:, 0] = tokenizer.bos_token_id
+                input_mask[:, 0] = 1
+            batch = dict(
+                input_tokens=input_tokens,
+                attention_mask=input_mask,
+            )
+            with mesh:
+                output, sharded_rng = forward_generate(
+                    params, sharded_rng, batch, temperature
+                )
+                output = jax.device_get(output)
+            output_text = []
+            for text in list(tokenizer.batch_decode(output)):
+                if tokenizer.eos_token in text:
+                    text = text.split(tokenizer.eos_token, maxsplit=1)[0]
+                output_text.append(text)
+            return output_text
+        @staticmethod
+        def greedy_until(prefix_text, until, max_length):
+            nonlocal sharded_rng
+            all_outputs = []
+            for pf, ut in zip(prefix_text, until):
+                if isinstance(ut, str):
+                    ut = [ut]
+                total_length = 0
+                total_generated = ''
+                while total_length < max_length:
+                    pf_tokens = tokenizer(
+                        pf,
+                        padding=False,
+                        truncation=False,
+                        max_length=np.iinfo(np.int32).max,
+                        return_tensors='np',
+                    )
+                    input_tokens = pf_tokens.input_ids
+                    attention_mask = pf_tokens.attention_mask
+                    if input_tokens.shape[1] < FLAGS.input_length:
+                        extra = FLAGS.input_length - input_tokens.shape[1]
+                        pad_tokens = np.full(
+                            (1, extra), tokenizer.pad_token_id, dtype=np.int32
+                        )
+                        input_tokens = np.concatenate(
+                            [pad_tokens, input_tokens], axis=1
+                        )
+                        pad_attention = np.zeros((1, extra), dtype=attention_mask.dtype)
+                        attention_mask = np.concatenate(
+                            [pad_attention, attention_mask], axis=1
+                        )
+                    elif input_tokens.shape[1] > FLAGS.input_length:
+                        input_tokens = input_tokens[:, -FLAGS.input_length:]
+                        attention_mask = attention_mask[:, -FLAGS.input_length:]
+                    if FLAGS.add_bos_token:
+                        input_tokens[:, 0] = tokenizer.bos_token_id
+                        attention_mask[:, 0] = 1
+                    batch = dict(input_tokens=input_tokens, attention_mask=attention_mask)
+                    with mesh:
+                        output, sharded_rng = forward_greedy_generate(
+                            params, sharded_rng, batch
+                        )
+                        output = jax.device_get(output)
+                    total_length += output.shape[1]
+                    output_text = tokenizer.batch_decode(output)[0]
+                    total_generated = total_generated + output_text
+                    pf = pf + output_text
+                    done = False
+                    for s in ut:
+                        if s in total_generated:
+                            total_generated = total_generated.split(s, maxsplit=1)[0]
+                            done = True
+                    if done:
+                        break
+                all_outputs.append(total_generated)
+            return all_outputs
+    server = ModelServer(FLAGS.lm_server)
+    server.run()
+if __name__ == "__main__":
+    mlxu.run(main)

EasyLM/models/llama/llama_train.py ADDED Viewed

	@@ -0,0 +1,268 @@

+import pprint
+from functools import partial
+from tqdm import tqdm, trange
+import numpy as np
+import mlxu
+import jax
+import jax.numpy as jnp
+from jax.experimental.pjit import pjit
+from jax.sharding import PartitionSpec as PS
+from flax.training.train_state import TrainState
+from EasyLM.data import DatasetFactory
+from EasyLM.checkpoint import StreamingCheckpointer
+from EasyLM.optimizers import OptimizerFactory
+from EasyLM.jax_utils import (
+    JaxRNG, JaxDistributedConfig, next_rng, match_partition_rules,
+    cross_entropy_loss_and_accuracy, global_norm, get_float_dtype_by_name,
+    set_random_seed, average_metrics, get_weight_decay_mask,
+    make_shard_and_gather_fns, with_sharding_constraint,
+)
+from EasyLM.models.llama.llama_model import (
+    LLaMAConfig, FlaxLLaMAForCausalLMModule
+)
+FLAGS, FLAGS_DEF = mlxu.define_flags_with_default(
+    seed=42,
+    mesh_dim='1,-1,1',
+    dtype='fp32',
+    param_dtype='fp32',
+    total_steps=10000,
+    load_llama_config='',
+    update_llama_config='',
+    load_checkpoint='',
+    load_dataset_state='',
+    log_freq=50,
+    save_model_freq=0,
+    save_milestone_freq=0,
+    eval_freq=0,
+    tokenizer=LLaMAConfig.get_tokenizer_config(),
+    train_dataset=DatasetFactory.get_default_config(),
+    eval_dataset=DatasetFactory.get_default_config(),
+    optimizer=OptimizerFactory.get_default_config(),
+    checkpointer=StreamingCheckpointer.get_default_config(),
+    llama=LLaMAConfig.get_default_config(),
+    logger=mlxu.WandBLogger.get_default_config(),
+    log_all_worker=False,
+    jax_distributed=JaxDistributedConfig.get_default_config(),
+)
+def main(argv):
+    JaxDistributedConfig.initialize(FLAGS.jax_distributed)
+    variant = mlxu.get_user_flags(FLAGS, FLAGS_DEF)
+    flags_config_dict = mlxu.user_flags_to_config_dict(FLAGS, FLAGS_DEF)
+    logger = mlxu.WandBLogger(
+        config=FLAGS.logger,
+        variant=variant,
+        enable=FLAGS.log_all_worker or (jax.process_index() == 0),
+    )
+    set_random_seed(FLAGS.seed)
+    tokenizer = LLaMAConfig.get_tokenizer(FLAGS.tokenizer)
+    dataset = DatasetFactory.load_dataset(FLAGS.train_dataset, tokenizer)
+    if FLAGS.load_dataset_state != '':
+        dataset.load_state_dict(mlxu.load_pickle(FLAGS.load_dataset_state))
+    if FLAGS.eval_freq > 0:
+        eval_dataset = DatasetFactory.load_dataset(
+            FLAGS.eval_dataset, dataset.tokenizer, eval_dataset=True
+        )
+    seq_length = dataset.seq_length
+    if FLAGS.load_llama_config != '':
+        llama_config = LLaMAConfig.load_config(FLAGS.load_llama_config)
+    else:
+        llama_config = LLaMAConfig(**FLAGS.llama)
+    if FLAGS.update_llama_config != '':
+        llama_config.update(dict(eval(FLAGS.update_llama_config)))
+    llama_config.update(dict(
+        bos_token_id=dataset.tokenizer.bos_token_id,
+        eos_token_id=dataset.tokenizer.eos_token_id,
+    ))
+    if llama_config.vocab_size < dataset.vocab_size:
+        print("Updating model config vocab size from", llama_config.vocab_size, "to", dataset.vocab_size)
+        llama_config.update(dict(vocab_size=dataset.vocab_size))
+    model = FlaxLLaMAForCausalLMModule(
+        llama_config, dtype=get_float_dtype_by_name(FLAGS.dtype), param_dtype=get_float_dtype_by_name(FLAGS.param_dtype)
+    )
+    optimizer, optimizer_info = OptimizerFactory.get_optimizer(
+        FLAGS.optimizer,
+        get_weight_decay_mask(LLaMAConfig.get_weight_decay_exclusions())
+    )
+    def create_trainstate_from_params(params):
+        return TrainState.create(params=params, tx=optimizer, apply_fn=None)
+    def init_fn(rng):
+        rng_generator = JaxRNG(rng)
+        params = model.init(
+            input_ids=jnp.zeros((4, seq_length), dtype=jnp.int32),
+            position_ids=jnp.zeros((4, seq_length), dtype=jnp.int32),
+            attention_mask=jnp.ones((4, seq_length), dtype=jnp.int32),
+            rngs=rng_generator(llama_config.rng_keys()),
+        )
+        return TrainState.create(params=params, tx=optimizer, apply_fn=None)
+    def train_step(train_state, rng, batch):
+        rng_generator = JaxRNG(rng)
+        batch = with_sharding_constraint(batch, PS(('dp', 'fsdp')))
+        def loss_and_accuracy(params):
+            logits = model.apply(
+                params, batch['input_tokens'], deterministic=False,
+                rngs=rng_generator(llama_config.rng_keys()),
+            ).logits
+            return cross_entropy_loss_and_accuracy(
+                logits, batch['target_tokens'], batch['loss_masks']
+            )
+        grad_fn = jax.value_and_grad(loss_and_accuracy, has_aux=True)
+        (loss, accuracy), grads = grad_fn(train_state.params)
+        train_state = train_state.apply_gradients(grads=grads)
+        metrics = dict(
+            loss=loss,
+            accuracy=accuracy,
+            learning_rate=optimizer_info['learning_rate_schedule'](train_state.step),
+            gradient_norm=global_norm(grads),
+            param_norm=global_norm(train_state.params),
+        )
+        return train_state, rng_generator(), metrics
+    def eval_step(train_state, rng, batch):
+        rng_generator = JaxRNG(rng)
+        batch = with_sharding_constraint(batch, PS(('dp', 'fsdp')))
+        logits = model.apply(
+            train_state.params, batch['input_tokens'], deterministic=True,
+            rngs=rng_generator(llama_config.rng_keys()),
+        ).logits
+        loss, accuracy = cross_entropy_loss_and_accuracy(
+            logits, batch['target_tokens'], batch['loss_masks']
+        )
+        metrics = dict(
+            eval_loss=loss,
+            eval_accuracy=accuracy,
+        )
+        return rng_generator(), metrics
+    train_state_shapes = jax.eval_shape(init_fn, next_rng())
+    train_state_partition = match_partition_rules(
+        LLaMAConfig.get_partition_rules(), train_state_shapes
+    )
+    shard_fns, gather_fns = make_shard_and_gather_fns(
+        train_state_partition, train_state_shapes
+    )
+    checkpointer = StreamingCheckpointer(
+        FLAGS.checkpointer, logger.output_dir,
+        enable=jax.process_index() == 0,
+    )
+    sharded_init_fn = pjit(
+        init_fn,
+        in_shardings=PS(),
+        out_shardings=train_state_partition
+    )
+    sharded_create_trainstate_from_params = pjit(
+        create_trainstate_from_params,
+        in_shardings=(train_state_partition.params, ),
+        out_shardings=train_state_partition,
+        donate_argnums=(0, ),
+    )
+    sharded_train_step = pjit(
+        train_step,
+        in_shardings=(train_state_partition, PS(), PS()),
+        out_shardings=(train_state_partition, PS(), PS()),
+        donate_argnums=(0, 1),
+    )
+    sharded_eval_step = pjit(
+        eval_step,
+        in_shardings=(train_state_partition, PS(), PS()),
+        out_shardings=(PS(), PS()),
+        donate_argnums=(1,),
+    )
+    def save_checkpoint(train_state, milestone=False):
+        step = int(jax.device_get(train_state.step))
+        metadata = dict(
+            step=step,
+            variant=variant,
+            flags=flags_config_dict,
+            llama_config=llama_config.to_dict(),
+        )
+        checkpointer.save_all(
+            train_state=train_state,
+            gather_fns=gather_fns,
+            metadata=metadata,
+            dataset=dataset.get_state_dict(),
+            milestone=milestone,
+        )
+    mesh = LLaMAConfig.get_jax_mesh(FLAGS.mesh_dim)
+    with mesh:
+        train_state, restored_params = None, None
+        if FLAGS.load_checkpoint != '':
+            train_state, restored_params = checkpointer.load_trainstate_checkpoint(
+                FLAGS.load_checkpoint, train_state_shapes, shard_fns
+            )
+        if train_state is None and restored_params is None:
+            # Initialize from scratch
+            train_state = sharded_init_fn(next_rng())
+        elif train_state is None and restored_params is not None:
+            # Restore from params but initialize train_state
+            train_state = sharded_create_trainstate_from_params(restored_params)
+            del restored_params
+        start_step = int(jax.device_get(train_state.step))
+        if FLAGS.save_model_freq > 0:
+            save_checkpoint(train_state)
+        sharded_rng = next_rng()
+        step_counter = trange(start_step, FLAGS.total_steps, ncols=0)
+        for step, (batch, dataset_metrics) in zip(step_counter, dataset):
+            train_state, sharded_rng, metrics = sharded_train_step(
+                train_state, sharded_rng, batch
+            )
+            if FLAGS.eval_freq > 0 and (step + 1) % FLAGS.eval_freq == 0:
+                eval_metric_list = []
+                eval_iterator = iter(eval_dataset)
+                for eval_batch, _ in eval_iterator:
+                    sharded_rng, eval_metrics = sharded_eval_step(
+                        train_state, sharded_rng, eval_batch
+                    )
+                    eval_metric_list.append(eval_metrics)
+                metrics.update(average_metrics(eval_metric_list))
+            if FLAGS.log_freq > 0 and (step + 1) % FLAGS.log_freq == 0:
+                log_metrics = {"step": step + 1}
+                log_metrics.update(metrics)
+                log_metrics.update(dataset_metrics)
+                log_metrics = jax.device_get(log_metrics)
+                logger.log(log_metrics)
+                tqdm.write("\n" + pprint.pformat(log_metrics) + "\n")
+            if FLAGS.save_milestone_freq > 0 and (step + 1) % FLAGS.save_milestone_freq == 0:
+                save_checkpoint(train_state, milestone=True)
+            elif FLAGS.save_model_freq > 0 and (step + 1) % FLAGS.save_model_freq == 0:
+                save_checkpoint(train_state)
+        if FLAGS.save_model_freq > 0:
+            save_checkpoint(train_state)
+if __name__ == "__main__":
+    mlxu.run(main)

EasyLM/models/roberta/__init__.py ADDED Viewed

File without changes

EasyLM/models/roberta/roberta_model.py ADDED Viewed

	@@ -0,0 +1,1694 @@

+# coding=utf-8
+# Copyright 2021 The Google Flax Team Authors and The HuggingFace Inc. team.
+# Modifications copyright 2022 Xinyang Geng
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Optional, Tuple
+from collections import OrderedDict
+from typing import Mapping
+import numpy as np
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen import combine_masks, make_causal_mask
+from flax.linen import partitioning as nn_partitioning
+from flax.linen.attention import dot_product_attention_weights
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+from jax.sharding import PartitionSpec
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_flax_outputs import (
+    FlaxBaseModelOutputWithPastAndCrossAttentions,
+    FlaxBaseModelOutputWithPooling,
+    FlaxBaseModelOutputWithPoolingAndCrossAttentions,
+    FlaxCausalLMOutputWithCrossAttentions,
+    FlaxMaskedLMOutput,
+    FlaxMultipleChoiceModelOutput,
+    FlaxQuestionAnsweringModelOutput,
+    FlaxSequenceClassifierOutput,
+    FlaxTokenClassifierOutput,
+)
+from transformers.modeling_flax_utils import (
+    ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring,
+    overwrite_call_docstring
+)
+from transformers.utils import (
+    add_start_docstrings, add_start_docstrings_to_model_forward, logging
+)
+from transformers import AutoTokenizer
+from ml_collections import ConfigDict
+from ml_collections.config_dict import config_dict
+from mlxu import function_args_to_config, load_pickle
+from EasyLM.jax_utils import with_sharding_constraint, get_jax_mesh
+"""
+The follow code is taken from
+transformers/src/transformers/models/roberta/configuration_roberta.py
+and modified to work with EasyLM.
+"""
+ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "roberta-base": "https://huggingface.co/roberta-base/resolve/main/config.json",
+    "roberta-large": "https://huggingface.co/roberta-large/resolve/main/config.json",
+    "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/config.json",
+    "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/config.json",
+    "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/config.json",
+    "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/config.json",
+}
+class RobertaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`RobertaModel`] or a [`TFRobertaModel`]. It is
+    used to instantiate a RoBERTa model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the RoBERTa
+    [roberta-base](https://huggingface.co/roberta-base) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the RoBERTa model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`RobertaModel`] or [`TFRobertaModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`RobertaModel`] or [`TFRobertaModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+    Examples:
+    ```python
+    >>> from transformers import RobertaConfig, RobertaModel
+    >>> # Initializing a RoBERTa configuration
+    >>> configuration = RobertaConfig()
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = RobertaModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "roberta"
+    def __init__(
+        self,
+        vocab_size=50265,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=514,
+        type_vocab_size=1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        position_embedding_type="absolute",
+        use_cache=True,
+        classifier_dropout=None,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+    @classmethod
+    def get_default_config(cls, updates=None):
+        none_arg_types = dict(
+            classifier_dropout=float,
+        )
+        config = function_args_to_config(cls.__init__, none_arg_types=none_arg_types)
+        config.tie_word_embeddings = True
+        if updates is not None:
+            config.update(ConfigDict(updates).copy_and_resolve_references())
+        return config
+    @staticmethod
+    def get_jax_mesh(axis_dims):
+        return get_jax_mesh(axis_dims, ('dp', 'fsdp', 'mp'))
+    @staticmethod
+    def get_partition_rules():
+        """ Parition rules for Roberta model. """
+        return (
+            ('embeddings/(position_embeddings|token_type_embeddings)/embedding', PartitionSpec()),
+            ('embeddings/word_embeddings/embedding', PartitionSpec()),
+            ('attention/self/(key|query|value)/kernel', PartitionSpec('fsdp', 'mp')),
+            ('attention/self/(key|query|value)/bias', PartitionSpec()),
+            ('attention/output/dense/kernel', PartitionSpec('mp', 'fsdp')),
+            ('attention/output/dense/bias', PartitionSpec()),
+            ('(LayerNorm|layer_norm)/(bias|scale)', PartitionSpec()),
+            ('intermediate/dense/kernel', PartitionSpec('fsdp', 'mp')),
+            ('intermediate/dense/bias', PartitionSpec('mp')),
+            ('output/dense/kernel', PartitionSpec('mp', 'fsdp')),
+            ('output/dense/bias', PartitionSpec()),
+            ('lm_head/dense/kernel', PartitionSpec()),
+            ('lm_head/dense/bias', PartitionSpec()),
+            ('lm_head/decoder/kernel', PartitionSpec('fsdp', 'mp')),
+            ('lm_head/decoder/bias', PartitionSpec('mp')),
+            ('.*', PartitionSpec()),
+        )
+    @staticmethod
+    def get_weight_decay_exclusions():
+        return ('bias', 'LayerNorm/scale', 'layer_norm/scale')
+    @staticmethod
+    def rng_keys():
+        return ('params', 'dropout')
+    @staticmethod
+    def get_tokenizer_config(updates=None):
+        config = ConfigDict()
+        config.name = 'roberta-base'
+        if updates is not None:
+            config.update(ConfigDict(updates).copy_and_resolve_references())
+        return config
+    @classmethod
+    def get_tokenizer(cls, config):
+        config = cls.get_tokenizer_config(config)
+        return AutoTokenizer.from_pretrained(
+            config.name,
+        )
+    @staticmethod
+    def load_pretrained(name):
+        with jax.default_device(jax.devices("cpu")[0]):
+            params = FlaxRobertaForMaskedLM.from_pretrained(name, _do_init=False)[1]
+            params = freeze({'params': params})
+        return params
+    @classmethod
+    def load_config(cls, path):
+        load_type, load_path = path.split('::', 1)
+        if load_type == 'pickle':
+            return cls.from_dict(load_pickle(load_path)['roberta_config'])
+        elif load_type == 'huggingface':
+            return cls.from_pretrained(load_path)
+        else:
+            raise ValueError(f'Unsupported load config type: {load_type}')
+"""
+The follow code is taken from
+transformers/src/transformers/models/roberta/modeling_flax_roberta.py
+and modified to work with EasyLM.
+"""
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "roberta-base"
+_CONFIG_FOR_DOC = "RobertaConfig"
+remat = nn_partitioning.remat
+def create_position_ids_from_input_ids(input_ids, padding_idx):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+    Args:
+        input_ids: jnp.ndarray
+        padding_idx: int
+    Returns: jnp.ndarray
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = (input_ids != padding_idx).astype("i4")
+    if mask.ndim > 2:
+        mask = mask.reshape((-1, mask.shape[-1]))
+        incremental_indices = jnp.cumsum(mask, axis=1).astype("i4") * mask
+        incremental_indices = incremental_indices.reshape(input_ids.shape)
+    else:
+        incremental_indices = jnp.cumsum(mask, axis=1).astype("i4") * mask
+    return incremental_indices.astype("i4") + padding_idx
+ROBERTA_START_DOCSTRING = r"""
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)
+    This model is also a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module)
+    subclass. Use it as a regular Flax linen Module and refer to the Flax documentation for all matter related to
+    general usage and behavior.
+    Finally, this model supports inherent JAX features such as:
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+    Parameters:
+        config ([`RobertaConfig`]): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+ROBERTA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`numpy.ndarray` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+        head_mask (`numpy.ndarray` of shape `({0})`, `optional):
+            Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEmbeddings with Bert->Roberta
+class FlaxRobertaEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+    config: RobertaConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    def setup(self):
+        self.word_embeddings = nn.Embed(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.position_embeddings = nn.Embed(
+            self.config.max_position_embeddings,
+            self.config.hidden_size,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.token_type_embeddings = nn.Embed(
+            self.config.type_vocab_size,
+            self.config.hidden_size,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+    def __call__(self, input_ids, token_type_ids, position_ids, attention_mask, deterministic: bool = True):
+        # Embed
+        inputs_embeds = self.word_embeddings(input_ids.astype("i4"))
+        position_embeds = self.position_embeddings(position_ids.astype("i4"))
+        token_type_embeddings = self.token_type_embeddings(token_type_ids.astype("i4"))
+        # Sum all embeddings
+        hidden_states = inputs_embeds + token_type_embeddings + position_embeds
+        # Layer Norm
+        hidden_states = self.LayerNorm(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        return hidden_states
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertSelfAttention with Bert->Roberta
+class FlaxRobertaSelfAttention(nn.Module):
+    config: RobertaConfig
+    causal: bool = False
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    def setup(self):
+        self.head_dim = self.config.hidden_size // self.config.num_attention_heads
+        if self.config.hidden_size % self.config.num_attention_heads != 0:
+            raise ValueError(
+                "`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads` "
+                "                   : {self.config.num_attention_heads}"
+            )
+        self.query = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.key = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.value = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        if self.causal:
+            self.causal_mask = make_causal_mask(
+                jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool"
+            )
+    def _split_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.config.num_attention_heads, self.head_dim))
+    def _merge_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.config.hidden_size,))
+    @nn.compact
+    # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartAttention._concatenate_to_cache
+    def _concatenate_to_cache(self, key, value, query, attention_mask):
+        """
+        This function takes projected key, value states from a single input token and concatenates the states to cached
+        states from previous steps. This function is slighly adapted from the official Flax repository:
+        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
+        """
+        # detect if we're initializing by absence of existing cache data.
+        is_initialized = self.has_variable("cache", "cached_key")
+        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
+        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
+        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+        if is_initialized:
+            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
+            # update key, value caches with our new 1d spatial slices
+            cur_index = cache_index.value
+            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
+            key = lax.dynamic_update_slice(cached_key.value, key, indices)
+            value = lax.dynamic_update_slice(cached_value.value, value, indices)
+            cached_key.value = key
+            cached_value.value = value
+            num_updated_cache_vectors = query.shape[1]
+            cache_index.value = cache_index.value + num_updated_cache_vectors
+            # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements.
+            pad_mask = jnp.broadcast_to(
+                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
+                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
+            )
+            attention_mask = combine_masks(pad_mask, attention_mask)
+        return key, value, attention_mask
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        layer_head_mask,
+        key_value_states: Optional[jnp.array] = None,
+        init_cache: bool = False,
+        deterministic=True,
+        output_attentions: bool = False,
+    ):
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        batch_size = hidden_states.shape[0]
+        # get query proj
+        query_states = self.query(hidden_states)
+        # get key, value proj
+        if is_cross_attention:
+            # cross_attentions
+            key_states = self.key(key_value_states)
+            value_states = self.value(key_value_states)
+        else:
+            # self_attention
+            key_states = self.key(hidden_states)
+            value_states = self.value(hidden_states)
+        query_states = self._split_heads(query_states)
+        key_states = self._split_heads(key_states)
+        value_states = self._split_heads(value_states)
+        # handle cache prepare causal attention mask
+        if self.causal:
+            query_length, key_length = query_states.shape[1], key_states.shape[1]
+            if self.has_variable("cache", "cached_key"):
+                mask_shift = self.variables["cache"]["cache_index"]
+                max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
+                causal_mask = lax.dynamic_slice(
+                    self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length)
+                )
+            else:
+                causal_mask = self.causal_mask[:, :, :query_length, :key_length]
+            causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
+        # combine masks if needed
+        if attention_mask is not None and self.causal:
+            attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
+            attention_mask = combine_masks(attention_mask, causal_mask)
+        elif self.causal:
+            attention_mask = causal_mask
+        elif attention_mask is not None:
+            attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
+        # During fast autoregressive decoding, we feed one position at a time,
+        # and cache the keys and values step by step.
+        if self.causal and (self.has_variable("cache", "cached_key") or init_cache):
+            key_states, value_states, attention_mask = self._concatenate_to_cache(
+                key_states, value_states, query_states, attention_mask
+            )
+        # Convert the boolean attention mask to an attention bias.
+        if attention_mask is not None:
+            # attention mask in the form of attention bias
+            attention_bias = lax.select(
+                attention_mask > 0,
+                jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
+            )
+        else:
+            attention_bias = None
+        dropout_rng = None
+        if not deterministic and self.config.attention_probs_dropout_prob > 0.0:
+            dropout_rng = self.make_rng("dropout")
+        attn_weights = dot_product_attention_weights(
+            query_states,
+            key_states,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.config.attention_probs_dropout_prob,
+            broadcast_dropout=True,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            precision=None,
+        )
+        # Mask heads if we want to
+        if layer_head_mask is not None:
+            attn_weights = jnp.einsum("...hqk,h->...hqk", attn_weights, layer_head_mask)
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
+        attn_output = attn_output.reshape(attn_output.shape[:2] + (-1,))
+        outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
+        return outputs
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertSelfOutput with Bert->Roberta
+class FlaxRobertaSelfOutput(nn.Module):
+    config: RobertaConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+    def __call__(self, hidden_states, input_tensor, deterministic: bool = True):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertAttention with Bert->Roberta
+class FlaxRobertaAttention(nn.Module):
+    config: RobertaConfig
+    causal: bool = False
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.self = FlaxRobertaSelfAttention(self.config, causal=self.causal, dtype=self.dtype)
+        self.output = FlaxRobertaSelfOutput(self.config, dtype=self.dtype)
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        layer_head_mask,
+        key_value_states=None,
+        init_cache=False,
+        deterministic=True,
+        output_attentions: bool = False,
+    ):
+        # Attention mask comes in as attention_mask.shape == (*batch_sizes, kv_length)
+        # FLAX expects: attention_mask.shape == (*batch_sizes, 1, 1, kv_length) such that it is broadcastable
+        # with attn_weights.shape == (*batch_sizes, num_heads, q_length, kv_length)
+        attn_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            layer_head_mask=layer_head_mask,
+            key_value_states=key_value_states,
+            init_cache=init_cache,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+        )
+        attn_output = attn_outputs[0]
+        hidden_states = self.output(attn_output, hidden_states, deterministic=deterministic)
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_outputs[1],)
+        return outputs
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertIntermediate with Bert->Roberta
+class FlaxRobertaIntermediate(nn.Module):
+    config: RobertaConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.intermediate_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.activation = ACT2FN[self.config.hidden_act]
+    def __call__(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertOutput with Bert->Roberta
+class FlaxRobertaOutput(nn.Module):
+    config: RobertaConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+    def __call__(self, hidden_states, attention_output, deterministic: bool = True):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        hidden_states = self.LayerNorm(hidden_states + attention_output)
+        return hidden_states
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLayer with Bert->Roberta
+class FlaxRobertaLayer(nn.Module):
+    config: RobertaConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    def setup(self):
+        self.attention = FlaxRobertaAttention(self.config, causal=self.config.is_decoder, dtype=self.dtype)
+        self.intermediate = FlaxRobertaIntermediate(self.config, dtype=self.dtype)
+        self.output = FlaxRobertaOutput(self.config, dtype=self.dtype)
+        if self.config.add_cross_attention:
+            self.crossattention = FlaxRobertaAttention(self.config, causal=False, dtype=self.dtype)
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        layer_head_mask,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+    ):
+        # Self Attention
+        attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            layer_head_mask=layer_head_mask,
+            init_cache=init_cache,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+        )
+        attention_output = attention_outputs[0]
+        # Cross-Attention Block
+        if encoder_hidden_states is not None:
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=layer_head_mask,
+                key_value_states=encoder_hidden_states,
+                deterministic=deterministic,
+                output_attentions=output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+        hidden_states = self.intermediate(attention_output)
+        hidden_states = self.output(hidden_states, attention_output, deterministic=deterministic)
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attention_outputs[1],)
+            if encoder_hidden_states is not None:
+                outputs += (cross_attention_outputs[1],)
+        return outputs
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLayerCollection with Bert->Roberta
+class FlaxRobertaLayerCollection(nn.Module):
+    config: RobertaConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    gradient_checkpointing: bool = False
+    def setup(self):
+        if self.gradient_checkpointing:
+            FlaxRobertaCheckpointLayer = remat(FlaxRobertaLayer, static_argnums=(5, 6, 7))
+            self.layers = [
+                FlaxRobertaCheckpointLayer(self.config, name=str(i), dtype=self.dtype)
+                for i in range(self.config.num_hidden_layers)
+            ]
+        else:
+            self.layers = [
+                FlaxRobertaLayer(self.config, name=str(i), dtype=self.dtype)
+                for i in range(self.config.num_hidden_layers)
+            ]
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        head_mask,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        # Check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.shape[0] != (len(self.layers)):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for                  "
+                    f"       {head_mask.shape[0]}."
+                )
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            layer_outputs = layer(
+                hidden_states,
+                attention_mask,
+                head_mask[i] if head_mask is not None else None,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                init_cache,
+                deterministic,
+                output_attentions,
+            )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions += (layer_outputs[1],)
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        outputs = (hidden_states, all_hidden_states, all_attentions, all_cross_attentions)
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+        return FlaxBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEncoder with Bert->Roberta
+class FlaxRobertaEncoder(nn.Module):
+    config: RobertaConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    gradient_checkpointing: bool = False
+    def setup(self):
+        self.layer = FlaxRobertaLayerCollection(
+            self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        head_mask,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        return self.layer(
+            hidden_states,
+            attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            init_cache=init_cache,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPooler with Bert->Roberta
+class FlaxRobertaPooler(nn.Module):
+    config: RobertaConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+    def __call__(self, hidden_states):
+        cls_hidden_state = hidden_states[:, 0]
+        cls_hidden_state = self.dense(cls_hidden_state)
+        return nn.tanh(cls_hidden_state)
+class FlaxRobertaLMHead(nn.Module):
+    config: RobertaConfig
+    dtype: jnp.dtype = jnp.float32
+    bias_init: Callable[..., np.ndarray] = jax.nn.initializers.zeros
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.layer_norm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.decoder = nn.Dense(
+            self.config.vocab_size,
+            dtype=self.dtype,
+            use_bias=False,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.bias = self.param("bias", self.bias_init, (self.config.vocab_size,))
+    def __call__(self, hidden_states, shared_embedding=None):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = ACT2FN["gelu"](hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        if shared_embedding is not None:
+            hidden_states = self.decoder.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
+        else:
+            hidden_states = self.decoder(hidden_states)
+        bias = jnp.asarray(self.bias, self.dtype)
+        hidden_states += bias
+        return hidden_states
+class FlaxRobertaClassificationHead(nn.Module):
+    config: RobertaConfig
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        classifier_dropout = (
+            self.config.classifier_dropout
+            if self.config.classifier_dropout is not None
+            else self.config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(rate=classifier_dropout)
+        self.out_proj = nn.Dense(
+            self.config.num_labels,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+    def __call__(self, hidden_states, deterministic=True):
+        hidden_states = hidden_states[:, 0, :]  # take <s> token (equiv. to [CLS])
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = nn.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+class FlaxRobertaPreTrainedModel(FlaxPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = RobertaConfig
+    base_model_prefix = "roberta"
+    module_class: nn.Module = None
+    def __init__(
+        self,
+        config: RobertaConfig,
+        input_shape: Tuple = (1, 1),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        gradient_checkpointing: bool = False,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+    # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPreTrainedModel.enable_gradient_checkpointing
+    def enable_gradient_checkpointing(self):
+        self._module = self.module_class(
+            config=self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=True,
+        )
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_ids = jnp.zeros(input_shape, dtype="i4")
+        token_type_ids = jnp.ones_like(input_ids)
+        position_ids = create_position_ids_from_input_ids(input_ids, self.config.pad_token_id)
+        attention_mask = jnp.ones_like(input_ids)
+        head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads))
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+        if self.config.add_cross_attention:
+            encoder_hidden_states = jnp.zeros(input_shape + (self.config.hidden_size,))
+            encoder_attention_mask = attention_mask
+            module_init_outputs = self.module.init(
+                rngs,
+                input_ids,
+                attention_mask,
+                token_type_ids,
+                position_ids,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                return_dict=False,
+            )
+        else:
+            module_init_outputs = self.module.init(
+                rngs, input_ids, attention_mask, token_type_ids, position_ids, head_mask, return_dict=False
+            )
+        random_params = module_init_outputs["params"]
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+    # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartDecoderPreTrainedModel.init_cache
+    def init_cache(self, batch_size, max_length):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+        """
+        # init input variables to retrieve cache
+        input_ids = jnp.ones((batch_size, max_length), dtype="i4")
+        attention_mask = jnp.ones_like(input_ids, dtype="i4")
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
+        )
+        return unfreeze(init_variables["cache"])
+    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        params: dict = None,
+        dropout_rng: jax.random.PRNGKey = None,
+        train: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        past_key_values: dict = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        # init input tensors if not passed
+        if token_type_ids is None:
+            token_type_ids = jnp.zeros_like(input_ids)
+        if position_ids is None:
+            position_ids = create_position_ids_from_input_ids(input_ids, self.config.pad_token_id)
+        if attention_mask is None:
+            attention_mask = jnp.ones_like(input_ids)
+        if head_mask is None:
+            head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads))
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+        inputs = {"params": params or self.params}
+        if self.config.add_cross_attention:
+            # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed
+            # down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be
+            # changed by FlaxRobertaAttention module
+            if past_key_values:
+                inputs["cache"] = past_key_values
+                mutable = ["cache"]
+            else:
+                mutable = False
+            outputs = self.module.apply(
+                inputs,
+                jnp.array(input_ids, dtype="i4"),
+                jnp.array(attention_mask, dtype="i4"),
+                token_type_ids=jnp.array(token_type_ids, dtype="i4"),
+                position_ids=jnp.array(position_ids, dtype="i4"),
+                head_mask=jnp.array(head_mask, dtype="i4"),
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                deterministic=not train,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                rngs=rngs,
+                mutable=mutable,
+            )
+            # add updated cache to model output
+            if past_key_values is not None and return_dict:
+                outputs, past_key_values = outputs
+                outputs["past_key_values"] = unfreeze(past_key_values["cache"])
+                return outputs
+            elif past_key_values is not None and not return_dict:
+                outputs, past_key_values = outputs
+                outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]
+        else:
+            outputs = self.module.apply(
+                inputs,
+                jnp.array(input_ids, dtype="i4"),
+                jnp.array(attention_mask, dtype="i4"),
+                token_type_ids=jnp.array(token_type_ids, dtype="i4"),
+                position_ids=jnp.array(position_ids, dtype="i4"),
+                head_mask=jnp.array(head_mask, dtype="i4"),
+                deterministic=not train,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                rngs=rngs,
+            )
+        return outputs
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertModule with Bert->Roberta
+class FlaxRobertaModule(nn.Module):
+    config: RobertaConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    add_pooling_layer: bool = True
+    gradient_checkpointing: bool = False
+    def setup(self):
+        self.embeddings = FlaxRobertaEmbeddings(self.config, dtype=self.dtype)
+        self.encoder = FlaxRobertaEncoder(
+            self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.pooler = FlaxRobertaPooler(self.config, dtype=self.dtype)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids: Optional[jnp.ndarray] = None,
+        position_ids: Optional[jnp.ndarray] = None,
+        head_mask: Optional[jnp.ndarray] = None,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # make sure `token_type_ids` is correctly initialized when not passed
+        if token_type_ids is None:
+            token_type_ids = jnp.zeros_like(input_ids)
+        # make sure `position_ids` is correctly initialized when not passed
+        if position_ids is None:
+            position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
+        hidden_states = self.embeddings(
+            input_ids, token_type_ids, position_ids, attention_mask, deterministic=deterministic
+        )
+        outputs = self.encoder(
+            hidden_states,
+            attention_mask,
+            head_mask=head_mask,
+            deterministic=deterministic,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        pooled = self.pooler(hidden_states) if self.add_pooling_layer else None
+        if not return_dict:
+            # if pooled is None, don't return it
+            if pooled is None:
+                return (hidden_states,) + outputs[1:]
+            return (hidden_states, pooled) + outputs[1:]
+        return FlaxBaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            pooler_output=pooled,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+@add_start_docstrings(
+    "The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
+    ROBERTA_START_DOCSTRING,
+)
+class FlaxRobertaModel(FlaxRobertaPreTrainedModel):
+    module_class = FlaxRobertaModule
+append_call_sample_docstring(FlaxRobertaModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutputWithPooling, _CONFIG_FOR_DOC)
+class FlaxRobertaForMaskedLMModule(nn.Module):
+    config: RobertaConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+    def setup(self):
+        self.roberta = FlaxRobertaModule(
+            config=self.config,
+            add_pooling_layer=False,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.lm_head = FlaxRobertaLMHead(config=self.config, dtype=self.dtype)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.roberta(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        if self.config.tie_word_embeddings:
+            shared_embedding = self.roberta.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
+        else:
+            shared_embedding = None
+        # Compute the prediction scores
+        logits = self.lm_head(hidden_states, shared_embedding=shared_embedding)
+        if not return_dict:
+            return (logits,) + outputs[1:]
+        return FlaxMaskedLMOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top.""", ROBERTA_START_DOCSTRING)
+class FlaxRobertaForMaskedLM(FlaxRobertaPreTrainedModel):
+    module_class = FlaxRobertaForMaskedLMModule
+append_call_sample_docstring(
+    FlaxRobertaForMaskedLM,
+    _CHECKPOINT_FOR_DOC,
+    FlaxBaseModelOutputWithPooling,
+    _CONFIG_FOR_DOC,
+    mask="<mask>",
+)
+class FlaxRobertaForSequenceClassificationModule(nn.Module):
+    config: RobertaConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+    def setup(self):
+        self.roberta = FlaxRobertaModule(
+            config=self.config,
+            dtype=self.dtype,
+            add_pooling_layer=False,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.classifier = FlaxRobertaClassificationHead(config=self.config, dtype=self.dtype)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.roberta(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output, deterministic=deterministic)
+        if not return_dict:
+            return (logits,) + outputs[1:]
+        return FlaxSequenceClassifierOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Roberta Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """,
+    ROBERTA_START_DOCSTRING,
+)
+class FlaxRobertaForSequenceClassification(FlaxRobertaPreTrainedModel):
+    module_class = FlaxRobertaForSequenceClassificationModule
+append_call_sample_docstring(
+    FlaxRobertaForSequenceClassification,
+    _CHECKPOINT_FOR_DOC,
+    FlaxSequenceClassifierOutput,
+    _CONFIG_FOR_DOC,
+)
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForMultipleChoiceModule with Bert->Roberta, with self.bert->self.roberta
+class FlaxRobertaForMultipleChoiceModule(nn.Module):
+    config: RobertaConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+    def setup(self):
+        self.roberta = FlaxRobertaModule(
+            config=self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+        self.classifier = nn.Dense(1, dtype=self.dtype)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        num_choices = input_ids.shape[1]
+        input_ids = input_ids.reshape(-1, input_ids.shape[-1]) if input_ids is not None else None
+        attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1]) if attention_mask is not None else None
+        token_type_ids = token_type_ids.reshape(-1, token_type_ids.shape[-1]) if token_type_ids is not None else None
+        position_ids = position_ids.reshape(-1, position_ids.shape[-1]) if position_ids is not None else None
+        # Model
+        outputs = self.roberta(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output, deterministic=deterministic)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.reshape(-1, num_choices)
+        if not return_dict:
+            return (reshaped_logits,) + outputs[2:]
+        return FlaxMultipleChoiceModelOutput(
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    ROBERTA_START_DOCSTRING,
+)
+class FlaxRobertaForMultipleChoice(FlaxRobertaPreTrainedModel):
+    module_class = FlaxRobertaForMultipleChoiceModule
+overwrite_call_docstring(
+    FlaxRobertaForMultipleChoice, ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
+)
+append_call_sample_docstring(
+    FlaxRobertaForMultipleChoice,
+    _CHECKPOINT_FOR_DOC,
+    FlaxMultipleChoiceModelOutput,
+    _CONFIG_FOR_DOC,
+)
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForTokenClassificationModule with Bert->Roberta, with self.bert->self.roberta
+class FlaxRobertaForTokenClassificationModule(nn.Module):
+    config: RobertaConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+    def setup(self):
+        self.roberta = FlaxRobertaModule(
+            config=self.config,
+            dtype=self.dtype,
+            add_pooling_layer=False,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        classifier_dropout = (
+            self.config.classifier_dropout
+            if self.config.classifier_dropout is not None
+            else self.config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(rate=classifier_dropout)
+        self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.roberta(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        logits = self.classifier(hidden_states)
+        if not return_dict:
+            return (logits,) + outputs[1:]
+        return FlaxTokenClassifierOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Roberta Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    ROBERTA_START_DOCSTRING,
+)
+class FlaxRobertaForTokenClassification(FlaxRobertaPreTrainedModel):
+    module_class = FlaxRobertaForTokenClassificationModule
+append_call_sample_docstring(
+    FlaxRobertaForTokenClassification,
+    _CHECKPOINT_FOR_DOC,
+    FlaxTokenClassifierOutput,
+    _CONFIG_FOR_DOC,
+)
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForQuestionAnsweringModule with Bert->Roberta, with self.bert->self.roberta
+class FlaxRobertaForQuestionAnsweringModule(nn.Module):
+    config: RobertaConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+    def setup(self):
+        self.roberta = FlaxRobertaModule(
+            config=self.config,
+            dtype=self.dtype,
+            add_pooling_layer=False,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.qa_outputs = nn.Dense(self.config.num_labels, dtype=self.dtype)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.roberta(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.qa_outputs(hidden_states)
+        start_logits, end_logits = logits.split(self.config.num_labels, axis=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+        if not return_dict:
+            return (start_logits, end_logits) + outputs[1:]
+        return FlaxQuestionAnsweringModelOutput(
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    ROBERTA_START_DOCSTRING,
+)
+class FlaxRobertaForQuestionAnswering(FlaxRobertaPreTrainedModel):
+    module_class = FlaxRobertaForQuestionAnsweringModule
+append_call_sample_docstring(
+    FlaxRobertaForQuestionAnswering,
+    _CHECKPOINT_FOR_DOC,
+    FlaxQuestionAnsweringModelOutput,
+    _CONFIG_FOR_DOC,
+)
+class FlaxRobertaForCausalLMModule(nn.Module):
+    config: RobertaConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+    def setup(self):
+        self.roberta = FlaxRobertaModule(
+            config=self.config,
+            add_pooling_layer=False,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.lm_head = FlaxRobertaLMHead(config=self.config, dtype=self.dtype)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        position_ids,
+        token_type_ids: Optional[jnp.ndarray] = None,
+        head_mask: Optional[jnp.ndarray] = None,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.roberta(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            init_cache=init_cache,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        if self.config.tie_word_embeddings:
+            shared_embedding = self.roberta.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
+        else:
+            shared_embedding = None
+        # Compute the prediction scores
+        logits = self.lm_head(hidden_states, shared_embedding=shared_embedding)
+        if not return_dict:
+            return (logits,) + outputs[1:]
+        return FlaxCausalLMOutputWithCrossAttentions(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+@add_start_docstrings(
+    """
+    Roberta Model with a language modeling head on top (a linear layer on top of the hidden-states output) e.g for
+    autoregressive tasks.
+    """,
+    ROBERTA_START_DOCSTRING,
+)
+class FlaxRobertaForCausalLM(FlaxRobertaPreTrainedModel):
+    module_class = FlaxRobertaForCausalLMModule
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
+        # initializing the cache
+        batch_size, seq_length = input_ids.shape
+        past_key_values = self.init_cache(batch_size, max_length)
+        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
+        # But since the decoder uses a causal mask, those positions are masked anyway.
+        # Thus, we can create a single static attention_mask here, which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if attention_mask is not None:
+            position_ids = attention_mask.cumsum(axis=-1) - 1
+            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
+        else:
+            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
+        return {
+            "past_key_values": past_key_values,
+            "attention_mask": extended_attention_mask,
+            "position_ids": position_ids,
+        }
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
+        return model_kwargs
+append_call_sample_docstring(
+    FlaxRobertaForCausalLM,
+    _CHECKPOINT_FOR_DOC,
+    FlaxCausalLMOutputWithCrossAttentions,
+    _CONFIG_FOR_DOC,
+)

EasyLM/models/roberta/roberta_train.py ADDED Viewed

	@@ -0,0 +1,307 @@

+import dataclasses
+import pprint
+from functools import partial
+import re
+from tqdm import tqdm, trange
+import numpy as np
+import mlxu
+import jax
+import jax.numpy as jnp
+from jax.experimental.pjit import pjit, with_sharding_constraint
+from jax.sharding import PartitionSpec as PS
+from flax.training.train_state import TrainState
+from EasyLM.data import DatasetFactory
+from EasyLM.checkpoint import StreamingCheckpointer
+from EasyLM.optimizers import OptimizerFactory
+from EasyLM.jax_utils import (
+    JaxRNG, JaxDistributedConfig, next_rng, match_partition_rules, get_float_dtype_by_name,
+    cross_entropy_loss_and_accuracy, named_tree_map, global_norm,
+    set_random_seed, average_metrics, get_weight_decay_mask,
+    make_shard_and_gather_fns, tree_apply
+)
+from EasyLM.models.roberta.roberta_model import (
+    RobertaConfig, FlaxRobertaForMaskedLMModule
+)
+FLAGS, FLAGS_DEF = mlxu.define_flags_with_default(
+    seed=42,
+    mesh_dim='-1,1,1',
+    dtype='fp32',
+    mask_token_probability=0.15,
+    total_steps=10000,
+    load_roberta_config='',
+    update_roberta_config='',
+    load_checkpoint='',
+    load_dataset_state='',
+    log_freq=50,
+    save_model_freq=0,
+    save_milestone_freq=0,
+    eval_steps=0,
+    tokenizer=RobertaConfig.get_tokenizer_config(),
+    train_dataset=DatasetFactory.get_default_config(),
+    eval_dataset=DatasetFactory.get_default_config(),
+    optimizer=OptimizerFactory.get_default_config(),
+    checkpointer=StreamingCheckpointer.get_default_config(),
+    roberta=RobertaConfig.get_default_config(),
+    logger=mlxu.WandBLogger.get_default_config(),
+    log_all_worker=False,
+    jax_distributed=JaxDistributedConfig.get_default_config(),
+)
+def main(argv):
+    JaxDistributedConfig.initialize(FLAGS.jax_distributed)
+    variant = mlxu.get_user_flags(FLAGS, FLAGS_DEF)
+    flags_config_dict = mlxu.user_flags_to_config_dict(FLAGS, FLAGS_DEF)
+    logger = mlxu.WandBLogger(
+        config=FLAGS.logger,
+        variant=variant,
+        enable=FLAGS.log_all_worker or (jax.process_index() == 0),
+    )
+    set_random_seed(FLAGS.seed)
+    tokenizer = RobertaConfig.get_tokenizer(FLAGS.tokenizer)
+    dataset = DatasetFactory.load_dataset(FLAGS.train_dataset, tokenizer)
+    if FLAGS.load_dataset_state != '':
+        dataset.load_state_dict(mlxu.load_pickle(FLAGS.load_dataset_state))
+    if FLAGS.eval_steps > 0:
+        eval_dataset = DatasetFactory.load_dataset(
+            FLAGS.eval_dataset, dataset.tokenizer
+        )
+        eval_iterator = iter(eval_dataset)
+    seq_length = dataset.seq_length
+    if FLAGS.load_roberta_config != '':
+        roberta_config = RobertaConfig.load_config(FLAGS.load_roberta_config)
+    else:
+        roberta_config = RobertaConfig(**FLAGS.roberta)
+    if FLAGS.update_roberta_config != '':
+        roberta_config.update(dict(eval(FLAGS.update_roberta_config)))
+    roberta_config.update(dict(
+        bos_token_id=dataset.tokenizer.bos_token_id,
+        eos_token_id=dataset.tokenizer.eos_token_id,
+        pad_token_id=dataset.tokenizer.pad_token_id,
+        vocab_size=dataset.vocab_size,
+    ))
+    model = FlaxRobertaForMaskedLMModule(
+        roberta_config, dtype=get_float_dtype_by_name(FLAGS.dtype)
+    )
+    optimizer, optimizer_info = OptimizerFactory.get_optimizer(
+        FLAGS.optimizer,
+        get_weight_decay_mask(RobertaConfig.get_weight_decay_exclusions()),
+    )
+    def create_trainstate_from_params(params):
+        return TrainState.create(params=params, tx=optimizer, apply_fn=None)
+    def init_fn(rng):
+        rng_generator = JaxRNG(rng)
+        params = model.init(
+            input_ids=jnp.zeros((4, seq_length), dtype=jnp.int32),
+            position_ids=jnp.zeros((4, seq_length), dtype=jnp.int32),
+            attention_mask=jnp.ones((4, seq_length), dtype=jnp.int32),
+            token_type_ids=None,
+            head_mask=None,
+            rngs=rng_generator(roberta_config.rng_keys()),
+        )
+        return TrainState.create(params=params, tx=optimizer, apply_fn=None)
+    def train_step(train_state, rng, batch):
+        rng_generator = JaxRNG(rng)
+        tokens = with_sharding_constraint(batch['target_tokens'], PS(('dp', 'fsdp')))
+        def loss_and_accuracy(params):
+            altered_tokens = jax.random.uniform(
+                rng_generator(), shape=tokens.shape
+            ) < FLAGS.mask_token_probability
+            random_uniform = jax.random.uniform(rng_generator(), shape=tokens.shape)
+            altered_by_mask = altered_tokens & (random_uniform < 0.8)
+            altered_by_random = altered_tokens & (random_uniform >= 0.8) & (random_uniform < 0.9)
+            inputs = jnp.where(altered_by_mask, dataset.tokenizer.mask_token_id, tokens)
+            random_tokens = jax.random.randint(
+                rng_generator(), shape=tokens.shape, minval=0, maxval=dataset.vocab_size
+            )
+            inputs = jnp.where(altered_by_random, random_tokens, inputs)
+            logits = model.apply(
+                params, inputs,
+                attention_mask=jnp.ones_like(inputs),
+                token_type_ids=None,
+                position_ids=None,
+                head_mask=None,
+                deterministic=False,
+                rngs=rng_generator(roberta_config.rng_keys()),
+            ).logits
+            return cross_entropy_loss_and_accuracy(logits, tokens, valid=altered_tokens)
+        grad_fn = jax.value_and_grad(loss_and_accuracy, has_aux=True)
+        (loss, accuracy), grads = grad_fn(train_state.params)
+        train_state = train_state.apply_gradients(grads=grads)
+        metrics = dict(
+            loss=loss,
+            accuracy=accuracy,
+            learning_rate=optimizer_info['learning_rate_schedule'](train_state.step),
+            gradient_norm=global_norm(grads),
+            param_norm=global_norm(train_state.params),
+        )
+        return train_state, rng_generator(), metrics
+    def eval_step(train_state, rng, batch):
+        rng_generator = JaxRNG(rng)
+        tokens = with_sharding_constraint(batch['target_tokens'], PS(('dp', 'fsdp')))
+        altered_tokens = jax.random.uniform(
+            rng_generator(), shape=tokens.shape
+        ) < FLAGS.mask_token_probability
+        random_uniform = jax.random.uniform(rng_generator(), shape=tokens.shape)
+        altered_by_mask = altered_tokens & (random_uniform < 0.8)
+        altered_by_random = altered_tokens & (random_uniform >= 0.8) & (random_uniform < 0.9)
+        inputs = jnp.where(altered_by_mask, dataset.tokenizer.mask_token_id, tokens)
+        random_tokens = jax.random.randint(
+            rng_generator(), shape=tokens.shape, minval=0, maxval=dataset.vocab_size
+        )
+        inputs = jnp.where(altered_by_random, random_tokens, inputs)
+        logits = model.apply(
+            train_state.params, inputs,
+            attention_mask=jnp.ones_like(inputs),
+            token_type_ids=None,
+            position_ids=None,
+            head_mask=None,
+            deterministic=False,
+            rngs=rng_generator(roberta_config.rng_keys()),
+        ).logits
+        loss, accuracy = cross_entropy_loss_and_accuracy(logits, tokens, valid=altered_tokens)
+        metrics = dict(
+            eval_loss=loss,
+            eval_accuracy=accuracy,
+        )
+        return rng_generator(), metrics
+    train_state_shapes = jax.eval_shape(init_fn, next_rng())
+    train_state_partition = match_partition_rules(
+        RobertaConfig.get_partition_rules(), train_state_shapes
+    )
+    shard_fns, gather_fns = make_shard_and_gather_fns(
+        train_state_partition, train_state_shapes
+    )
+    checkpointer = StreamingCheckpointer(
+        FLAGS.checkpointer, logger.output_dir,
+        enable=jax.process_index() == 0
+    )
+    sharded_init_fn = pjit(
+        init_fn,
+        in_shardings=PS(),
+        out_shardings=train_state_partition
+    )
+    sharded_create_trainstate_from_params = pjit(
+        create_trainstate_from_params,
+        in_shardings=(train_state_partition.params, ),
+        out_shardings=train_state_partition,
+        donate_argnums=(0, ),
+    )
+    sharded_train_step = pjit(
+        train_step,
+        in_shardings=(train_state_partition, PS(), PS()),
+        out_shardings=(train_state_partition, PS(), PS()),
+        donate_argnums=(0, 1),
+    )
+    sharded_eval_step = pjit(
+        eval_step,
+        in_shardings=(train_state_partition, PS(), PS()),
+        out_shardings=(PS(), PS()),
+        donate_argnums=(1,),
+    )
+    def save_checkpoint(train_state, milestone=False):
+        step = int(jax.device_get(train_state.step))
+        metadata = dict(
+            step=step,
+            variant=variant,
+            flags=flags_config_dict,
+            roberta_config=roberta_config.to_dict(),
+        )
+        checkpointer.save_all(
+            train_state=train_state,
+            gather_fns=gather_fns,
+            metadata=metadata,
+            dataset=dataset.get_state_dict(),
+            milestone=milestone,
+        )
+    mesh = RobertaConfig.get_jax_mesh(FLAGS.mesh_dim)
+    with mesh:
+        train_state, restored_params = None, None
+        if FLAGS.load_checkpoint != '':
+            load_type, load_path = FLAGS.load_checkpoint.split('::', 1)
+            if load_type == 'huggingface':
+                restored_params = tree_apply(
+                    shard_fns.params, roberta_config.load_pretrained(load_path)
+                )
+                train_state = None
+            else:
+                train_state, restored_params = checkpointer.load_trainstate_checkpoint(
+                    FLAGS.load_checkpoint, train_state_shapes, shard_fns
+                )
+        if train_state is None and restored_params is None:
+            # Initialize from scratch
+            train_state = sharded_init_fn(next_rng())
+        elif train_state is None and restored_params is not None:
+            # Restore from params but initialize train_state
+            train_state = sharded_create_trainstate_from_params(restored_params)
+            del restored_params
+        start_step = int(jax.device_get(train_state.step))
+        if FLAGS.save_model_freq > 0:
+            save_checkpoint(train_state)
+        sharded_rng = next_rng()
+        step_counter = trange(start_step, FLAGS.total_steps, ncols=0)
+        for step, (batch, dataset_metrics) in zip(step_counter, dataset):
+            train_state, sharded_rng, metrics = sharded_train_step(
+                train_state, sharded_rng, batch
+            )
+            if step % FLAGS.log_freq == 0:
+                if FLAGS.eval_steps > 0:
+                    eval_metric_list = []
+                    for _ in range(FLAGS.eval_steps):
+                        eval_batch, _ = next(eval_iterator)
+                        sharded_rng, eval_metrics = sharded_eval_step(
+                            train_state, sharded_rng, eval_batch
+                        )
+                        eval_metric_list.append(eval_metrics)
+                    metrics.update(average_metrics(eval_metric_list))
+                log_metrics = {"step": step}
+                log_metrics.update(metrics)
+                log_metrics.update(dataset_metrics)
+                log_metrics = jax.device_get(log_metrics)
+                logger.log(log_metrics)
+                tqdm.write("\n" + pprint.pformat(log_metrics) + "\n")
+            if FLAGS.save_milestone_freq > 0 and (step + 1) % FLAGS.save_milestone_freq == 0:
+                save_checkpoint(train_state, milestone=True)
+            elif FLAGS.save_model_freq > 0 and (step + 1) % FLAGS.save_model_freq == 0:
+                save_checkpoint(train_state)
+        if FLAGS.save_model_freq > 0:
+            save_checkpoint(train_state)
+if __name__ == "__main__":
+    mlxu.run(main)

EasyLM/optimizers.py ADDED Viewed

	@@ -0,0 +1,302 @@

+import os
+import time
+from typing import Any, Mapping, Text, Tuple, Union, NamedTuple
+from functools import partial
+import re
+import dataclasses
+import random
+from ml_collections.config_dict import config_dict
+from ml_collections import ConfigDict
+import jax
+import jax.numpy as jnp
+import numpy as np
+from absl import logging
+import optax
+from EasyLM.jax_utils import float_to_dtype
+class OptimizerFactory(object):
+    """ Configurable optax optimizer factory. """
+    def __init__(self):
+        raise NotImplementedError
+    @staticmethod
+    def get_default_config(updates=None):
+        config = ConfigDict()
+        config.accumulate_gradient_steps = 1
+        config.type = 'adamw'
+        config.palm_optimizer = PalmOptimizerFactory.get_default_config()
+        config.adamw_optimizer = AdamWOptimizerFactory.get_default_config()
+        config.lion_optimizer = LionOptimizerFactory.get_default_config()
+        if updates is not None:
+            config.update(ConfigDict(updates).copy_and_resolve_references())
+        return config
+    @classmethod
+    def get_optimizer(cls, config, weight_decay_mask=None):
+        config = cls.get_default_config(config)
+        if config.type == 'palm':
+            optimizer, optimizer_info = PalmOptimizerFactory.get_optimizer(
+                config.palm_optimizer, weight_decay_mask
+            )
+        elif config.type == 'adamw':
+            optimizer, optimizer_info = AdamWOptimizerFactory.get_optimizer(
+                config.adamw_optimizer, weight_decay_mask
+            )
+        elif config.type == 'lion':
+            optimizer, optimizer_info = LionOptimizerFactory.get_optimizer(
+                config.lion_optimizer, weight_decay_mask
+            )
+        else:
+            raise ValueError(f'Unknown optimizer type: {config.type}')
+        if config.accumulate_gradient_steps > 1:
+            optimizer = optax.MultiSteps(
+                optimizer, config.accumulate_gradient_steps
+            )
+        return optimizer, optimizer_info
+class PalmOptimizerFactory(object):
+    """ PaLM optimizer factory. This optimizer implements the optimizer
+        described in the PaLM paper: https://arxiv.org/abs/2204.02311
+    """
+    def __init__(self):
+        raise NotImplementedError
+    @staticmethod
+    def get_default_config(updates=None):
+        config = ConfigDict()
+        config.lr = 0.01
+        config.lr_warmup_steps = 10000
+        config.b1 = 0.9
+        config.b2 = 0.99
+        config.clip_gradient = 1.0
+        config.weight_decay = 1e-4
+        config.bf16_momentum = False
+        if updates is not None:
+            config.update(ConfigDict(updates).copy_and_resolve_references())
+        return config
+    @classmethod
+    def get_optimizer(cls, config, weight_decay_mask=None):
+        config = cls.get_default_config(config)
+        def learning_rate_schedule(step):
+            multiplier = config.lr / 0.01
+            return multiplier / jnp.sqrt(jnp.maximum(step, config.lr_warmup_steps))
+        def weight_decay_schedule(step):
+            multiplier = config.weight_decay / 1e-4
+            return -multiplier * jnp.square(learning_rate_schedule(step))
+        optimizer_info = dict(
+            learning_rate_schedule=learning_rate_schedule,
+            weight_decay_schedule=weight_decay_schedule,
+        )
+        optimizer = optax.chain(
+            optax.clip_by_global_norm(config.clip_gradient),
+            optax.adafactor(
+                learning_rate=learning_rate_schedule,
+                multiply_by_parameter_scale=True,
+                momentum=config.b1,
+                decay_rate=config.b2,
+                factored=False,
+                clipping_threshold=None,
+                dtype_momentum=jnp.bfloat16 if config.bf16_momentum else jnp.float32,
+            ),
+            optax_add_scheduled_weight_decay(
+                weight_decay_schedule, weight_decay_mask
+            )
+        )
+        return optimizer, optimizer_info
+class AdamWOptimizerFactory(object):
+    """ AdamW optimizer with cosine schedule. """
+    def __init__(self):
+        raise NotImplementedError
+    @staticmethod
+    def get_default_config(updates=None):
+        config = ConfigDict()
+        config.init_lr = 0.0
+        config.end_lr = 0.001
+        config.lr = 0.01
+        config.lr_warmup_steps = 2000
+        config.lr_decay_steps = 500000
+        config.b1 = 0.9
+        config.b2 = 0.95
+        config.clip_gradient = 1.0
+        config.weight_decay = 1e-4
+        config.bf16_momentum = False
+        config.multiply_by_parameter_scale = False
+        if updates is not None:
+            config.update(ConfigDict(updates).copy_and_resolve_references())
+        return config
+    @classmethod
+    def get_optimizer(cls, config, weight_decay_mask=None):
+        config = cls.get_default_config(config)
+        learning_rate_schedule = optax.warmup_cosine_decay_schedule(
+            init_value=config.init_lr,
+            peak_value=config.lr,
+            warmup_steps=config.lr_warmup_steps,
+            decay_steps=config.lr_decay_steps,
+            end_value=config.end_lr,
+        )
+        optimizer_info = dict(
+            learning_rate_schedule=learning_rate_schedule,
+        )
+        if config.multiply_by_parameter_scale:
+            optimizer = optax.chain(
+                optax.clip_by_global_norm(config.clip_gradient),
+                optax.adafactor(
+                    learning_rate=learning_rate_schedule,
+                    multiply_by_parameter_scale=True,
+                    momentum=config.b1,
+                    decay_rate=config.b2,
+                    factored=False,
+                    clipping_threshold=None,
+                    dtype_momentum=jnp.bfloat16 if config.bf16_momentum else jnp.float32,
+                ),
+                optax_add_scheduled_weight_decay(
+                    lambda step: -learning_rate_schedule(step) * config.weight_decay,
+                    weight_decay_mask
+                )
+            )
+        else:
+            optimizer = optax.chain(
+                optax.clip_by_global_norm(config.clip_gradient),
+                optax.adamw(
+                    learning_rate=learning_rate_schedule,
+                    weight_decay=config.weight_decay,
+                    b1=config.b1,
+                    b2=config.b2,
+                    mask=weight_decay_mask,
+                    mu_dtype=jnp.bfloat16 if config.bf16_momentum else jnp.float32,
+                ),
+            )
+        return optimizer, optimizer_info
+class LionOptimizerFactory(object):
+    """ Lion optimizer with cosine schedule. """
+    def __init__(self):
+        raise NotImplementedError
+    @staticmethod
+    def get_default_config(updates=None):
+        config = ConfigDict()
+        config.init_lr = 0.0
+        config.end_lr = 0.0001
+        config.lr = 0.001
+        config.lr_warmup_steps = 2000
+        config.lr_decay_steps = 500000
+        config.b1 = 0.9
+        config.b2 = 0.98
+        config.clip_gradient = 1.0
+        config.weight_decay = 1e-3
+        config.bf16_momentum = False
+        config.lr_schedule_type = "warmup_cosine_decay_schedule"
+        config.lr_decay_rate = 0.98
+        if updates is not None:
+            config.update(ConfigDict(updates).copy_and_resolve_references())
+        return config
+    @classmethod
+    def get_optimizer(cls, config, weight_decay_mask=None):
+        config = cls.get_default_config(config)
+        if config.lr_schedule_type == "warmup_cosine_decay_schedule":
+            learning_rate_schedule = optax.warmup_cosine_decay_schedule(
+                init_value=config.init_lr,
+                peak_value=config.lr,
+                warmup_steps=config.lr_warmup_steps,
+                decay_steps=config.lr_decay_steps,
+                end_value=config.end_lr,
+            )
+        elif config.lr_schedule_type == "warmup_constant":
+            learning_rate_schedule = optax.join_schedules(
+                [
+                    optax.linear_schedule(
+                        init_value=config.init_lr,
+                        end_value=config.lr,
+                        transition_steps=config.lr_warmup_steps,
+                    ),
+                    optax.constant_schedule(config.lr),
+                ],
+                [config.lr_warmup_steps],
+            )
+        elif config.lr_schedule_type == "exponential_decay":
+            learning_rate_schedule = optax.exponential_decay(
+                        init_value=config.lr,
+                        transition_steps=config.lr_decay_steps,
+                        decay_rate=config.lr_decay_rate,
+                        transition_begin=0,
+                        staircase=False,
+                        end_value=config.end_lr,
+            )
+        else:
+            raise ValueError('config.lr_schedule_type must be "warmup_cosine_decay_schedule", "warmup_constant", or "exponential_decay"')
+        optimizer_info = dict(
+            learning_rate_schedule=learning_rate_schedule,
+        )
+        optimizer = optax.chain(
+            optax.clip_by_global_norm(config.clip_gradient),
+            optax.lion(
+                learning_rate=learning_rate_schedule,
+                weight_decay=config.weight_decay,
+                b1=config.b1,
+                b2=config.b2,
+                mask=weight_decay_mask,
+                mu_dtype=jnp.bfloat16 if config.bf16_momentum else jnp.float32,
+            ),
+        )
+        return optimizer, optimizer_info
+class OptaxScheduledWeightDecayState(NamedTuple):
+    count: jax.Array
+def optax_add_scheduled_weight_decay(schedule_fn, mask=None):
+    """ Apply weight decay with schedule. """
+    def init_fn(params):
+        del params
+        return OptaxScheduledWeightDecayState(count=jnp.zeros([], jnp.int32))
+    def update_fn(updates, state, params):
+        if params is None:
+            raise ValueError('Params cannot be None for weight decay!')
+        weight_decay = schedule_fn(state.count)
+        updates = jax.tree_util.tree_map(
+            lambda g, p: g + weight_decay * p, updates, params
+        )
+        return updates, OptaxScheduledWeightDecayState(
+            count=optax.safe_int32_increment(state.count)
+        )
+    if mask is not None:
+        return optax.masked(optax.GradientTransformation(init_fn, update_fn), mask)
+    return optax.GradientTransformation(init_fn, update_fn)

EasyLM/scripts/__init__.py ADDED Viewed

File without changes

EasyLM/scripts/benchmark_attention.py ADDED Viewed

	@@ -0,0 +1,150 @@

+from functools import partial
+from time import time
+import os
+import numpy as np
+import jax
+import jax.flatten_util
+import jax.numpy as jnp
+import mlxu
+from EasyLM.bpt import blockwise_attn
+from EasyLM.jax_utils import (
+    get_float_dtype_by_name, set_random_seed, next_rng, JaxRNG
+)
+FLAGS, _ = mlxu.define_flags_with_default(
+    seed=42,
+    dtype='fp32',
+    embed_dim=2048,
+    n_heads=16,
+    ref_attn_seq_len=2048,
+    eff_attn_seq_len=16384,
+    batch_size=1,
+    query_chunk_size=2048,
+    key_chunk_size=2048,
+    warmup_steps=40,
+    steps=200,
+)
+def main(argv):
+    def random_kqv(rng_key, seq_len):
+        rng_generator = JaxRNG(rng_key)
+        kqv = []
+        for i in range(3):
+            kqv.append(
+                jax.random.normal(
+                    rng_generator(),
+                    (FLAGS.batch_size, seq_len, FLAGS.n_heads, FLAGS.embed_dim // FLAGS.n_heads),
+                    dtype=get_float_dtype_by_name(FLAGS.dtype)
+                )
+            )
+        return tuple(kqv)
+    def reference_attn(query, key, value):
+        dtype = get_float_dtype_by_name(FLAGS.dtype)
+        query = query / jnp.sqrt(query.shape[-1]).astype(dtype)
+        logits = jnp.einsum("bqhc,bkhc->bhqk", query, key)
+        mask_value = jnp.finfo(logits.dtype).min
+        _, q_seq_len, _, _ = query.shape
+        _, kv_seq_len, _, _ = key.shape
+        mask_shape = (q_seq_len, kv_seq_len)
+        row_ids = jax.lax.broadcasted_iota(jnp.int32, mask_shape, 0)
+        col_ids = jax.lax.broadcasted_iota(jnp.int32, mask_shape, 1)
+        causal_mask = (row_ids < col_ids)[None, None, :, :]
+        logits = logits + jnp.where(causal_mask, mask_value, 0.0)
+        weights = jax.nn.softmax(logits, axis=-1)
+        out = jnp.einsum("bhqk,bkhc->bqhc", weights, value)
+        return out
+    def efficient_attention(query, key, value):
+        dtype = get_float_dtype_by_name(FLAGS.dtype)
+        return blockwise_attn(
+            query, key, value,
+            bias=None,
+            deterministic=True,
+            dropout_rng=None,
+            attn_pdrop=0.0,
+            causal=True,
+            query_chunk_size=FLAGS.query_chunk_size,
+            key_chunk_size=FLAGS.key_chunk_size,
+            dtype=get_float_dtype_by_name(FLAGS.dtype),
+            policy=jax.checkpoint_policies.nothing_saveable(),
+            precision=None,
+            float32_logits=True,
+            prevent_cse=True,
+        )
+    @partial(jax.jit, static_argnums=(1,))
+    def reference_attn_forward_backward(rng_key, seq_len):
+        @partial(jax.grad, argnums=(0, 1, 2))
+        @partial(jax.checkpoint, policy=jax.checkpoint_policies.nothing_saveable())
+        def grad_fn(query, key, value):
+            out = reference_attn(query, key, value)
+            return jnp.mean(out)
+        query, key, value = random_kqv(rng_key, seq_len)
+        return jax.flatten_util.ravel_pytree(
+            grad_fn(query, key, value)[1]
+        )[0].mean()
+    @partial(jax.jit, static_argnums=(1,))
+    def efficient_attn_forward_backward(rng_key, seq_len):
+        @partial(jax.grad, argnums=(0, 1, 2))
+        def grad_fn(query, key, value):
+            out = efficient_attention(query, key, value)
+            return jnp.mean(out)
+        query, key, value = random_kqv(rng_key, seq_len)
+        return jax.flatten_util.ravel_pytree(
+            grad_fn(query, key, value)[1]
+        )[0].mean()
+    set_random_seed(FLAGS.seed)
+    jax.block_until_ready(reference_attn_forward_backward(next_rng(), FLAGS.ref_attn_seq_len))
+    jax.block_until_ready(efficient_attn_forward_backward(next_rng(), FLAGS.eff_attn_seq_len))
+    all_results = []
+    for i in range(FLAGS.warmup_steps):
+        all_results.append(reference_attn_forward_backward(next_rng(), FLAGS.ref_attn_seq_len))
+    jax.block_until_ready(all_results)
+    start_time = time()
+    all_results = []
+    for i in range(FLAGS.steps):
+        all_results.append(reference_attn_forward_backward(next_rng(), FLAGS.ref_attn_seq_len))
+    jax.block_until_ready(all_results)
+    elapsed_time_ref_attn = time() - start_time
+    print(f'Reference attention: {elapsed_time_ref_attn:.3f} seconds')
+    all_results = []
+    for i in range(FLAGS.warmup_steps):
+        all_results.append(efficient_attn_forward_backward(next_rng(), FLAGS.eff_attn_seq_len))
+    jax.block_until_ready(all_results)
+    start_time = time()
+    all_results = []
+    for i in range(FLAGS.steps):
+        all_results.append(efficient_attn_forward_backward(next_rng(), FLAGS.eff_attn_seq_len))
+    jax.block_until_ready(all_results)
+    elapsed_time_efficient_attn = time() - start_time
+    print(f'Efficient attention: {elapsed_time_efficient_attn:.3f} seconds')
+    flops_ratio = (FLAGS.eff_attn_seq_len / FLAGS.ref_attn_seq_len) ** 2
+    efficiency = elapsed_time_ref_attn / elapsed_time_efficient_attn * flops_ratio
+    print(f'Efficiency: {efficiency:.3f}')
+if __name__ == '__main__':
+    mlxu.run(main)

EasyLM/scripts/convert_checkpoint.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# This script converts model checkpoint trained by EsayLM to a standard
+# mspack checkpoint that can be loaded by huggingface transformers or
+# flax.serialization.msgpack_restore. Such conversion allows models to be
+# used by other frameworks that integrate with huggingface transformers.
+import pprint
+from functools import partial
+import os
+import numpy as np
+import mlxu
+import jax.numpy as jnp
+import flax.serialization
+from EasyLM.checkpoint import StreamingCheckpointer
+from EasyLM.jax_utils import float_to_dtype
+FLAGS, FLAGS_DEF = mlxu.define_flags_with_default(
+    load_checkpoint='',
+    output_file='',
+    streaming=False,
+    float_dtype='bf16',
+)
+def main(argv):
+    assert FLAGS.load_checkpoint != '' and FLAGS.output_file != '', 'input and output must be specified'
+    params = StreamingCheckpointer.load_trainstate_checkpoint(
+        FLAGS.load_checkpoint, disallow_trainstate=True
+    )[1]['params']
+    if FLAGS.streaming:
+        StreamingCheckpointer.save_train_state_to_file(
+            params, FLAGS.output_file, float_dtype=FLAGS.float_dtype
+        )
+    else:
+        params = float_to_dtype(params, FLAGS.float_dtype)
+        with mlxu.open_file(FLAGS.output, 'wb') as fout:
+            fout.write(flax.serialization.msgpack_serialize(params, in_place=True))
+if __name__ == "__main__":
+    mlxu.run(main)

EasyLM/scripts/diff_checkpoint.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# This script converts model checkpoint trained by EsayLM to a standard
+# mspack checkpoint that can be loaded by huggingface transformers or
+# flax.serialization.msgpack_restore. Such conversion allows models to be
+# used by other frameworks that integrate with huggingface transformers.
+import pprint
+from functools import partial
+import os
+import numpy as np
+import jax
+import jax.numpy as jnp
+import flax.serialization
+import mlxu
+from EasyLM.checkpoint import StreamingCheckpointer
+from EasyLM.jax_utils import float_to_dtype
+FLAGS, FLAGS_DEF = mlxu.define_flags_with_default(
+    recover_diff=False,
+    load_base_checkpoint='',
+    load_target_checkpoint='',
+    output_file='',
+    streaming=True,
+    float_dtype='bf16',
+)
+def main(argv):
+    assert FLAGS.load_base_checkpoint != '' and FLAGS.load_target_checkpoint != ''
+    assert FLAGS.output_file != ''
+    base_params = StreamingCheckpointer.load_trainstate_checkpoint(
+        FLAGS.load_base_checkpoint, disallow_trainstate=True
+    )[1]['params']
+    target_params = StreamingCheckpointer.load_trainstate_checkpoint(
+        FLAGS.load_target_checkpoint, disallow_trainstate=True
+    )[1]['params']
+    if FLAGS.recover_diff:
+        params = jax.tree_util.tree_map(
+            lambda b, t: b + t, base_params, target_params
+        )
+    else:
+        params = jax.tree_util.tree_map(
+            lambda b, t: t - b, base_params, target_params
+        )
+    if FLAGS.streaming:
+        StreamingCheckpointer.save_train_state_to_file(
+            params, FLAGS.output_file, float_dtype=FLAGS.float_dtype
+        )
+    else:
+        params = float_to_dtype(params, FLAGS.float_dtype)
+        with mlxu.open_file(FLAGS.output, 'wb') as fout:
+            fout.write(flax.serialization.msgpack_serialize(params, in_place=True))
+if __name__ == "__main__":
+    mlxu.run(main)

EasyLM/scripts/lm_eval_harness.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# This script runs lm_eval_harness evaluations against a served language model.
+# Typically, you need to run a language model server first, e.g.:
+#    python -m EasyLM.models.gptj.gptj_serve ...
+import dataclasses
+import pprint
+from functools import partial
+import os
+from tqdm import tqdm, trange
+import numpy as np
+import mlxu
+from flax.traverse_util import flatten_dict
+from lm_eval import evaluator, tasks
+from lm_eval.base import LM
+from EasyLM.serving import LMClient
+FLAGS, FLAGS_DEF = mlxu.define_flags_with_default(
+    tasks='wsc,piqa,winogrande,openbookqa,logiqa',
+    shots=0,
+    limit=0,
+    write_out=False,
+    lm_client=LMClient.get_default_config(),
+    logger=mlxu.WandBLogger.get_default_config(),
+)
+class LMEvalHarnessInterface(LM):
+    def __init__(self, lm_client):
+        self.lm_client = lm_client
+    def greedy_until(self, inputs):
+        prefix, until = zip(*inputs)
+        return self.lm_client.greedy_until(prefix, until)
+    def loglikelihood_rolling(self, inputs):
+        loglikelihood, is_greedy = self.lm_client.loglikelihood_rolling(inputs)
+        return list(zip(loglikelihood, is_greedy))
+    def loglikelihood(self, inputs):
+        prefix, text = zip(*inputs)
+        loglikelihood, is_greedy = self.lm_client.loglikelihood(prefix, text)
+        return list(zip(loglikelihood, is_greedy))
+def main(argv):
+    logger = mlxu.WandBLogger(
+        config=FLAGS.logger, variant=mlxu.get_user_flags(FLAGS, FLAGS_DEF)
+    )
+    model = LMEvalHarnessInterface(LMClient(FLAGS.lm_client))
+    task_list = FLAGS.tasks.split(',')
+    results = evaluator.evaluate(
+        model, tasks.get_task_dict(task_list), False, FLAGS.shots,
+        limit=None if FLAGS.limit <= 0 else FLAGS.limit,
+        write_out=FLAGS.write_out,
+    )
+    logger.log(flatten_dict(results['results'], sep='/'))
+    pprint.pprint(results)
+if __name__ == "__main__":
+    mlxu.run(main)

EasyLM/scripts/lm_eval_json.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import json
+import mlxu
+from EasyLM.serving import LMClient
+FLAGS, FLAGS_DEF = mlxu.define_flags_with_default(
+    input_file='',
+    output_file='',
+    prefix_field='prefix',
+    text_field='text',
+    until_field='until',
+    eval_type='loglikelihood',
+    lm_client=LMClient.get_default_config(),
+)
+def main(argv):
+    lm_client = LMClient(FLAGS.lm_client)
+    with mlxu.open_file(FLAGS.input_file, 'r') as fin:
+        input_data = json.load(fin)
+    if FLAGS.eval_type == 'loglikelihood':
+        prefix = input_data[FLAGS.prefix_field]
+        text = input_data[FLAGS.text_field]
+        loglikelihoods, is_greedys = lm_client.loglikelihood(prefix, text)
+        output_data = {
+            'loglikelihood': loglikelihoods,
+            'is_greedy': is_greedys,
+        }
+    elif FLAGS.eval_type == 'loglikelihood_rolling':
+        text = input_data[FLAGS.text_field]
+        loglikelihoods, is_greedys = lm_client.loglikelihood_rolling(text)
+        output_data = {
+            'loglikelihood': loglikelihoods,
+            'is_greedy': is_greedys,
+        }
+    elif FLAGS.eval_type == 'greedy_until':
+        prefix = input_data[FLAGS.prefix_field]
+        until = input_data[FLAGS.until_field]
+        output_data = {'output_text': lm_client.greedy_until(prefix, until)}
+    elif FLAGS.eval_type == 'generate':
+        prefix = input_data[FLAGS.prefix_field]
+        output_data = {'output_text': lm_client.generate(prefix)}
+    else:
+        raise ValueError(f'Unknown eval_type: {FLAGS.eval_type}')
+    with mlxu.open_file(FLAGS.output_file, 'w') as fout:
+        json.dump(output_data, fout)
+if __name__ == "__main__":
+    mlxu.run(main)

EasyLM/serving.py ADDED Viewed

	@@ -0,0 +1,566 @@

+import dataclasses
+import pprint
+from functools import partial
+import re
+import os
+from threading import Lock
+import urllib
+import time
+from typing import List, Optional, Union
+from pydantic import BaseModel
+import absl.logging
+from tqdm import tqdm, trange
+import numpy as np
+import mlxu
+from ml_collections import ConfigDict
+import uvicorn
+from fastapi import FastAPI
+import gradio as gr
+import requests
+from requests.exceptions import Timeout, ConnectionError
+class InferenceRequest(BaseModel):
+    prefix_text: Optional[List[str]] = None
+    text: Optional[List[str]] = None
+    until: Optional[Union[List[str], List[List[str]]]] = None
+    temperature: Optional[float] = None
+class ChatRequest(BaseModel):
+    prompt: str
+    context: str = ''
+    temperature: Optional[float] = None
+class LMServer(object):
+    """ HTTP server for serving langauge models. """
+    @staticmethod
+    def get_default_config(updates=None):
+        config = ConfigDict()
+        config.host = '0.0.0.0'
+        config.port = 5007
+        config.batch_size = 1
+        config.logging = False
+        config.pre_compile = 'loglikelihood'
+        config.default_temperature = 1.0
+        config.greedy_until_max_length = 5000
+        config.prepend_to_prefix = ''
+        config.append_to_prefix = ''
+        config.prepend_to_text = ''
+        config.append_to_text = ''
+        config.chat_prepend_text = ''
+        config.chat_user_prefix = ''
+        config.chat_user_suffix = ''
+        config.chat_lm_prefix = ''
+        config.chat_lm_suffix = ''
+        config.notes = ''
+        if updates is not None:
+            config.update(ConfigDict(updates).copy_and_resolve_references())
+        return config
+    def __init__(self, config):
+        self.config = self.get_default_config(config)
+        self.lock = Lock()
+        self.app = FastAPI()
+        self.app.post('/loglikelihood')(self.serve_loglikelihood)
+        self.app.post('/loglikelihood-rolling')(self.serve_loglikelihood_rolling)
+        self.app.post('/generate')(self.serve_generate)
+        self.app.post('/greedy-until')(self.serve_greedy_until)
+        self.app.post('/chat')(self.serve_chat)
+        self.app.get('/ready')(self.serve_ready)
+        self.app = gr.mount_gradio_app(self.app, self.create_chat_app(), '/')
+    @staticmethod
+    def loglikelihood(prefix_text, text):
+        raise NotImplementedError()
+    @staticmethod
+    def loglikelihood_rolling(text):
+        raise NotImplementedError()
+    @staticmethod
+    def generate(text, temperature):
+        raise NotImplementedError()
+    @staticmethod
+    def greedy_until(prefix_text, until, max_length):
+        raise NotImplementedError()
+    @staticmethod
+    def to_list(x):
+        if isinstance(x, np.ndarray):
+            return x.tolist()
+        return x
+    def serve_ready(self):
+        return 'Ready!\n'
+    def serve_loglikelihood(self, data: InferenceRequest):
+        with self.lock:
+            if self.config.logging:
+                absl.logging.info(
+                    '\n========= Serving Log Likelihood Request ========= \n'
+                    + pprint.pformat(data) + '\n'
+                )
+            if data.prefix_text is None:
+                data.prefix_text = ['' for _ in data.text]
+            prefix_text = [
+                self.config.prepend_to_prefix + p + self.config.append_to_prefix
+                for p in data.prefix_text
+            ]
+            text = [
+                self.config.prepend_to_text + t + self.config.append_to_text
+                for t in data.text
+            ]
+            log_likelihood = []
+            is_greedy = []
+            for i in trange(0, len(text), self.config.batch_size, ncols=0):
+                batch_prefix_text = prefix_text[i:i + self.config.batch_size]
+                batch_text = text[i:i + self.config.batch_size]
+                batch_size = len(batch_text)
+                if batch_size < self.config.batch_size:
+                    extra = self.config.batch_size - batch_size
+                    batch_prefix_text.extend(['a' for _ in range(extra)])
+                    batch_text.extend(['a' for _ in range(extra)])
+                batch_log_likelihood, batch_is_greedy = self.loglikelihood(
+                    batch_prefix_text, batch_text
+                )
+                batch_log_likelihood = self.to_list(batch_log_likelihood)
+                batch_is_greedy = self.to_list(batch_is_greedy)
+                log_likelihood.extend(batch_log_likelihood[:batch_size])
+                is_greedy.extend(batch_is_greedy[:batch_size])
+            output = {
+                'prefix_text': data.prefix_text,
+                'text': data.text,
+                'log_likelihood': log_likelihood,
+                'is_greedy': is_greedy,
+            }
+            if self.config.logging:
+                absl.logging.info(
+                '\n========= Output ========= \n'
+                + pprint.pformat(output) + '\n'
+            )
+        return output
+    def serve_loglikelihood_rolling(self, data: InferenceRequest):
+        with self.lock:
+            if self.config.logging:
+                absl.logging.info(
+                    '\n========= Serving Log Likelihood Request ========= \n'
+                    + pprint.pformat(data) + '\n'
+                )
+            text = [
+                self.config.prepend_to_text + t + self.config.append_to_text
+                for t in data.text
+            ]
+            log_likelihood = []
+            is_greedy = []
+            for i in trange(0, len(text), self.config.batch_size, ncols=0):
+                batch_text = text[i:i + self.config.batch_size]
+                batch_size = len(batch_text)
+                if batch_size < self.config.batch_size:
+                    extra = self.config.batch_size - batch_size
+                    batch_text.extend(['a' for _ in range(extra)])
+                batch_log_likelihood, batch_is_greedy = self.loglikelihood_rolling(
+                    batch_text
+                )
+                batch_log_likelihood = self.to_list(batch_log_likelihood)
+                batch_is_greedy = self.to_list(batch_is_greedy)
+                log_likelihood.extend(batch_log_likelihood[:batch_size])
+                is_greedy.extend(batch_is_greedy[:batch_size])
+            output = {
+                'text': data.text,
+                'log_likelihood': log_likelihood,
+                'is_greedy': is_greedy,
+            }
+            if self.config.logging:
+                absl.logging.info(
+                '\n========= Output ========= \n'
+                + pprint.pformat(output) + '\n'
+            )
+        return output
+    def serve_generate(self, data: InferenceRequest):
+        with self.lock:
+            if self.config.logging:
+                absl.logging.info(
+                    '\n========= Serving Generate Request ========= \n'
+                    + pprint.pformat(data) + '\n'
+                )
+            prefix_text = [
+                self.config.prepend_to_prefix + p + self.config.append_to_prefix
+                for p in data.prefix_text
+            ]
+            if data.temperature is None:
+                data.temperature = self.config.default_temperature
+            output_text = []
+            for i in trange(0, len(prefix_text), self.config.batch_size, ncols=0):
+                batch_prefix_text = prefix_text[i:i + self.config.batch_size]
+                batch_size = len(batch_prefix_text)
+                if batch_size < self.config.batch_size:
+                    extra = self.config.batch_size - batch_size
+                    batch_prefix_text.extend(['a' for _ in range(extra)])
+                batch_output_text = self.generate(
+                    batch_prefix_text,
+                    temperature=data.temperature,
+                )
+                output_text.extend(self.to_list(batch_output_text)[:batch_size])
+            output = {
+                'prefix_text': data.prefix_text,
+                'output_text': output_text,
+                'temperature': data.temperature,
+            }
+            if self.config.logging:
+                absl.logging.info(
+                    '\n========= Output ========= \n'
+                    + pprint.pformat(output) + '\n'
+                )
+        return output
+    def serve_greedy_until(self, data: InferenceRequest):
+        with self.lock:
+            if self.config.logging:
+                absl.logging.info(
+                    '\n========= Serving Greedy Until Request ========= \n'
+                    + pprint.pformat(data) + '\n'
+                )
+            prefix_text = [
+                self.config.prepend_to_prefix + p + self.config.append_to_prefix
+                for p in data.prefix_text
+            ]
+            until = data.until
+            max_length = self.config.greedy_until_max_length
+            output_text = []
+            for i in range(0, len(prefix_text), self.config.batch_size):
+                batch_prefix_text = prefix_text[i:i + self.config.batch_size]
+                batch_until = until[i:i + self.config.batch_size]
+                batch_size = len(batch_prefix_text)
+                batch_output_text = self.greedy_until(batch_prefix_text, batch_until, max_length)
+                output_text.extend(self.to_list(batch_output_text)[:batch_size])
+            output = {
+                'prefix_text': data.prefix_text,
+                'until': data.until,
+                'max_length': max_length,
+                'output_text': output_text,
+            }
+            if self.config.logging:
+                absl.logging.info(
+                    '\n========= Output ========= \n'
+                    + pprint.pformat(output) + '\n'
+                )
+        return output
+    def process_chat(self, prompt, context, temperature):
+        context = (
+            context + self.config.chat_user_prefix
+            + prompt + self.config.chat_user_suffix
+            + self.config.chat_lm_prefix
+        )
+        response = self.generate(
+            [self.config.chat_prepend_text + context],
+            temperature=float(temperature),
+        )[0]
+        context = context + response + self.config.chat_lm_suffix
+        return response, context
+    def serve_chat(self, data: ChatRequest):
+        if data.temperature is None:
+            data.temperature = self.config.default_temperature
+        response, context = self.process_chat(
+            data.prompt, data.context,
+            temperature=data.temperature,
+        )
+        return {
+            'response': response,
+            'context': context,
+            'temperature': data.temperature,
+        }
+    def create_chat_app(self):
+        with gr.Blocks(analytics_enabled=False, title='EasyLM Chat') as gradio_chatbot:
+            gr.Markdown('# Chatbot Powered by [EasyLM](https://github.com/young-geng/EasyLM)')
+            gr.Markdown(self.config.notes)
+            chatbot = gr.Chatbot(label='Chat history')
+            msg = gr.Textbox(
+                placeholder='Type your message here...',
+                show_label=False
+            )
+            with gr.Row():
+                send = gr.Button('Send')
+                regenerate = gr.Button('Regenerate', interactive=False)
+                clear = gr.Button('Reset')
+            temp_slider = gr.Slider(
+                label='Temperature', minimum=0, maximum=2.0,
+                value=self.config.default_temperature
+            )
+            context_state = gr.State(['', ''])
+            def user_fn(user_message, history, context):
+                return {
+                    msg: gr.update(value='', interactive=False),
+                    clear: gr.update(interactive=False),
+                    send: gr.update(interactive=False),
+                    regenerate: gr.update(interactive=False),
+                    chatbot: history + [[user_message, None]],
+                    context_state: [context[1], context[1]],
+                }
+            def model_fn(history, context, temperature):
+                history[-1][1], new_context = self.process_chat(
+                    history[-1][0], context[0], temperature
+                )
+                return {
+                    msg: gr.update(value='', interactive=True),
+                    clear: gr.update(interactive=True),
+                    send: gr.update(interactive=True),
+                    chatbot: history,
+                    context_state: [context[0], new_context],
+                    regenerate: gr.update(interactive=True),
+                }
+            def regenerate_fn():
+                return {
+                    msg: gr.update(value='', interactive=False),
+                    clear: gr.update(interactive=False),
+                    send: gr.update(interactive=False),
+                    regenerate: gr.update(interactive=False),
+                }
+            def clear_fn():
+                return {
+                    chatbot: None,
+                    msg: '',
+                    context_state: ['', ''],
+                    regenerate: gr.update(interactive=False),
+                }
+            msg.submit(
+                user_fn,
+                inputs=[msg, chatbot, context_state],
+                outputs=[msg, clear, send, chatbot, context_state, regenerate],
+                queue=False
+            ).then(
+                model_fn,
+                inputs=[chatbot, context_state, temp_slider],
+                outputs=[msg, clear, send, chatbot, context_state, regenerate],
+                queue=True
+            )
+            send.click(
+                user_fn,
+                inputs=[msg, chatbot, context_state],
+                outputs=[msg, clear, send, chatbot, context_state, regenerate],
+                queue=False
+            ).then(
+                model_fn,
+                inputs=[chatbot, context_state, temp_slider],
+                outputs=[msg, clear, send, chatbot, context_state, regenerate],
+                queue=True
+            )
+            regenerate.click(
+                regenerate_fn,
+                inputs=None,
+                outputs=[msg, clear, send, regenerate],
+                queue=False
+            ).then(
+                model_fn,
+                inputs=[chatbot, context_state, temp_slider],
+                outputs=[msg, clear, send, chatbot, context_state, regenerate],
+                queue=True
+            )
+            clear.click(
+                clear_fn,
+                inputs=None,
+                outputs=[chatbot, msg, context_state, regenerate],
+                queue=False
+            )
+        gradio_chatbot.queue(concurrency_count=1)
+        return gradio_chatbot
+    def run(self):
+        if self.config.pre_compile != '':
+            if self.config.pre_compile == 'all':
+                pre_compile = ['loglikelihood', 'generate', 'greedy_until', 'chat']
+            else:
+                pre_compile = self.config.pre_compile.split(',')
+            pre_compile_data = ['a' for _ in range(self.config.batch_size)]
+            for task in pre_compile:
+                if task == 'loglikelihood':
+                    self.loglikelihood(pre_compile_data, pre_compile_data)
+                    self.loglikelihood_rolling(pre_compile_data)
+                elif task == 'generate':
+                    self.generate(pre_compile_data, 1.0)
+                elif task == 'greedy_until':
+                    self.greedy_until(
+                        pre_compile_data, pre_compile_data,
+                        self.config.greedy_until_max_length
+                    )
+                elif task == 'chat':
+                    self.process_chat('a', 'a', 1.0)
+                else:
+                    raise ValueError(f'Invalid precompile task: {task}!')
+        uvicorn.run(self.app, host=self.config.host, port=self.config.port)
+class LMClient(object):
+    """ A simple client for the LM server. """
+    @staticmethod
+    def get_default_config(updates=None):
+        config = ConfigDict()
+        config.url = 'http://localhost:5007'
+        config.batch_size = 1
+        config.wait_for_ready = True
+        config.dummy = False
+        if updates is not None:
+            config.update(ConfigDict(updates).copy_and_resolve_references())
+        return config
+    def __init__(self, config=None):
+        self.config = self.get_default_config(config)
+        if self.config.wait_for_ready:
+            self.wait_for_ready()
+    def wait_for_ready(self):
+        if self.config.dummy:
+            return
+        while True:
+            try:
+                requests.get(urllib.parse.urljoin(self.config.url, 'ready'))
+                return
+            except (Timeout, ConnectionError) as e:
+                time.sleep(10)
+    @staticmethod
+    def batched(iterator, batch_size):
+        batch = []
+        for example in iterator:
+            batch.append(example)
+            if len(batch) == batch_size:
+                yield batch
+                batch = []
+        if len(batch) > 0:
+            yield batch
+    def loglikelihood(self, prefix, text):
+        prefix, text = list(prefix), list(text)
+        if self.config.dummy:
+            return [-1.0 for _ in text], [False for _ in text]
+        log_likelihood = []
+        is_greedy = []
+        batched_iterator = list(zip(
+            self.batched(prefix, self.config.batch_size),
+            self.batched(text, self.config.batch_size)
+        ))
+        for batch_prefix, batch_text in tqdm(batched_iterator, ncols=0):
+            response = requests.post(
+                urllib.parse.urljoin(self.config.url, 'loglikelihood'),
+                json={'prefix_text': batch_prefix, 'text': batch_text}
+            ).json()
+            log_likelihood.extend(response['log_likelihood'])
+            is_greedy.extend(response['is_greedy'])
+        return log_likelihood, is_greedy
+    def loglikelihood_rolling(self, text):
+        text = list(text)
+        if self.config.dummy:
+            return [-1.0 for _ in text], [False for _ in text]
+        log_likelihood = []
+        is_greedy = []
+        batched_iterator = list(self.batched(text, self.config.batch_size))
+        for batch_text in tqdm(batched_iterator, ncols=0):
+            response = requests.post(
+                urllib.parse.urljoin(self.config.url, 'loglikelihood-rolling'),
+                json={'text': batch_text}
+            ).json()
+            log_likelihood.extend(response['log_likelihood'])
+            is_greedy.extend(response['is_greedy'])
+        return log_likelihood, is_greedy
+    def greedy_until(self, prefix, until):
+        prefix, until = list(prefix), list(until)
+        if self.config.dummy:
+            results = []
+            for u in until:
+                if isinstance(u, str):
+                    results.append('dummy text ' + u)
+                else:
+                    results.append('dummy text ' + u[0])
+            return results
+        batched_iterator = list(zip(
+            self.batched(prefix, self.config.batch_size),
+            self.batched(until, self.config.batch_size),
+        ))
+        output_text = []
+        for batch_prefix, batch_until in tqdm(batched_iterator, ncols=0):
+            response = requests.post(
+                urllib.parse.urljoin(self.config.url, 'greedy-until'),
+                json={'prefix_text': batch_prefix, 'until': batch_until}
+            ).json()
+            output_text.extend(response['output_text'])
+        return output_text
+    def generate(self, prefix, temperature=None):
+        prefix = list(prefix)
+        if self.config.dummy:
+            return ['' for _ in prefix]
+        output_text = []
+        batched_iterator = list(self.batched(prefix, self.config.batch_size))
+        for batch_prefix in tqdm(batched_iterator, ncols=0):
+            response = requests.post(
+                urllib.parse.urljoin(self.config.url, 'generate'),
+                json={
+                    'prefix_text': batch_prefix,
+                    'temperature': temperature,
+                }
+            ).json()
+            output_text.extend(response['output_text'])
+        return output_text
+    def chat(self, prompt, context, temperature=None):
+        if self.config.dummy:
+            return ''
+        response = requests.post(
+            urllib.parse.urljoin(self.config.url, 'chat'),
+            json={
+                'prompt': prompt,
+                'context': context,
+                'temperature': temperature,
+            }
+        ).json()
+        return response['response'], response['context']

convert_to_hf_model.sh ADDED Viewed

	@@ -0,0 +1,4 @@

+JAX_PLATFORM_NAME=cpu python3 -m EasyLM.models.llama.convert_easylm_to_hf \
+    --load_checkpoint='' \
+    --model_size='7b' \
+    --output_dir='./'

pretrain_llama_7b.sh ADDED Viewed

	@@ -0,0 +1,52 @@

+#! /bin/bash
+# Put your WANDB API key here to enable logging to wandb.
+export WANDB_API_KEY=''
+# TPU specific flags to improve training throughput
+export LIBTPU_INIT_ARGS='--xla_jf_spmd_threshold_for_windowed_einsum_mib=0 --xla_tpu_spmd_threshold_for_allgather_cse=10000 --xla_tpu_spmd_rewrite_einsum_with_reshape=true --xla_enable_async_all_gather=true --jax_enable_async_collective_offload=true --xla_tpu_enable_latency_hiding_scheduler=true TPU_MEGACORE=MEGACORE_DENSE'
+python3 -m EasyLM.models.llama.llama_train \
+    --jax_distributed.initialize_jax_distributed=True \
+    --mesh_dim='1,-1,4' \
+    --dtype='bf16' \
+    --total_steps=900000 \
+    --eval_freq=50000 \
+    --log_freq=1000 \
+    --save_model_freq=2000 \
+    --save_milestone_freq=50000 \
+    --load_llama_config='7b' \
+    --update_llama_config='' \
+    --load_dataset_state='' \
+    --load_checkpoint='' \
+    --tokenizer.vocab_file='tokenizer.model' \
+    --optimizer.type='lion' \
+    --optimizer.lion_optimizer.weight_decay=1.0 \
+    --optimizer.lion_optimizer.lr_schedule_type='warmup_constant' \
+    --optimizer.lion_optimizer.lr=3e-4 \
+    --optimizer.lion_optimizer.end_lr=3e-5 \
+    --optimizer.lion_optimizer.lr_warmup_steps=60000 \
+    --optimizer.lion_optimizer.lr_decay_steps=100000 \
+    --optimizer.lion_optimizer.bf16_momentum=True \
+    --train_dataset.type='huggingface' \
+    --train_dataset.text_processor.fields='text' \
+    --train_dataset.text_processor.add_eos_token=True \
+    --train_dataset.text_processor.add_bos_token=True \
+    --train_dataset.huggingface_dataset.path='/researchdisk/lm_training_dataset_first_stage' \
+    --train_dataset.huggingface_dataset.split='train' \
+    --train_dataset.huggingface_dataset.seq_length=2048 \
+    --train_dataset.huggingface_dataset.batch_size=64 \
+    --eval_dataset.type='huggingface' \
+    --eval_dataset.text_processor.fields='text' \
+    --eval_dataset.huggingface_dataset.path='/researchdisk/lm_training_dataset_first_stage' \
+    --eval_dataset.huggingface_dataset.split='validation' \
+    --eval_dataset.huggingface_dataset.seq_length=2048 \
+    --eval_dataset.huggingface_dataset.batch_size=64 \
+    --checkpointer.save_optimizer_state=True \
+    --logger.online=True \
+    --logger.prefix='EasyLM' \
+    --logger.project="llama-7b-finnish-v2" \
+    --logger.output_dir="gs://finnish-nlp-research-us/llama-7b-v2-checkpoint" \
+    --logger.wandb_dir="./"

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1980c00aa3cb5455177a39efa3e60e7b8887ee89c3f7b8950719592a08ad9456
+size 1400411

tokenizer.vocab ADDED Viewed

The diff for this file is too large to render. See raw diff

train_sentencepiece.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import sentencepiece as spm
+spm.SentencePieceTrainer.train(input="/researchdisk/training_dataset_sentences/train.txt", model_prefix="tokenizer",
+                                model_type="bpe", split_digits=True, vocab_size=64256, byte_fallback=True,
+                                normalization_rule_name="nfkc",
+                                user_defined_symbols=["[INST]", "[/INST]", "<<SYS>>", "<</SYS>>"],
+                                required_chars="abcdefghijklmnopqrstuvwxyzåäöABCDEFGHIJKLMNOPQRSTUVWXYZÅÄÖ",
+                                train_extremely_large_corpus=True,
+                                input_sentence_size=500000000, shuffle_input_sentence=True,
+                                num_threads=96)