TearGosling commited on Apr 6

Commit

15a0bd6

•

1 Parent(s): 5c61222

Upload folder using huggingface_hub

Browse files

Files changed (18) hide show

added_tokens.json +145 -0
config.json +50 -0
configuration_gptj_moe.py +120 -0
generation_config.json +6 -0
merges.txt +0 -0
model-00001-of-00007.safetensors +3 -0
model-00002-of-00007.safetensors +3 -0
model-00003-of-00007.safetensors +3 -0
model-00004-of-00007.safetensors +3 -0
model-00005-of-00007.safetensors +3 -0
model-00006-of-00007.safetensors +3 -0
model-00007-of-00007.safetensors +3 -0
model.safetensors.index.json +656 -0
modeling_gptj_moe.py +671 -0
special_tokens_map.json +23 -0
tokenizer.json +0 -0
tokenizer_config.json +1166 -0
vocab.json +0 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,145 @@

+{
+  "<|extratoken_100|>": 50356,
+  "<|extratoken_101|>": 50357,
+  "<|extratoken_102|>": 50358,
+  "<|extratoken_103|>": 50359,
+  "<|extratoken_104|>": 50360,
+  "<|extratoken_105|>": 50361,
+  "<|extratoken_106|>": 50362,
+  "<|extratoken_107|>": 50363,
+  "<|extratoken_108|>": 50364,
+  "<|extratoken_109|>": 50365,
+  "<|extratoken_10|>": 50266,
+  "<|extratoken_110|>": 50366,
+  "<|extratoken_111|>": 50367,
+  "<|extratoken_112|>": 50368,
+  "<|extratoken_113|>": 50369,
+  "<|extratoken_114|>": 50370,
+  "<|extratoken_115|>": 50371,
+  "<|extratoken_116|>": 50372,
+  "<|extratoken_117|>": 50373,
+  "<|extratoken_118|>": 50374,
+  "<|extratoken_119|>": 50375,
+  "<|extratoken_11|>": 50267,
+  "<|extratoken_120|>": 50376,
+  "<|extratoken_121|>": 50377,
+  "<|extratoken_122|>": 50378,
+  "<|extratoken_123|>": 50379,
+  "<|extratoken_124|>": 50380,
+  "<|extratoken_125|>": 50381,
+  "<|extratoken_126|>": 50382,
+  "<|extratoken_127|>": 50383,
+  "<|extratoken_128|>": 50384,
+  "<|extratoken_129|>": 50385,
+  "<|extratoken_12|>": 50268,
+  "<|extratoken_130|>": 50386,
+  "<|extratoken_131|>": 50387,
+  "<|extratoken_132|>": 50388,
+  "<|extratoken_133|>": 50389,
+  "<|extratoken_134|>": 50390,
+  "<|extratoken_135|>": 50391,
+  "<|extratoken_136|>": 50392,
+  "<|extratoken_137|>": 50393,
+  "<|extratoken_138|>": 50394,
+  "<|extratoken_139|>": 50395,
+  "<|extratoken_13|>": 50269,
+  "<|extratoken_140|>": 50396,
+  "<|extratoken_141|>": 50397,
+  "<|extratoken_142|>": 50398,
+  "<|extratoken_143|>": 50399,
+  "<|extratoken_14|>": 50270,
+  "<|extratoken_15|>": 50271,
+  "<|extratoken_16|>": 50272,
+  "<|extratoken_17|>": 50273,
+  "<|extratoken_18|>": 50274,
+  "<|extratoken_19|>": 50275,
+  "<|extratoken_1|>": 50257,
+  "<|extratoken_20|>": 50276,
+  "<|extratoken_21|>": 50277,
+  "<|extratoken_22|>": 50278,
+  "<|extratoken_23|>": 50279,
+  "<|extratoken_24|>": 50280,
+  "<|extratoken_25|>": 50281,
+  "<|extratoken_26|>": 50282,
+  "<|extratoken_27|>": 50283,
+  "<|extratoken_28|>": 50284,
+  "<|extratoken_29|>": 50285,
+  "<|extratoken_2|>": 50258,
+  "<|extratoken_30|>": 50286,
+  "<|extratoken_31|>": 50287,
+  "<|extratoken_32|>": 50288,
+  "<|extratoken_33|>": 50289,
+  "<|extratoken_34|>": 50290,
+  "<|extratoken_35|>": 50291,
+  "<|extratoken_36|>": 50292,
+  "<|extratoken_37|>": 50293,
+  "<|extratoken_38|>": 50294,
+  "<|extratoken_39|>": 50295,
+  "<|extratoken_3|>": 50259,
+  "<|extratoken_40|>": 50296,
+  "<|extratoken_41|>": 50297,
+  "<|extratoken_42|>": 50298,
+  "<|extratoken_43|>": 50299,
+  "<|extratoken_44|>": 50300,
+  "<|extratoken_45|>": 50301,
+  "<|extratoken_46|>": 50302,
+  "<|extratoken_47|>": 50303,
+  "<|extratoken_48|>": 50304,
+  "<|extratoken_49|>": 50305,
+  "<|extratoken_4|>": 50260,
+  "<|extratoken_50|>": 50306,
+  "<|extratoken_51|>": 50307,
+  "<|extratoken_52|>": 50308,
+  "<|extratoken_53|>": 50309,
+  "<|extratoken_54|>": 50310,
+  "<|extratoken_55|>": 50311,
+  "<|extratoken_56|>": 50312,
+  "<|extratoken_57|>": 50313,
+  "<|extratoken_58|>": 50314,
+  "<|extratoken_59|>": 50315,
+  "<|extratoken_5|>": 50261,
+  "<|extratoken_60|>": 50316,
+  "<|extratoken_61|>": 50317,
+  "<|extratoken_62|>": 50318,
+  "<|extratoken_63|>": 50319,
+  "<|extratoken_64|>": 50320,
+  "<|extratoken_65|>": 50321,
+  "<|extratoken_66|>": 50322,
+  "<|extratoken_67|>": 50323,
+  "<|extratoken_68|>": 50324,
+  "<|extratoken_69|>": 50325,
+  "<|extratoken_6|>": 50262,
+  "<|extratoken_70|>": 50326,
+  "<|extratoken_71|>": 50327,
+  "<|extratoken_72|>": 50328,
+  "<|extratoken_73|>": 50329,
+  "<|extratoken_74|>": 50330,
+  "<|extratoken_75|>": 50331,
+  "<|extratoken_76|>": 50332,
+  "<|extratoken_77|>": 50333,
+  "<|extratoken_78|>": 50334,
+  "<|extratoken_79|>": 50335,
+  "<|extratoken_7|>": 50263,
+  "<|extratoken_80|>": 50336,
+  "<|extratoken_81|>": 50337,
+  "<|extratoken_82|>": 50338,
+  "<|extratoken_83|>": 50339,
+  "<|extratoken_84|>": 50340,
+  "<|extratoken_85|>": 50341,
+  "<|extratoken_86|>": 50342,
+  "<|extratoken_87|>": 50343,
+  "<|extratoken_88|>": 50344,
+  "<|extratoken_89|>": 50345,
+  "<|extratoken_8|>": 50264,
+  "<|extratoken_90|>": 50346,
+  "<|extratoken_91|>": 50347,
+  "<|extratoken_92|>": 50348,
+  "<|extratoken_93|>": 50349,
+  "<|extratoken_94|>": 50350,
+  "<|extratoken_95|>": 50351,
+  "<|extratoken_96|>": 50352,
+  "<|extratoken_97|>": 50353,
+  "<|extratoken_98|>": 50354,
+  "<|extratoken_99|>": 50355,
+  "<|extratoken_9|>": 50265
+}

config.json ADDED Viewed

	@@ -0,0 +1,50 @@

+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPTJMoEForCausalLM"
+  ],
+  "attn_pdrop": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_gptj_moe.GPTJMoEConfig",
+    "AutoModel": "modeling_gptj_moe.GPTJMoEModel",
+    "AutoModelForCausalLM": "modeling_gptj_moe.GPTJMoEForCausalLM"
+  },
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.0,
+  "eos_token_id": 50256,
+  "gradient_checkpointing": false,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gptj_moe",
+  "n_embd": 4096,
+  "n_head": 16,
+  "n_inner": null,
+  "n_layer": 28,
+  "n_positions": 2048,
+  "num_experts_per_tok": 2,
+  "num_local_experts": 4,
+  "output_router_logits": false,
+  "resid_pdrop": 0.0,
+  "rotary_dim": 64,
+  "router_aux_loss_coef": 0.001,
+  "router_jitter_noise": 0.0,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50,
+      "temperature": 1.0
+    }
+  },
+  "tie_word_embeddings": false,
+  "tokenizer_class": "GPT2Tokenizer",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.40.0.dev0",
+  "use_cache": true,
+  "vocab_size": 50400
+}

configuration_gptj_moe.py ADDED Viewed

	@@ -0,0 +1,120 @@

+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class GPTJMoEConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GPTJModel`]. It is used to instantiate a GPT-J
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the GPT-J
+    [EleutherAI/gpt-j-6B](https://huggingface.co/EleutherAI/gpt-j-6B) architecture. Configuration objects inherit from
+    [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`]
+    for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50400):
+            Vocabulary size of the GPT-J model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GPTJModel`].
+        n_positions (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        n_embd (`int`, *optional*, defaults to 4096):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (`int`, *optional*, defaults to 28):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        rotary_dim (`int`, *optional*, defaults to 64):
+            Number of dimensions in the embedding that Rotary Position Embedding is applied to.
+        n_inner (`int`, *optional*, defaults to None):
+            Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
+        activation_function (`str`, *optional*, defaults to `"gelu_new"`):
+            Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (`int`, *optional*, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        num_experts_per_tok (`int`, *optional*, defaults to 2):
+            The number of experts to root per-token, can be also interpreted as the `top-p` routing
+            parameter
+        num_local_experts (`int`, *optional*, defaults to 4):
+            Number of experts per Sparse MLP layer.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabeling this will also
+            allow the model to output the auxiliary loss. See [here]() for more details
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+        router_jitter_noise (`float`, *optional*, defaults to 0.0):
+            Amount of noise to add to the router.
+        """
+    model_type = "gptj_moe"
+    attribute_map = {
+        "max_position_embeddings": "n_positions",
+        "hidden_size": "n_embd",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+    }
+    def __init__(
+        self,
+        vocab_size=50400,
+        n_positions=2048,
+        n_embd=4096,
+        n_layer=28,
+        n_head=16,
+        rotary_dim=64,
+        n_inner=None,
+        activation_function="gelu_new",
+        resid_pdrop=0.0,
+        embd_pdrop=0.0,
+        attn_pdrop=0.0,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        use_cache=True,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        tie_word_embeddings=False,
+        n_experts_per_tok=2,
+        n_local_experts=4,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        router_jitter_noise=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_inner = n_inner
+        self.rotary_dim = rotary_dim
+        self.activation_function = activation_function
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.num_experts_per_tok = n_experts_per_tok
+        self.num_local_experts = n_local_experts
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.router_jitter_noise = router_jitter_noise
+        super().__init__(
+            bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs
+        )

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "transformers_version": "4.40.0.dev0"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:56b567fa4fb5ba701d0a89e3e21f61bf450adaa93baeee7877cf918723e31086
+size 4977101656

model-00002-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f9af7ef4e8ce64473087fbf84f72524a16907ecde61330362cafe2c9aa1232e0
+size 4966952456

model-00003-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:505a7867107466c875521fa16f6cafec577f36ebf9c99313ca293531b26fafcd
+size 4966944520

model-00004-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:827cb5b4972c4b3b1b26b6da5b8e5edaf1a9b55939bff202002b017bf0bc5beb
+size 4966952864

model-00005-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dea322e2cf83970e37a13679157d50a8effe44b2a5ab742dbd5ab0b27c87d281
+size 4966952552

model-00006-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:db980fdb921b826014a3704fa9516b5a77857b64640216a97492b895b3ff975d
+size 4966927976

model-00007-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:45af2b4065d5a8d0058e6a59729774dea7fa54b3c25e585a0889e90205a990ac
+size 4842951552

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,656 @@

+{
+  "metadata": {
+    "total_size": 34654702016
+  },
+  "weight_map": {
+    "lm_head.bias": "model-00007-of-00007.safetensors",
+    "lm_head.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.0.attn.k_proj.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.0.attn.out_proj.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.0.attn.q_proj.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.0.attn.v_proj.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.0.block_sparse_moe.experts.0.fc_in.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.0.block_sparse_moe.experts.0.fc_in.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.0.block_sparse_moe.experts.0.fc_out.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.0.block_sparse_moe.experts.0.fc_out.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.0.block_sparse_moe.experts.1.fc_in.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.0.block_sparse_moe.experts.1.fc_in.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.0.block_sparse_moe.experts.1.fc_out.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.0.block_sparse_moe.experts.1.fc_out.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.0.block_sparse_moe.experts.2.fc_in.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.0.block_sparse_moe.experts.2.fc_in.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.0.block_sparse_moe.experts.2.fc_out.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.0.block_sparse_moe.experts.2.fc_out.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.0.block_sparse_moe.experts.3.fc_in.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.0.block_sparse_moe.experts.3.fc_in.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.0.block_sparse_moe.experts.3.fc_out.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.0.block_sparse_moe.experts.3.fc_out.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.0.block_sparse_moe.gate.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.0.ln_1.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.0.ln_1.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.1.attn.k_proj.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.1.attn.out_proj.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.1.attn.q_proj.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.1.attn.v_proj.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.1.block_sparse_moe.experts.0.fc_in.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.1.block_sparse_moe.experts.0.fc_in.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.1.block_sparse_moe.experts.0.fc_out.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.1.block_sparse_moe.experts.0.fc_out.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.1.block_sparse_moe.experts.1.fc_in.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.1.block_sparse_moe.experts.1.fc_in.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.1.block_sparse_moe.experts.1.fc_out.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.1.block_sparse_moe.experts.1.fc_out.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.1.block_sparse_moe.experts.2.fc_in.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.1.block_sparse_moe.experts.2.fc_in.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.1.block_sparse_moe.experts.2.fc_out.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.1.block_sparse_moe.experts.2.fc_out.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.1.block_sparse_moe.experts.3.fc_in.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.1.block_sparse_moe.experts.3.fc_in.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.1.block_sparse_moe.experts.3.fc_out.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.1.block_sparse_moe.experts.3.fc_out.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.1.block_sparse_moe.gate.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.1.ln_1.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.1.ln_1.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.10.attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.10.attn.out_proj.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.10.attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.10.attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.10.block_sparse_moe.experts.0.fc_in.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.10.block_sparse_moe.experts.0.fc_in.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.10.block_sparse_moe.experts.0.fc_out.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.10.block_sparse_moe.experts.0.fc_out.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.10.block_sparse_moe.experts.1.fc_in.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.10.block_sparse_moe.experts.1.fc_in.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.10.block_sparse_moe.experts.1.fc_out.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.10.block_sparse_moe.experts.1.fc_out.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.10.block_sparse_moe.experts.2.fc_in.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.10.block_sparse_moe.experts.2.fc_in.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.10.block_sparse_moe.experts.2.fc_out.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.10.block_sparse_moe.experts.2.fc_out.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.10.block_sparse_moe.experts.3.fc_in.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.10.block_sparse_moe.experts.3.fc_in.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.10.block_sparse_moe.experts.3.fc_out.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.10.block_sparse_moe.experts.3.fc_out.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.10.block_sparse_moe.gate.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.10.ln_1.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.10.ln_1.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.11.attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.11.attn.out_proj.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.11.attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.11.attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.11.block_sparse_moe.experts.0.fc_in.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.11.block_sparse_moe.experts.0.fc_in.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.11.block_sparse_moe.experts.0.fc_out.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.11.block_sparse_moe.experts.0.fc_out.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.11.block_sparse_moe.experts.1.fc_in.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.11.block_sparse_moe.experts.1.fc_in.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.11.block_sparse_moe.experts.1.fc_out.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.11.block_sparse_moe.experts.1.fc_out.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.11.block_sparse_moe.experts.2.fc_in.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.11.block_sparse_moe.experts.2.fc_in.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.11.block_sparse_moe.experts.2.fc_out.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.11.block_sparse_moe.experts.2.fc_out.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.11.block_sparse_moe.experts.3.fc_in.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.11.block_sparse_moe.experts.3.fc_in.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.11.block_sparse_moe.experts.3.fc_out.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.11.block_sparse_moe.experts.3.fc_out.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.11.block_sparse_moe.gate.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.11.ln_1.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.11.ln_1.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.12.attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.12.attn.out_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.12.attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.12.attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.12.block_sparse_moe.experts.0.fc_in.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.12.block_sparse_moe.experts.0.fc_in.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.12.block_sparse_moe.experts.0.fc_out.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.12.block_sparse_moe.experts.0.fc_out.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.12.block_sparse_moe.experts.1.fc_in.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.12.block_sparse_moe.experts.1.fc_in.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.12.block_sparse_moe.experts.1.fc_out.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.12.block_sparse_moe.experts.1.fc_out.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.12.block_sparse_moe.experts.2.fc_in.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.12.block_sparse_moe.experts.2.fc_in.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.12.block_sparse_moe.experts.2.fc_out.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.12.block_sparse_moe.experts.2.fc_out.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.12.block_sparse_moe.experts.3.fc_in.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.12.block_sparse_moe.experts.3.fc_in.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.12.block_sparse_moe.experts.3.fc_out.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.12.block_sparse_moe.experts.3.fc_out.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.12.block_sparse_moe.gate.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.12.ln_1.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.12.ln_1.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.13.attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.13.attn.out_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.13.attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.13.attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.13.block_sparse_moe.experts.0.fc_in.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.13.block_sparse_moe.experts.0.fc_in.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.13.block_sparse_moe.experts.0.fc_out.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.13.block_sparse_moe.experts.0.fc_out.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.13.block_sparse_moe.experts.1.fc_in.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.13.block_sparse_moe.experts.1.fc_in.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.13.block_sparse_moe.experts.1.fc_out.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.13.block_sparse_moe.experts.1.fc_out.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.13.block_sparse_moe.experts.2.fc_in.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.13.block_sparse_moe.experts.2.fc_in.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.13.block_sparse_moe.experts.2.fc_out.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.13.block_sparse_moe.experts.2.fc_out.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.13.block_sparse_moe.experts.3.fc_in.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.13.block_sparse_moe.experts.3.fc_in.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.13.block_sparse_moe.experts.3.fc_out.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.13.block_sparse_moe.experts.3.fc_out.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.13.block_sparse_moe.gate.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.13.ln_1.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.13.ln_1.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.14.attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.14.attn.out_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.14.attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.14.attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.14.block_sparse_moe.experts.0.fc_in.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.14.block_sparse_moe.experts.0.fc_in.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.14.block_sparse_moe.experts.0.fc_out.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.14.block_sparse_moe.experts.0.fc_out.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.14.block_sparse_moe.experts.1.fc_in.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.14.block_sparse_moe.experts.1.fc_in.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.14.block_sparse_moe.experts.1.fc_out.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.14.block_sparse_moe.experts.1.fc_out.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.14.block_sparse_moe.experts.2.fc_in.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.14.block_sparse_moe.experts.2.fc_in.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.14.block_sparse_moe.experts.2.fc_out.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.14.block_sparse_moe.experts.2.fc_out.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.14.block_sparse_moe.experts.3.fc_in.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.14.block_sparse_moe.experts.3.fc_in.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.14.block_sparse_moe.experts.3.fc_out.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.14.block_sparse_moe.experts.3.fc_out.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.14.block_sparse_moe.gate.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.14.ln_1.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.14.ln_1.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.15.attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.15.attn.out_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.15.attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.15.attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.15.block_sparse_moe.experts.0.fc_in.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.15.block_sparse_moe.experts.0.fc_in.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.15.block_sparse_moe.experts.0.fc_out.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.15.block_sparse_moe.experts.0.fc_out.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.15.block_sparse_moe.experts.1.fc_in.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.15.block_sparse_moe.experts.1.fc_in.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.15.block_sparse_moe.experts.1.fc_out.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.15.block_sparse_moe.experts.1.fc_out.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.15.block_sparse_moe.experts.2.fc_in.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.15.block_sparse_moe.experts.2.fc_in.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.15.block_sparse_moe.experts.2.fc_out.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.15.block_sparse_moe.experts.2.fc_out.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.15.block_sparse_moe.experts.3.fc_in.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.15.block_sparse_moe.experts.3.fc_in.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.15.block_sparse_moe.experts.3.fc_out.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.15.block_sparse_moe.experts.3.fc_out.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.15.block_sparse_moe.gate.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.15.ln_1.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.15.ln_1.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.16.attn.k_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.16.attn.out_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.16.attn.q_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.16.attn.v_proj.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.16.block_sparse_moe.experts.0.fc_in.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.16.block_sparse_moe.experts.0.fc_in.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.16.block_sparse_moe.experts.0.fc_out.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.16.block_sparse_moe.experts.0.fc_out.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.16.block_sparse_moe.experts.1.fc_in.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.16.block_sparse_moe.experts.1.fc_in.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.16.block_sparse_moe.experts.1.fc_out.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.16.block_sparse_moe.experts.1.fc_out.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.16.block_sparse_moe.experts.2.fc_in.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.16.block_sparse_moe.experts.2.fc_in.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.16.block_sparse_moe.experts.2.fc_out.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.16.block_sparse_moe.experts.2.fc_out.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.16.block_sparse_moe.experts.3.fc_in.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.16.block_sparse_moe.experts.3.fc_in.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.16.block_sparse_moe.experts.3.fc_out.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.16.block_sparse_moe.experts.3.fc_out.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.16.block_sparse_moe.gate.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.16.ln_1.bias": "model-00004-of-00007.safetensors",
+    "transformer.h.16.ln_1.weight": "model-00004-of-00007.safetensors",
+    "transformer.h.17.attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.17.attn.out_proj.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.17.attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.17.attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.17.block_sparse_moe.experts.0.fc_in.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.17.block_sparse_moe.experts.0.fc_in.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.17.block_sparse_moe.experts.0.fc_out.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.17.block_sparse_moe.experts.0.fc_out.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.17.block_sparse_moe.experts.1.fc_in.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.17.block_sparse_moe.experts.1.fc_in.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.17.block_sparse_moe.experts.1.fc_out.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.17.block_sparse_moe.experts.1.fc_out.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.17.block_sparse_moe.experts.2.fc_in.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.17.block_sparse_moe.experts.2.fc_in.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.17.block_sparse_moe.experts.2.fc_out.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.17.block_sparse_moe.experts.2.fc_out.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.17.block_sparse_moe.experts.3.fc_in.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.17.block_sparse_moe.experts.3.fc_in.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.17.block_sparse_moe.experts.3.fc_out.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.17.block_sparse_moe.experts.3.fc_out.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.17.block_sparse_moe.gate.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.17.ln_1.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.17.ln_1.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.18.attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.18.attn.out_proj.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.18.attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.18.attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.18.block_sparse_moe.experts.0.fc_in.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.18.block_sparse_moe.experts.0.fc_in.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.18.block_sparse_moe.experts.0.fc_out.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.18.block_sparse_moe.experts.0.fc_out.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.18.block_sparse_moe.experts.1.fc_in.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.18.block_sparse_moe.experts.1.fc_in.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.18.block_sparse_moe.experts.1.fc_out.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.18.block_sparse_moe.experts.1.fc_out.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.18.block_sparse_moe.experts.2.fc_in.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.18.block_sparse_moe.experts.2.fc_in.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.18.block_sparse_moe.experts.2.fc_out.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.18.block_sparse_moe.experts.2.fc_out.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.18.block_sparse_moe.experts.3.fc_in.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.18.block_sparse_moe.experts.3.fc_in.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.18.block_sparse_moe.experts.3.fc_out.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.18.block_sparse_moe.experts.3.fc_out.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.18.block_sparse_moe.gate.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.18.ln_1.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.18.ln_1.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.19.attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.19.attn.out_proj.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.19.attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.19.attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.19.block_sparse_moe.experts.0.fc_in.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.19.block_sparse_moe.experts.0.fc_in.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.19.block_sparse_moe.experts.0.fc_out.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.19.block_sparse_moe.experts.0.fc_out.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.19.block_sparse_moe.experts.1.fc_in.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.19.block_sparse_moe.experts.1.fc_in.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.19.block_sparse_moe.experts.1.fc_out.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.19.block_sparse_moe.experts.1.fc_out.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.19.block_sparse_moe.experts.2.fc_in.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.19.block_sparse_moe.experts.2.fc_in.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.19.block_sparse_moe.experts.2.fc_out.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.19.block_sparse_moe.experts.2.fc_out.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.19.block_sparse_moe.experts.3.fc_in.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.19.block_sparse_moe.experts.3.fc_in.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.19.block_sparse_moe.experts.3.fc_out.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.19.block_sparse_moe.experts.3.fc_out.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.19.block_sparse_moe.gate.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.19.ln_1.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.19.ln_1.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.2.attn.k_proj.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.2.attn.out_proj.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.2.attn.q_proj.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.2.attn.v_proj.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.2.block_sparse_moe.experts.0.fc_in.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.2.block_sparse_moe.experts.0.fc_in.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.2.block_sparse_moe.experts.0.fc_out.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.2.block_sparse_moe.experts.0.fc_out.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.2.block_sparse_moe.experts.1.fc_in.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.2.block_sparse_moe.experts.1.fc_in.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.2.block_sparse_moe.experts.1.fc_out.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.2.block_sparse_moe.experts.1.fc_out.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.2.block_sparse_moe.experts.2.fc_in.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.2.block_sparse_moe.experts.2.fc_in.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.2.block_sparse_moe.experts.2.fc_out.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.2.block_sparse_moe.experts.2.fc_out.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.2.block_sparse_moe.experts.3.fc_in.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.2.block_sparse_moe.experts.3.fc_in.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.2.block_sparse_moe.experts.3.fc_out.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.2.block_sparse_moe.experts.3.fc_out.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.2.block_sparse_moe.gate.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.2.ln_1.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.2.ln_1.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.20.attn.k_proj.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.20.attn.out_proj.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.20.attn.q_proj.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.20.attn.v_proj.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.20.block_sparse_moe.experts.0.fc_in.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.20.block_sparse_moe.experts.0.fc_in.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.20.block_sparse_moe.experts.0.fc_out.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.20.block_sparse_moe.experts.0.fc_out.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.20.block_sparse_moe.experts.1.fc_in.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.20.block_sparse_moe.experts.1.fc_in.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.20.block_sparse_moe.experts.1.fc_out.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.20.block_sparse_moe.experts.1.fc_out.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.20.block_sparse_moe.experts.2.fc_in.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.20.block_sparse_moe.experts.2.fc_in.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.20.block_sparse_moe.experts.2.fc_out.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.20.block_sparse_moe.experts.2.fc_out.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.20.block_sparse_moe.experts.3.fc_in.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.20.block_sparse_moe.experts.3.fc_in.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.20.block_sparse_moe.experts.3.fc_out.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.20.block_sparse_moe.experts.3.fc_out.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.20.block_sparse_moe.gate.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.20.ln_1.bias": "model-00005-of-00007.safetensors",
+    "transformer.h.20.ln_1.weight": "model-00005-of-00007.safetensors",
+    "transformer.h.21.attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.21.attn.out_proj.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.21.attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.21.attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.21.block_sparse_moe.experts.0.fc_in.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.21.block_sparse_moe.experts.0.fc_in.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.21.block_sparse_moe.experts.0.fc_out.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.21.block_sparse_moe.experts.0.fc_out.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.21.block_sparse_moe.experts.1.fc_in.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.21.block_sparse_moe.experts.1.fc_in.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.21.block_sparse_moe.experts.1.fc_out.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.21.block_sparse_moe.experts.1.fc_out.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.21.block_sparse_moe.experts.2.fc_in.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.21.block_sparse_moe.experts.2.fc_in.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.21.block_sparse_moe.experts.2.fc_out.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.21.block_sparse_moe.experts.2.fc_out.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.21.block_sparse_moe.experts.3.fc_in.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.21.block_sparse_moe.experts.3.fc_in.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.21.block_sparse_moe.experts.3.fc_out.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.21.block_sparse_moe.experts.3.fc_out.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.21.block_sparse_moe.gate.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.21.ln_1.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.21.ln_1.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.22.attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.22.attn.out_proj.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.22.attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.22.attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.22.block_sparse_moe.experts.0.fc_in.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.22.block_sparse_moe.experts.0.fc_in.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.22.block_sparse_moe.experts.0.fc_out.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.22.block_sparse_moe.experts.0.fc_out.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.22.block_sparse_moe.experts.1.fc_in.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.22.block_sparse_moe.experts.1.fc_in.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.22.block_sparse_moe.experts.1.fc_out.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.22.block_sparse_moe.experts.1.fc_out.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.22.block_sparse_moe.experts.2.fc_in.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.22.block_sparse_moe.experts.2.fc_in.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.22.block_sparse_moe.experts.2.fc_out.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.22.block_sparse_moe.experts.2.fc_out.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.22.block_sparse_moe.experts.3.fc_in.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.22.block_sparse_moe.experts.3.fc_in.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.22.block_sparse_moe.experts.3.fc_out.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.22.block_sparse_moe.experts.3.fc_out.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.22.block_sparse_moe.gate.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.22.ln_1.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.22.ln_1.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.23.attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.23.attn.out_proj.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.23.attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.23.attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.23.block_sparse_moe.experts.0.fc_in.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.23.block_sparse_moe.experts.0.fc_in.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.23.block_sparse_moe.experts.0.fc_out.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.23.block_sparse_moe.experts.0.fc_out.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.23.block_sparse_moe.experts.1.fc_in.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.23.block_sparse_moe.experts.1.fc_in.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.23.block_sparse_moe.experts.1.fc_out.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.23.block_sparse_moe.experts.1.fc_out.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.23.block_sparse_moe.experts.2.fc_in.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.23.block_sparse_moe.experts.2.fc_in.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.23.block_sparse_moe.experts.2.fc_out.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.23.block_sparse_moe.experts.2.fc_out.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.23.block_sparse_moe.experts.3.fc_in.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.23.block_sparse_moe.experts.3.fc_in.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.23.block_sparse_moe.experts.3.fc_out.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.23.block_sparse_moe.experts.3.fc_out.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.23.block_sparse_moe.gate.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.23.ln_1.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.23.ln_1.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.24.attn.k_proj.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.24.attn.out_proj.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.24.attn.q_proj.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.24.attn.v_proj.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.24.block_sparse_moe.experts.0.fc_in.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.24.block_sparse_moe.experts.0.fc_in.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.24.block_sparse_moe.experts.0.fc_out.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.24.block_sparse_moe.experts.0.fc_out.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.24.block_sparse_moe.experts.1.fc_in.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.24.block_sparse_moe.experts.1.fc_in.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.24.block_sparse_moe.experts.1.fc_out.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.24.block_sparse_moe.experts.1.fc_out.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.24.block_sparse_moe.experts.2.fc_in.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.24.block_sparse_moe.experts.2.fc_in.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.24.block_sparse_moe.experts.2.fc_out.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.24.block_sparse_moe.experts.2.fc_out.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.24.block_sparse_moe.experts.3.fc_in.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.24.block_sparse_moe.experts.3.fc_in.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.24.block_sparse_moe.experts.3.fc_out.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.24.block_sparse_moe.experts.3.fc_out.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.24.block_sparse_moe.gate.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.24.ln_1.bias": "model-00006-of-00007.safetensors",
+    "transformer.h.24.ln_1.weight": "model-00006-of-00007.safetensors",
+    "transformer.h.25.attn.k_proj.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.25.attn.out_proj.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.25.attn.q_proj.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.25.attn.v_proj.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.25.block_sparse_moe.experts.0.fc_in.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.25.block_sparse_moe.experts.0.fc_in.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.25.block_sparse_moe.experts.0.fc_out.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.25.block_sparse_moe.experts.0.fc_out.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.25.block_sparse_moe.experts.1.fc_in.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.25.block_sparse_moe.experts.1.fc_in.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.25.block_sparse_moe.experts.1.fc_out.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.25.block_sparse_moe.experts.1.fc_out.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.25.block_sparse_moe.experts.2.fc_in.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.25.block_sparse_moe.experts.2.fc_in.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.25.block_sparse_moe.experts.2.fc_out.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.25.block_sparse_moe.experts.2.fc_out.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.25.block_sparse_moe.experts.3.fc_in.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.25.block_sparse_moe.experts.3.fc_in.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.25.block_sparse_moe.experts.3.fc_out.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.25.block_sparse_moe.experts.3.fc_out.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.25.block_sparse_moe.gate.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.25.ln_1.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.25.ln_1.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.26.attn.k_proj.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.26.attn.out_proj.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.26.attn.q_proj.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.26.attn.v_proj.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.26.block_sparse_moe.experts.0.fc_in.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.26.block_sparse_moe.experts.0.fc_in.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.26.block_sparse_moe.experts.0.fc_out.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.26.block_sparse_moe.experts.0.fc_out.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.26.block_sparse_moe.experts.1.fc_in.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.26.block_sparse_moe.experts.1.fc_in.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.26.block_sparse_moe.experts.1.fc_out.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.26.block_sparse_moe.experts.1.fc_out.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.26.block_sparse_moe.experts.2.fc_in.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.26.block_sparse_moe.experts.2.fc_in.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.26.block_sparse_moe.experts.2.fc_out.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.26.block_sparse_moe.experts.2.fc_out.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.26.block_sparse_moe.experts.3.fc_in.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.26.block_sparse_moe.experts.3.fc_in.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.26.block_sparse_moe.experts.3.fc_out.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.26.block_sparse_moe.experts.3.fc_out.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.26.block_sparse_moe.gate.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.26.ln_1.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.26.ln_1.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.27.attn.k_proj.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.27.attn.out_proj.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.27.attn.q_proj.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.27.attn.v_proj.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.27.block_sparse_moe.experts.0.fc_in.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.27.block_sparse_moe.experts.0.fc_in.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.27.block_sparse_moe.experts.0.fc_out.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.27.block_sparse_moe.experts.0.fc_out.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.27.block_sparse_moe.experts.1.fc_in.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.27.block_sparse_moe.experts.1.fc_in.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.27.block_sparse_moe.experts.1.fc_out.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.27.block_sparse_moe.experts.1.fc_out.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.27.block_sparse_moe.experts.2.fc_in.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.27.block_sparse_moe.experts.2.fc_in.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.27.block_sparse_moe.experts.2.fc_out.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.27.block_sparse_moe.experts.2.fc_out.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.27.block_sparse_moe.experts.3.fc_in.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.27.block_sparse_moe.experts.3.fc_in.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.27.block_sparse_moe.experts.3.fc_out.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.27.block_sparse_moe.experts.3.fc_out.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.27.block_sparse_moe.gate.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.27.ln_1.bias": "model-00007-of-00007.safetensors",
+    "transformer.h.27.ln_1.weight": "model-00007-of-00007.safetensors",
+    "transformer.h.3.attn.k_proj.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.3.attn.out_proj.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.3.attn.q_proj.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.3.attn.v_proj.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.3.block_sparse_moe.experts.0.fc_in.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.3.block_sparse_moe.experts.0.fc_in.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.3.block_sparse_moe.experts.0.fc_out.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.3.block_sparse_moe.experts.0.fc_out.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.3.block_sparse_moe.experts.1.fc_in.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.3.block_sparse_moe.experts.1.fc_in.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.3.block_sparse_moe.experts.1.fc_out.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.3.block_sparse_moe.experts.1.fc_out.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.3.block_sparse_moe.experts.2.fc_in.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.3.block_sparse_moe.experts.2.fc_in.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.3.block_sparse_moe.experts.2.fc_out.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.3.block_sparse_moe.experts.2.fc_out.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.3.block_sparse_moe.experts.3.fc_in.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.3.block_sparse_moe.experts.3.fc_in.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.3.block_sparse_moe.experts.3.fc_out.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.3.block_sparse_moe.experts.3.fc_out.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.3.block_sparse_moe.gate.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.3.ln_1.bias": "model-00001-of-00007.safetensors",
+    "transformer.h.3.ln_1.weight": "model-00001-of-00007.safetensors",
+    "transformer.h.4.attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.4.attn.out_proj.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.4.attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.4.attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.4.block_sparse_moe.experts.0.fc_in.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.4.block_sparse_moe.experts.0.fc_in.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.4.block_sparse_moe.experts.0.fc_out.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.4.block_sparse_moe.experts.0.fc_out.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.4.block_sparse_moe.experts.1.fc_in.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.4.block_sparse_moe.experts.1.fc_in.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.4.block_sparse_moe.experts.1.fc_out.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.4.block_sparse_moe.experts.1.fc_out.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.4.block_sparse_moe.experts.2.fc_in.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.4.block_sparse_moe.experts.2.fc_in.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.4.block_sparse_moe.experts.2.fc_out.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.4.block_sparse_moe.experts.2.fc_out.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.4.block_sparse_moe.experts.3.fc_in.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.4.block_sparse_moe.experts.3.fc_in.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.4.block_sparse_moe.experts.3.fc_out.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.4.block_sparse_moe.experts.3.fc_out.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.4.block_sparse_moe.gate.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.4.ln_1.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.4.ln_1.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.5.attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.5.attn.out_proj.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.5.attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.5.attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.5.block_sparse_moe.experts.0.fc_in.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.5.block_sparse_moe.experts.0.fc_in.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.5.block_sparse_moe.experts.0.fc_out.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.5.block_sparse_moe.experts.0.fc_out.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.5.block_sparse_moe.experts.1.fc_in.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.5.block_sparse_moe.experts.1.fc_in.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.5.block_sparse_moe.experts.1.fc_out.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.5.block_sparse_moe.experts.1.fc_out.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.5.block_sparse_moe.experts.2.fc_in.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.5.block_sparse_moe.experts.2.fc_in.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.5.block_sparse_moe.experts.2.fc_out.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.5.block_sparse_moe.experts.2.fc_out.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.5.block_sparse_moe.experts.3.fc_in.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.5.block_sparse_moe.experts.3.fc_in.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.5.block_sparse_moe.experts.3.fc_out.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.5.block_sparse_moe.experts.3.fc_out.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.5.block_sparse_moe.gate.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.5.ln_1.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.5.ln_1.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.6.attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.6.attn.out_proj.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.6.attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.6.attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.6.block_sparse_moe.experts.0.fc_in.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.6.block_sparse_moe.experts.0.fc_in.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.6.block_sparse_moe.experts.0.fc_out.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.6.block_sparse_moe.experts.0.fc_out.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.6.block_sparse_moe.experts.1.fc_in.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.6.block_sparse_moe.experts.1.fc_in.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.6.block_sparse_moe.experts.1.fc_out.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.6.block_sparse_moe.experts.1.fc_out.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.6.block_sparse_moe.experts.2.fc_in.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.6.block_sparse_moe.experts.2.fc_in.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.6.block_sparse_moe.experts.2.fc_out.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.6.block_sparse_moe.experts.2.fc_out.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.6.block_sparse_moe.experts.3.fc_in.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.6.block_sparse_moe.experts.3.fc_in.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.6.block_sparse_moe.experts.3.fc_out.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.6.block_sparse_moe.experts.3.fc_out.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.6.block_sparse_moe.gate.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.6.ln_1.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.6.ln_1.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.7.attn.k_proj.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.7.attn.out_proj.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.7.attn.q_proj.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.7.attn.v_proj.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.7.block_sparse_moe.experts.0.fc_in.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.7.block_sparse_moe.experts.0.fc_in.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.7.block_sparse_moe.experts.0.fc_out.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.7.block_sparse_moe.experts.0.fc_out.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.7.block_sparse_moe.experts.1.fc_in.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.7.block_sparse_moe.experts.1.fc_in.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.7.block_sparse_moe.experts.1.fc_out.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.7.block_sparse_moe.experts.1.fc_out.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.7.block_sparse_moe.experts.2.fc_in.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.7.block_sparse_moe.experts.2.fc_in.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.7.block_sparse_moe.experts.2.fc_out.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.7.block_sparse_moe.experts.2.fc_out.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.7.block_sparse_moe.experts.3.fc_in.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.7.block_sparse_moe.experts.3.fc_in.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.7.block_sparse_moe.experts.3.fc_out.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.7.block_sparse_moe.experts.3.fc_out.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.7.block_sparse_moe.gate.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.7.ln_1.bias": "model-00002-of-00007.safetensors",
+    "transformer.h.7.ln_1.weight": "model-00002-of-00007.safetensors",
+    "transformer.h.8.attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.8.attn.out_proj.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.8.attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.8.attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.8.block_sparse_moe.experts.0.fc_in.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.8.block_sparse_moe.experts.0.fc_in.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.8.block_sparse_moe.experts.0.fc_out.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.8.block_sparse_moe.experts.0.fc_out.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.8.block_sparse_moe.experts.1.fc_in.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.8.block_sparse_moe.experts.1.fc_in.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.8.block_sparse_moe.experts.1.fc_out.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.8.block_sparse_moe.experts.1.fc_out.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.8.block_sparse_moe.experts.2.fc_in.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.8.block_sparse_moe.experts.2.fc_in.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.8.block_sparse_moe.experts.2.fc_out.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.8.block_sparse_moe.experts.2.fc_out.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.8.block_sparse_moe.experts.3.fc_in.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.8.block_sparse_moe.experts.3.fc_in.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.8.block_sparse_moe.experts.3.fc_out.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.8.block_sparse_moe.experts.3.fc_out.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.8.block_sparse_moe.gate.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.8.ln_1.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.8.ln_1.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.9.attn.k_proj.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.9.attn.out_proj.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.9.attn.q_proj.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.9.attn.v_proj.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.9.block_sparse_moe.experts.0.fc_in.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.9.block_sparse_moe.experts.0.fc_in.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.9.block_sparse_moe.experts.0.fc_out.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.9.block_sparse_moe.experts.0.fc_out.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.9.block_sparse_moe.experts.1.fc_in.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.9.block_sparse_moe.experts.1.fc_in.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.9.block_sparse_moe.experts.1.fc_out.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.9.block_sparse_moe.experts.1.fc_out.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.9.block_sparse_moe.experts.2.fc_in.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.9.block_sparse_moe.experts.2.fc_in.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.9.block_sparse_moe.experts.2.fc_out.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.9.block_sparse_moe.experts.2.fc_out.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.9.block_sparse_moe.experts.3.fc_in.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.9.block_sparse_moe.experts.3.fc_in.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.9.block_sparse_moe.experts.3.fc_out.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.9.block_sparse_moe.experts.3.fc_out.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.9.block_sparse_moe.gate.weight": "model-00003-of-00007.safetensors",
+    "transformer.h.9.ln_1.bias": "model-00003-of-00007.safetensors",
+    "transformer.h.9.ln_1.weight": "model-00003-of-00007.safetensors",
+    "transformer.ln_f.bias": "model-00007-of-00007.safetensors",
+    "transformer.ln_f.weight": "model-00007-of-00007.safetensors",
+    "transformer.wte.weight": "model-00001-of-00007.safetensors"
+  }
+}

modeling_gptj_moe.py ADDED Viewed

	@@ -0,0 +1,671 @@

+""" GPT-J model with MoE. """
+import warnings
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers.modeling_outputs import (
+    MoeCausalLMOutputWithPast,
+    MoeModelOutputWithPast
+)
+from transformers.models.gptj.modeling_gptj import (
+    GPTJ_ATTENTION_CLASSES,
+    GPTJMLP,
+    GPTJPreTrainedModel
+)
+from transformers.utils import logging
+from transformers.utils.model_parallel_utils import assert_device_map, get_device_map
+from .configuration_gptj_moe import GPTJMoEConfig
+logger = logging.get_logger(__name__)
+# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func
+def load_balancing_loss_func(
+    gate_logits: torch.Tensor, num_experts: torch.Tensor = None, top_k=2, attention_mask: Optional[torch.Tensor] = None
+) -> float:
+    r"""
+    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
+    See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
+    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
+    experts is too unbalanced.
+    Args:
+        gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
+            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
+            shape [batch_size X sequence_length, num_experts].
+        attention_mask (`torch.Tensor`, None):
+            The attention_mask used in forward function
+            shape [batch_size X sequence_length] if not None.
+        num_experts (`int`, *optional*):
+            Number of experts
+    Returns:
+        The auxiliary loss.
+    """
+    if gate_logits is None or not isinstance(gate_logits, tuple):
+        return 0
+    if isinstance(gate_logits, tuple):
+        compute_device = gate_logits[0].device
+        concatenated_gate_logits = torch.cat([layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0)
+    routing_weights = F.softmax(concatenated_gate_logits, dim=-1)
+    _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
+    expert_mask = F.one_hot(selected_experts, num_experts)
+    if attention_mask is None:
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.mean(routing_weights, dim=0)
+    else:
+        batch_size, sequence_length = attention_mask.shape
+        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
+        # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
+        expert_attention_mask = (
+            attention_mask[None, :, :, None, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
+            .reshape(-1, top_k, num_experts)
+            .to(compute_device)
+        )
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.sum(expert_mask.float() * expert_attention_mask, dim=0) / torch.sum(
+            expert_attention_mask, dim=0
+        )
+        # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
+        router_per_expert_attention_mask = (
+            attention_mask[None, :, :, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
+            .reshape(-1, num_experts)
+            .to(compute_device)
+        )
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.sum(routing_weights * router_per_expert_attention_mask, dim=0) / torch.sum(
+            router_per_expert_attention_mask, dim=0
+        )
+    overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
+    return overall_loss * num_experts
+# Copied from transformers.models.mixtral.modeling_mixtral.MixtralSparseMoeBlock
+class GPTJSparseMoE(nn.Module):
+    """
+    This implementation is
+    strictly equivalent to standard MoE with full capacity (no
+    dropped tokens). It's faster since it formulates MoE operations
+    in terms of block-sparse operations to accomodate imbalanced
+    assignments of tokens to experts, whereas standard MoE either
+    (1) drop tokens at the cost of reduced performance or (2) set
+    capacity factor to number of experts and thus waste computation
+    and memory on padding.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_dim = config.n_embd
+        self.ffn_dim = config.n_inner if config.n_inner is not None else 4 * config.n_embd
+        self.num_experts = config.num_local_experts
+        self.top_k = config.num_experts_per_tok
+        # gating
+        self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
+        self.experts = nn.ModuleList([GPTJMLP(self.ffn_dim, config) for _ in range(self.num_experts)])
+        # Jitter parameters
+        self.jitter_noise = config.router_jitter_noise
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """ """
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        if self.training and self.jitter_noise > 0:
+            hidden_states *= torch.empty_like(hidden_states).uniform_(1.0 - self.jitter_noise, 1.0 + self.jitter_noise)
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.gate(hidden_states)
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        # we cast back to the input dtype
+        routing_weights = routing_weights.to(hidden_states.dtype)
+        final_hidden_states = torch.zeros(
+            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
+        )
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated
+        expert_mask = F.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
+        # Loop over all available experts in the model and perform the computation on each expert
+        for expert_idx in range(self.num_experts):
+            expert_layer = self.experts[expert_idx]
+            idx, top_x = torch.where(expert_mask[expert_idx])
+            if top_x.shape[0] == 0:
+                continue
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
+            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
+        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
+        return final_hidden_states, router_logits
+class GPTJMoEBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        self.attn = GPTJ_ATTENTION_CLASSES[config._attn_implementation](config)
+        self.block_sparse_moe = GPTJSparseMoE(config)
+    def forward(
+        self,
+        hidden_states: Optional[torch.FloatTensor],
+        layer_past: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        output_router_logits: Optional[bool] = False,
+    ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_outputs = self.attn(
+            hidden_states=hidden_states,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
+        outputs = attn_outputs[1:]
+        feed_forward_hidden_states, router_logits = self.block_sparse_moe(hidden_states)
+        hidden_states = attn_output + feed_forward_hidden_states + residual
+        if use_cache:
+            outputs = (hidden_states,) + outputs
+        else:
+            outputs = (hidden_states,) + outputs[1:]
+        if output_router_logits:
+            outputs = outputs + (router_logits,)
+        return outputs  # hidden_states, present, (attentions), (router_logits)
+class GPTJMoEModel(GPTJPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.embed_dim = config.n_embd
+        self.vocab_size = config.vocab_size
+        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.h = nn.ModuleList([GPTJMoEBlock(config) for _ in range(config.n_layer)])
+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+    def parallelize(self, device_map=None):
+        warnings.warn(
+            "`GPTJModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your"
+            " model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
+            " `device_map` but it needs to be a dictionary module_name to device, so for instance {'h.0': 0, 'h.1': 1,"
+            " ...}",
+            FutureWarning,
+        )
+        # Check validity of device_map
+        self.device_map = (
+            get_device_map(len(self.h), range(torch.cuda.device_count())) if device_map is None else device_map
+        )
+        assert_device_map(self.device_map, len(self.h))
+        self.model_parallel = True
+        self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys()))
+        self.last_device = "cuda:" + str(max(self.device_map.keys()))
+        self.wte = self.wte.to(self.first_device)
+        # Load onto devices
+        for k, v in self.device_map.items():
+            for block in v:
+                cuda_device = "cuda:" + str(k)
+                self.h[block] = self.h[block].to(cuda_device)
+        # ln_f to last
+        self.ln_f = self.ln_f.to(self.last_device)
+    def deparallelize(self):
+        warnings.warn(
+            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
+            FutureWarning,
+        )
+        self.model_parallel = False
+        self.device_map = None
+        self.first_device = "cpu"
+        self.last_device = "cpu"
+        self.wte = self.wte.to("cpu")
+        for index in range(len(self.h)):
+            self.h[index] = self.h[index].to("cpu")
+        self.ln_f = self.ln_f.to("cpu")
+        torch.cuda.empty_cache()
+    def get_input_embeddings(self):
+        return self.wte
+    def set_input_embeddings(self, new_embeddings):
+        self.wte = new_embeddings
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, MoeModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+            batch_size = input_ids.shape[0]
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size = inputs_embeds.shape[0]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, input_shape[-1])
+        if past_key_values is None:
+            past_length = 0
+            past_key_values = tuple([None] * len(self.h))
+        else:
+            past_length = past_key_values[0][0].size(-2)
+        if position_ids is None:
+            position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0)
+        if not self._use_flash_attention_2:
+            # Attention mask.
+            if attention_mask is not None:
+                if batch_size <= 0:
+                    raise ValueError("batch_size has to be defined and > 0")
+                attention_mask = attention_mask.view(batch_size, -1)
+                # We create a 3D attention mask from a 2D tensor mask.
+                # Sizes are [batch_size, 1, 1, to_seq_length]
+                # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+                # this attention mask is more simple than the triangular masking of causal attention
+                # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+                attention_mask = attention_mask[:, None, None, :]
+                # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+                # masked positions, this operation will create a tensor which is 0.0 for
+                # positions we want to attend and the dtype's smallest value for masked positions.
+                # Since we are adding it to the raw scores before the softmax, this is
+                # effectively the same as removing these entirely.
+                attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+                attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x num_attention_heads x N x N
+        # head_mask has shape n_layer x batch x num_attention_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+        hidden_states = inputs_embeds
+        if token_type_ids is not None:
+            token_type_embeds = self.wte(token_type_ids)
+            hidden_states = hidden_states + token_type_embeds
+        hidden_states = self.drop(hidden_states)
+        output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),)
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        presents = () if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        all_router_logits = () if output_router_logits else None
+        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+            # Model parallel
+            if self.model_parallel:
+                torch.cuda.set_device(hidden_states.device)
+                # Ensure layer_past is on same device as hidden_states (might not be correct)
+                if layer_past is not None:
+                    layer_past = tuple(past_state.to(hidden_states.device) for past_state in layer_past)
+                # Ensure that attention_mask is always on the same device as hidden_states
+                if attention_mask is not None:
+                    attention_mask = attention_mask.to(hidden_states.device)
+                if isinstance(head_mask, torch.Tensor):
+                    head_mask = head_mask.to(hidden_states.device)
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                outputs = self._gradient_checkpointing_func(
+                    block.__call__,
+                    hidden_states,
+                    None,
+                    attention_mask,
+                    position_ids,
+                    head_mask[i],
+                    use_cache,
+                    output_attentions,
+                    output_router_logits,
+                )
+            else:
+                outputs = block(
+                    hidden_states=hidden_states,
+                    layer_past=layer_past,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    head_mask=head_mask[i],
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    output_router_logits=output_router_logits,
+                )
+            hidden_states = outputs[0]
+            if use_cache is True:
+                presents = presents + (outputs[1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
+            if output_router_logits:
+                all_router_logits = all_router_logits + (outputs[-1],)
+            # Model Parallel: If it's the last layer for that device, put things on the next device
+            if self.model_parallel:
+                for k, v in self.device_map.items():
+                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
+                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
+        hidden_states = self.ln_f(hidden_states)
+        hidden_states = hidden_states.view(output_shape)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        # Add router logits
+        if output_router_logits:
+            all_router_logits += (outputs[-1],)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+        return MoeModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            router_logits=all_router_logits,
+        )
+class GPTJMoEForCausalLM(GPTJPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = GPTJMoEModel(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size)
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        # MoE
+        self.router_aux_loss_coef = config.router_aux_loss_coef
+        self.num_experts = config.num_local_experts
+        self.num_experts_per_tok = config.num_experts_per_tok
+        # Initialize weights and apply final processing
+        self.post_init()
+    def parallelize(self, device_map=None):
+        warnings.warn(
+            "`GPTJForCausalLM.parallelize` is deprecated and will be removed in v5 of Transformers, you should load"
+            " your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own"
+            " `device_map` but it needs to be a dictionary module_name to device, so for instance {'transformer.h.0':"
+            " 0, 'transformer.h.1': 1, ...}",
+            FutureWarning,
+        )
+        self.device_map = (
+            get_device_map(len(self.transformer.h), range(torch.cuda.device_count()))
+            if device_map is None
+            else device_map
+        )
+        assert_device_map(self.device_map, len(self.transformer.h))
+        self.transformer.parallelize(self.device_map)
+        self.lm_head = self.lm_head.to(self.transformer.first_device)
+        self.model_parallel = True
+    def deparallelize(self):
+        warnings.warn(
+            "Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.",
+            FutureWarning,
+        )
+        self.transformer.deparallelize()
+        self.transformer = self.transformer.to("cpu")
+        self.lm_head = self.lm_head.to("cpu")
+        self.model_parallel = False
+        torch.cuda.empty_cache()
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, output_router_logits=False, **kwargs):
+        token_type_ids = kwargs.get("token_type_ids", None)
+        # Omit tokens covered by past_key_values
+        if past_key_values:
+            past_length = past_key_values[0][0].shape[2]
+            # Some generation methods already pass only the last input ID
+            if input_ids.shape[1] > past_length:
+                remove_prefix_length = past_length
+            else:
+                # Default to old behavior: keep only final ID
+                remove_prefix_length = input_ids.shape[1] - 1
+            input_ids = input_ids[:, remove_prefix_length:]
+            if token_type_ids is not None:
+                token_type_ids = token_type_ids[:, -input_ids.shape[1] :]
+        attention_mask = kwargs.get("attention_mask", None)
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "position_ids": position_ids,
+                "attention_mask": attention_mask,
+                "token_type_ids": token_type_ids,
+                "output_router_logits": output_router_logits,
+            }
+        )
+        return model_inputs
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            output_router_logits=output_router_logits,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.transformer.first_device)
+            hidden_states = hidden_states.to(self.lm_head.weight.device)
+        # make sure sampling in fp16 works correctly and
+        # compute loss in fp32 to match with mesh-tf version
+        # https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179
+        lm_logits = self.lm_head(hidden_states).to(torch.float32)
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+            loss = loss.to(hidden_states.dtype)
+        # MoE loss
+        aux_loss = None
+        if output_router_logits:
+            aux_loss = load_balancing_loss_func(
+                transformer_outputs.router_logits if return_dict else transformer_outputs[-1],
+                self.num_experts,
+                self.num_experts_per_tok,
+                attention_mask,
+            )
+            if labels is not None:
+                loss += self.router_aux_loss_coef * aux_loss.to(loss.device) # make sure to reside in the same device
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            if output_router_logits:
+                output = (aux_loss,) + output
+            return ((loss,) + output) if loss is not None else output
+        return MoeCausalLMOutputWithPast(
+            loss=loss,
+            aux_loss=aux_loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+            router_logits=transformer_outputs.router_logits
+        )
+    @staticmethod
+    def _reorder_cache(
+        past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor
+    ) -> Tuple[Tuple[torch.Tensor]]:
+        """
+        This function is used to re-order the `past_key_values` cache if [`~PretrainedModel.beam_search`] or
+        [`~PretrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+        beam_idx at every generation step.
+        """
+        return tuple(
+            tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
+            for layer_past in past_key_values
+        )

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,1166 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50257": {
+      "content": "<|extratoken_1|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50258": {
+      "content": "<|extratoken_2|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50259": {
+      "content": "<|extratoken_3|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50260": {
+      "content": "<|extratoken_4|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50261": {
+      "content": "<|extratoken_5|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50262": {
+      "content": "<|extratoken_6|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50263": {
+      "content": "<|extratoken_7|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50264": {
+      "content": "<|extratoken_8|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50265": {
+      "content": "<|extratoken_9|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50266": {
+      "content": "<|extratoken_10|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50267": {
+      "content": "<|extratoken_11|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50268": {
+      "content": "<|extratoken_12|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50269": {
+      "content": "<|extratoken_13|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50270": {
+      "content": "<|extratoken_14|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50271": {
+      "content": "<|extratoken_15|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50272": {
+      "content": "<|extratoken_16|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50273": {
+      "content": "<|extratoken_17|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50274": {
+      "content": "<|extratoken_18|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50275": {
+      "content": "<|extratoken_19|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50276": {
+      "content": "<|extratoken_20|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50277": {
+      "content": "<|extratoken_21|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50278": {
+      "content": "<|extratoken_22|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50279": {
+      "content": "<|extratoken_23|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50280": {
+      "content": "<|extratoken_24|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50281": {
+      "content": "<|extratoken_25|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50282": {
+      "content": "<|extratoken_26|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50283": {
+      "content": "<|extratoken_27|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50284": {
+      "content": "<|extratoken_28|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50285": {
+      "content": "<|extratoken_29|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50286": {
+      "content": "<|extratoken_30|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50287": {
+      "content": "<|extratoken_31|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50288": {
+      "content": "<|extratoken_32|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50289": {
+      "content": "<|extratoken_33|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50290": {
+      "content": "<|extratoken_34|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50291": {
+      "content": "<|extratoken_35|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50292": {
+      "content": "<|extratoken_36|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50293": {
+      "content": "<|extratoken_37|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50294": {
+      "content": "<|extratoken_38|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50295": {
+      "content": "<|extratoken_39|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50296": {
+      "content": "<|extratoken_40|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50297": {
+      "content": "<|extratoken_41|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50298": {
+      "content": "<|extratoken_42|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50299": {
+      "content": "<|extratoken_43|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50300": {
+      "content": "<|extratoken_44|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50301": {
+      "content": "<|extratoken_45|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50302": {
+      "content": "<|extratoken_46|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50303": {
+      "content": "<|extratoken_47|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50304": {
+      "content": "<|extratoken_48|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50305": {
+      "content": "<|extratoken_49|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50306": {
+      "content": "<|extratoken_50|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50307": {
+      "content": "<|extratoken_51|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50308": {
+      "content": "<|extratoken_52|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50309": {
+      "content": "<|extratoken_53|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50310": {
+      "content": "<|extratoken_54|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50311": {
+      "content": "<|extratoken_55|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50312": {
+      "content": "<|extratoken_56|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50313": {
+      "content": "<|extratoken_57|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50314": {
+      "content": "<|extratoken_58|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50315": {
+      "content": "<|extratoken_59|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50316": {
+      "content": "<|extratoken_60|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50317": {
+      "content": "<|extratoken_61|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50318": {
+      "content": "<|extratoken_62|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50319": {
+      "content": "<|extratoken_63|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50320": {
+      "content": "<|extratoken_64|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50321": {
+      "content": "<|extratoken_65|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50322": {
+      "content": "<|extratoken_66|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50323": {
+      "content": "<|extratoken_67|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50324": {
+      "content": "<|extratoken_68|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50325": {
+      "content": "<|extratoken_69|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50326": {
+      "content": "<|extratoken_70|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50327": {
+      "content": "<|extratoken_71|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50328": {
+      "content": "<|extratoken_72|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50329": {
+      "content": "<|extratoken_73|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50330": {
+      "content": "<|extratoken_74|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50331": {
+      "content": "<|extratoken_75|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50332": {
+      "content": "<|extratoken_76|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50333": {
+      "content": "<|extratoken_77|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50334": {
+      "content": "<|extratoken_78|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50335": {
+      "content": "<|extratoken_79|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50336": {
+      "content": "<|extratoken_80|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50337": {
+      "content": "<|extratoken_81|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50338": {
+      "content": "<|extratoken_82|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50339": {
+      "content": "<|extratoken_83|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50340": {
+      "content": "<|extratoken_84|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50341": {
+      "content": "<|extratoken_85|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50342": {
+      "content": "<|extratoken_86|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50343": {
+      "content": "<|extratoken_87|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50344": {
+      "content": "<|extratoken_88|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50345": {
+      "content": "<|extratoken_89|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50346": {
+      "content": "<|extratoken_90|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50347": {
+      "content": "<|extratoken_91|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50348": {
+      "content": "<|extratoken_92|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50349": {
+      "content": "<|extratoken_93|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50350": {
+      "content": "<|extratoken_94|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50351": {
+      "content": "<|extratoken_95|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50352": {
+      "content": "<|extratoken_96|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50353": {
+      "content": "<|extratoken_97|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50354": {
+      "content": "<|extratoken_98|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50355": {
+      "content": "<|extratoken_99|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50356": {
+      "content": "<|extratoken_100|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50357": {
+      "content": "<|extratoken_101|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50358": {
+      "content": "<|extratoken_102|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50359": {
+      "content": "<|extratoken_103|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50360": {
+      "content": "<|extratoken_104|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50361": {
+      "content": "<|extratoken_105|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50362": {
+      "content": "<|extratoken_106|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50363": {
+      "content": "<|extratoken_107|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50364": {
+      "content": "<|extratoken_108|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50365": {
+      "content": "<|extratoken_109|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50366": {
+      "content": "<|extratoken_110|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50367": {
+      "content": "<|extratoken_111|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50368": {
+      "content": "<|extratoken_112|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50369": {
+      "content": "<|extratoken_113|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50370": {
+      "content": "<|extratoken_114|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50371": {
+      "content": "<|extratoken_115|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50372": {
+      "content": "<|extratoken_116|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50373": {
+      "content": "<|extratoken_117|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50374": {
+      "content": "<|extratoken_118|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50375": {
+      "content": "<|extratoken_119|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50376": {
+      "content": "<|extratoken_120|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50377": {
+      "content": "<|extratoken_121|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50378": {
+      "content": "<|extratoken_122|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50379": {
+      "content": "<|extratoken_123|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50380": {
+      "content": "<|extratoken_124|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50381": {
+      "content": "<|extratoken_125|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50382": {
+      "content": "<|extratoken_126|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50383": {
+      "content": "<|extratoken_127|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50384": {
+      "content": "<|extratoken_128|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50385": {
+      "content": "<|extratoken_129|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50386": {
+      "content": "<|extratoken_130|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50387": {
+      "content": "<|extratoken_131|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50388": {
+      "content": "<|extratoken_132|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50389": {
+      "content": "<|extratoken_133|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50390": {
+      "content": "<|extratoken_134|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50391": {
+      "content": "<|extratoken_135|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50392": {
+      "content": "<|extratoken_136|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50393": {
+      "content": "<|extratoken_137|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50394": {
+      "content": "<|extratoken_138|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50395": {
+      "content": "<|extratoken_139|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50396": {
+      "content": "<|extratoken_140|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50397": {
+      "content": "<|extratoken_141|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50398": {
+      "content": "<|extratoken_142|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "50399": {
+      "content": "<|extratoken_143|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "model_max_length": 2048,
+  "pad_token": null,
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff