diff --git "a/notebook/tinyllama_to_mixtral_clonebase.ipynb" "b/notebook/tinyllama_to_mixtral_clonebase.ipynb" new file mode 100644--- /dev/null +++ "b/notebook/tinyllama_to_mixtral_clonebase.ipynb" @@ -0,0 +1,1839 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "machine_shape": "hm" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "ae1fb4b51ee2457998f8066635edcc14": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_347c9d512b454f4781f0dbbfda6c3724", + "IPY_MODEL_a458afb6c9b94046bc552a50ada0d867", + "IPY_MODEL_f9062f94c8ee41febe6717fcbcb5053f" + ], + "layout": "IPY_MODEL_a79cec942e034086927cfda8e9d8afca" + } + }, + "347c9d512b454f4781f0dbbfda6c3724": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f30a519a167249b285fa7cf44e5565bc", + "placeholder": "​", + "style": "IPY_MODEL_dba40b83726b4fdaa5d5c8cb1c4d3c3b", + "value": "Loading checkpoint shards: 100%" + } + }, + "a458afb6c9b94046bc552a50ada0d867": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_65348e69ef7440038f107f5e8efa3b7e", + "max": 22, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_ddde5ed5da554643aaea09e0bb0b797a", + "value": 22 + } + }, + "f9062f94c8ee41febe6717fcbcb5053f": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c138ccb796ff4c1a81ccce900bdec068", + "placeholder": "​", + "style": "IPY_MODEL_558ca035522c405fba9c7a3f9e8eb135", + "value": " 22/22 [00:12<00:00, 1.64it/s]" + } + }, + "a79cec942e034086927cfda8e9d8afca": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f30a519a167249b285fa7cf44e5565bc": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "dba40b83726b4fdaa5d5c8cb1c4d3c3b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "65348e69ef7440038f107f5e8efa3b7e": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ddde5ed5da554643aaea09e0bb0b797a": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "c138ccb796ff4c1a81ccce900bdec068": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "558ca035522c405fba9c7a3f9e8eb135": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "cells": [ + { + "cell_type": "code", + "source": [ + "model_name_or_path = \"TinyLlama/TinyLlama-1.1B-intermediate-step-1195k-token-2.5T\" #@param {type:\"string\"}\n", + "model_name = model_name_or_path.split(\"/\")[-1]\n", + "\n", + "save_mistral_dir = \"/content/tiny_mistral\" #@param {type:\"string\"}\n", + "\n", + "mixtral_num_experts = 8 #@param {type:\"integer\"}\n", + "save_mixtral_dir = \"/content/tiny_mixtral_x\" #@param {type:\"string\"}\n" + ], + "metadata": { + "cellView": "form", + "id": "IS9mKmQQEHbC" + }, + "execution_count": 1, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nO9OwwtND6bp" + }, + "outputs": [], + "source": [ + "!pip install transformers --upgrade\n", + "!pip install torch safetensors" + ] + }, + { + "cell_type": "code", + "source": [ + "!git clone https://huggingface.co/{model_name_or_path}" + ], + "metadata": { + "id": "-mUQ35RTEE_G" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import json\n", + "import torch\n", + "\n", + "# load config.json\n", + "with open(f\"{model_name}/config.json\") as f:\n", + " config = json.load(f)\n", + "\n", + "print(config)\n", + "\n", + "mistral_config = {\n", + " \"architectures\": [\n", + " \"MistralForCausalLM\"\n", + " ],\n", + " \"attention_dropout\": 0.0,\n", + " \"bos_token_id\": 1,\n", + " \"eos_token_id\": 2,\n", + " \"hidden_act\": \"silu\",\n", + " \"hidden_size\": 4096,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 14336,\n", + " \"max_position_embeddings\": 32768,\n", + " \"model_type\": \"mistral\",\n", + " \"num_attention_heads\": 32,\n", + " \"num_hidden_layers\": 32,\n", + " \"num_key_value_heads\": 8,\n", + " \"rms_norm_eps\": 1e-05,\n", + " \"rope_theta\": 1000000.0,\n", + " \"sliding_window\": None,\n", + " \"tie_word_embeddings\": False,\n", + " # \"torch_dtype\": \"bfloat16\",\n", + " \"transformers_version\": \"4.36.0\",\n", + " \"use_cache\": True,\n", + " \"vocab_size\": 32000\n", + "}\n", + "mistral_config[\"architectures\"] = [\"MistralForCausalLM\"]\n", + "mistral_config[\"model_type\"] = \"mistral\"\n", + "mistral_config[\"bos_token_id\"] = config[\"bos_token_id\"]\n", + "mistral_config[\"eos_token_id\"] = config[\"eos_token_id\"]\n", + "mistral_config[\"hidden_act\"] = config[\"hidden_act\"]\n", + "mistral_config[\"hidden_size\"] = config[\"hidden_size\"]\n", + "mistral_config[\"initializer_range\"] = config[\"initializer_range\"]\n", + "mistral_config[\"intermediate_size\"] = config[\"intermediate_size\"]\n", + "mistral_config[\"max_position_embeddings\"] = config[\"max_position_embeddings\"]\n", + "mistral_config[\"num_attention_heads\"] = config[\"num_attention_heads\"]\n", + "mistral_config[\"num_hidden_layers\"] = config[\"num_hidden_layers\"]\n", + "mistral_config[\"num_key_value_heads\"] = config[\"num_key_value_heads\"]\n", + "mistral_config[\"rms_norm_eps\"] = config[\"rms_norm_eps\"]\n", + "mistral_config[\"rope_theta\"] = 1000000.0\n", + "mistral_config[\"sliding_window\"] = None\n", + "mistral_config[\"tie_word_embeddings\"] = config[\"tie_word_embeddings\"]\n", + "mistral_config[\"torch_dtype\"] = config[\"torch_dtype\"]\n", + "mistral_config[\"transformers_version\"] = \"4.36.0\"\n", + "mistral_config[\"use_cache\"] = config[\"use_cache\"]\n", + "mistral_config[\"vocab_size\"] = config[\"vocab_size\"]\n", + "\n", + "# save tokenizer and model\n", + "from transformers import AutoTokenizer\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)\n", + "tokenizer.save_pretrained(save_mistral_dir)\n", + "\n", + "from transformers import AutoModelForCausalLM\n", + "model = AutoModelForCausalLM.from_pretrained(model_name_or_path)\n", + "if model.dtype == torch.float32:\n", + " model.half()\n", + " model.to(torch.bfloat16)\n", + " mistral_config[\"torch_dtype\"] = \"bfloat16\"\n", + "\n", + "model.save_pretrained(save_mistral_dir)\n", + "\n", + "# save convert mistral config\n", + "with open(f\"{save_mistral_dir}/config.json\", \"w\") as f:\n", + " json.dump(mistral_config, f, indent=2)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ITP1ylgIEaUm", + "outputId": "7d5175f6-a686-47b0-ce5a-1c89464c05e5" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{'_name_or_path': 'meta-llama/Llama-2-7b-hf', 'architectures': ['LlamaForCausalLM'], 'bos_token_id': 1, 'eos_token_id': 2, 'hidden_act': 'silu', 'hidden_size': 2048, 'initializer_range': 0.02, 'intermediate_size': 5632, 'max_position_embeddings': 2048, 'model_type': 'llama', 'num_attention_heads': 32, 'num_hidden_layers': 22, 'num_key_value_heads': 4, 'pretraining_tp': 1, 'rms_norm_eps': 1e-05, 'rope_scaling': None, 'tie_word_embeddings': False, 'torch_dtype': 'float32', 'transformers_version': '4.31.0.dev0', 'use_cache': True, 'vocab_size': 32000}\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#\n", + "# モデルの出力テスト\n", + "#\n", + "from transformers import AutoModelForCausalLM, MistralForCausalLM, MixtralForCausalLM\n", + "def test_gen(model_name_or_path):\n", + "\n", + " device = \"cpu\" # ここを変えてね\n", + "\n", + " model = AutoModelForCausalLM.from_pretrained(model_name_or_path)\n", + "\n", + " print(\"check model load \")\n", + " print(model.config)\n", + " print(model)\n", + "\n", + " print(\"check model generate text\")\n", + " messages = [\n", + " {\"role\": \"user\", \"content\": \"What is your favourite condiment?\"},\n", + " {\"role\": \"assistant\", \"content\": \"Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!\"},\n", + " {\"role\": \"user\", \"content\": \"Do you have mayonnaise recipes?\"}\n", + " ]\n", + "\n", + " encodeds = tokenizer.apply_chat_template(messages, return_tensors=\"pt\")\n", + "\n", + " model_inputs = encodeds.to(device)\n", + " model.to(device)\n", + "\n", + " generated_ids = model.generate(model_inputs, max_new_tokens=128, do_sample=True)\n", + " decoded = tokenizer.batch_decode(generated_ids)\n", + " print(decoded[0])\n", + " print(\"------------------------\")\n", + " return model, tokenizer\n", + "\n", + "_ , _ = test_gen(save_mistral_dir)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Zx_NM0wEHjmU", + "outputId": "13b64987-2079-44cd-c707-13a4fe77d474" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "check model load \n", + "MistralConfig {\n", + " \"_name_or_path\": \"/content/tiny_mistral\",\n", + " \"architectures\": [\n", + " \"MistralForCausalLM\"\n", + " ],\n", + " \"attention_dropout\": 0.0,\n", + " \"bos_token_id\": 1,\n", + " \"eos_token_id\": 2,\n", + " \"hidden_act\": \"silu\",\n", + " \"hidden_size\": 2048,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 5632,\n", + " \"max_position_embeddings\": 2048,\n", + " \"model_type\": \"mistral\",\n", + " \"num_attention_heads\": 32,\n", + " \"num_hidden_layers\": 22,\n", + " \"num_key_value_heads\": 4,\n", + " \"rms_norm_eps\": 1e-05,\n", + " \"rope_theta\": 1000000.0,\n", + " \"sliding_window\": null,\n", + " \"tie_word_embeddings\": false,\n", + " \"torch_dtype\": \"bfloat16\",\n", + " \"transformers_version\": \"4.36.1\",\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 32000\n", + "}\n", + "\n", + "MistralForCausalLM(\n", + " (model): MistralModel(\n", + " (embed_tokens): Embedding(32000, 2048)\n", + " (layers): ModuleList(\n", + " (0-21): 22 x MistralDecoderLayer(\n", + " (self_attn): MistralAttention(\n", + " (q_proj): Linear(in_features=2048, out_features=2048, bias=False)\n", + " (k_proj): Linear(in_features=2048, out_features=256, bias=False)\n", + " (v_proj): Linear(in_features=2048, out_features=256, bias=False)\n", + " (o_proj): Linear(in_features=2048, out_features=2048, bias=False)\n", + " (rotary_emb): MistralRotaryEmbedding()\n", + " )\n", + " (mlp): MistralMLP(\n", + " (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)\n", + " (up_proj): Linear(in_features=2048, out_features=5632, bias=False)\n", + " (down_proj): Linear(in_features=5632, out_features=2048, bias=False)\n", + " (act_fn): SiLU()\n", + " )\n", + " (input_layernorm): MistralRMSNorm()\n", + " (post_attention_layernorm): MistralRMSNorm()\n", + " )\n", + " )\n", + " (norm): MistralRMSNorm()\n", + " )\n", + " (lm_head): Linear(in_features=2048, out_features=32000, bias=False)\n", + ")\n", + "check model generate text\n", + " [INST] What is your favourite condiment? [/INST] Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen! [INST] Do you have mayonnaise recipes? [/INST] ᴍ [/INST] We are about to test a Mayonnaise recipe. ꧁[INST] It's really good. ᴍ꧁ [INST] Do you know how to make one? [/INST] I've eaten many on my recent days. But, I didn't know any recipe.\n", + "[INST] Not here. But, I have tested and I am going to try this recipes sometime. I am so excited!\n", + "ᴍ That is very useful for me. I'd have love to try.���\n", + "------------------------\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#\n", + "# mixtral config setting\n", + "#\n", + "\n", + "mixtral_config = {\n", + " \"architectures\": [\n", + " \"MixtralForCausalLM\"\n", + " ],\n", + " \"attention_dropout\": 0.0,\n", + " \"bos_token_id\": 1,\n", + " \"eos_token_id\": 2,\n", + " \"hidden_act\": \"silu\",\n", + " \"hidden_size\": 4096,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 14336,\n", + " \"max_position_embeddings\": 32768,\n", + " \"model_type\": \"mixtral\",\n", + " \"num_attention_heads\": 32,\n", + " \"num_experts_per_tok\": 2,\n", + " \"num_hidden_layers\": 32,\n", + " \"num_key_value_heads\": 8,\n", + " \"num_local_experts\": 8,\n", + " \"output_router_logits\": False,\n", + " \"rms_norm_eps\": 1e-05,\n", + " \"rope_theta\": 1000000.0,\n", + " \"router_aux_loss_coef\": 0.02,\n", + " \"sliding_window\": None,\n", + " \"tie_word_embeddings\": False,\n", + " \"torch_dtype\": \"bfloat16\",\n", + " \"transformers_version\": \"4.36.0.dev0\",\n", + " \"use_cache\": True,\n", + " \"vocab_size\": 32000\n", + "}\n", + "\n", + "mixtral_config[\"architectures\"] = [\"MixtralForCausalLM\"]\n", + "mixtral_config[\"model_type\"] = \"mixtral\"\n", + "mixtral_config[\"num_experts_per_tok\"] = 2\n", + "mixtral_config[\"num_local_experts\"] = mixtral_num_experts\n", + "\n", + "mixtral_config[\"bos_token_id\"] = mistral_config[\"bos_token_id\"]\n", + "mixtral_config[\"eos_token_id\"] = mistral_config[\"eos_token_id\"]\n", + "mixtral_config[\"hidden_act\"] = mistral_config[\"hidden_act\"]\n", + "mixtral_config[\"hidden_size\"] = mistral_config[\"hidden_size\"]\n", + "mixtral_config[\"initializer_range\"] = mistral_config[\"initializer_range\"]\n", + "mixtral_config[\"intermediate_size\"] = mistral_config[\"intermediate_size\"]\n", + "mixtral_config[\"max_position_embeddings\"] = mistral_config[\"max_position_embeddings\"]\n", + "mixtral_config[\"num_attention_heads\"] = mistral_config[\"num_attention_heads\"]\n", + "mixtral_config[\"num_hidden_layers\"] = mistral_config[\"num_hidden_layers\"]\n", + "mixtral_config[\"num_key_value_heads\"] = mistral_config[\"num_key_value_heads\"]\n", + "mixtral_config[\"rms_norm_eps\"] = mistral_config[\"rms_norm_eps\"]\n", + "mixtral_config[\"rope_theta\"] = mistral_config[\"rope_theta\"]\n", + "mixtral_config[\"sliding_window\"] = mistral_config[\"sliding_window\"]\n", + "mixtral_config[\"tie_word_embeddings\"] = mistral_config[\"tie_word_embeddings\"]\n", + "mixtral_config[\"torch_dtype\"] = mistral_config[\"torch_dtype\"]\n", + "mixtral_config[\"transformers_version\"] = \"4.36.0.dev0\"\n", + "mixtral_config[\"use_cache\"] = mistral_config[\"use_cache\"]\n", + "mixtral_config[\"vocab_size\"] = mistral_config[\"vocab_size\"]\n", + "\n", + "print(json.dumps(mixtral_config,indent=2))\n", + "\n", + "# configをsave\n", + "!mkdir -p {save_mixtral_dir}\n", + "with open(f\"{save_mixtral_dir}/config.json\", \"w\") as f:\n", + " json.dump(mixtral_config, f, indent=2)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "zipdtc3AIWYD", + "outputId": "222d5380-7228-4412-8684-cf6d1c851e74" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "{\n", + " \"architectures\": [\n", + " \"MixtralForCausalLM\"\n", + " ],\n", + " \"attention_dropout\": 0.0,\n", + " \"bos_token_id\": 1,\n", + " \"eos_token_id\": 2,\n", + " \"hidden_act\": \"silu\",\n", + " \"hidden_size\": 2048,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 5632,\n", + " \"max_position_embeddings\": 2048,\n", + " \"model_type\": \"mixtral\",\n", + " \"num_attention_heads\": 32,\n", + " \"num_experts_per_tok\": 2,\n", + " \"num_hidden_layers\": 22,\n", + " \"num_key_value_heads\": 4,\n", + " \"num_local_experts\": 8,\n", + " \"output_router_logits\": false,\n", + " \"rms_norm_eps\": 1e-05,\n", + " \"rope_theta\": 1000000.0,\n", + " \"router_aux_loss_coef\": 0.02,\n", + " \"sliding_window\": null,\n", + " \"tie_word_embeddings\": false,\n", + " \"torch_dtype\": \"bfloat16\",\n", + " \"transformers_version\": \"4.36.0.dev0\",\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 32000\n", + "}\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# copy other model files\n", + "\n", + "# save tokenizer\n", + "if tokenizer is None:\n", + " from transformers import AutoTokenizer\n", + " tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)\n", + "\n", + "tokenizer.save_pretrained(save_mixtral_dir)\n", + "\n", + "!cp {save_mistral_dir}/generation_config.json {save_mixtral_dir}/generation_config.json\n" + ], + "metadata": { + "id": "T2uTzZHyk6vS" + }, + "execution_count": 7, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# convert mixtral clone\n", + "import torch\n", + "from safetensors import safe_open\n", + "from safetensors.torch import save_file\n", + "import re\n", + "\n", + "def convert_weight_name(mistral_key, mixtral_expert_num):\n", + " if \"mlp.gate_proj\" in mistral_key:\n", + " return mistral_key.replace(\".mlp.gate_proj.\", f\".block_sparse_moe.experts.{mixtral_expert_num}.w1.\")\n", + " elif \"mlp.down_proj\" in mistral_key:\n", + " return mistral_key.replace(\".mlp.down_proj.\", f\".block_sparse_moe.experts.{mixtral_expert_num}.w2.\")\n", + " elif \"mlp.up_proj\" in mistral_key:\n", + " return mistral_key.replace(\".mlp.up_proj.\" , f\".block_sparse_moe.experts.{mixtral_expert_num}.w3.\")\n", + " else:\n", + " return mistral_key\n", + "\n", + "def is_experts_key(mistral_key):\n", + " return \".mlp.\" in mistral_key\n", + "\n", + "def get_layer(mistral_key):\n", + " layer = re.match(r'model[.]layers[.]\\d+[.]', mistral_key)\n", + " if layer is not None:\n", + " return int(re.findall(r'\\d+', layer[0])[0])\n", + " return None\n", + "\n", + "def get_weight_byte_size(weight):\n", + "\n", + " if isinstance(weight, torch.Tensor):\n", + " weight_byte_size = weight.nelement() * weight.element_size()\n", + " else:\n", + " weight_byte_size = sum(p.nelement() * p.element_size() for p in weight.parameters())\n", + "\n", + " return weight_byte_size\n", + "\n", + "# mistralのweight取得\n", + "mistral_weights = safe_open(save_mistral_dir + \"/model.safetensors\", framework=\"pt\")\n", + "# print(mistral_weights.keys())\n", + "\n", + "first_weights = {}\n", + "\n", + "gate_shape = mistral_weights.get_tensor(\"model.layers.0.mlp.up_proj.weight\").shape\n", + "gate_tensor = torch.full((mixtral_num_experts, gate_shape[1]), 0.5)\n", + "\n", + "common_layer_weights = {}\n", + "\n", + "print(\"mixtral_num_experts\", mixtral_num_experts, \"gate_shape[1]\", gate_shape[1], \"gate_tensor\", gate_tensor)\n", + "\n", + "# max layer\n", + "max_layer_no = 0\n", + "for key in mistral_weights.keys():\n", + " layer_no = get_layer(key)\n", + " if layer_no is None:\n", + " first_weights[key] = mistral_weights.get_tensor(key)\n", + " else:\n", + " max_layer_no = max(max_layer_no, layer_no)\n", + "\n", + "mixtral_weight_map = {\n", + " \"metadata\": {\n", + " \"total_size\": 0\n", + " },\n", + " \"weight_map\": {\n", + " }\n", + "}\n", + "\n", + "total_size = 0\n", + "\n", + "!rm {save_mixtral_dir + \"/*.safetensors\"}\n", + "\n", + "for i in range(max_layer_no + 1):\n", + " weight_file_no = i + 1\n", + " layer_weights = {}\n", + "\n", + " # first weight\n", + " if weight_file_no == 1:\n", + " for key in first_weights.keys():\n", + " mixtral_key = convert_weight_name(key, 0)\n", + " layer_weights[mixtral_key] = first_weights[mixtral_key]\n", + " total_size += get_weight_byte_size(layer_weights[mixtral_key])\n", + " print(\"first\", mixtral_key, layer_weights[mixtral_key].shape)\n", + "\n", + "\n", + " for key in mistral_weights.keys():\n", + "\n", + " lk = re.match(re.compile(f\"model[.]layers[.]{i}[.]\"), key)\n", + " if lk is not None:\n", + " mistral_layer_key = key\n", + " if not is_experts_key(mistral_layer_key):\n", + " mixtral_key = convert_weight_name(mistral_layer_key, 0)\n", + " layer_weights[mixtral_key] = mistral_weights.get_tensor(mistral_layer_key)\n", + " total_size += get_weight_byte_size(layer_weights[mixtral_key])\n", + " print(\"layer\", i , mixtral_key, layer_weights[mixtral_key].shape)\n", + " else:\n", + " print(\"gen experts\")\n", + " for expert_no in range(mixtral_num_experts):\n", + " mixtral_key = convert_weight_name(mistral_layer_key, expert_no)\n", + " layer_weights[mixtral_key] = mistral_weights.get_tensor(mistral_layer_key).clone()\n", + " total_size += get_weight_byte_size(layer_weights[mixtral_key])\n", + " print(\"layer\", i , \"expert\", expert_no, mixtral_key, layer_weights[mixtral_key].shape)\n", + "\n", + " # gate\n", + " mixtral_key = f\"model.layers.{i}.block_sparse_moe.gate.weight\"\n", + " layer_weights[mixtral_key] = gate_tensor.clone()\n", + " total_size += get_weight_byte_size(layer_weights[mixtral_key])\n", + " print(\"layer\", i , \"gate\", mixtral_key, layer_weights[mixtral_key].shape)\n", + "\n", + " #フォーマットで0���め\n", + " tensor_weight_file_name = f\"model.layers.{weight_file_no:05d}-of-{max_layer_no + 1:05d}.safetensors\"\n", + "\n", + " # save safetensor\n", + " save_file(layer_weights, save_mixtral_dir + \"/\" + tensor_weight_file_name, metadata={\"format\":\"pt\"})\n", + " print(\"Save layer weighs\", i, tensor_weight_file_name)\n", + "\n", + " for key in layer_weights.keys():\n", + " mixtral_weight_map[\"weight_map\"][key] = tensor_weight_file_name\n", + "\n", + " print(i, tensor_weight_file_name)\n", + "\n", + "# set total size\n", + "mixtral_weight_map[\"metadata\"][\"total_size\"] = total_size\n", + "\n", + "# save model.safetensors.index.json\n", + "with open(save_mixtral_dir + \"/model.safetensors.index.json\", \"w\") as f:\n", + " json.dump(mixtral_weight_map, f, indent=2)\n", + "\n", + "print(mixtral_weight_map)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Lswg1ESdI7q9", + "outputId": "170e9c80-7856-4a64-b9d6-2701c650ed68" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "mixtral_num_experts 8 gate_shape[1] 2048 gate_tensor tensor([[0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000],\n", + " [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000],\n", + " [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000],\n", + " ...,\n", + " [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000],\n", + " [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000],\n", + " [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000]])\n", + "first lm_head.weight torch.Size([32000, 2048])\n", + "first model.embed_tokens.weight torch.Size([32000, 2048])\n", + "first model.norm.weight torch.Size([2048])\n", + "layer 0 model.layers.0.input_layernorm.weight torch.Size([2048])\n", + "gen experts\n", + "layer 0 expert 0 model.layers.0.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", + "layer 0 expert 1 model.layers.0.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", + "layer 0 expert 2 model.layers.0.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", + "layer 0 expert 3 model.layers.0.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", + "layer 0 expert 4 model.layers.0.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", + "layer 0 expert 5 model.layers.0.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", + "layer 0 expert 6 model.layers.0.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", + "layer 0 expert 7 model.layers.0.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", + "gen experts\n", + "layer 0 expert 0 model.layers.0.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", + "layer 0 expert 1 model.layers.0.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", + "layer 0 expert 2 model.layers.0.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", + "layer 0 expert 3 model.layers.0.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", + "layer 0 expert 4 model.layers.0.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", + "layer 0 expert 5 model.layers.0.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", + "layer 0 expert 6 model.layers.0.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", + "layer 0 expert 7 model.layers.0.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", + "gen experts\n", + "layer 0 expert 0 model.layers.0.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", + "layer 0 expert 1 model.layers.0.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", + "layer 0 expert 2 model.layers.0.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", + "layer 0 expert 3 model.layers.0.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", + "layer 0 expert 4 model.layers.0.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", + "layer 0 expert 5 model.layers.0.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", + "layer 0 expert 6 model.layers.0.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", + "layer 0 expert 7 model.layers.0.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", + "layer 0 model.layers.0.post_attention_layernorm.weight torch.Size([2048])\n", + "layer 0 model.layers.0.self_attn.k_proj.weight torch.Size([256, 2048])\n", + "layer 0 model.layers.0.self_attn.o_proj.weight torch.Size([2048, 2048])\n", + "layer 0 model.layers.0.self_attn.q_proj.weight torch.Size([2048, 2048])\n", + "layer 0 model.layers.0.self_attn.v_proj.weight torch.Size([256, 2048])\n", + "layer 0 gate model.layers.0.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", + "Save layer weighs 0 model.layers.00001-of-00022.safetensors\n", + "0 model.layers.00001-of-00022.safetensors\n", + "layer 1 model.layers.1.input_layernorm.weight torch.Size([2048])\n", + "gen experts\n", + "layer 1 expert 0 model.layers.1.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", + "layer 1 expert 1 model.layers.1.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", + "layer 1 expert 2 model.layers.1.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", + "layer 1 expert 3 model.layers.1.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", + "layer 1 expert 4 model.layers.1.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", + "layer 1 expert 5 model.layers.1.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", + "layer 1 expert 6 model.layers.1.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", + "layer 1 expert 7 model.layers.1.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", + "gen experts\n", + "layer 1 expert 0 model.layers.1.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", + "layer 1 expert 1 model.layers.1.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", + "layer 1 expert 2 model.layers.1.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", + "layer 1 expert 3 model.layers.1.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", + "layer 1 expert 4 model.layers.1.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", + "layer 1 expert 5 model.layers.1.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", + "layer 1 expert 6 model.layers.1.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", + "layer 1 expert 7 model.layers.1.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", + "gen experts\n", + "layer 1 expert 0 model.layers.1.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", + "layer 1 expert 1 model.layers.1.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", + "layer 1 expert 2 model.layers.1.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", + "layer 1 expert 3 model.layers.1.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", + "layer 1 expert 4 model.layers.1.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", + "layer 1 expert 5 model.layers.1.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", + "layer 1 expert 6 model.layers.1.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", + "layer 1 expert 7 model.layers.1.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", + "layer 1 model.layers.1.post_attention_layernorm.weight torch.Size([2048])\n", + "layer 1 model.layers.1.self_attn.k_proj.weight torch.Size([256, 2048])\n", + "layer 1 model.layers.1.self_attn.o_proj.weight torch.Size([2048, 2048])\n", + "layer 1 model.layers.1.self_attn.q_proj.weight torch.Size([2048, 2048])\n", + "layer 1 model.layers.1.self_attn.v_proj.weight torch.Size([256, 2048])\n", + "layer 1 gate model.layers.1.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", + "Save layer weighs 1 model.layers.00002-of-00022.safetensors\n", + "1 model.layers.00002-of-00022.safetensors\n", + "layer 2 model.layers.2.input_layernorm.weight torch.Size([2048])\n", + "gen experts\n", + "layer 2 expert 0 model.layers.2.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", + "layer 2 expert 1 model.layers.2.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", + "layer 2 expert 2 model.layers.2.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", + "layer 2 expert 3 model.layers.2.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", + "layer 2 expert 4 model.layers.2.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", + "layer 2 expert 5 model.layers.2.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", + "layer 2 expert 6 model.layers.2.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", + "layer 2 expert 7 model.layers.2.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", + "gen experts\n", + "layer 2 expert 0 model.layers.2.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", + "layer 2 expert 1 model.layers.2.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", + "layer 2 expert 2 model.layers.2.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", + "layer 2 expert 3 model.layers.2.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", + "layer 2 expert 4 model.layers.2.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", + "layer 2 expert 5 model.layers.2.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", + "layer 2 expert 6 model.layers.2.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", + "layer 2 expert 7 model.layers.2.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", + "gen experts\n", + "layer 2 expert 0 model.layers.2.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", + "layer 2 expert 1 model.layers.2.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", + "layer 2 expert 2 model.layers.2.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", + "layer 2 expert 3 model.layers.2.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", + "layer 2 expert 4 model.layers.2.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", + "layer 2 expert 5 model.layers.2.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", + "layer 2 expert 6 model.layers.2.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", + "layer 2 expert 7 model.layers.2.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", + "layer 2 model.layers.2.post_attention_layernorm.weight torch.Size([2048])\n", + "layer 2 model.layers.2.self_attn.k_proj.weight torch.Size([256, 2048])\n", + "layer 2 model.layers.2.self_attn.o_proj.weight torch.Size([2048, 2048])\n", + "layer 2 model.layers.2.self_attn.q_proj.weight torch.Size([2048, 2048])\n", + "layer 2 model.layers.2.self_attn.v_proj.weight torch.Size([256, 2048])\n", + "layer 2 gate model.layers.2.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", + "Save layer weighs 2 model.layers.00003-of-00022.safetensors\n", + "2 model.layers.00003-of-00022.safetensors\n", + "layer 3 model.layers.3.input_layernorm.weight torch.Size([2048])\n", + "gen experts\n", + "layer 3 expert 0 model.layers.3.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", + "layer 3 expert 1 model.layers.3.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", + "layer 3 expert 2 model.layers.3.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", + "layer 3 expert 3 model.layers.3.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", + "layer 3 expert 4 model.layers.3.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", + "layer 3 expert 5 model.layers.3.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", + "layer 3 expert 6 model.layers.3.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", + "layer 3 expert 7 model.layers.3.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", + "gen experts\n", + "layer 3 expert 0 model.layers.3.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", + "layer 3 expert 1 model.layers.3.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", + "layer 3 expert 2 model.layers.3.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", + "layer 3 expert 3 model.layers.3.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", + "layer 3 expert 4 model.layers.3.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", + "layer 3 expert 5 model.layers.3.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", + "layer 3 expert 6 model.layers.3.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", + "layer 3 expert 7 model.layers.3.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", + "gen experts\n", + "layer 3 expert 0 model.layers.3.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", + "layer 3 expert 1 model.layers.3.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", + "layer 3 expert 2 model.layers.3.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", + "layer 3 expert 3 model.layers.3.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", + "layer 3 expert 4 model.layers.3.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", + "layer 3 expert 5 model.layers.3.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", + "layer 3 expert 6 model.layers.3.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", + "layer 3 expert 7 model.layers.3.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", + "layer 3 model.layers.3.post_attention_layernorm.weight torch.Size([2048])\n", + "layer 3 model.layers.3.self_attn.k_proj.weight torch.Size([256, 2048])\n", + "layer 3 model.layers.3.self_attn.o_proj.weight torch.Size([2048, 2048])\n", + "layer 3 model.layers.3.self_attn.q_proj.weight torch.Size([2048, 2048])\n", + "layer 3 model.layers.3.self_attn.v_proj.weight torch.Size([256, 2048])\n", + "layer 3 gate model.layers.3.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", + "Save layer weighs 3 model.layers.00004-of-00022.safetensors\n", + "3 model.layers.00004-of-00022.safetensors\n", + "layer 4 model.layers.4.input_layernorm.weight torch.Size([2048])\n", + "gen experts\n", + "layer 4 expert 0 model.layers.4.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", + "layer 4 expert 1 model.layers.4.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", + "layer 4 expert 2 model.layers.4.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", + "layer 4 expert 3 model.layers.4.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", + "layer 4 expert 4 model.layers.4.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", + "layer 4 expert 5 model.layers.4.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", + "layer 4 expert 6 model.layers.4.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", + "layer 4 expert 7 model.layers.4.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", + "gen experts\n", + "layer 4 expert 0 model.layers.4.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", + "layer 4 expert 1 model.layers.4.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", + "layer 4 expert 2 model.layers.4.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", + "layer 4 expert 3 model.layers.4.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", + "layer 4 expert 4 model.layers.4.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", + "layer 4 expert 5 model.layers.4.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", + "layer 4 expert 6 model.layers.4.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", + "layer 4 expert 7 model.layers.4.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", + "gen experts\n", + "layer 4 expert 0 model.layers.4.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", + "layer 4 expert 1 model.layers.4.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", + "layer 4 expert 2 model.layers.4.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", + "layer 4 expert 3 model.layers.4.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", + "layer 4 expert 4 model.layers.4.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", + "layer 4 expert 5 model.layers.4.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", + "layer 4 expert 6 model.layers.4.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", + "layer 4 expert 7 model.layers.4.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", + "layer 4 model.layers.4.post_attention_layernorm.weight torch.Size([2048])\n", + "layer 4 model.layers.4.self_attn.k_proj.weight torch.Size([256, 2048])\n", + "layer 4 model.layers.4.self_attn.o_proj.weight torch.Size([2048, 2048])\n", + "layer 4 model.layers.4.self_attn.q_proj.weight torch.Size([2048, 2048])\n", + "layer 4 model.layers.4.self_attn.v_proj.weight torch.Size([256, 2048])\n", + "layer 4 gate model.layers.4.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", + "Save layer weighs 4 model.layers.00005-of-00022.safetensors\n", + "4 model.layers.00005-of-00022.safetensors\n", + "layer 5 model.layers.5.input_layernorm.weight torch.Size([2048])\n", + "gen experts\n", + "layer 5 expert 0 model.layers.5.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", + "layer 5 expert 1 model.layers.5.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", + "layer 5 expert 2 model.layers.5.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", + "layer 5 expert 3 model.layers.5.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", + "layer 5 expert 4 model.layers.5.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", + "layer 5 expert 5 model.layers.5.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", + "layer 5 expert 6 model.layers.5.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", + "layer 5 expert 7 model.layers.5.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", + "gen experts\n", + "layer 5 expert 0 model.layers.5.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", + "layer 5 expert 1 model.layers.5.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", + "layer 5 expert 2 model.layers.5.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", + "layer 5 expert 3 model.layers.5.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", + "layer 5 expert 4 model.layers.5.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", + "layer 5 expert 5 model.layers.5.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", + "layer 5 expert 6 model.layers.5.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", + "layer 5 expert 7 model.layers.5.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", + "gen experts\n", + "layer 5 expert 0 model.layers.5.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", + "layer 5 expert 1 model.layers.5.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", + "layer 5 expert 2 model.layers.5.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", + "layer 5 expert 3 model.layers.5.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", + "layer 5 expert 4 model.layers.5.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", + "layer 5 expert 5 model.layers.5.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", + "layer 5 expert 6 model.layers.5.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", + "layer 5 expert 7 model.layers.5.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", + "layer 5 model.layers.5.post_attention_layernorm.weight torch.Size([2048])\n", + "layer 5 model.layers.5.self_attn.k_proj.weight torch.Size([256, 2048])\n", + "layer 5 model.layers.5.self_attn.o_proj.weight torch.Size([2048, 2048])\n", + "layer 5 model.layers.5.self_attn.q_proj.weight torch.Size([2048, 2048])\n", + "layer 5 model.layers.5.self_attn.v_proj.weight torch.Size([256, 2048])\n", + "layer 5 gate model.layers.5.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", + "Save layer weighs 5 model.layers.00006-of-00022.safetensors\n", + "5 model.layers.00006-of-00022.safetensors\n", + "layer 6 model.layers.6.input_layernorm.weight torch.Size([2048])\n", + "gen experts\n", + "layer 6 expert 0 model.layers.6.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", + "layer 6 expert 1 model.layers.6.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", + "layer 6 expert 2 model.layers.6.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", + "layer 6 expert 3 model.layers.6.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", + "layer 6 expert 4 model.layers.6.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", + "layer 6 expert 5 model.layers.6.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", + "layer 6 expert 6 model.layers.6.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", + "layer 6 expert 7 model.layers.6.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", + "gen experts\n", + "layer 6 expert 0 model.layers.6.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", + "layer 6 expert 1 model.layers.6.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", + "layer 6 expert 2 model.layers.6.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", + "layer 6 expert 3 model.layers.6.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", + "layer 6 expert 4 model.layers.6.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", + "layer 6 expert 5 model.layers.6.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", + "layer 6 expert 6 model.layers.6.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", + "layer 6 expert 7 model.layers.6.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", + "gen experts\n", + "layer 6 expert 0 model.layers.6.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", + "layer 6 expert 1 model.layers.6.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", + "layer 6 expert 2 model.layers.6.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", + "layer 6 expert 3 model.layers.6.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", + "layer 6 expert 4 model.layers.6.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", + "layer 6 expert 5 model.layers.6.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", + "layer 6 expert 6 model.layers.6.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", + "layer 6 expert 7 model.layers.6.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", + "layer 6 model.layers.6.post_attention_layernorm.weight torch.Size([2048])\n", + "layer 6 model.layers.6.self_attn.k_proj.weight torch.Size([256, 2048])\n", + "layer 6 model.layers.6.self_attn.o_proj.weight torch.Size([2048, 2048])\n", + "layer 6 model.layers.6.self_attn.q_proj.weight torch.Size([2048, 2048])\n", + "layer 6 model.layers.6.self_attn.v_proj.weight torch.Size([256, 2048])\n", + "layer 6 gate model.layers.6.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", + "Save layer weighs 6 model.layers.00007-of-00022.safetensors\n", + "6 model.layers.00007-of-00022.safetensors\n", + "layer 7 model.layers.7.input_layernorm.weight torch.Size([2048])\n", + "gen experts\n", + "layer 7 expert 0 model.layers.7.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", + "layer 7 expert 1 model.layers.7.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", + "layer 7 expert 2 model.layers.7.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", + "layer 7 expert 3 model.layers.7.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", + "layer 7 expert 4 model.layers.7.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", + "layer 7 expert 5 model.layers.7.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", + "layer 7 expert 6 model.layers.7.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", + "layer 7 expert 7 model.layers.7.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", + "gen experts\n", + "layer 7 expert 0 model.layers.7.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", + "layer 7 expert 1 model.layers.7.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", + "layer 7 expert 2 model.layers.7.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", + "layer 7 expert 3 model.layers.7.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", + "layer 7 expert 4 model.layers.7.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", + "layer 7 expert 5 model.layers.7.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", + "layer 7 expert 6 model.layers.7.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", + "layer 7 expert 7 model.layers.7.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", + "gen experts\n", + "layer 7 expert 0 model.layers.7.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", + "layer 7 expert 1 model.layers.7.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", + "layer 7 expert 2 model.layers.7.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", + "layer 7 expert 3 model.layers.7.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", + "layer 7 expert 4 model.layers.7.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", + "layer 7 expert 5 model.layers.7.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", + "layer 7 expert 6 model.layers.7.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", + "layer 7 expert 7 model.layers.7.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", + "layer 7 model.layers.7.post_attention_layernorm.weight torch.Size([2048])\n", + "layer 7 model.layers.7.self_attn.k_proj.weight torch.Size([256, 2048])\n", + "layer 7 model.layers.7.self_attn.o_proj.weight torch.Size([2048, 2048])\n", + "layer 7 model.layers.7.self_attn.q_proj.weight torch.Size([2048, 2048])\n", + "layer 7 model.layers.7.self_attn.v_proj.weight torch.Size([256, 2048])\n", + "layer 7 gate model.layers.7.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", + "Save layer weighs 7 model.layers.00008-of-00022.safetensors\n", + "7 model.layers.00008-of-00022.safetensors\n", + "layer 8 model.layers.8.input_layernorm.weight torch.Size([2048])\n", + "gen experts\n", + "layer 8 expert 0 model.layers.8.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", + "layer 8 expert 1 model.layers.8.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", + "layer 8 expert 2 model.layers.8.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", + "layer 8 expert 3 model.layers.8.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", + "layer 8 expert 4 model.layers.8.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", + "layer 8 expert 5 model.layers.8.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", + "layer 8 expert 6 model.layers.8.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", + "layer 8 expert 7 model.layers.8.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", + "gen experts\n", + "layer 8 expert 0 model.layers.8.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", + "layer 8 expert 1 model.layers.8.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", + "layer 8 expert 2 model.layers.8.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", + "layer 8 expert 3 model.layers.8.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", + "layer 8 expert 4 model.layers.8.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", + "layer 8 expert 5 model.layers.8.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", + "layer 8 expert 6 model.layers.8.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", + "layer 8 expert 7 model.layers.8.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", + "gen experts\n", + "layer 8 expert 0 model.layers.8.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", + "layer 8 expert 1 model.layers.8.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", + "layer 8 expert 2 model.layers.8.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", + "layer 8 expert 3 model.layers.8.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", + "layer 8 expert 4 model.layers.8.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", + "layer 8 expert 5 model.layers.8.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", + "layer 8 expert 6 model.layers.8.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", + "layer 8 expert 7 model.layers.8.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", + "layer 8 model.layers.8.post_attention_layernorm.weight torch.Size([2048])\n", + "layer 8 model.layers.8.self_attn.k_proj.weight torch.Size([256, 2048])\n", + "layer 8 model.layers.8.self_attn.o_proj.weight torch.Size([2048, 2048])\n", + "layer 8 model.layers.8.self_attn.q_proj.weight torch.Size([2048, 2048])\n", + "layer 8 model.layers.8.self_attn.v_proj.weight torch.Size([256, 2048])\n", + "layer 8 gate model.layers.8.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", + "Save layer weighs 8 model.layers.00009-of-00022.safetensors\n", + "8 model.layers.00009-of-00022.safetensors\n", + "layer 9 model.layers.9.input_layernorm.weight torch.Size([2048])\n", + "gen experts\n", + "layer 9 expert 0 model.layers.9.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", + "layer 9 expert 1 model.layers.9.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", + "layer 9 expert 2 model.layers.9.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", + "layer 9 expert 3 model.layers.9.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", + "layer 9 expert 4 model.layers.9.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", + "layer 9 expert 5 model.layers.9.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", + "layer 9 expert 6 model.layers.9.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", + "layer 9 expert 7 model.layers.9.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", + "gen experts\n", + "layer 9 expert 0 model.layers.9.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", + "layer 9 expert 1 model.layers.9.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", + "layer 9 expert 2 model.layers.9.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", + "layer 9 expert 3 model.layers.9.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", + "layer 9 expert 4 model.layers.9.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", + "layer 9 expert 5 model.layers.9.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", + "layer 9 expert 6 model.layers.9.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", + "layer 9 expert 7 model.layers.9.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", + "gen experts\n", + "layer 9 expert 0 model.layers.9.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", + "layer 9 expert 1 model.layers.9.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", + "layer 9 expert 2 model.layers.9.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", + "layer 9 expert 3 model.layers.9.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", + "layer 9 expert 4 model.layers.9.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", + "layer 9 expert 5 model.layers.9.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", + "layer 9 expert 6 model.layers.9.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", + "layer 9 expert 7 model.layers.9.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", + "layer 9 model.layers.9.post_attention_layernorm.weight torch.Size([2048])\n", + "layer 9 model.layers.9.self_attn.k_proj.weight torch.Size([256, 2048])\n", + "layer 9 model.layers.9.self_attn.o_proj.weight torch.Size([2048, 2048])\n", + "layer 9 model.layers.9.self_attn.q_proj.weight torch.Size([2048, 2048])\n", + "layer 9 model.layers.9.self_attn.v_proj.weight torch.Size([256, 2048])\n", + "layer 9 gate model.layers.9.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", + "Save layer weighs 9 model.layers.00010-of-00022.safetensors\n", + "9 model.layers.00010-of-00022.safetensors\n", + "layer 10 model.layers.10.input_layernorm.weight torch.Size([2048])\n", + "gen experts\n", + "layer 10 expert 0 model.layers.10.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", + "layer 10 expert 1 model.layers.10.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", + "layer 10 expert 2 model.layers.10.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", + "layer 10 expert 3 model.layers.10.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", + "layer 10 expert 4 model.layers.10.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", + "layer 10 expert 5 model.layers.10.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", + "layer 10 expert 6 model.layers.10.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", + "layer 10 expert 7 model.layers.10.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", + "gen experts\n", + "layer 10 expert 0 model.layers.10.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", + "layer 10 expert 1 model.layers.10.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", + "layer 10 expert 2 model.layers.10.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", + "layer 10 expert 3 model.layers.10.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", + "layer 10 expert 4 model.layers.10.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", + "layer 10 expert 5 model.layers.10.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", + "layer 10 expert 6 model.layers.10.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", + "layer 10 expert 7 model.layers.10.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", + "gen experts\n", + "layer 10 expert 0 model.layers.10.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", + "layer 10 expert 1 model.layers.10.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", + "layer 10 expert 2 model.layers.10.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", + "layer 10 expert 3 model.layers.10.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", + "layer 10 expert 4 model.layers.10.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", + "layer 10 expert 5 model.layers.10.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", + "layer 10 expert 6 model.layers.10.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", + "layer 10 expert 7 model.layers.10.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", + "layer 10 model.layers.10.post_attention_layernorm.weight torch.Size([2048])\n", + "layer 10 model.layers.10.self_attn.k_proj.weight torch.Size([256, 2048])\n", + "layer 10 model.layers.10.self_attn.o_proj.weight torch.Size([2048, 2048])\n", + "layer 10 model.layers.10.self_attn.q_proj.weight torch.Size([2048, 2048])\n", + "layer 10 model.layers.10.self_attn.v_proj.weight torch.Size([256, 2048])\n", + "layer 10 gate model.layers.10.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", + "Save layer weighs 10 model.layers.00011-of-00022.safetensors\n", + "10 model.layers.00011-of-00022.safetensors\n", + "layer 11 model.layers.11.input_layernorm.weight torch.Size([2048])\n", + "gen experts\n", + "layer 11 expert 0 model.layers.11.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", + "layer 11 expert 1 model.layers.11.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", + "layer 11 expert 2 model.layers.11.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", + "layer 11 expert 3 model.layers.11.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", + "layer 11 expert 4 model.layers.11.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", + "layer 11 expert 5 model.layers.11.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", + "layer 11 expert 6 model.layers.11.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", + "layer 11 expert 7 model.layers.11.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", + "gen experts\n", + "layer 11 expert 0 model.layers.11.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", + "layer 11 expert 1 model.layers.11.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", + "layer 11 expert 2 model.layers.11.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", + "layer 11 expert 3 model.layers.11.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", + "layer 11 expert 4 model.layers.11.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", + "layer 11 expert 5 model.layers.11.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", + "layer 11 expert 6 model.layers.11.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", + "layer 11 expert 7 model.layers.11.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", + "gen experts\n", + "layer 11 expert 0 model.layers.11.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", + "layer 11 expert 1 model.layers.11.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", + "layer 11 expert 2 model.layers.11.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", + "layer 11 expert 3 model.layers.11.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", + "layer 11 expert 4 model.layers.11.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", + "layer 11 expert 5 model.layers.11.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", + "layer 11 expert 6 model.layers.11.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", + "layer 11 expert 7 model.layers.11.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", + "layer 11 model.layers.11.post_attention_layernorm.weight torch.Size([2048])\n", + "layer 11 model.layers.11.self_attn.k_proj.weight torch.Size([256, 2048])\n", + "layer 11 model.layers.11.self_attn.o_proj.weight torch.Size([2048, 2048])\n", + "layer 11 model.layers.11.self_attn.q_proj.weight torch.Size([2048, 2048])\n", + "layer 11 model.layers.11.self_attn.v_proj.weight torch.Size([256, 2048])\n", + "layer 11 gate model.layers.11.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", + "Save layer weighs 11 model.layers.00012-of-00022.safetensors\n", + "11 model.layers.00012-of-00022.safetensors\n", + "layer 12 model.layers.12.input_layernorm.weight torch.Size([2048])\n", + "gen experts\n", + "layer 12 expert 0 model.layers.12.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", + "layer 12 expert 1 model.layers.12.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", + "layer 12 expert 2 model.layers.12.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", + "layer 12 expert 3 model.layers.12.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", + "layer 12 expert 4 model.layers.12.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", + "layer 12 expert 5 model.layers.12.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", + "layer 12 expert 6 model.layers.12.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", + "layer 12 expert 7 model.layers.12.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", + "gen experts\n", + "layer 12 expert 0 model.layers.12.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", + "layer 12 expert 1 model.layers.12.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", + "layer 12 expert 2 model.layers.12.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", + "layer 12 expert 3 model.layers.12.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", + "layer 12 expert 4 model.layers.12.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", + "layer 12 expert 5 model.layers.12.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", + "layer 12 expert 6 model.layers.12.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", + "layer 12 expert 7 model.layers.12.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", + "gen experts\n", + "layer 12 expert 0 model.layers.12.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", + "layer 12 expert 1 model.layers.12.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", + "layer 12 expert 2 model.layers.12.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", + "layer 12 expert 3 model.layers.12.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", + "layer 12 expert 4 model.layers.12.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", + "layer 12 expert 5 model.layers.12.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", + "layer 12 expert 6 model.layers.12.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", + "layer 12 expert 7 model.layers.12.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", + "layer 12 model.layers.12.post_attention_layernorm.weight torch.Size([2048])\n", + "layer 12 model.layers.12.self_attn.k_proj.weight torch.Size([256, 2048])\n", + "layer 12 model.layers.12.self_attn.o_proj.weight torch.Size([2048, 2048])\n", + "layer 12 model.layers.12.self_attn.q_proj.weight torch.Size([2048, 2048])\n", + "layer 12 model.layers.12.self_attn.v_proj.weight torch.Size([256, 2048])\n", + "layer 12 gate model.layers.12.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", + "Save layer weighs 12 model.layers.00013-of-00022.safetensors\n", + "12 model.layers.00013-of-00022.safetensors\n", + "layer 13 model.layers.13.input_layernorm.weight torch.Size([2048])\n", + "gen experts\n", + "layer 13 expert 0 model.layers.13.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", + "layer 13 expert 1 model.layers.13.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", + "layer 13 expert 2 model.layers.13.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", + "layer 13 expert 3 model.layers.13.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", + "layer 13 expert 4 model.layers.13.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", + "layer 13 expert 5 model.layers.13.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", + "layer 13 expert 6 model.layers.13.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", + "layer 13 expert 7 model.layers.13.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", + "gen experts\n", + "layer 13 expert 0 model.layers.13.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", + "layer 13 expert 1 model.layers.13.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", + "layer 13 expert 2 model.layers.13.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", + "layer 13 expert 3 model.layers.13.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", + "layer 13 expert 4 model.layers.13.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", + "layer 13 expert 5 model.layers.13.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", + "layer 13 expert 6 model.layers.13.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", + "layer 13 expert 7 model.layers.13.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", + "gen experts\n", + "layer 13 expert 0 model.layers.13.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", + "layer 13 expert 1 model.layers.13.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", + "layer 13 expert 2 model.layers.13.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", + "layer 13 expert 3 model.layers.13.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", + "layer 13 expert 4 model.layers.13.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", + "layer 13 expert 5 model.layers.13.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", + "layer 13 expert 6 model.layers.13.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", + "layer 13 expert 7 model.layers.13.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", + "layer 13 model.layers.13.post_attention_layernorm.weight torch.Size([2048])\n", + "layer 13 model.layers.13.self_attn.k_proj.weight torch.Size([256, 2048])\n", + "layer 13 model.layers.13.self_attn.o_proj.weight torch.Size([2048, 2048])\n", + "layer 13 model.layers.13.self_attn.q_proj.weight torch.Size([2048, 2048])\n", + "layer 13 model.layers.13.self_attn.v_proj.weight torch.Size([256, 2048])\n", + "layer 13 gate model.layers.13.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", + "Save layer weighs 13 model.layers.00014-of-00022.safetensors\n", + "13 model.layers.00014-of-00022.safetensors\n", + "layer 14 model.layers.14.input_layernorm.weight torch.Size([2048])\n", + "gen experts\n", + "layer 14 expert 0 model.layers.14.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", + "layer 14 expert 1 model.layers.14.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", + "layer 14 expert 2 model.layers.14.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", + "layer 14 expert 3 model.layers.14.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", + "layer 14 expert 4 model.layers.14.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", + "layer 14 expert 5 model.layers.14.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", + "layer 14 expert 6 model.layers.14.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", + "layer 14 expert 7 model.layers.14.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", + "gen experts\n", + "layer 14 expert 0 model.layers.14.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", + "layer 14 expert 1 model.layers.14.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", + "layer 14 expert 2 model.layers.14.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", + "layer 14 expert 3 model.layers.14.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", + "layer 14 expert 4 model.layers.14.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", + "layer 14 expert 5 model.layers.14.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", + "layer 14 expert 6 model.layers.14.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", + "layer 14 expert 7 model.layers.14.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", + "gen experts\n", + "layer 14 expert 0 model.layers.14.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", + "layer 14 expert 1 model.layers.14.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", + "layer 14 expert 2 model.layers.14.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", + "layer 14 expert 3 model.layers.14.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", + "layer 14 expert 4 model.layers.14.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", + "layer 14 expert 5 model.layers.14.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", + "layer 14 expert 6 model.layers.14.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", + "layer 14 expert 7 model.layers.14.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", + "layer 14 model.layers.14.post_attention_layernorm.weight torch.Size([2048])\n", + "layer 14 model.layers.14.self_attn.k_proj.weight torch.Size([256, 2048])\n", + "layer 14 model.layers.14.self_attn.o_proj.weight torch.Size([2048, 2048])\n", + "layer 14 model.layers.14.self_attn.q_proj.weight torch.Size([2048, 2048])\n", + "layer 14 model.layers.14.self_attn.v_proj.weight torch.Size([256, 2048])\n", + "layer 14 gate model.layers.14.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", + "Save layer weighs 14 model.layers.00015-of-00022.safetensors\n", + "14 model.layers.00015-of-00022.safetensors\n", + "layer 15 model.layers.15.input_layernorm.weight torch.Size([2048])\n", + "gen experts\n", + "layer 15 expert 0 model.layers.15.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", + "layer 15 expert 1 model.layers.15.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", + "layer 15 expert 2 model.layers.15.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", + "layer 15 expert 3 model.layers.15.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", + "layer 15 expert 4 model.layers.15.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", + "layer 15 expert 5 model.layers.15.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", + "layer 15 expert 6 model.layers.15.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", + "layer 15 expert 7 model.layers.15.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", + "gen experts\n", + "layer 15 expert 0 model.layers.15.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", + "layer 15 expert 1 model.layers.15.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", + "layer 15 expert 2 model.layers.15.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", + "layer 15 expert 3 model.layers.15.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", + "layer 15 expert 4 model.layers.15.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", + "layer 15 expert 5 model.layers.15.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", + "layer 15 expert 6 model.layers.15.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", + "layer 15 expert 7 model.layers.15.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", + "gen experts\n", + "layer 15 expert 0 model.layers.15.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", + "layer 15 expert 1 model.layers.15.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", + "layer 15 expert 2 model.layers.15.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", + "layer 15 expert 3 model.layers.15.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", + "layer 15 expert 4 model.layers.15.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", + "layer 15 expert 5 model.layers.15.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", + "layer 15 expert 6 model.layers.15.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", + "layer 15 expert 7 model.layers.15.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", + "layer 15 model.layers.15.post_attention_layernorm.weight torch.Size([2048])\n", + "layer 15 model.layers.15.self_attn.k_proj.weight torch.Size([256, 2048])\n", + "layer 15 model.layers.15.self_attn.o_proj.weight torch.Size([2048, 2048])\n", + "layer 15 model.layers.15.self_attn.q_proj.weight torch.Size([2048, 2048])\n", + "layer 15 model.layers.15.self_attn.v_proj.weight torch.Size([256, 2048])\n", + "layer 15 gate model.layers.15.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", + "Save layer weighs 15 model.layers.00016-of-00022.safetensors\n", + "15 model.layers.00016-of-00022.safetensors\n", + "layer 16 model.layers.16.input_layernorm.weight torch.Size([2048])\n", + "gen experts\n", + "layer 16 expert 0 model.layers.16.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", + "layer 16 expert 1 model.layers.16.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", + "layer 16 expert 2 model.layers.16.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", + "layer 16 expert 3 model.layers.16.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", + "layer 16 expert 4 model.layers.16.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", + "layer 16 expert 5 model.layers.16.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", + "layer 16 expert 6 model.layers.16.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", + "layer 16 expert 7 model.layers.16.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", + "gen experts\n", + "layer 16 expert 0 model.layers.16.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", + "layer 16 expert 1 model.layers.16.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", + "layer 16 expert 2 model.layers.16.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", + "layer 16 expert 3 model.layers.16.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", + "layer 16 expert 4 model.layers.16.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", + "layer 16 expert 5 model.layers.16.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", + "layer 16 expert 6 model.layers.16.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", + "layer 16 expert 7 model.layers.16.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", + "gen experts\n", + "layer 16 expert 0 model.layers.16.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", + "layer 16 expert 1 model.layers.16.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", + "layer 16 expert 2 model.layers.16.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", + "layer 16 expert 3 model.layers.16.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", + "layer 16 expert 4 model.layers.16.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", + "layer 16 expert 5 model.layers.16.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", + "layer 16 expert 6 model.layers.16.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", + "layer 16 expert 7 model.layers.16.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", + "layer 16 model.layers.16.post_attention_layernorm.weight torch.Size([2048])\n", + "layer 16 model.layers.16.self_attn.k_proj.weight torch.Size([256, 2048])\n", + "layer 16 model.layers.16.self_attn.o_proj.weight torch.Size([2048, 2048])\n", + "layer 16 model.layers.16.self_attn.q_proj.weight torch.Size([2048, 2048])\n", + "layer 16 model.layers.16.self_attn.v_proj.weight torch.Size([256, 2048])\n", + "layer 16 gate model.layers.16.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", + "Save layer weighs 16 model.layers.00017-of-00022.safetensors\n", + "16 model.layers.00017-of-00022.safetensors\n", + "layer 17 model.layers.17.input_layernorm.weight torch.Size([2048])\n", + "gen experts\n", + "layer 17 expert 0 model.layers.17.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", + "layer 17 expert 1 model.layers.17.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", + "layer 17 expert 2 model.layers.17.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", + "layer 17 expert 3 model.layers.17.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", + "layer 17 expert 4 model.layers.17.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", + "layer 17 expert 5 model.layers.17.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", + "layer 17 expert 6 model.layers.17.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", + "layer 17 expert 7 model.layers.17.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", + "gen experts\n", + "layer 17 expert 0 model.layers.17.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", + "layer 17 expert 1 model.layers.17.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", + "layer 17 expert 2 model.layers.17.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", + "layer 17 expert 3 model.layers.17.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", + "layer 17 expert 4 model.layers.17.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", + "layer 17 expert 5 model.layers.17.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", + "layer 17 expert 6 model.layers.17.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", + "layer 17 expert 7 model.layers.17.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", + "gen experts\n", + "layer 17 expert 0 model.layers.17.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", + "layer 17 expert 1 model.layers.17.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", + "layer 17 expert 2 model.layers.17.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", + "layer 17 expert 3 model.layers.17.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", + "layer 17 expert 4 model.layers.17.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", + "layer 17 expert 5 model.layers.17.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", + "layer 17 expert 6 model.layers.17.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", + "layer 17 expert 7 model.layers.17.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", + "layer 17 model.layers.17.post_attention_layernorm.weight torch.Size([2048])\n", + "layer 17 model.layers.17.self_attn.k_proj.weight torch.Size([256, 2048])\n", + "layer 17 model.layers.17.self_attn.o_proj.weight torch.Size([2048, 2048])\n", + "layer 17 model.layers.17.self_attn.q_proj.weight torch.Size([2048, 2048])\n", + "layer 17 model.layers.17.self_attn.v_proj.weight torch.Size([256, 2048])\n", + "layer 17 gate model.layers.17.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", + "Save layer weighs 17 model.layers.00018-of-00022.safetensors\n", + "17 model.layers.00018-of-00022.safetensors\n", + "layer 18 model.layers.18.input_layernorm.weight torch.Size([2048])\n", + "gen experts\n", + "layer 18 expert 0 model.layers.18.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", + "layer 18 expert 1 model.layers.18.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", + "layer 18 expert 2 model.layers.18.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", + "layer 18 expert 3 model.layers.18.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", + "layer 18 expert 4 model.layers.18.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", + "layer 18 expert 5 model.layers.18.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", + "layer 18 expert 6 model.layers.18.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", + "layer 18 expert 7 model.layers.18.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", + "gen experts\n", + "layer 18 expert 0 model.layers.18.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", + "layer 18 expert 1 model.layers.18.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", + "layer 18 expert 2 model.layers.18.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", + "layer 18 expert 3 model.layers.18.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", + "layer 18 expert 4 model.layers.18.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", + "layer 18 expert 5 model.layers.18.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", + "layer 18 expert 6 model.layers.18.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", + "layer 18 expert 7 model.layers.18.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", + "gen experts\n", + "layer 18 expert 0 model.layers.18.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", + "layer 18 expert 1 model.layers.18.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", + "layer 18 expert 2 model.layers.18.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", + "layer 18 expert 3 model.layers.18.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", + "layer 18 expert 4 model.layers.18.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", + "layer 18 expert 5 model.layers.18.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", + "layer 18 expert 6 model.layers.18.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", + "layer 18 expert 7 model.layers.18.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", + "layer 18 model.layers.18.post_attention_layernorm.weight torch.Size([2048])\n", + "layer 18 model.layers.18.self_attn.k_proj.weight torch.Size([256, 2048])\n", + "layer 18 model.layers.18.self_attn.o_proj.weight torch.Size([2048, 2048])\n", + "layer 18 model.layers.18.self_attn.q_proj.weight torch.Size([2048, 2048])\n", + "layer 18 model.layers.18.self_attn.v_proj.weight torch.Size([256, 2048])\n", + "layer 18 gate model.layers.18.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", + "Save layer weighs 18 model.layers.00019-of-00022.safetensors\n", + "18 model.layers.00019-of-00022.safetensors\n", + "layer 19 model.layers.19.input_layernorm.weight torch.Size([2048])\n", + "gen experts\n", + "layer 19 expert 0 model.layers.19.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", + "layer 19 expert 1 model.layers.19.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", + "layer 19 expert 2 model.layers.19.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", + "layer 19 expert 3 model.layers.19.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", + "layer 19 expert 4 model.layers.19.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", + "layer 19 expert 5 model.layers.19.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", + "layer 19 expert 6 model.layers.19.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", + "layer 19 expert 7 model.layers.19.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", + "gen experts\n", + "layer 19 expert 0 model.layers.19.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", + "layer 19 expert 1 model.layers.19.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", + "layer 19 expert 2 model.layers.19.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", + "layer 19 expert 3 model.layers.19.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", + "layer 19 expert 4 model.layers.19.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", + "layer 19 expert 5 model.layers.19.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", + "layer 19 expert 6 model.layers.19.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", + "layer 19 expert 7 model.layers.19.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", + "gen experts\n", + "layer 19 expert 0 model.layers.19.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", + "layer 19 expert 1 model.layers.19.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", + "layer 19 expert 2 model.layers.19.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", + "layer 19 expert 3 model.layers.19.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", + "layer 19 expert 4 model.layers.19.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", + "layer 19 expert 5 model.layers.19.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", + "layer 19 expert 6 model.layers.19.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", + "layer 19 expert 7 model.layers.19.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", + "layer 19 model.layers.19.post_attention_layernorm.weight torch.Size([2048])\n", + "layer 19 model.layers.19.self_attn.k_proj.weight torch.Size([256, 2048])\n", + "layer 19 model.layers.19.self_attn.o_proj.weight torch.Size([2048, 2048])\n", + "layer 19 model.layers.19.self_attn.q_proj.weight torch.Size([2048, 2048])\n", + "layer 19 model.layers.19.self_attn.v_proj.weight torch.Size([256, 2048])\n", + "layer 19 gate model.layers.19.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", + "Save layer weighs 19 model.layers.00020-of-00022.safetensors\n", + "19 model.layers.00020-of-00022.safetensors\n", + "layer 20 model.layers.20.input_layernorm.weight torch.Size([2048])\n", + "gen experts\n", + "layer 20 expert 0 model.layers.20.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", + "layer 20 expert 1 model.layers.20.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", + "layer 20 expert 2 model.layers.20.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", + "layer 20 expert 3 model.layers.20.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", + "layer 20 expert 4 model.layers.20.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", + "layer 20 expert 5 model.layers.20.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", + "layer 20 expert 6 model.layers.20.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", + "layer 20 expert 7 model.layers.20.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", + "gen experts\n", + "layer 20 expert 0 model.layers.20.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", + "layer 20 expert 1 model.layers.20.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", + "layer 20 expert 2 model.layers.20.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", + "layer 20 expert 3 model.layers.20.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", + "layer 20 expert 4 model.layers.20.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", + "layer 20 expert 5 model.layers.20.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", + "layer 20 expert 6 model.layers.20.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", + "layer 20 expert 7 model.layers.20.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", + "gen experts\n", + "layer 20 expert 0 model.layers.20.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", + "layer 20 expert 1 model.layers.20.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", + "layer 20 expert 2 model.layers.20.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", + "layer 20 expert 3 model.layers.20.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", + "layer 20 expert 4 model.layers.20.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", + "layer 20 expert 5 model.layers.20.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", + "layer 20 expert 6 model.layers.20.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", + "layer 20 expert 7 model.layers.20.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", + "layer 20 model.layers.20.post_attention_layernorm.weight torch.Size([2048])\n", + "layer 20 model.layers.20.self_attn.k_proj.weight torch.Size([256, 2048])\n", + "layer 20 model.layers.20.self_attn.o_proj.weight torch.Size([2048, 2048])\n", + "layer 20 model.layers.20.self_attn.q_proj.weight torch.Size([2048, 2048])\n", + "layer 20 model.layers.20.self_attn.v_proj.weight torch.Size([256, 2048])\n", + "layer 20 gate model.layers.20.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", + "Save layer weighs 20 model.layers.00021-of-00022.safetensors\n", + "20 model.layers.00021-of-00022.safetensors\n", + "layer 21 model.layers.21.input_layernorm.weight torch.Size([2048])\n", + "gen experts\n", + "layer 21 expert 0 model.layers.21.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n", + "layer 21 expert 1 model.layers.21.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n", + "layer 21 expert 2 model.layers.21.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n", + "layer 21 expert 3 model.layers.21.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n", + "layer 21 expert 4 model.layers.21.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n", + "layer 21 expert 5 model.layers.21.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n", + "layer 21 expert 6 model.layers.21.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n", + "layer 21 expert 7 model.layers.21.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n", + "gen experts\n", + "layer 21 expert 0 model.layers.21.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n", + "layer 21 expert 1 model.layers.21.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n", + "layer 21 expert 2 model.layers.21.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n", + "layer 21 expert 3 model.layers.21.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n", + "layer 21 expert 4 model.layers.21.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n", + "layer 21 expert 5 model.layers.21.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n", + "layer 21 expert 6 model.layers.21.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n", + "layer 21 expert 7 model.layers.21.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n", + "gen experts\n", + "layer 21 expert 0 model.layers.21.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n", + "layer 21 expert 1 model.layers.21.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n", + "layer 21 expert 2 model.layers.21.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n", + "layer 21 expert 3 model.layers.21.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n", + "layer 21 expert 4 model.layers.21.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n", + "layer 21 expert 5 model.layers.21.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n", + "layer 21 expert 6 model.layers.21.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n", + "layer 21 expert 7 model.layers.21.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n", + "layer 21 model.layers.21.post_attention_layernorm.weight torch.Size([2048])\n", + "layer 21 model.layers.21.self_attn.k_proj.weight torch.Size([256, 2048])\n", + "layer 21 model.layers.21.self_attn.o_proj.weight torch.Size([2048, 2048])\n", + "layer 21 model.layers.21.self_attn.q_proj.weight torch.Size([2048, 2048])\n", + "layer 21 model.layers.21.self_attn.v_proj.weight torch.Size([256, 2048])\n", + "layer 21 gate model.layers.21.block_sparse_moe.gate.weight torch.Size([8, 2048])\n", + "Save layer weighs 21 model.layers.00022-of-00022.safetensors\n", + "21 model.layers.00022-of-00022.safetensors\n", + "{'metadata': {'total_size': 12859265024}, 'weight_map': {'lm_head.weight': 'model.layers.00001-of-00022.safetensors', 'model.embed_tokens.weight': 'model.layers.00001-of-00022.safetensors', 'model.norm.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.input_layernorm.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.0.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.1.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.2.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.3.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.4.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.5.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.6.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.7.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.0.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.1.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.2.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.3.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.4.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.5.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.6.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.7.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.0.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.1.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.2.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.3.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.4.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.5.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.6.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.7.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.post_attention_layernorm.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.self_attn.k_proj.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.self_attn.o_proj.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.self_attn.q_proj.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.self_attn.v_proj.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.gate.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.1.input_layernorm.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.0.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.1.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.2.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.3.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.4.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.5.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.6.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.7.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.0.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.1.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.2.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.3.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.4.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.5.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.6.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.7.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.0.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.1.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.2.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.3.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.4.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.5.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.6.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.7.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.post_attention_layernorm.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.self_attn.k_proj.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.self_attn.o_proj.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.self_attn.q_proj.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.self_attn.v_proj.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.gate.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.2.input_layernorm.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.0.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.1.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.2.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.3.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.4.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.5.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.6.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.7.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.0.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.1.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.2.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.3.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.4.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.5.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.6.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.7.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.0.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.1.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.2.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.3.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.4.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.5.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.6.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.7.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.post_attention_layernorm.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.self_attn.k_proj.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.self_attn.o_proj.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.self_attn.q_proj.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.self_attn.v_proj.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.gate.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.3.input_layernorm.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.0.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.1.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.2.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.3.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.4.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.5.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.6.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.7.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.0.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.1.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.2.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.3.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.4.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.5.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.6.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.7.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.0.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.1.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.2.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.3.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.4.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.5.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.6.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.7.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.post_attention_layernorm.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.self_attn.k_proj.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.self_attn.o_proj.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.self_attn.q_proj.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.self_attn.v_proj.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.gate.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.4.input_layernorm.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.0.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.1.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.2.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.3.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.4.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.5.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.6.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.7.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.0.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.1.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.2.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.3.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.4.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.5.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.6.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.7.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.0.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.1.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.2.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.3.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.4.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.5.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.6.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.7.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.post_attention_layernorm.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.self_attn.k_proj.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.self_attn.o_proj.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.self_attn.q_proj.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.self_attn.v_proj.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.gate.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.5.input_layernorm.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.0.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.1.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.2.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.3.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.4.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.5.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.6.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.7.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.0.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.1.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.2.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.3.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.4.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.5.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.6.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.7.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.0.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.1.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.2.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.3.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.4.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.5.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.6.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.7.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.post_attention_layernorm.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.self_attn.k_proj.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.self_attn.o_proj.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.self_attn.q_proj.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.self_attn.v_proj.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.gate.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.6.input_layernorm.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.0.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.1.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.2.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.3.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.4.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.5.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.6.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.7.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.0.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.1.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.2.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.3.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.4.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.5.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.6.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.7.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.0.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.1.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.2.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.3.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.4.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.5.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.6.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.7.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.post_attention_layernorm.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.self_attn.k_proj.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.self_attn.o_proj.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.self_attn.q_proj.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.self_attn.v_proj.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.gate.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.7.input_layernorm.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.0.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.1.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.2.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.3.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.4.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.5.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.6.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.7.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.0.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.1.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.2.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.3.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.4.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.5.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.6.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.7.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.0.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.1.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.2.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.3.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.4.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.5.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.6.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.7.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.post_attention_layernorm.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.self_attn.k_proj.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.self_attn.o_proj.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.self_attn.q_proj.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.self_attn.v_proj.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.gate.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.8.input_layernorm.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.0.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.1.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.2.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.3.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.4.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.5.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.6.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.7.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.0.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.1.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.2.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.3.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.4.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.5.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.6.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.7.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.0.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.1.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.2.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.3.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.4.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.5.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.6.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.7.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.post_attention_layernorm.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.self_attn.k_proj.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.self_attn.o_proj.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.self_attn.q_proj.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.self_attn.v_proj.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.gate.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.9.input_layernorm.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.0.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.1.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.2.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.3.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.4.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.5.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.6.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.7.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.0.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.1.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.2.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.3.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.4.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.5.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.6.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.7.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.0.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.1.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.2.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.3.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.4.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.5.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.6.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.7.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.post_attention_layernorm.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.self_attn.k_proj.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.self_attn.o_proj.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.self_attn.q_proj.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.self_attn.v_proj.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.gate.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.10.input_layernorm.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.0.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.1.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.2.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.3.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.4.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.5.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.6.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.7.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.0.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.1.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.2.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.3.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.4.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.5.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.6.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.7.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.0.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.1.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.2.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.3.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.4.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.5.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.6.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.7.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.post_attention_layernorm.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.self_attn.k_proj.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.self_attn.o_proj.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.self_attn.q_proj.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.self_attn.v_proj.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.gate.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.11.input_layernorm.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.0.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.1.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.2.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.3.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.4.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.5.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.6.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.7.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.0.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.1.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.2.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.3.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.4.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.5.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.6.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.7.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.0.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.1.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.2.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.3.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.4.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.5.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.6.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.7.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.post_attention_layernorm.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.self_attn.k_proj.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.self_attn.o_proj.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.self_attn.q_proj.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.self_attn.v_proj.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.gate.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.12.input_layernorm.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.0.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.1.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.2.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.3.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.4.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.5.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.6.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.7.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.0.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.1.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.2.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.3.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.4.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.5.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.6.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.7.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.0.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.1.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.2.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.3.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.4.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.5.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.6.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.7.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.post_attention_layernorm.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.self_attn.k_proj.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.self_attn.o_proj.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.self_attn.q_proj.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.self_attn.v_proj.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.gate.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.13.input_layernorm.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.0.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.1.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.2.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.3.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.4.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.5.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.6.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.7.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.0.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.1.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.2.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.3.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.4.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.5.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.6.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.7.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.0.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.1.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.2.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.3.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.4.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.5.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.6.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.7.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.post_attention_layernorm.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.self_attn.k_proj.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.self_attn.o_proj.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.self_attn.q_proj.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.self_attn.v_proj.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.gate.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.14.input_layernorm.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.0.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.1.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.2.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.3.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.4.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.5.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.6.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.7.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.0.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.1.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.2.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.3.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.4.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.5.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.6.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.7.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.0.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.1.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.2.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.3.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.4.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.5.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.6.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.7.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.post_attention_layernorm.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.self_attn.k_proj.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.self_attn.o_proj.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.self_attn.q_proj.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.self_attn.v_proj.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.gate.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.15.input_layernorm.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.0.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.1.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.2.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.3.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.4.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.5.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.6.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.7.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.0.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.1.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.2.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.3.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.4.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.5.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.6.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.7.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.0.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.1.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.2.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.3.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.4.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.5.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.6.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.7.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.post_attention_layernorm.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.self_attn.k_proj.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.self_attn.o_proj.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.self_attn.q_proj.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.self_attn.v_proj.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.gate.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.16.input_layernorm.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.0.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.1.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.2.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.3.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.4.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.5.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.6.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.7.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.0.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.1.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.2.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.3.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.4.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.5.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.6.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.7.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.0.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.1.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.2.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.3.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.4.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.5.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.6.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.7.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.post_attention_layernorm.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.self_attn.k_proj.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.self_attn.o_proj.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.self_attn.q_proj.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.self_attn.v_proj.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.gate.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.17.input_layernorm.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.0.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.1.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.2.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.3.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.4.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.5.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.6.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.7.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.0.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.1.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.2.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.3.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.4.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.5.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.6.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.7.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.0.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.1.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.2.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.3.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.4.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.5.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.6.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.7.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.post_attention_layernorm.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.self_attn.k_proj.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.self_attn.o_proj.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.self_attn.q_proj.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.self_attn.v_proj.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.gate.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.18.input_layernorm.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.0.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.1.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.2.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.3.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.4.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.5.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.6.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.7.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.0.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.1.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.2.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.3.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.4.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.5.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.6.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.7.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.0.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.1.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.2.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.3.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.4.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.5.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.6.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.7.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.post_attention_layernorm.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.self_attn.k_proj.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.self_attn.o_proj.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.self_attn.q_proj.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.self_attn.v_proj.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.gate.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.19.input_layernorm.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.0.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.1.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.2.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.3.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.4.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.5.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.6.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.7.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.0.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.1.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.2.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.3.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.4.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.5.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.6.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.7.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.0.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.1.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.2.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.3.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.4.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.5.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.6.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.7.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.post_attention_layernorm.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.self_attn.k_proj.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.self_attn.o_proj.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.self_attn.q_proj.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.self_attn.v_proj.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.gate.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.20.input_layernorm.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.0.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.1.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.2.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.3.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.4.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.5.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.6.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.7.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.0.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.1.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.2.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.3.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.4.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.5.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.6.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.7.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.0.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.1.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.2.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.3.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.4.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.5.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.6.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.7.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.post_attention_layernorm.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.self_attn.k_proj.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.self_attn.o_proj.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.self_attn.q_proj.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.self_attn.v_proj.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.gate.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.21.input_layernorm.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.0.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.1.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.2.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.3.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.4.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.5.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.6.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.7.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.0.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.1.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.2.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.3.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.4.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.5.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.6.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.7.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.0.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.1.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.2.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.3.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.4.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.5.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.6.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.7.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.post_attention_layernorm.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.self_attn.k_proj.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.self_attn.o_proj.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.self_attn.q_proj.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.self_attn.v_proj.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.gate.weight': 'model.layers.00022-of-00022.safetensors'}}\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# check model\n", + "mx_model, mx_tok = test_gen(save_mixtral_dir)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000, + "referenced_widgets": [ + "ae1fb4b51ee2457998f8066635edcc14", + "347c9d512b454f4781f0dbbfda6c3724", + "a458afb6c9b94046bc552a50ada0d867", + "f9062f94c8ee41febe6717fcbcb5053f", + "a79cec942e034086927cfda8e9d8afca", + "f30a519a167249b285fa7cf44e5565bc", + "dba40b83726b4fdaa5d5c8cb1c4d3c3b", + "65348e69ef7440038f107f5e8efa3b7e", + "ddde5ed5da554643aaea09e0bb0b797a", + "c138ccb796ff4c1a81ccce900bdec068", + "558ca035522c405fba9c7a3f9e8eb135" + ] + }, + "id": "DCs_uVxCvCwR", + "outputId": "73663da6-0141-48fa-936b-82598865d27a" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Loading checkpoint shards: 0%| | 0/22 [00:00 [INST] What is your favourite condiment? [/INST] Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen! [INST] Do you have mayonnaise recipes? [/INST] (https://diet-d.com/recipes/mayonnaise-recipe-recipes-for-chicken-or-fish-on-easy-mayonaise-veggie-eggs-dont-use-sweetness.html) are you thinking of substituting these with a recipe that calls for mayonnaise, though?\n", + "cheesecake recipes with a mayonnaise instead of the oil? For how many pounds? May I suggest you substitute the mayonnaise with a cream? (and olive oil instead of soybean oil?\n", + "------------------------\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# from google.colab import userdata\n", + "# !huggingface-cli login --token {userdata.get('HUGGINGFACE_ACCESS_TOKEN')}" + ], + "metadata": { + "id": "X1jZZ3ggwX9x" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# huggingface_repo = \"TinyMixtral-x8-Clonebase-7b\"\n", + "# mx_model.push_to_hub(huggingface_repo, private=True)\n", + "# mx_tok.push_to_hub(huggingface_repo, private=True)" + ], + "metadata": { + "id": "asht1d6Fws_P" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file