diff --git "a/notebook/tinyllama_to_mixtral_clonebase.ipynb" "b/notebook/tinyllama_to_mixtral_clonebase.ipynb"
new file mode 100644--- /dev/null
+++ "b/notebook/tinyllama_to_mixtral_clonebase.ipynb"
@@ -0,0 +1,1839 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "provenance": [],
+ "machine_shape": "hm"
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ },
+ "widgets": {
+ "application/vnd.jupyter.widget-state+json": {
+ "ae1fb4b51ee2457998f8066635edcc14": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "HBoxModel",
+ "model_module_version": "1.5.0",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HBoxModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HBoxView",
+ "box_style": "",
+ "children": [
+ "IPY_MODEL_347c9d512b454f4781f0dbbfda6c3724",
+ "IPY_MODEL_a458afb6c9b94046bc552a50ada0d867",
+ "IPY_MODEL_f9062f94c8ee41febe6717fcbcb5053f"
+ ],
+ "layout": "IPY_MODEL_a79cec942e034086927cfda8e9d8afca"
+ }
+ },
+ "347c9d512b454f4781f0dbbfda6c3724": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "HTMLModel",
+ "model_module_version": "1.5.0",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HTMLModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HTMLView",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_f30a519a167249b285fa7cf44e5565bc",
+ "placeholder": "",
+ "style": "IPY_MODEL_dba40b83726b4fdaa5d5c8cb1c4d3c3b",
+ "value": "Loading checkpoint shards: 100%"
+ }
+ },
+ "a458afb6c9b94046bc552a50ada0d867": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "FloatProgressModel",
+ "model_module_version": "1.5.0",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "FloatProgressModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "ProgressView",
+ "bar_style": "success",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_65348e69ef7440038f107f5e8efa3b7e",
+ "max": 22,
+ "min": 0,
+ "orientation": "horizontal",
+ "style": "IPY_MODEL_ddde5ed5da554643aaea09e0bb0b797a",
+ "value": 22
+ }
+ },
+ "f9062f94c8ee41febe6717fcbcb5053f": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "HTMLModel",
+ "model_module_version": "1.5.0",
+ "state": {
+ "_dom_classes": [],
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "HTMLModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/controls",
+ "_view_module_version": "1.5.0",
+ "_view_name": "HTMLView",
+ "description": "",
+ "description_tooltip": null,
+ "layout": "IPY_MODEL_c138ccb796ff4c1a81ccce900bdec068",
+ "placeholder": "",
+ "style": "IPY_MODEL_558ca035522c405fba9c7a3f9e8eb135",
+ "value": " 22/22 [00:12<00:00, 1.64it/s]"
+ }
+ },
+ "a79cec942e034086927cfda8e9d8afca": {
+ "model_module": "@jupyter-widgets/base",
+ "model_name": "LayoutModel",
+ "model_module_version": "1.2.0",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "f30a519a167249b285fa7cf44e5565bc": {
+ "model_module": "@jupyter-widgets/base",
+ "model_name": "LayoutModel",
+ "model_module_version": "1.2.0",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "dba40b83726b4fdaa5d5c8cb1c4d3c3b": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "DescriptionStyleModel",
+ "model_module_version": "1.5.0",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "DescriptionStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "StyleView",
+ "description_width": ""
+ }
+ },
+ "65348e69ef7440038f107f5e8efa3b7e": {
+ "model_module": "@jupyter-widgets/base",
+ "model_name": "LayoutModel",
+ "model_module_version": "1.2.0",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "ddde5ed5da554643aaea09e0bb0b797a": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "ProgressStyleModel",
+ "model_module_version": "1.5.0",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "ProgressStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "StyleView",
+ "bar_color": null,
+ "description_width": ""
+ }
+ },
+ "c138ccb796ff4c1a81ccce900bdec068": {
+ "model_module": "@jupyter-widgets/base",
+ "model_name": "LayoutModel",
+ "model_module_version": "1.2.0",
+ "state": {
+ "_model_module": "@jupyter-widgets/base",
+ "_model_module_version": "1.2.0",
+ "_model_name": "LayoutModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "LayoutView",
+ "align_content": null,
+ "align_items": null,
+ "align_self": null,
+ "border": null,
+ "bottom": null,
+ "display": null,
+ "flex": null,
+ "flex_flow": null,
+ "grid_area": null,
+ "grid_auto_columns": null,
+ "grid_auto_flow": null,
+ "grid_auto_rows": null,
+ "grid_column": null,
+ "grid_gap": null,
+ "grid_row": null,
+ "grid_template_areas": null,
+ "grid_template_columns": null,
+ "grid_template_rows": null,
+ "height": null,
+ "justify_content": null,
+ "justify_items": null,
+ "left": null,
+ "margin": null,
+ "max_height": null,
+ "max_width": null,
+ "min_height": null,
+ "min_width": null,
+ "object_fit": null,
+ "object_position": null,
+ "order": null,
+ "overflow": null,
+ "overflow_x": null,
+ "overflow_y": null,
+ "padding": null,
+ "right": null,
+ "top": null,
+ "visibility": null,
+ "width": null
+ }
+ },
+ "558ca035522c405fba9c7a3f9e8eb135": {
+ "model_module": "@jupyter-widgets/controls",
+ "model_name": "DescriptionStyleModel",
+ "model_module_version": "1.5.0",
+ "state": {
+ "_model_module": "@jupyter-widgets/controls",
+ "_model_module_version": "1.5.0",
+ "_model_name": "DescriptionStyleModel",
+ "_view_count": null,
+ "_view_module": "@jupyter-widgets/base",
+ "_view_module_version": "1.2.0",
+ "_view_name": "StyleView",
+ "description_width": ""
+ }
+ }
+ }
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "code",
+ "source": [
+ "model_name_or_path = \"TinyLlama/TinyLlama-1.1B-intermediate-step-1195k-token-2.5T\" #@param {type:\"string\"}\n",
+ "model_name = model_name_or_path.split(\"/\")[-1]\n",
+ "\n",
+ "save_mistral_dir = \"/content/tiny_mistral\" #@param {type:\"string\"}\n",
+ "\n",
+ "mixtral_num_experts = 8 #@param {type:\"integer\"}\n",
+ "save_mixtral_dir = \"/content/tiny_mixtral_x\" #@param {type:\"string\"}\n"
+ ],
+ "metadata": {
+ "cellView": "form",
+ "id": "IS9mKmQQEHbC"
+ },
+ "execution_count": 1,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "nO9OwwtND6bp"
+ },
+ "outputs": [],
+ "source": [
+ "!pip install transformers --upgrade\n",
+ "!pip install torch safetensors"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "!git clone https://huggingface.co/{model_name_or_path}"
+ ],
+ "metadata": {
+ "id": "-mUQ35RTEE_G"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import json\n",
+ "import torch\n",
+ "\n",
+ "# load config.json\n",
+ "with open(f\"{model_name}/config.json\") as f:\n",
+ " config = json.load(f)\n",
+ "\n",
+ "print(config)\n",
+ "\n",
+ "mistral_config = {\n",
+ " \"architectures\": [\n",
+ " \"MistralForCausalLM\"\n",
+ " ],\n",
+ " \"attention_dropout\": 0.0,\n",
+ " \"bos_token_id\": 1,\n",
+ " \"eos_token_id\": 2,\n",
+ " \"hidden_act\": \"silu\",\n",
+ " \"hidden_size\": 4096,\n",
+ " \"initializer_range\": 0.02,\n",
+ " \"intermediate_size\": 14336,\n",
+ " \"max_position_embeddings\": 32768,\n",
+ " \"model_type\": \"mistral\",\n",
+ " \"num_attention_heads\": 32,\n",
+ " \"num_hidden_layers\": 32,\n",
+ " \"num_key_value_heads\": 8,\n",
+ " \"rms_norm_eps\": 1e-05,\n",
+ " \"rope_theta\": 1000000.0,\n",
+ " \"sliding_window\": None,\n",
+ " \"tie_word_embeddings\": False,\n",
+ " # \"torch_dtype\": \"bfloat16\",\n",
+ " \"transformers_version\": \"4.36.0\",\n",
+ " \"use_cache\": True,\n",
+ " \"vocab_size\": 32000\n",
+ "}\n",
+ "mistral_config[\"architectures\"] = [\"MistralForCausalLM\"]\n",
+ "mistral_config[\"model_type\"] = \"mistral\"\n",
+ "mistral_config[\"bos_token_id\"] = config[\"bos_token_id\"]\n",
+ "mistral_config[\"eos_token_id\"] = config[\"eos_token_id\"]\n",
+ "mistral_config[\"hidden_act\"] = config[\"hidden_act\"]\n",
+ "mistral_config[\"hidden_size\"] = config[\"hidden_size\"]\n",
+ "mistral_config[\"initializer_range\"] = config[\"initializer_range\"]\n",
+ "mistral_config[\"intermediate_size\"] = config[\"intermediate_size\"]\n",
+ "mistral_config[\"max_position_embeddings\"] = config[\"max_position_embeddings\"]\n",
+ "mistral_config[\"num_attention_heads\"] = config[\"num_attention_heads\"]\n",
+ "mistral_config[\"num_hidden_layers\"] = config[\"num_hidden_layers\"]\n",
+ "mistral_config[\"num_key_value_heads\"] = config[\"num_key_value_heads\"]\n",
+ "mistral_config[\"rms_norm_eps\"] = config[\"rms_norm_eps\"]\n",
+ "mistral_config[\"rope_theta\"] = 1000000.0\n",
+ "mistral_config[\"sliding_window\"] = None\n",
+ "mistral_config[\"tie_word_embeddings\"] = config[\"tie_word_embeddings\"]\n",
+ "mistral_config[\"torch_dtype\"] = config[\"torch_dtype\"]\n",
+ "mistral_config[\"transformers_version\"] = \"4.36.0\"\n",
+ "mistral_config[\"use_cache\"] = config[\"use_cache\"]\n",
+ "mistral_config[\"vocab_size\"] = config[\"vocab_size\"]\n",
+ "\n",
+ "# save tokenizer and model\n",
+ "from transformers import AutoTokenizer\n",
+ "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)\n",
+ "tokenizer.save_pretrained(save_mistral_dir)\n",
+ "\n",
+ "from transformers import AutoModelForCausalLM\n",
+ "model = AutoModelForCausalLM.from_pretrained(model_name_or_path)\n",
+ "if model.dtype == torch.float32:\n",
+ " model.half()\n",
+ " model.to(torch.bfloat16)\n",
+ " mistral_config[\"torch_dtype\"] = \"bfloat16\"\n",
+ "\n",
+ "model.save_pretrained(save_mistral_dir)\n",
+ "\n",
+ "# save convert mistral config\n",
+ "with open(f\"{save_mistral_dir}/config.json\", \"w\") as f:\n",
+ " json.dump(mistral_config, f, indent=2)\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "ITP1ylgIEaUm",
+ "outputId": "7d5175f6-a686-47b0-ce5a-1c89464c05e5"
+ },
+ "execution_count": 3,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "{'_name_or_path': 'meta-llama/Llama-2-7b-hf', 'architectures': ['LlamaForCausalLM'], 'bos_token_id': 1, 'eos_token_id': 2, 'hidden_act': 'silu', 'hidden_size': 2048, 'initializer_range': 0.02, 'intermediate_size': 5632, 'max_position_embeddings': 2048, 'model_type': 'llama', 'num_attention_heads': 32, 'num_hidden_layers': 22, 'num_key_value_heads': 4, 'pretraining_tp': 1, 'rms_norm_eps': 1e-05, 'rope_scaling': None, 'tie_word_embeddings': False, 'torch_dtype': 'float32', 'transformers_version': '4.31.0.dev0', 'use_cache': True, 'vocab_size': 32000}\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#\n",
+ "# モデルの出力テスト\n",
+ "#\n",
+ "from transformers import AutoModelForCausalLM, MistralForCausalLM, MixtralForCausalLM\n",
+ "def test_gen(model_name_or_path):\n",
+ "\n",
+ " device = \"cpu\" # ここを変えてね\n",
+ "\n",
+ " model = AutoModelForCausalLM.from_pretrained(model_name_or_path)\n",
+ "\n",
+ " print(\"check model load \")\n",
+ " print(model.config)\n",
+ " print(model)\n",
+ "\n",
+ " print(\"check model generate text\")\n",
+ " messages = [\n",
+ " {\"role\": \"user\", \"content\": \"What is your favourite condiment?\"},\n",
+ " {\"role\": \"assistant\", \"content\": \"Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!\"},\n",
+ " {\"role\": \"user\", \"content\": \"Do you have mayonnaise recipes?\"}\n",
+ " ]\n",
+ "\n",
+ " encodeds = tokenizer.apply_chat_template(messages, return_tensors=\"pt\")\n",
+ "\n",
+ " model_inputs = encodeds.to(device)\n",
+ " model.to(device)\n",
+ "\n",
+ " generated_ids = model.generate(model_inputs, max_new_tokens=128, do_sample=True)\n",
+ " decoded = tokenizer.batch_decode(generated_ids)\n",
+ " print(decoded[0])\n",
+ " print(\"------------------------\")\n",
+ " return model, tokenizer\n",
+ "\n",
+ "_ , _ = test_gen(save_mistral_dir)\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "Zx_NM0wEHjmU",
+ "outputId": "13b64987-2079-44cd-c707-13a4fe77d474"
+ },
+ "execution_count": 5,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "check model load \n",
+ "MistralConfig {\n",
+ " \"_name_or_path\": \"/content/tiny_mistral\",\n",
+ " \"architectures\": [\n",
+ " \"MistralForCausalLM\"\n",
+ " ],\n",
+ " \"attention_dropout\": 0.0,\n",
+ " \"bos_token_id\": 1,\n",
+ " \"eos_token_id\": 2,\n",
+ " \"hidden_act\": \"silu\",\n",
+ " \"hidden_size\": 2048,\n",
+ " \"initializer_range\": 0.02,\n",
+ " \"intermediate_size\": 5632,\n",
+ " \"max_position_embeddings\": 2048,\n",
+ " \"model_type\": \"mistral\",\n",
+ " \"num_attention_heads\": 32,\n",
+ " \"num_hidden_layers\": 22,\n",
+ " \"num_key_value_heads\": 4,\n",
+ " \"rms_norm_eps\": 1e-05,\n",
+ " \"rope_theta\": 1000000.0,\n",
+ " \"sliding_window\": null,\n",
+ " \"tie_word_embeddings\": false,\n",
+ " \"torch_dtype\": \"bfloat16\",\n",
+ " \"transformers_version\": \"4.36.1\",\n",
+ " \"use_cache\": true,\n",
+ " \"vocab_size\": 32000\n",
+ "}\n",
+ "\n",
+ "MistralForCausalLM(\n",
+ " (model): MistralModel(\n",
+ " (embed_tokens): Embedding(32000, 2048)\n",
+ " (layers): ModuleList(\n",
+ " (0-21): 22 x MistralDecoderLayer(\n",
+ " (self_attn): MistralAttention(\n",
+ " (q_proj): Linear(in_features=2048, out_features=2048, bias=False)\n",
+ " (k_proj): Linear(in_features=2048, out_features=256, bias=False)\n",
+ " (v_proj): Linear(in_features=2048, out_features=256, bias=False)\n",
+ " (o_proj): Linear(in_features=2048, out_features=2048, bias=False)\n",
+ " (rotary_emb): MistralRotaryEmbedding()\n",
+ " )\n",
+ " (mlp): MistralMLP(\n",
+ " (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)\n",
+ " (up_proj): Linear(in_features=2048, out_features=5632, bias=False)\n",
+ " (down_proj): Linear(in_features=5632, out_features=2048, bias=False)\n",
+ " (act_fn): SiLU()\n",
+ " )\n",
+ " (input_layernorm): MistralRMSNorm()\n",
+ " (post_attention_layernorm): MistralRMSNorm()\n",
+ " )\n",
+ " )\n",
+ " (norm): MistralRMSNorm()\n",
+ " )\n",
+ " (lm_head): Linear(in_features=2048, out_features=32000, bias=False)\n",
+ ")\n",
+ "check model generate text\n",
+ " [INST] What is your favourite condiment? [/INST] Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen! [INST] Do you have mayonnaise recipes? [/INST] ᴍ [/INST] We are about to test a Mayonnaise recipe. ꧁[INST] It's really good. ᴍ꧁ [INST] Do you know how to make one? [/INST] I've eaten many on my recent days. But, I didn't know any recipe.\n",
+ "[INST] Not here. But, I have tested and I am going to try this recipes sometime. I am so excited!\n",
+ "ᴍ That is very useful for me. I'd have love to try.���\n",
+ "------------------------\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#\n",
+ "# mixtral config setting\n",
+ "#\n",
+ "\n",
+ "mixtral_config = {\n",
+ " \"architectures\": [\n",
+ " \"MixtralForCausalLM\"\n",
+ " ],\n",
+ " \"attention_dropout\": 0.0,\n",
+ " \"bos_token_id\": 1,\n",
+ " \"eos_token_id\": 2,\n",
+ " \"hidden_act\": \"silu\",\n",
+ " \"hidden_size\": 4096,\n",
+ " \"initializer_range\": 0.02,\n",
+ " \"intermediate_size\": 14336,\n",
+ " \"max_position_embeddings\": 32768,\n",
+ " \"model_type\": \"mixtral\",\n",
+ " \"num_attention_heads\": 32,\n",
+ " \"num_experts_per_tok\": 2,\n",
+ " \"num_hidden_layers\": 32,\n",
+ " \"num_key_value_heads\": 8,\n",
+ " \"num_local_experts\": 8,\n",
+ " \"output_router_logits\": False,\n",
+ " \"rms_norm_eps\": 1e-05,\n",
+ " \"rope_theta\": 1000000.0,\n",
+ " \"router_aux_loss_coef\": 0.02,\n",
+ " \"sliding_window\": None,\n",
+ " \"tie_word_embeddings\": False,\n",
+ " \"torch_dtype\": \"bfloat16\",\n",
+ " \"transformers_version\": \"4.36.0.dev0\",\n",
+ " \"use_cache\": True,\n",
+ " \"vocab_size\": 32000\n",
+ "}\n",
+ "\n",
+ "mixtral_config[\"architectures\"] = [\"MixtralForCausalLM\"]\n",
+ "mixtral_config[\"model_type\"] = \"mixtral\"\n",
+ "mixtral_config[\"num_experts_per_tok\"] = 2\n",
+ "mixtral_config[\"num_local_experts\"] = mixtral_num_experts\n",
+ "\n",
+ "mixtral_config[\"bos_token_id\"] = mistral_config[\"bos_token_id\"]\n",
+ "mixtral_config[\"eos_token_id\"] = mistral_config[\"eos_token_id\"]\n",
+ "mixtral_config[\"hidden_act\"] = mistral_config[\"hidden_act\"]\n",
+ "mixtral_config[\"hidden_size\"] = mistral_config[\"hidden_size\"]\n",
+ "mixtral_config[\"initializer_range\"] = mistral_config[\"initializer_range\"]\n",
+ "mixtral_config[\"intermediate_size\"] = mistral_config[\"intermediate_size\"]\n",
+ "mixtral_config[\"max_position_embeddings\"] = mistral_config[\"max_position_embeddings\"]\n",
+ "mixtral_config[\"num_attention_heads\"] = mistral_config[\"num_attention_heads\"]\n",
+ "mixtral_config[\"num_hidden_layers\"] = mistral_config[\"num_hidden_layers\"]\n",
+ "mixtral_config[\"num_key_value_heads\"] = mistral_config[\"num_key_value_heads\"]\n",
+ "mixtral_config[\"rms_norm_eps\"] = mistral_config[\"rms_norm_eps\"]\n",
+ "mixtral_config[\"rope_theta\"] = mistral_config[\"rope_theta\"]\n",
+ "mixtral_config[\"sliding_window\"] = mistral_config[\"sliding_window\"]\n",
+ "mixtral_config[\"tie_word_embeddings\"] = mistral_config[\"tie_word_embeddings\"]\n",
+ "mixtral_config[\"torch_dtype\"] = mistral_config[\"torch_dtype\"]\n",
+ "mixtral_config[\"transformers_version\"] = \"4.36.0.dev0\"\n",
+ "mixtral_config[\"use_cache\"] = mistral_config[\"use_cache\"]\n",
+ "mixtral_config[\"vocab_size\"] = mistral_config[\"vocab_size\"]\n",
+ "\n",
+ "print(json.dumps(mixtral_config,indent=2))\n",
+ "\n",
+ "# configをsave\n",
+ "!mkdir -p {save_mixtral_dir}\n",
+ "with open(f\"{save_mixtral_dir}/config.json\", \"w\") as f:\n",
+ " json.dump(mixtral_config, f, indent=2)\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "zipdtc3AIWYD",
+ "outputId": "222d5380-7228-4412-8684-cf6d1c851e74"
+ },
+ "execution_count": 6,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "{\n",
+ " \"architectures\": [\n",
+ " \"MixtralForCausalLM\"\n",
+ " ],\n",
+ " \"attention_dropout\": 0.0,\n",
+ " \"bos_token_id\": 1,\n",
+ " \"eos_token_id\": 2,\n",
+ " \"hidden_act\": \"silu\",\n",
+ " \"hidden_size\": 2048,\n",
+ " \"initializer_range\": 0.02,\n",
+ " \"intermediate_size\": 5632,\n",
+ " \"max_position_embeddings\": 2048,\n",
+ " \"model_type\": \"mixtral\",\n",
+ " \"num_attention_heads\": 32,\n",
+ " \"num_experts_per_tok\": 2,\n",
+ " \"num_hidden_layers\": 22,\n",
+ " \"num_key_value_heads\": 4,\n",
+ " \"num_local_experts\": 8,\n",
+ " \"output_router_logits\": false,\n",
+ " \"rms_norm_eps\": 1e-05,\n",
+ " \"rope_theta\": 1000000.0,\n",
+ " \"router_aux_loss_coef\": 0.02,\n",
+ " \"sliding_window\": null,\n",
+ " \"tie_word_embeddings\": false,\n",
+ " \"torch_dtype\": \"bfloat16\",\n",
+ " \"transformers_version\": \"4.36.0.dev0\",\n",
+ " \"use_cache\": true,\n",
+ " \"vocab_size\": 32000\n",
+ "}\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# copy other model files\n",
+ "\n",
+ "# save tokenizer\n",
+ "if tokenizer is None:\n",
+ " from transformers import AutoTokenizer\n",
+ " tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)\n",
+ "\n",
+ "tokenizer.save_pretrained(save_mixtral_dir)\n",
+ "\n",
+ "!cp {save_mistral_dir}/generation_config.json {save_mixtral_dir}/generation_config.json\n"
+ ],
+ "metadata": {
+ "id": "T2uTzZHyk6vS"
+ },
+ "execution_count": 7,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# convert mixtral clone\n",
+ "import torch\n",
+ "from safetensors import safe_open\n",
+ "from safetensors.torch import save_file\n",
+ "import re\n",
+ "\n",
+ "def convert_weight_name(mistral_key, mixtral_expert_num):\n",
+ " if \"mlp.gate_proj\" in mistral_key:\n",
+ " return mistral_key.replace(\".mlp.gate_proj.\", f\".block_sparse_moe.experts.{mixtral_expert_num}.w1.\")\n",
+ " elif \"mlp.down_proj\" in mistral_key:\n",
+ " return mistral_key.replace(\".mlp.down_proj.\", f\".block_sparse_moe.experts.{mixtral_expert_num}.w2.\")\n",
+ " elif \"mlp.up_proj\" in mistral_key:\n",
+ " return mistral_key.replace(\".mlp.up_proj.\" , f\".block_sparse_moe.experts.{mixtral_expert_num}.w3.\")\n",
+ " else:\n",
+ " return mistral_key\n",
+ "\n",
+ "def is_experts_key(mistral_key):\n",
+ " return \".mlp.\" in mistral_key\n",
+ "\n",
+ "def get_layer(mistral_key):\n",
+ " layer = re.match(r'model[.]layers[.]\\d+[.]', mistral_key)\n",
+ " if layer is not None:\n",
+ " return int(re.findall(r'\\d+', layer[0])[0])\n",
+ " return None\n",
+ "\n",
+ "def get_weight_byte_size(weight):\n",
+ "\n",
+ " if isinstance(weight, torch.Tensor):\n",
+ " weight_byte_size = weight.nelement() * weight.element_size()\n",
+ " else:\n",
+ " weight_byte_size = sum(p.nelement() * p.element_size() for p in weight.parameters())\n",
+ "\n",
+ " return weight_byte_size\n",
+ "\n",
+ "# mistralのweight取得\n",
+ "mistral_weights = safe_open(save_mistral_dir + \"/model.safetensors\", framework=\"pt\")\n",
+ "# print(mistral_weights.keys())\n",
+ "\n",
+ "first_weights = {}\n",
+ "\n",
+ "gate_shape = mistral_weights.get_tensor(\"model.layers.0.mlp.up_proj.weight\").shape\n",
+ "gate_tensor = torch.full((mixtral_num_experts, gate_shape[1]), 0.5)\n",
+ "\n",
+ "common_layer_weights = {}\n",
+ "\n",
+ "print(\"mixtral_num_experts\", mixtral_num_experts, \"gate_shape[1]\", gate_shape[1], \"gate_tensor\", gate_tensor)\n",
+ "\n",
+ "# max layer\n",
+ "max_layer_no = 0\n",
+ "for key in mistral_weights.keys():\n",
+ " layer_no = get_layer(key)\n",
+ " if layer_no is None:\n",
+ " first_weights[key] = mistral_weights.get_tensor(key)\n",
+ " else:\n",
+ " max_layer_no = max(max_layer_no, layer_no)\n",
+ "\n",
+ "mixtral_weight_map = {\n",
+ " \"metadata\": {\n",
+ " \"total_size\": 0\n",
+ " },\n",
+ " \"weight_map\": {\n",
+ " }\n",
+ "}\n",
+ "\n",
+ "total_size = 0\n",
+ "\n",
+ "!rm {save_mixtral_dir + \"/*.safetensors\"}\n",
+ "\n",
+ "for i in range(max_layer_no + 1):\n",
+ " weight_file_no = i + 1\n",
+ " layer_weights = {}\n",
+ "\n",
+ " # first weight\n",
+ " if weight_file_no == 1:\n",
+ " for key in first_weights.keys():\n",
+ " mixtral_key = convert_weight_name(key, 0)\n",
+ " layer_weights[mixtral_key] = first_weights[mixtral_key]\n",
+ " total_size += get_weight_byte_size(layer_weights[mixtral_key])\n",
+ " print(\"first\", mixtral_key, layer_weights[mixtral_key].shape)\n",
+ "\n",
+ "\n",
+ " for key in mistral_weights.keys():\n",
+ "\n",
+ " lk = re.match(re.compile(f\"model[.]layers[.]{i}[.]\"), key)\n",
+ " if lk is not None:\n",
+ " mistral_layer_key = key\n",
+ " if not is_experts_key(mistral_layer_key):\n",
+ " mixtral_key = convert_weight_name(mistral_layer_key, 0)\n",
+ " layer_weights[mixtral_key] = mistral_weights.get_tensor(mistral_layer_key)\n",
+ " total_size += get_weight_byte_size(layer_weights[mixtral_key])\n",
+ " print(\"layer\", i , mixtral_key, layer_weights[mixtral_key].shape)\n",
+ " else:\n",
+ " print(\"gen experts\")\n",
+ " for expert_no in range(mixtral_num_experts):\n",
+ " mixtral_key = convert_weight_name(mistral_layer_key, expert_no)\n",
+ " layer_weights[mixtral_key] = mistral_weights.get_tensor(mistral_layer_key).clone()\n",
+ " total_size += get_weight_byte_size(layer_weights[mixtral_key])\n",
+ " print(\"layer\", i , \"expert\", expert_no, mixtral_key, layer_weights[mixtral_key].shape)\n",
+ "\n",
+ " # gate\n",
+ " mixtral_key = f\"model.layers.{i}.block_sparse_moe.gate.weight\"\n",
+ " layer_weights[mixtral_key] = gate_tensor.clone()\n",
+ " total_size += get_weight_byte_size(layer_weights[mixtral_key])\n",
+ " print(\"layer\", i , \"gate\", mixtral_key, layer_weights[mixtral_key].shape)\n",
+ "\n",
+ " #フォーマットで0���め\n",
+ " tensor_weight_file_name = f\"model.layers.{weight_file_no:05d}-of-{max_layer_no + 1:05d}.safetensors\"\n",
+ "\n",
+ " # save safetensor\n",
+ " save_file(layer_weights, save_mixtral_dir + \"/\" + tensor_weight_file_name, metadata={\"format\":\"pt\"})\n",
+ " print(\"Save layer weighs\", i, tensor_weight_file_name)\n",
+ "\n",
+ " for key in layer_weights.keys():\n",
+ " mixtral_weight_map[\"weight_map\"][key] = tensor_weight_file_name\n",
+ "\n",
+ " print(i, tensor_weight_file_name)\n",
+ "\n",
+ "# set total size\n",
+ "mixtral_weight_map[\"metadata\"][\"total_size\"] = total_size\n",
+ "\n",
+ "# save model.safetensors.index.json\n",
+ "with open(save_mixtral_dir + \"/model.safetensors.index.json\", \"w\") as f:\n",
+ " json.dump(mixtral_weight_map, f, indent=2)\n",
+ "\n",
+ "print(mixtral_weight_map)\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "Lswg1ESdI7q9",
+ "outputId": "170e9c80-7856-4a64-b9d6-2701c650ed68"
+ },
+ "execution_count": 8,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "mixtral_num_experts 8 gate_shape[1] 2048 gate_tensor tensor([[0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000],\n",
+ " [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000],\n",
+ " [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000],\n",
+ " ...,\n",
+ " [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000],\n",
+ " [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000],\n",
+ " [0.5000, 0.5000, 0.5000, ..., 0.5000, 0.5000, 0.5000]])\n",
+ "first lm_head.weight torch.Size([32000, 2048])\n",
+ "first model.embed_tokens.weight torch.Size([32000, 2048])\n",
+ "first model.norm.weight torch.Size([2048])\n",
+ "layer 0 model.layers.0.input_layernorm.weight torch.Size([2048])\n",
+ "gen experts\n",
+ "layer 0 expert 0 model.layers.0.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
+ "layer 0 expert 1 model.layers.0.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
+ "layer 0 expert 2 model.layers.0.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
+ "layer 0 expert 3 model.layers.0.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
+ "layer 0 expert 4 model.layers.0.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
+ "layer 0 expert 5 model.layers.0.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
+ "layer 0 expert 6 model.layers.0.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
+ "layer 0 expert 7 model.layers.0.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
+ "gen experts\n",
+ "layer 0 expert 0 model.layers.0.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
+ "layer 0 expert 1 model.layers.0.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
+ "layer 0 expert 2 model.layers.0.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
+ "layer 0 expert 3 model.layers.0.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
+ "layer 0 expert 4 model.layers.0.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
+ "layer 0 expert 5 model.layers.0.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
+ "layer 0 expert 6 model.layers.0.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
+ "layer 0 expert 7 model.layers.0.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
+ "gen experts\n",
+ "layer 0 expert 0 model.layers.0.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
+ "layer 0 expert 1 model.layers.0.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
+ "layer 0 expert 2 model.layers.0.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
+ "layer 0 expert 3 model.layers.0.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
+ "layer 0 expert 4 model.layers.0.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
+ "layer 0 expert 5 model.layers.0.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
+ "layer 0 expert 6 model.layers.0.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
+ "layer 0 expert 7 model.layers.0.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
+ "layer 0 model.layers.0.post_attention_layernorm.weight torch.Size([2048])\n",
+ "layer 0 model.layers.0.self_attn.k_proj.weight torch.Size([256, 2048])\n",
+ "layer 0 model.layers.0.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
+ "layer 0 model.layers.0.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
+ "layer 0 model.layers.0.self_attn.v_proj.weight torch.Size([256, 2048])\n",
+ "layer 0 gate model.layers.0.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
+ "Save layer weighs 0 model.layers.00001-of-00022.safetensors\n",
+ "0 model.layers.00001-of-00022.safetensors\n",
+ "layer 1 model.layers.1.input_layernorm.weight torch.Size([2048])\n",
+ "gen experts\n",
+ "layer 1 expert 0 model.layers.1.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
+ "layer 1 expert 1 model.layers.1.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
+ "layer 1 expert 2 model.layers.1.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
+ "layer 1 expert 3 model.layers.1.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
+ "layer 1 expert 4 model.layers.1.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
+ "layer 1 expert 5 model.layers.1.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
+ "layer 1 expert 6 model.layers.1.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
+ "layer 1 expert 7 model.layers.1.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
+ "gen experts\n",
+ "layer 1 expert 0 model.layers.1.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
+ "layer 1 expert 1 model.layers.1.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
+ "layer 1 expert 2 model.layers.1.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
+ "layer 1 expert 3 model.layers.1.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
+ "layer 1 expert 4 model.layers.1.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
+ "layer 1 expert 5 model.layers.1.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
+ "layer 1 expert 6 model.layers.1.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
+ "layer 1 expert 7 model.layers.1.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
+ "gen experts\n",
+ "layer 1 expert 0 model.layers.1.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
+ "layer 1 expert 1 model.layers.1.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
+ "layer 1 expert 2 model.layers.1.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
+ "layer 1 expert 3 model.layers.1.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
+ "layer 1 expert 4 model.layers.1.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
+ "layer 1 expert 5 model.layers.1.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
+ "layer 1 expert 6 model.layers.1.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
+ "layer 1 expert 7 model.layers.1.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
+ "layer 1 model.layers.1.post_attention_layernorm.weight torch.Size([2048])\n",
+ "layer 1 model.layers.1.self_attn.k_proj.weight torch.Size([256, 2048])\n",
+ "layer 1 model.layers.1.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
+ "layer 1 model.layers.1.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
+ "layer 1 model.layers.1.self_attn.v_proj.weight torch.Size([256, 2048])\n",
+ "layer 1 gate model.layers.1.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
+ "Save layer weighs 1 model.layers.00002-of-00022.safetensors\n",
+ "1 model.layers.00002-of-00022.safetensors\n",
+ "layer 2 model.layers.2.input_layernorm.weight torch.Size([2048])\n",
+ "gen experts\n",
+ "layer 2 expert 0 model.layers.2.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
+ "layer 2 expert 1 model.layers.2.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
+ "layer 2 expert 2 model.layers.2.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
+ "layer 2 expert 3 model.layers.2.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
+ "layer 2 expert 4 model.layers.2.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
+ "layer 2 expert 5 model.layers.2.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
+ "layer 2 expert 6 model.layers.2.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
+ "layer 2 expert 7 model.layers.2.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
+ "gen experts\n",
+ "layer 2 expert 0 model.layers.2.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
+ "layer 2 expert 1 model.layers.2.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
+ "layer 2 expert 2 model.layers.2.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
+ "layer 2 expert 3 model.layers.2.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
+ "layer 2 expert 4 model.layers.2.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
+ "layer 2 expert 5 model.layers.2.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
+ "layer 2 expert 6 model.layers.2.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
+ "layer 2 expert 7 model.layers.2.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
+ "gen experts\n",
+ "layer 2 expert 0 model.layers.2.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
+ "layer 2 expert 1 model.layers.2.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
+ "layer 2 expert 2 model.layers.2.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
+ "layer 2 expert 3 model.layers.2.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
+ "layer 2 expert 4 model.layers.2.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
+ "layer 2 expert 5 model.layers.2.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
+ "layer 2 expert 6 model.layers.2.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
+ "layer 2 expert 7 model.layers.2.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
+ "layer 2 model.layers.2.post_attention_layernorm.weight torch.Size([2048])\n",
+ "layer 2 model.layers.2.self_attn.k_proj.weight torch.Size([256, 2048])\n",
+ "layer 2 model.layers.2.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
+ "layer 2 model.layers.2.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
+ "layer 2 model.layers.2.self_attn.v_proj.weight torch.Size([256, 2048])\n",
+ "layer 2 gate model.layers.2.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
+ "Save layer weighs 2 model.layers.00003-of-00022.safetensors\n",
+ "2 model.layers.00003-of-00022.safetensors\n",
+ "layer 3 model.layers.3.input_layernorm.weight torch.Size([2048])\n",
+ "gen experts\n",
+ "layer 3 expert 0 model.layers.3.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
+ "layer 3 expert 1 model.layers.3.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
+ "layer 3 expert 2 model.layers.3.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
+ "layer 3 expert 3 model.layers.3.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
+ "layer 3 expert 4 model.layers.3.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
+ "layer 3 expert 5 model.layers.3.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
+ "layer 3 expert 6 model.layers.3.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
+ "layer 3 expert 7 model.layers.3.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
+ "gen experts\n",
+ "layer 3 expert 0 model.layers.3.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
+ "layer 3 expert 1 model.layers.3.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
+ "layer 3 expert 2 model.layers.3.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
+ "layer 3 expert 3 model.layers.3.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
+ "layer 3 expert 4 model.layers.3.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
+ "layer 3 expert 5 model.layers.3.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
+ "layer 3 expert 6 model.layers.3.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
+ "layer 3 expert 7 model.layers.3.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
+ "gen experts\n",
+ "layer 3 expert 0 model.layers.3.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
+ "layer 3 expert 1 model.layers.3.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
+ "layer 3 expert 2 model.layers.3.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
+ "layer 3 expert 3 model.layers.3.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
+ "layer 3 expert 4 model.layers.3.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
+ "layer 3 expert 5 model.layers.3.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
+ "layer 3 expert 6 model.layers.3.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
+ "layer 3 expert 7 model.layers.3.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
+ "layer 3 model.layers.3.post_attention_layernorm.weight torch.Size([2048])\n",
+ "layer 3 model.layers.3.self_attn.k_proj.weight torch.Size([256, 2048])\n",
+ "layer 3 model.layers.3.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
+ "layer 3 model.layers.3.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
+ "layer 3 model.layers.3.self_attn.v_proj.weight torch.Size([256, 2048])\n",
+ "layer 3 gate model.layers.3.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
+ "Save layer weighs 3 model.layers.00004-of-00022.safetensors\n",
+ "3 model.layers.00004-of-00022.safetensors\n",
+ "layer 4 model.layers.4.input_layernorm.weight torch.Size([2048])\n",
+ "gen experts\n",
+ "layer 4 expert 0 model.layers.4.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
+ "layer 4 expert 1 model.layers.4.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
+ "layer 4 expert 2 model.layers.4.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
+ "layer 4 expert 3 model.layers.4.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
+ "layer 4 expert 4 model.layers.4.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
+ "layer 4 expert 5 model.layers.4.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
+ "layer 4 expert 6 model.layers.4.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
+ "layer 4 expert 7 model.layers.4.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
+ "gen experts\n",
+ "layer 4 expert 0 model.layers.4.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
+ "layer 4 expert 1 model.layers.4.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
+ "layer 4 expert 2 model.layers.4.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
+ "layer 4 expert 3 model.layers.4.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
+ "layer 4 expert 4 model.layers.4.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
+ "layer 4 expert 5 model.layers.4.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
+ "layer 4 expert 6 model.layers.4.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
+ "layer 4 expert 7 model.layers.4.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
+ "gen experts\n",
+ "layer 4 expert 0 model.layers.4.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
+ "layer 4 expert 1 model.layers.4.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
+ "layer 4 expert 2 model.layers.4.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
+ "layer 4 expert 3 model.layers.4.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
+ "layer 4 expert 4 model.layers.4.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
+ "layer 4 expert 5 model.layers.4.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
+ "layer 4 expert 6 model.layers.4.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
+ "layer 4 expert 7 model.layers.4.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
+ "layer 4 model.layers.4.post_attention_layernorm.weight torch.Size([2048])\n",
+ "layer 4 model.layers.4.self_attn.k_proj.weight torch.Size([256, 2048])\n",
+ "layer 4 model.layers.4.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
+ "layer 4 model.layers.4.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
+ "layer 4 model.layers.4.self_attn.v_proj.weight torch.Size([256, 2048])\n",
+ "layer 4 gate model.layers.4.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
+ "Save layer weighs 4 model.layers.00005-of-00022.safetensors\n",
+ "4 model.layers.00005-of-00022.safetensors\n",
+ "layer 5 model.layers.5.input_layernorm.weight torch.Size([2048])\n",
+ "gen experts\n",
+ "layer 5 expert 0 model.layers.5.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
+ "layer 5 expert 1 model.layers.5.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
+ "layer 5 expert 2 model.layers.5.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
+ "layer 5 expert 3 model.layers.5.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
+ "layer 5 expert 4 model.layers.5.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
+ "layer 5 expert 5 model.layers.5.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
+ "layer 5 expert 6 model.layers.5.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
+ "layer 5 expert 7 model.layers.5.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
+ "gen experts\n",
+ "layer 5 expert 0 model.layers.5.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
+ "layer 5 expert 1 model.layers.5.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
+ "layer 5 expert 2 model.layers.5.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
+ "layer 5 expert 3 model.layers.5.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
+ "layer 5 expert 4 model.layers.5.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
+ "layer 5 expert 5 model.layers.5.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
+ "layer 5 expert 6 model.layers.5.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
+ "layer 5 expert 7 model.layers.5.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
+ "gen experts\n",
+ "layer 5 expert 0 model.layers.5.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
+ "layer 5 expert 1 model.layers.5.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
+ "layer 5 expert 2 model.layers.5.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
+ "layer 5 expert 3 model.layers.5.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
+ "layer 5 expert 4 model.layers.5.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
+ "layer 5 expert 5 model.layers.5.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
+ "layer 5 expert 6 model.layers.5.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
+ "layer 5 expert 7 model.layers.5.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
+ "layer 5 model.layers.5.post_attention_layernorm.weight torch.Size([2048])\n",
+ "layer 5 model.layers.5.self_attn.k_proj.weight torch.Size([256, 2048])\n",
+ "layer 5 model.layers.5.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
+ "layer 5 model.layers.5.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
+ "layer 5 model.layers.5.self_attn.v_proj.weight torch.Size([256, 2048])\n",
+ "layer 5 gate model.layers.5.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
+ "Save layer weighs 5 model.layers.00006-of-00022.safetensors\n",
+ "5 model.layers.00006-of-00022.safetensors\n",
+ "layer 6 model.layers.6.input_layernorm.weight torch.Size([2048])\n",
+ "gen experts\n",
+ "layer 6 expert 0 model.layers.6.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
+ "layer 6 expert 1 model.layers.6.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
+ "layer 6 expert 2 model.layers.6.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
+ "layer 6 expert 3 model.layers.6.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
+ "layer 6 expert 4 model.layers.6.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
+ "layer 6 expert 5 model.layers.6.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
+ "layer 6 expert 6 model.layers.6.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
+ "layer 6 expert 7 model.layers.6.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
+ "gen experts\n",
+ "layer 6 expert 0 model.layers.6.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
+ "layer 6 expert 1 model.layers.6.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
+ "layer 6 expert 2 model.layers.6.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
+ "layer 6 expert 3 model.layers.6.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
+ "layer 6 expert 4 model.layers.6.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
+ "layer 6 expert 5 model.layers.6.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
+ "layer 6 expert 6 model.layers.6.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
+ "layer 6 expert 7 model.layers.6.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
+ "gen experts\n",
+ "layer 6 expert 0 model.layers.6.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
+ "layer 6 expert 1 model.layers.6.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
+ "layer 6 expert 2 model.layers.6.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
+ "layer 6 expert 3 model.layers.6.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
+ "layer 6 expert 4 model.layers.6.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
+ "layer 6 expert 5 model.layers.6.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
+ "layer 6 expert 6 model.layers.6.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
+ "layer 6 expert 7 model.layers.6.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
+ "layer 6 model.layers.6.post_attention_layernorm.weight torch.Size([2048])\n",
+ "layer 6 model.layers.6.self_attn.k_proj.weight torch.Size([256, 2048])\n",
+ "layer 6 model.layers.6.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
+ "layer 6 model.layers.6.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
+ "layer 6 model.layers.6.self_attn.v_proj.weight torch.Size([256, 2048])\n",
+ "layer 6 gate model.layers.6.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
+ "Save layer weighs 6 model.layers.00007-of-00022.safetensors\n",
+ "6 model.layers.00007-of-00022.safetensors\n",
+ "layer 7 model.layers.7.input_layernorm.weight torch.Size([2048])\n",
+ "gen experts\n",
+ "layer 7 expert 0 model.layers.7.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
+ "layer 7 expert 1 model.layers.7.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
+ "layer 7 expert 2 model.layers.7.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
+ "layer 7 expert 3 model.layers.7.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
+ "layer 7 expert 4 model.layers.7.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
+ "layer 7 expert 5 model.layers.7.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
+ "layer 7 expert 6 model.layers.7.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
+ "layer 7 expert 7 model.layers.7.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
+ "gen experts\n",
+ "layer 7 expert 0 model.layers.7.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
+ "layer 7 expert 1 model.layers.7.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
+ "layer 7 expert 2 model.layers.7.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
+ "layer 7 expert 3 model.layers.7.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
+ "layer 7 expert 4 model.layers.7.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
+ "layer 7 expert 5 model.layers.7.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
+ "layer 7 expert 6 model.layers.7.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
+ "layer 7 expert 7 model.layers.7.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
+ "gen experts\n",
+ "layer 7 expert 0 model.layers.7.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
+ "layer 7 expert 1 model.layers.7.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
+ "layer 7 expert 2 model.layers.7.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
+ "layer 7 expert 3 model.layers.7.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
+ "layer 7 expert 4 model.layers.7.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
+ "layer 7 expert 5 model.layers.7.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
+ "layer 7 expert 6 model.layers.7.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
+ "layer 7 expert 7 model.layers.7.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
+ "layer 7 model.layers.7.post_attention_layernorm.weight torch.Size([2048])\n",
+ "layer 7 model.layers.7.self_attn.k_proj.weight torch.Size([256, 2048])\n",
+ "layer 7 model.layers.7.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
+ "layer 7 model.layers.7.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
+ "layer 7 model.layers.7.self_attn.v_proj.weight torch.Size([256, 2048])\n",
+ "layer 7 gate model.layers.7.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
+ "Save layer weighs 7 model.layers.00008-of-00022.safetensors\n",
+ "7 model.layers.00008-of-00022.safetensors\n",
+ "layer 8 model.layers.8.input_layernorm.weight torch.Size([2048])\n",
+ "gen experts\n",
+ "layer 8 expert 0 model.layers.8.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
+ "layer 8 expert 1 model.layers.8.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
+ "layer 8 expert 2 model.layers.8.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
+ "layer 8 expert 3 model.layers.8.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
+ "layer 8 expert 4 model.layers.8.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
+ "layer 8 expert 5 model.layers.8.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
+ "layer 8 expert 6 model.layers.8.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
+ "layer 8 expert 7 model.layers.8.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
+ "gen experts\n",
+ "layer 8 expert 0 model.layers.8.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
+ "layer 8 expert 1 model.layers.8.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
+ "layer 8 expert 2 model.layers.8.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
+ "layer 8 expert 3 model.layers.8.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
+ "layer 8 expert 4 model.layers.8.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
+ "layer 8 expert 5 model.layers.8.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
+ "layer 8 expert 6 model.layers.8.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
+ "layer 8 expert 7 model.layers.8.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
+ "gen experts\n",
+ "layer 8 expert 0 model.layers.8.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
+ "layer 8 expert 1 model.layers.8.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
+ "layer 8 expert 2 model.layers.8.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
+ "layer 8 expert 3 model.layers.8.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
+ "layer 8 expert 4 model.layers.8.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
+ "layer 8 expert 5 model.layers.8.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
+ "layer 8 expert 6 model.layers.8.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
+ "layer 8 expert 7 model.layers.8.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
+ "layer 8 model.layers.8.post_attention_layernorm.weight torch.Size([2048])\n",
+ "layer 8 model.layers.8.self_attn.k_proj.weight torch.Size([256, 2048])\n",
+ "layer 8 model.layers.8.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
+ "layer 8 model.layers.8.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
+ "layer 8 model.layers.8.self_attn.v_proj.weight torch.Size([256, 2048])\n",
+ "layer 8 gate model.layers.8.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
+ "Save layer weighs 8 model.layers.00009-of-00022.safetensors\n",
+ "8 model.layers.00009-of-00022.safetensors\n",
+ "layer 9 model.layers.9.input_layernorm.weight torch.Size([2048])\n",
+ "gen experts\n",
+ "layer 9 expert 0 model.layers.9.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
+ "layer 9 expert 1 model.layers.9.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
+ "layer 9 expert 2 model.layers.9.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
+ "layer 9 expert 3 model.layers.9.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
+ "layer 9 expert 4 model.layers.9.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
+ "layer 9 expert 5 model.layers.9.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
+ "layer 9 expert 6 model.layers.9.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
+ "layer 9 expert 7 model.layers.9.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
+ "gen experts\n",
+ "layer 9 expert 0 model.layers.9.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
+ "layer 9 expert 1 model.layers.9.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
+ "layer 9 expert 2 model.layers.9.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
+ "layer 9 expert 3 model.layers.9.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
+ "layer 9 expert 4 model.layers.9.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
+ "layer 9 expert 5 model.layers.9.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
+ "layer 9 expert 6 model.layers.9.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
+ "layer 9 expert 7 model.layers.9.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
+ "gen experts\n",
+ "layer 9 expert 0 model.layers.9.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
+ "layer 9 expert 1 model.layers.9.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
+ "layer 9 expert 2 model.layers.9.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
+ "layer 9 expert 3 model.layers.9.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
+ "layer 9 expert 4 model.layers.9.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
+ "layer 9 expert 5 model.layers.9.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
+ "layer 9 expert 6 model.layers.9.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
+ "layer 9 expert 7 model.layers.9.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
+ "layer 9 model.layers.9.post_attention_layernorm.weight torch.Size([2048])\n",
+ "layer 9 model.layers.9.self_attn.k_proj.weight torch.Size([256, 2048])\n",
+ "layer 9 model.layers.9.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
+ "layer 9 model.layers.9.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
+ "layer 9 model.layers.9.self_attn.v_proj.weight torch.Size([256, 2048])\n",
+ "layer 9 gate model.layers.9.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
+ "Save layer weighs 9 model.layers.00010-of-00022.safetensors\n",
+ "9 model.layers.00010-of-00022.safetensors\n",
+ "layer 10 model.layers.10.input_layernorm.weight torch.Size([2048])\n",
+ "gen experts\n",
+ "layer 10 expert 0 model.layers.10.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
+ "layer 10 expert 1 model.layers.10.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
+ "layer 10 expert 2 model.layers.10.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
+ "layer 10 expert 3 model.layers.10.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
+ "layer 10 expert 4 model.layers.10.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
+ "layer 10 expert 5 model.layers.10.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
+ "layer 10 expert 6 model.layers.10.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
+ "layer 10 expert 7 model.layers.10.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
+ "gen experts\n",
+ "layer 10 expert 0 model.layers.10.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
+ "layer 10 expert 1 model.layers.10.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
+ "layer 10 expert 2 model.layers.10.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
+ "layer 10 expert 3 model.layers.10.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
+ "layer 10 expert 4 model.layers.10.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
+ "layer 10 expert 5 model.layers.10.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
+ "layer 10 expert 6 model.layers.10.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
+ "layer 10 expert 7 model.layers.10.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
+ "gen experts\n",
+ "layer 10 expert 0 model.layers.10.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
+ "layer 10 expert 1 model.layers.10.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
+ "layer 10 expert 2 model.layers.10.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
+ "layer 10 expert 3 model.layers.10.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
+ "layer 10 expert 4 model.layers.10.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
+ "layer 10 expert 5 model.layers.10.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
+ "layer 10 expert 6 model.layers.10.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
+ "layer 10 expert 7 model.layers.10.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
+ "layer 10 model.layers.10.post_attention_layernorm.weight torch.Size([2048])\n",
+ "layer 10 model.layers.10.self_attn.k_proj.weight torch.Size([256, 2048])\n",
+ "layer 10 model.layers.10.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
+ "layer 10 model.layers.10.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
+ "layer 10 model.layers.10.self_attn.v_proj.weight torch.Size([256, 2048])\n",
+ "layer 10 gate model.layers.10.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
+ "Save layer weighs 10 model.layers.00011-of-00022.safetensors\n",
+ "10 model.layers.00011-of-00022.safetensors\n",
+ "layer 11 model.layers.11.input_layernorm.weight torch.Size([2048])\n",
+ "gen experts\n",
+ "layer 11 expert 0 model.layers.11.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
+ "layer 11 expert 1 model.layers.11.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
+ "layer 11 expert 2 model.layers.11.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
+ "layer 11 expert 3 model.layers.11.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
+ "layer 11 expert 4 model.layers.11.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
+ "layer 11 expert 5 model.layers.11.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
+ "layer 11 expert 6 model.layers.11.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
+ "layer 11 expert 7 model.layers.11.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
+ "gen experts\n",
+ "layer 11 expert 0 model.layers.11.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
+ "layer 11 expert 1 model.layers.11.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
+ "layer 11 expert 2 model.layers.11.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
+ "layer 11 expert 3 model.layers.11.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
+ "layer 11 expert 4 model.layers.11.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
+ "layer 11 expert 5 model.layers.11.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
+ "layer 11 expert 6 model.layers.11.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
+ "layer 11 expert 7 model.layers.11.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
+ "gen experts\n",
+ "layer 11 expert 0 model.layers.11.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
+ "layer 11 expert 1 model.layers.11.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
+ "layer 11 expert 2 model.layers.11.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
+ "layer 11 expert 3 model.layers.11.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
+ "layer 11 expert 4 model.layers.11.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
+ "layer 11 expert 5 model.layers.11.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
+ "layer 11 expert 6 model.layers.11.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
+ "layer 11 expert 7 model.layers.11.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
+ "layer 11 model.layers.11.post_attention_layernorm.weight torch.Size([2048])\n",
+ "layer 11 model.layers.11.self_attn.k_proj.weight torch.Size([256, 2048])\n",
+ "layer 11 model.layers.11.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
+ "layer 11 model.layers.11.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
+ "layer 11 model.layers.11.self_attn.v_proj.weight torch.Size([256, 2048])\n",
+ "layer 11 gate model.layers.11.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
+ "Save layer weighs 11 model.layers.00012-of-00022.safetensors\n",
+ "11 model.layers.00012-of-00022.safetensors\n",
+ "layer 12 model.layers.12.input_layernorm.weight torch.Size([2048])\n",
+ "gen experts\n",
+ "layer 12 expert 0 model.layers.12.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
+ "layer 12 expert 1 model.layers.12.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
+ "layer 12 expert 2 model.layers.12.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
+ "layer 12 expert 3 model.layers.12.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
+ "layer 12 expert 4 model.layers.12.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
+ "layer 12 expert 5 model.layers.12.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
+ "layer 12 expert 6 model.layers.12.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
+ "layer 12 expert 7 model.layers.12.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
+ "gen experts\n",
+ "layer 12 expert 0 model.layers.12.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
+ "layer 12 expert 1 model.layers.12.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
+ "layer 12 expert 2 model.layers.12.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
+ "layer 12 expert 3 model.layers.12.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
+ "layer 12 expert 4 model.layers.12.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
+ "layer 12 expert 5 model.layers.12.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
+ "layer 12 expert 6 model.layers.12.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
+ "layer 12 expert 7 model.layers.12.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
+ "gen experts\n",
+ "layer 12 expert 0 model.layers.12.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
+ "layer 12 expert 1 model.layers.12.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
+ "layer 12 expert 2 model.layers.12.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
+ "layer 12 expert 3 model.layers.12.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
+ "layer 12 expert 4 model.layers.12.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
+ "layer 12 expert 5 model.layers.12.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
+ "layer 12 expert 6 model.layers.12.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
+ "layer 12 expert 7 model.layers.12.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
+ "layer 12 model.layers.12.post_attention_layernorm.weight torch.Size([2048])\n",
+ "layer 12 model.layers.12.self_attn.k_proj.weight torch.Size([256, 2048])\n",
+ "layer 12 model.layers.12.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
+ "layer 12 model.layers.12.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
+ "layer 12 model.layers.12.self_attn.v_proj.weight torch.Size([256, 2048])\n",
+ "layer 12 gate model.layers.12.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
+ "Save layer weighs 12 model.layers.00013-of-00022.safetensors\n",
+ "12 model.layers.00013-of-00022.safetensors\n",
+ "layer 13 model.layers.13.input_layernorm.weight torch.Size([2048])\n",
+ "gen experts\n",
+ "layer 13 expert 0 model.layers.13.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
+ "layer 13 expert 1 model.layers.13.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
+ "layer 13 expert 2 model.layers.13.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
+ "layer 13 expert 3 model.layers.13.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
+ "layer 13 expert 4 model.layers.13.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
+ "layer 13 expert 5 model.layers.13.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
+ "layer 13 expert 6 model.layers.13.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
+ "layer 13 expert 7 model.layers.13.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
+ "gen experts\n",
+ "layer 13 expert 0 model.layers.13.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
+ "layer 13 expert 1 model.layers.13.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
+ "layer 13 expert 2 model.layers.13.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
+ "layer 13 expert 3 model.layers.13.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
+ "layer 13 expert 4 model.layers.13.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
+ "layer 13 expert 5 model.layers.13.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
+ "layer 13 expert 6 model.layers.13.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
+ "layer 13 expert 7 model.layers.13.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
+ "gen experts\n",
+ "layer 13 expert 0 model.layers.13.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
+ "layer 13 expert 1 model.layers.13.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
+ "layer 13 expert 2 model.layers.13.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
+ "layer 13 expert 3 model.layers.13.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
+ "layer 13 expert 4 model.layers.13.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
+ "layer 13 expert 5 model.layers.13.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
+ "layer 13 expert 6 model.layers.13.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
+ "layer 13 expert 7 model.layers.13.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
+ "layer 13 model.layers.13.post_attention_layernorm.weight torch.Size([2048])\n",
+ "layer 13 model.layers.13.self_attn.k_proj.weight torch.Size([256, 2048])\n",
+ "layer 13 model.layers.13.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
+ "layer 13 model.layers.13.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
+ "layer 13 model.layers.13.self_attn.v_proj.weight torch.Size([256, 2048])\n",
+ "layer 13 gate model.layers.13.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
+ "Save layer weighs 13 model.layers.00014-of-00022.safetensors\n",
+ "13 model.layers.00014-of-00022.safetensors\n",
+ "layer 14 model.layers.14.input_layernorm.weight torch.Size([2048])\n",
+ "gen experts\n",
+ "layer 14 expert 0 model.layers.14.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
+ "layer 14 expert 1 model.layers.14.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
+ "layer 14 expert 2 model.layers.14.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
+ "layer 14 expert 3 model.layers.14.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
+ "layer 14 expert 4 model.layers.14.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
+ "layer 14 expert 5 model.layers.14.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
+ "layer 14 expert 6 model.layers.14.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
+ "layer 14 expert 7 model.layers.14.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
+ "gen experts\n",
+ "layer 14 expert 0 model.layers.14.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
+ "layer 14 expert 1 model.layers.14.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
+ "layer 14 expert 2 model.layers.14.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
+ "layer 14 expert 3 model.layers.14.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
+ "layer 14 expert 4 model.layers.14.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
+ "layer 14 expert 5 model.layers.14.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
+ "layer 14 expert 6 model.layers.14.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
+ "layer 14 expert 7 model.layers.14.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
+ "gen experts\n",
+ "layer 14 expert 0 model.layers.14.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
+ "layer 14 expert 1 model.layers.14.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
+ "layer 14 expert 2 model.layers.14.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
+ "layer 14 expert 3 model.layers.14.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
+ "layer 14 expert 4 model.layers.14.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
+ "layer 14 expert 5 model.layers.14.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
+ "layer 14 expert 6 model.layers.14.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
+ "layer 14 expert 7 model.layers.14.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
+ "layer 14 model.layers.14.post_attention_layernorm.weight torch.Size([2048])\n",
+ "layer 14 model.layers.14.self_attn.k_proj.weight torch.Size([256, 2048])\n",
+ "layer 14 model.layers.14.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
+ "layer 14 model.layers.14.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
+ "layer 14 model.layers.14.self_attn.v_proj.weight torch.Size([256, 2048])\n",
+ "layer 14 gate model.layers.14.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
+ "Save layer weighs 14 model.layers.00015-of-00022.safetensors\n",
+ "14 model.layers.00015-of-00022.safetensors\n",
+ "layer 15 model.layers.15.input_layernorm.weight torch.Size([2048])\n",
+ "gen experts\n",
+ "layer 15 expert 0 model.layers.15.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
+ "layer 15 expert 1 model.layers.15.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
+ "layer 15 expert 2 model.layers.15.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
+ "layer 15 expert 3 model.layers.15.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
+ "layer 15 expert 4 model.layers.15.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
+ "layer 15 expert 5 model.layers.15.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
+ "layer 15 expert 6 model.layers.15.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
+ "layer 15 expert 7 model.layers.15.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
+ "gen experts\n",
+ "layer 15 expert 0 model.layers.15.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
+ "layer 15 expert 1 model.layers.15.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
+ "layer 15 expert 2 model.layers.15.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
+ "layer 15 expert 3 model.layers.15.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
+ "layer 15 expert 4 model.layers.15.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
+ "layer 15 expert 5 model.layers.15.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
+ "layer 15 expert 6 model.layers.15.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
+ "layer 15 expert 7 model.layers.15.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
+ "gen experts\n",
+ "layer 15 expert 0 model.layers.15.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
+ "layer 15 expert 1 model.layers.15.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
+ "layer 15 expert 2 model.layers.15.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
+ "layer 15 expert 3 model.layers.15.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
+ "layer 15 expert 4 model.layers.15.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
+ "layer 15 expert 5 model.layers.15.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
+ "layer 15 expert 6 model.layers.15.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
+ "layer 15 expert 7 model.layers.15.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
+ "layer 15 model.layers.15.post_attention_layernorm.weight torch.Size([2048])\n",
+ "layer 15 model.layers.15.self_attn.k_proj.weight torch.Size([256, 2048])\n",
+ "layer 15 model.layers.15.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
+ "layer 15 model.layers.15.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
+ "layer 15 model.layers.15.self_attn.v_proj.weight torch.Size([256, 2048])\n",
+ "layer 15 gate model.layers.15.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
+ "Save layer weighs 15 model.layers.00016-of-00022.safetensors\n",
+ "15 model.layers.00016-of-00022.safetensors\n",
+ "layer 16 model.layers.16.input_layernorm.weight torch.Size([2048])\n",
+ "gen experts\n",
+ "layer 16 expert 0 model.layers.16.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
+ "layer 16 expert 1 model.layers.16.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
+ "layer 16 expert 2 model.layers.16.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
+ "layer 16 expert 3 model.layers.16.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
+ "layer 16 expert 4 model.layers.16.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
+ "layer 16 expert 5 model.layers.16.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
+ "layer 16 expert 6 model.layers.16.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
+ "layer 16 expert 7 model.layers.16.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
+ "gen experts\n",
+ "layer 16 expert 0 model.layers.16.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
+ "layer 16 expert 1 model.layers.16.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
+ "layer 16 expert 2 model.layers.16.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
+ "layer 16 expert 3 model.layers.16.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
+ "layer 16 expert 4 model.layers.16.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
+ "layer 16 expert 5 model.layers.16.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
+ "layer 16 expert 6 model.layers.16.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
+ "layer 16 expert 7 model.layers.16.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
+ "gen experts\n",
+ "layer 16 expert 0 model.layers.16.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
+ "layer 16 expert 1 model.layers.16.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
+ "layer 16 expert 2 model.layers.16.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
+ "layer 16 expert 3 model.layers.16.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
+ "layer 16 expert 4 model.layers.16.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
+ "layer 16 expert 5 model.layers.16.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
+ "layer 16 expert 6 model.layers.16.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
+ "layer 16 expert 7 model.layers.16.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
+ "layer 16 model.layers.16.post_attention_layernorm.weight torch.Size([2048])\n",
+ "layer 16 model.layers.16.self_attn.k_proj.weight torch.Size([256, 2048])\n",
+ "layer 16 model.layers.16.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
+ "layer 16 model.layers.16.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
+ "layer 16 model.layers.16.self_attn.v_proj.weight torch.Size([256, 2048])\n",
+ "layer 16 gate model.layers.16.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
+ "Save layer weighs 16 model.layers.00017-of-00022.safetensors\n",
+ "16 model.layers.00017-of-00022.safetensors\n",
+ "layer 17 model.layers.17.input_layernorm.weight torch.Size([2048])\n",
+ "gen experts\n",
+ "layer 17 expert 0 model.layers.17.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
+ "layer 17 expert 1 model.layers.17.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
+ "layer 17 expert 2 model.layers.17.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
+ "layer 17 expert 3 model.layers.17.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
+ "layer 17 expert 4 model.layers.17.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
+ "layer 17 expert 5 model.layers.17.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
+ "layer 17 expert 6 model.layers.17.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
+ "layer 17 expert 7 model.layers.17.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
+ "gen experts\n",
+ "layer 17 expert 0 model.layers.17.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
+ "layer 17 expert 1 model.layers.17.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
+ "layer 17 expert 2 model.layers.17.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
+ "layer 17 expert 3 model.layers.17.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
+ "layer 17 expert 4 model.layers.17.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
+ "layer 17 expert 5 model.layers.17.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
+ "layer 17 expert 6 model.layers.17.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
+ "layer 17 expert 7 model.layers.17.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
+ "gen experts\n",
+ "layer 17 expert 0 model.layers.17.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
+ "layer 17 expert 1 model.layers.17.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
+ "layer 17 expert 2 model.layers.17.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
+ "layer 17 expert 3 model.layers.17.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
+ "layer 17 expert 4 model.layers.17.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
+ "layer 17 expert 5 model.layers.17.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
+ "layer 17 expert 6 model.layers.17.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
+ "layer 17 expert 7 model.layers.17.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
+ "layer 17 model.layers.17.post_attention_layernorm.weight torch.Size([2048])\n",
+ "layer 17 model.layers.17.self_attn.k_proj.weight torch.Size([256, 2048])\n",
+ "layer 17 model.layers.17.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
+ "layer 17 model.layers.17.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
+ "layer 17 model.layers.17.self_attn.v_proj.weight torch.Size([256, 2048])\n",
+ "layer 17 gate model.layers.17.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
+ "Save layer weighs 17 model.layers.00018-of-00022.safetensors\n",
+ "17 model.layers.00018-of-00022.safetensors\n",
+ "layer 18 model.layers.18.input_layernorm.weight torch.Size([2048])\n",
+ "gen experts\n",
+ "layer 18 expert 0 model.layers.18.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
+ "layer 18 expert 1 model.layers.18.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
+ "layer 18 expert 2 model.layers.18.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
+ "layer 18 expert 3 model.layers.18.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
+ "layer 18 expert 4 model.layers.18.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
+ "layer 18 expert 5 model.layers.18.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
+ "layer 18 expert 6 model.layers.18.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
+ "layer 18 expert 7 model.layers.18.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
+ "gen experts\n",
+ "layer 18 expert 0 model.layers.18.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
+ "layer 18 expert 1 model.layers.18.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
+ "layer 18 expert 2 model.layers.18.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
+ "layer 18 expert 3 model.layers.18.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
+ "layer 18 expert 4 model.layers.18.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
+ "layer 18 expert 5 model.layers.18.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
+ "layer 18 expert 6 model.layers.18.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
+ "layer 18 expert 7 model.layers.18.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
+ "gen experts\n",
+ "layer 18 expert 0 model.layers.18.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
+ "layer 18 expert 1 model.layers.18.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
+ "layer 18 expert 2 model.layers.18.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
+ "layer 18 expert 3 model.layers.18.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
+ "layer 18 expert 4 model.layers.18.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
+ "layer 18 expert 5 model.layers.18.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
+ "layer 18 expert 6 model.layers.18.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
+ "layer 18 expert 7 model.layers.18.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
+ "layer 18 model.layers.18.post_attention_layernorm.weight torch.Size([2048])\n",
+ "layer 18 model.layers.18.self_attn.k_proj.weight torch.Size([256, 2048])\n",
+ "layer 18 model.layers.18.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
+ "layer 18 model.layers.18.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
+ "layer 18 model.layers.18.self_attn.v_proj.weight torch.Size([256, 2048])\n",
+ "layer 18 gate model.layers.18.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
+ "Save layer weighs 18 model.layers.00019-of-00022.safetensors\n",
+ "18 model.layers.00019-of-00022.safetensors\n",
+ "layer 19 model.layers.19.input_layernorm.weight torch.Size([2048])\n",
+ "gen experts\n",
+ "layer 19 expert 0 model.layers.19.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
+ "layer 19 expert 1 model.layers.19.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
+ "layer 19 expert 2 model.layers.19.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
+ "layer 19 expert 3 model.layers.19.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
+ "layer 19 expert 4 model.layers.19.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
+ "layer 19 expert 5 model.layers.19.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
+ "layer 19 expert 6 model.layers.19.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
+ "layer 19 expert 7 model.layers.19.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
+ "gen experts\n",
+ "layer 19 expert 0 model.layers.19.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
+ "layer 19 expert 1 model.layers.19.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
+ "layer 19 expert 2 model.layers.19.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
+ "layer 19 expert 3 model.layers.19.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
+ "layer 19 expert 4 model.layers.19.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
+ "layer 19 expert 5 model.layers.19.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
+ "layer 19 expert 6 model.layers.19.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
+ "layer 19 expert 7 model.layers.19.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
+ "gen experts\n",
+ "layer 19 expert 0 model.layers.19.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
+ "layer 19 expert 1 model.layers.19.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
+ "layer 19 expert 2 model.layers.19.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
+ "layer 19 expert 3 model.layers.19.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
+ "layer 19 expert 4 model.layers.19.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
+ "layer 19 expert 5 model.layers.19.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
+ "layer 19 expert 6 model.layers.19.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
+ "layer 19 expert 7 model.layers.19.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
+ "layer 19 model.layers.19.post_attention_layernorm.weight torch.Size([2048])\n",
+ "layer 19 model.layers.19.self_attn.k_proj.weight torch.Size([256, 2048])\n",
+ "layer 19 model.layers.19.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
+ "layer 19 model.layers.19.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
+ "layer 19 model.layers.19.self_attn.v_proj.weight torch.Size([256, 2048])\n",
+ "layer 19 gate model.layers.19.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
+ "Save layer weighs 19 model.layers.00020-of-00022.safetensors\n",
+ "19 model.layers.00020-of-00022.safetensors\n",
+ "layer 20 model.layers.20.input_layernorm.weight torch.Size([2048])\n",
+ "gen experts\n",
+ "layer 20 expert 0 model.layers.20.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
+ "layer 20 expert 1 model.layers.20.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
+ "layer 20 expert 2 model.layers.20.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
+ "layer 20 expert 3 model.layers.20.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
+ "layer 20 expert 4 model.layers.20.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
+ "layer 20 expert 5 model.layers.20.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
+ "layer 20 expert 6 model.layers.20.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
+ "layer 20 expert 7 model.layers.20.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
+ "gen experts\n",
+ "layer 20 expert 0 model.layers.20.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
+ "layer 20 expert 1 model.layers.20.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
+ "layer 20 expert 2 model.layers.20.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
+ "layer 20 expert 3 model.layers.20.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
+ "layer 20 expert 4 model.layers.20.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
+ "layer 20 expert 5 model.layers.20.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
+ "layer 20 expert 6 model.layers.20.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
+ "layer 20 expert 7 model.layers.20.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
+ "gen experts\n",
+ "layer 20 expert 0 model.layers.20.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
+ "layer 20 expert 1 model.layers.20.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
+ "layer 20 expert 2 model.layers.20.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
+ "layer 20 expert 3 model.layers.20.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
+ "layer 20 expert 4 model.layers.20.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
+ "layer 20 expert 5 model.layers.20.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
+ "layer 20 expert 6 model.layers.20.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
+ "layer 20 expert 7 model.layers.20.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
+ "layer 20 model.layers.20.post_attention_layernorm.weight torch.Size([2048])\n",
+ "layer 20 model.layers.20.self_attn.k_proj.weight torch.Size([256, 2048])\n",
+ "layer 20 model.layers.20.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
+ "layer 20 model.layers.20.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
+ "layer 20 model.layers.20.self_attn.v_proj.weight torch.Size([256, 2048])\n",
+ "layer 20 gate model.layers.20.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
+ "Save layer weighs 20 model.layers.00021-of-00022.safetensors\n",
+ "20 model.layers.00021-of-00022.safetensors\n",
+ "layer 21 model.layers.21.input_layernorm.weight torch.Size([2048])\n",
+ "gen experts\n",
+ "layer 21 expert 0 model.layers.21.block_sparse_moe.experts.0.w2.weight torch.Size([2048, 5632])\n",
+ "layer 21 expert 1 model.layers.21.block_sparse_moe.experts.1.w2.weight torch.Size([2048, 5632])\n",
+ "layer 21 expert 2 model.layers.21.block_sparse_moe.experts.2.w2.weight torch.Size([2048, 5632])\n",
+ "layer 21 expert 3 model.layers.21.block_sparse_moe.experts.3.w2.weight torch.Size([2048, 5632])\n",
+ "layer 21 expert 4 model.layers.21.block_sparse_moe.experts.4.w2.weight torch.Size([2048, 5632])\n",
+ "layer 21 expert 5 model.layers.21.block_sparse_moe.experts.5.w2.weight torch.Size([2048, 5632])\n",
+ "layer 21 expert 6 model.layers.21.block_sparse_moe.experts.6.w2.weight torch.Size([2048, 5632])\n",
+ "layer 21 expert 7 model.layers.21.block_sparse_moe.experts.7.w2.weight torch.Size([2048, 5632])\n",
+ "gen experts\n",
+ "layer 21 expert 0 model.layers.21.block_sparse_moe.experts.0.w1.weight torch.Size([5632, 2048])\n",
+ "layer 21 expert 1 model.layers.21.block_sparse_moe.experts.1.w1.weight torch.Size([5632, 2048])\n",
+ "layer 21 expert 2 model.layers.21.block_sparse_moe.experts.2.w1.weight torch.Size([5632, 2048])\n",
+ "layer 21 expert 3 model.layers.21.block_sparse_moe.experts.3.w1.weight torch.Size([5632, 2048])\n",
+ "layer 21 expert 4 model.layers.21.block_sparse_moe.experts.4.w1.weight torch.Size([5632, 2048])\n",
+ "layer 21 expert 5 model.layers.21.block_sparse_moe.experts.5.w1.weight torch.Size([5632, 2048])\n",
+ "layer 21 expert 6 model.layers.21.block_sparse_moe.experts.6.w1.weight torch.Size([5632, 2048])\n",
+ "layer 21 expert 7 model.layers.21.block_sparse_moe.experts.7.w1.weight torch.Size([5632, 2048])\n",
+ "gen experts\n",
+ "layer 21 expert 0 model.layers.21.block_sparse_moe.experts.0.w3.weight torch.Size([5632, 2048])\n",
+ "layer 21 expert 1 model.layers.21.block_sparse_moe.experts.1.w3.weight torch.Size([5632, 2048])\n",
+ "layer 21 expert 2 model.layers.21.block_sparse_moe.experts.2.w3.weight torch.Size([5632, 2048])\n",
+ "layer 21 expert 3 model.layers.21.block_sparse_moe.experts.3.w3.weight torch.Size([5632, 2048])\n",
+ "layer 21 expert 4 model.layers.21.block_sparse_moe.experts.4.w3.weight torch.Size([5632, 2048])\n",
+ "layer 21 expert 5 model.layers.21.block_sparse_moe.experts.5.w3.weight torch.Size([5632, 2048])\n",
+ "layer 21 expert 6 model.layers.21.block_sparse_moe.experts.6.w3.weight torch.Size([5632, 2048])\n",
+ "layer 21 expert 7 model.layers.21.block_sparse_moe.experts.7.w3.weight torch.Size([5632, 2048])\n",
+ "layer 21 model.layers.21.post_attention_layernorm.weight torch.Size([2048])\n",
+ "layer 21 model.layers.21.self_attn.k_proj.weight torch.Size([256, 2048])\n",
+ "layer 21 model.layers.21.self_attn.o_proj.weight torch.Size([2048, 2048])\n",
+ "layer 21 model.layers.21.self_attn.q_proj.weight torch.Size([2048, 2048])\n",
+ "layer 21 model.layers.21.self_attn.v_proj.weight torch.Size([256, 2048])\n",
+ "layer 21 gate model.layers.21.block_sparse_moe.gate.weight torch.Size([8, 2048])\n",
+ "Save layer weighs 21 model.layers.00022-of-00022.safetensors\n",
+ "21 model.layers.00022-of-00022.safetensors\n",
+ "{'metadata': {'total_size': 12859265024}, 'weight_map': {'lm_head.weight': 'model.layers.00001-of-00022.safetensors', 'model.embed_tokens.weight': 'model.layers.00001-of-00022.safetensors', 'model.norm.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.input_layernorm.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.0.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.1.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.2.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.3.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.4.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.5.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.6.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.7.w2.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.0.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.1.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.2.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.3.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.4.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.5.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.6.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.7.w1.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.0.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.1.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.2.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.3.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.4.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.5.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.6.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.experts.7.w3.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.post_attention_layernorm.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.self_attn.k_proj.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.self_attn.o_proj.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.self_attn.q_proj.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.self_attn.v_proj.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.0.block_sparse_moe.gate.weight': 'model.layers.00001-of-00022.safetensors', 'model.layers.1.input_layernorm.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.0.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.1.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.2.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.3.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.4.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.5.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.6.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.7.w2.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.0.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.1.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.2.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.3.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.4.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.5.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.6.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.7.w1.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.0.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.1.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.2.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.3.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.4.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.5.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.6.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.experts.7.w3.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.post_attention_layernorm.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.self_attn.k_proj.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.self_attn.o_proj.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.self_attn.q_proj.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.self_attn.v_proj.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.1.block_sparse_moe.gate.weight': 'model.layers.00002-of-00022.safetensors', 'model.layers.2.input_layernorm.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.0.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.1.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.2.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.3.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.4.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.5.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.6.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.7.w2.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.0.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.1.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.2.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.3.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.4.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.5.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.6.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.7.w1.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.0.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.1.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.2.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.3.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.4.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.5.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.6.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.experts.7.w3.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.post_attention_layernorm.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.self_attn.k_proj.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.self_attn.o_proj.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.self_attn.q_proj.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.self_attn.v_proj.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.2.block_sparse_moe.gate.weight': 'model.layers.00003-of-00022.safetensors', 'model.layers.3.input_layernorm.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.0.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.1.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.2.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.3.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.4.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.5.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.6.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.7.w2.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.0.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.1.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.2.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.3.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.4.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.5.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.6.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.7.w1.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.0.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.1.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.2.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.3.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.4.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.5.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.6.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.experts.7.w3.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.post_attention_layernorm.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.self_attn.k_proj.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.self_attn.o_proj.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.self_attn.q_proj.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.self_attn.v_proj.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.3.block_sparse_moe.gate.weight': 'model.layers.00004-of-00022.safetensors', 'model.layers.4.input_layernorm.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.0.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.1.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.2.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.3.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.4.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.5.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.6.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.7.w2.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.0.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.1.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.2.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.3.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.4.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.5.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.6.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.7.w1.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.0.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.1.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.2.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.3.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.4.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.5.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.6.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.experts.7.w3.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.post_attention_layernorm.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.self_attn.k_proj.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.self_attn.o_proj.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.self_attn.q_proj.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.self_attn.v_proj.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.4.block_sparse_moe.gate.weight': 'model.layers.00005-of-00022.safetensors', 'model.layers.5.input_layernorm.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.0.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.1.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.2.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.3.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.4.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.5.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.6.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.7.w2.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.0.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.1.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.2.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.3.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.4.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.5.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.6.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.7.w1.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.0.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.1.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.2.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.3.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.4.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.5.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.6.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.experts.7.w3.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.post_attention_layernorm.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.self_attn.k_proj.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.self_attn.o_proj.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.self_attn.q_proj.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.self_attn.v_proj.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.5.block_sparse_moe.gate.weight': 'model.layers.00006-of-00022.safetensors', 'model.layers.6.input_layernorm.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.0.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.1.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.2.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.3.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.4.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.5.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.6.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.7.w2.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.0.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.1.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.2.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.3.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.4.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.5.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.6.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.7.w1.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.0.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.1.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.2.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.3.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.4.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.5.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.6.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.experts.7.w3.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.post_attention_layernorm.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.self_attn.k_proj.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.self_attn.o_proj.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.self_attn.q_proj.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.self_attn.v_proj.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.6.block_sparse_moe.gate.weight': 'model.layers.00007-of-00022.safetensors', 'model.layers.7.input_layernorm.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.0.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.1.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.2.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.3.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.4.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.5.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.6.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.7.w2.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.0.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.1.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.2.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.3.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.4.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.5.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.6.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.7.w1.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.0.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.1.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.2.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.3.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.4.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.5.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.6.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.experts.7.w3.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.post_attention_layernorm.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.self_attn.k_proj.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.self_attn.o_proj.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.self_attn.q_proj.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.self_attn.v_proj.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.7.block_sparse_moe.gate.weight': 'model.layers.00008-of-00022.safetensors', 'model.layers.8.input_layernorm.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.0.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.1.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.2.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.3.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.4.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.5.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.6.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.7.w2.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.0.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.1.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.2.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.3.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.4.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.5.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.6.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.7.w1.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.0.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.1.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.2.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.3.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.4.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.5.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.6.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.experts.7.w3.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.post_attention_layernorm.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.self_attn.k_proj.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.self_attn.o_proj.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.self_attn.q_proj.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.self_attn.v_proj.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.8.block_sparse_moe.gate.weight': 'model.layers.00009-of-00022.safetensors', 'model.layers.9.input_layernorm.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.0.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.1.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.2.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.3.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.4.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.5.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.6.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.7.w2.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.0.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.1.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.2.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.3.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.4.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.5.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.6.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.7.w1.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.0.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.1.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.2.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.3.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.4.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.5.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.6.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.experts.7.w3.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.post_attention_layernorm.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.self_attn.k_proj.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.self_attn.o_proj.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.self_attn.q_proj.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.self_attn.v_proj.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.9.block_sparse_moe.gate.weight': 'model.layers.00010-of-00022.safetensors', 'model.layers.10.input_layernorm.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.0.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.1.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.2.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.3.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.4.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.5.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.6.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.7.w2.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.0.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.1.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.2.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.3.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.4.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.5.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.6.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.7.w1.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.0.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.1.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.2.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.3.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.4.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.5.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.6.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.experts.7.w3.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.post_attention_layernorm.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.self_attn.k_proj.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.self_attn.o_proj.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.self_attn.q_proj.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.self_attn.v_proj.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.10.block_sparse_moe.gate.weight': 'model.layers.00011-of-00022.safetensors', 'model.layers.11.input_layernorm.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.0.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.1.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.2.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.3.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.4.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.5.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.6.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.7.w2.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.0.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.1.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.2.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.3.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.4.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.5.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.6.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.7.w1.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.0.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.1.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.2.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.3.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.4.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.5.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.6.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.experts.7.w3.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.post_attention_layernorm.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.self_attn.k_proj.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.self_attn.o_proj.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.self_attn.q_proj.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.self_attn.v_proj.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.11.block_sparse_moe.gate.weight': 'model.layers.00012-of-00022.safetensors', 'model.layers.12.input_layernorm.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.0.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.1.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.2.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.3.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.4.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.5.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.6.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.7.w2.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.0.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.1.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.2.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.3.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.4.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.5.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.6.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.7.w1.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.0.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.1.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.2.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.3.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.4.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.5.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.6.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.experts.7.w3.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.post_attention_layernorm.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.self_attn.k_proj.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.self_attn.o_proj.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.self_attn.q_proj.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.self_attn.v_proj.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.12.block_sparse_moe.gate.weight': 'model.layers.00013-of-00022.safetensors', 'model.layers.13.input_layernorm.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.0.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.1.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.2.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.3.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.4.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.5.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.6.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.7.w2.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.0.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.1.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.2.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.3.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.4.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.5.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.6.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.7.w1.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.0.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.1.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.2.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.3.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.4.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.5.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.6.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.experts.7.w3.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.post_attention_layernorm.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.self_attn.k_proj.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.self_attn.o_proj.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.self_attn.q_proj.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.self_attn.v_proj.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.13.block_sparse_moe.gate.weight': 'model.layers.00014-of-00022.safetensors', 'model.layers.14.input_layernorm.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.0.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.1.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.2.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.3.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.4.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.5.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.6.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.7.w2.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.0.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.1.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.2.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.3.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.4.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.5.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.6.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.7.w1.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.0.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.1.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.2.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.3.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.4.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.5.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.6.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.experts.7.w3.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.post_attention_layernorm.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.self_attn.k_proj.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.self_attn.o_proj.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.self_attn.q_proj.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.self_attn.v_proj.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.14.block_sparse_moe.gate.weight': 'model.layers.00015-of-00022.safetensors', 'model.layers.15.input_layernorm.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.0.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.1.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.2.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.3.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.4.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.5.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.6.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.7.w2.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.0.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.1.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.2.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.3.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.4.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.5.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.6.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.7.w1.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.0.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.1.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.2.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.3.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.4.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.5.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.6.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.experts.7.w3.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.post_attention_layernorm.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.self_attn.k_proj.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.self_attn.o_proj.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.self_attn.q_proj.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.self_attn.v_proj.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.15.block_sparse_moe.gate.weight': 'model.layers.00016-of-00022.safetensors', 'model.layers.16.input_layernorm.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.0.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.1.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.2.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.3.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.4.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.5.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.6.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.7.w2.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.0.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.1.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.2.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.3.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.4.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.5.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.6.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.7.w1.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.0.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.1.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.2.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.3.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.4.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.5.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.6.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.experts.7.w3.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.post_attention_layernorm.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.self_attn.k_proj.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.self_attn.o_proj.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.self_attn.q_proj.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.self_attn.v_proj.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.16.block_sparse_moe.gate.weight': 'model.layers.00017-of-00022.safetensors', 'model.layers.17.input_layernorm.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.0.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.1.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.2.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.3.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.4.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.5.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.6.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.7.w2.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.0.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.1.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.2.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.3.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.4.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.5.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.6.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.7.w1.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.0.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.1.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.2.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.3.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.4.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.5.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.6.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.experts.7.w3.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.post_attention_layernorm.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.self_attn.k_proj.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.self_attn.o_proj.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.self_attn.q_proj.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.self_attn.v_proj.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.17.block_sparse_moe.gate.weight': 'model.layers.00018-of-00022.safetensors', 'model.layers.18.input_layernorm.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.0.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.1.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.2.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.3.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.4.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.5.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.6.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.7.w2.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.0.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.1.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.2.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.3.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.4.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.5.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.6.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.7.w1.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.0.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.1.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.2.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.3.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.4.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.5.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.6.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.experts.7.w3.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.post_attention_layernorm.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.self_attn.k_proj.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.self_attn.o_proj.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.self_attn.q_proj.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.self_attn.v_proj.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.18.block_sparse_moe.gate.weight': 'model.layers.00019-of-00022.safetensors', 'model.layers.19.input_layernorm.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.0.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.1.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.2.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.3.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.4.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.5.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.6.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.7.w2.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.0.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.1.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.2.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.3.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.4.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.5.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.6.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.7.w1.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.0.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.1.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.2.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.3.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.4.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.5.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.6.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.experts.7.w3.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.post_attention_layernorm.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.self_attn.k_proj.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.self_attn.o_proj.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.self_attn.q_proj.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.self_attn.v_proj.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.19.block_sparse_moe.gate.weight': 'model.layers.00020-of-00022.safetensors', 'model.layers.20.input_layernorm.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.0.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.1.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.2.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.3.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.4.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.5.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.6.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.7.w2.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.0.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.1.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.2.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.3.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.4.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.5.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.6.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.7.w1.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.0.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.1.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.2.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.3.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.4.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.5.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.6.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.experts.7.w3.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.post_attention_layernorm.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.self_attn.k_proj.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.self_attn.o_proj.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.self_attn.q_proj.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.self_attn.v_proj.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.20.block_sparse_moe.gate.weight': 'model.layers.00021-of-00022.safetensors', 'model.layers.21.input_layernorm.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.0.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.1.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.2.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.3.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.4.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.5.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.6.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.7.w2.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.0.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.1.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.2.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.3.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.4.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.5.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.6.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.7.w1.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.0.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.1.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.2.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.3.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.4.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.5.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.6.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.experts.7.w3.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.post_attention_layernorm.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.self_attn.k_proj.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.self_attn.o_proj.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.self_attn.q_proj.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.self_attn.v_proj.weight': 'model.layers.00022-of-00022.safetensors', 'model.layers.21.block_sparse_moe.gate.weight': 'model.layers.00022-of-00022.safetensors'}}\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# check model\n",
+ "mx_model, mx_tok = test_gen(save_mixtral_dir)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 1000,
+ "referenced_widgets": [
+ "ae1fb4b51ee2457998f8066635edcc14",
+ "347c9d512b454f4781f0dbbfda6c3724",
+ "a458afb6c9b94046bc552a50ada0d867",
+ "f9062f94c8ee41febe6717fcbcb5053f",
+ "a79cec942e034086927cfda8e9d8afca",
+ "f30a519a167249b285fa7cf44e5565bc",
+ "dba40b83726b4fdaa5d5c8cb1c4d3c3b",
+ "65348e69ef7440038f107f5e8efa3b7e",
+ "ddde5ed5da554643aaea09e0bb0b797a",
+ "c138ccb796ff4c1a81ccce900bdec068",
+ "558ca035522c405fba9c7a3f9e8eb135"
+ ]
+ },
+ "id": "DCs_uVxCvCwR",
+ "outputId": "73663da6-0141-48fa-936b-82598865d27a"
+ },
+ "execution_count": 9,
+ "outputs": [
+ {
+ "output_type": "display_data",
+ "data": {
+ "text/plain": [
+ "Loading checkpoint shards: 0%| | 0/22 [00:00, ?it/s]"
+ ],
+ "application/vnd.jupyter.widget-view+json": {
+ "version_major": 2,
+ "version_minor": 0,
+ "model_id": "ae1fb4b51ee2457998f8066635edcc14"
+ }
+ },
+ "metadata": {}
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "check model load \n",
+ "MixtralConfig {\n",
+ " \"_name_or_path\": \"/content/tiny_mixtral_x\",\n",
+ " \"architectures\": [\n",
+ " \"MixtralForCausalLM\"\n",
+ " ],\n",
+ " \"attention_dropout\": 0.0,\n",
+ " \"bos_token_id\": 1,\n",
+ " \"eos_token_id\": 2,\n",
+ " \"hidden_act\": \"silu\",\n",
+ " \"hidden_size\": 2048,\n",
+ " \"initializer_range\": 0.02,\n",
+ " \"intermediate_size\": 5632,\n",
+ " \"max_position_embeddings\": 2048,\n",
+ " \"model_type\": \"mixtral\",\n",
+ " \"num_attention_heads\": 32,\n",
+ " \"num_experts_per_tok\": 2,\n",
+ " \"num_hidden_layers\": 22,\n",
+ " \"num_key_value_heads\": 4,\n",
+ " \"num_local_experts\": 8,\n",
+ " \"output_router_logits\": false,\n",
+ " \"rms_norm_eps\": 1e-05,\n",
+ " \"rope_theta\": 1000000.0,\n",
+ " \"router_aux_loss_coef\": 0.02,\n",
+ " \"sliding_window\": null,\n",
+ " \"tie_word_embeddings\": false,\n",
+ " \"torch_dtype\": \"bfloat16\",\n",
+ " \"transformers_version\": \"4.36.1\",\n",
+ " \"use_cache\": true,\n",
+ " \"vocab_size\": 32000\n",
+ "}\n",
+ "\n",
+ "MixtralForCausalLM(\n",
+ " (model): MixtralModel(\n",
+ " (embed_tokens): Embedding(32000, 2048)\n",
+ " (layers): ModuleList(\n",
+ " (0-21): 22 x MixtralDecoderLayer(\n",
+ " (self_attn): MixtralAttention(\n",
+ " (q_proj): Linear(in_features=2048, out_features=2048, bias=False)\n",
+ " (k_proj): Linear(in_features=2048, out_features=256, bias=False)\n",
+ " (v_proj): Linear(in_features=2048, out_features=256, bias=False)\n",
+ " (o_proj): Linear(in_features=2048, out_features=2048, bias=False)\n",
+ " (rotary_emb): MixtralRotaryEmbedding()\n",
+ " )\n",
+ " (block_sparse_moe): MixtralSparseMoeBlock(\n",
+ " (gate): Linear(in_features=2048, out_features=8, bias=False)\n",
+ " (experts): ModuleList(\n",
+ " (0-7): 8 x MixtralBLockSparseTop2MLP(\n",
+ " (w1): Linear(in_features=2048, out_features=5632, bias=False)\n",
+ " (w2): Linear(in_features=5632, out_features=2048, bias=False)\n",
+ " (w3): Linear(in_features=2048, out_features=5632, bias=False)\n",
+ " (act_fn): SiLU()\n",
+ " )\n",
+ " )\n",
+ " )\n",
+ " (input_layernorm): MixtralRMSNorm()\n",
+ " (post_attention_layernorm): MixtralRMSNorm()\n",
+ " )\n",
+ " )\n",
+ " (norm): MixtralRMSNorm()\n",
+ " )\n",
+ " (lm_head): Linear(in_features=2048, out_features=32000, bias=False)\n",
+ ")\n",
+ "check model generate text\n",
+ " [INST] What is your favourite condiment? [/INST] Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen! [INST] Do you have mayonnaise recipes? [/INST] (https://diet-d.com/recipes/mayonnaise-recipe-recipes-for-chicken-or-fish-on-easy-mayonaise-veggie-eggs-dont-use-sweetness.html) are you thinking of substituting these with a recipe that calls for mayonnaise, though?\n",
+ "cheesecake recipes with a mayonnaise instead of the oil? For how many pounds? May I suggest you substitute the mayonnaise with a cream? (and olive oil instead of soybean oil?\n",
+ "------------------------\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# from google.colab import userdata\n",
+ "# !huggingface-cli login --token {userdata.get('HUGGINGFACE_ACCESS_TOKEN')}"
+ ],
+ "metadata": {
+ "id": "X1jZZ3ggwX9x"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# huggingface_repo = \"TinyMixtral-x8-Clonebase-7b\"\n",
+ "# mx_model.push_to_hub(huggingface_repo, private=True)\n",
+ "# mx_tok.push_to_hub(huggingface_repo, private=True)"
+ ],
+ "metadata": {
+ "id": "asht1d6Fws_P"
+ },
+ "execution_count": null,
+ "outputs": []
+ }
+ ]
+}
\ No newline at end of file