{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "id": "hS2zWviCGv-j" }, "outputs": [], "source": [ "model_name_or_path = \"mistralai/Mixtral-8x7B-Instruct-v0.1\" ##@param {type:\"string\"}\n", "\n", "temp_dir = \"/content\" #@param {type:\"string\"}\n", "model_name = model_name_or_path.split(\"/\")[-1]\n", "target_dir = f\"{temp_dir}/{model_name}\"\n", "save_dir = \"/content/mixtral-4x7b\" #@param {type:\"string\"}\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "fS0Z2JFPG3w1" }, "outputs": [], "source": [ "!pip install transformers torch safetensors" ] }, { "cell_type": "code", "source": [ "%cd {temp_dir}\n", "save_model_dir = model_name.split('/')[-1]\n", "!mkdir -p {save_model_dir}\n", "\n", "!wget https://huggingface.co/{model_name_or_path}/resolve/main/config.json -O {save_model_dir}/config.json\n", "!wget https://huggingface.co/{model_name_or_path}/resolve/main/model.safetensors.index.json -O {save_model_dir}/model.safetensors.index.json\n", "!wget https://huggingface.co/{model_name_or_path}/resolve/main/generation_config.json -O {save_model_dir}/generation_config.json\n", "\n", "for i in range(1,20):\n", " file_count_str = str(i).zfill(5)\n", " !wget https://huggingface.co/{model_name_or_path}/resolve/main/model-{file_count_str}-of-00019.safetensors?download=true -O {save_model_dir}/model-{file_count_str}-of-00019.safetensors" ], "metadata": { "id": "WwnZPGHATsqv" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "GpHX5HoDPCEM", "outputId": "eb9095ad-ffea-4d3d-9b2d-cea1a278b3f9" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "/content\n", "starting layer: 0\n", "Loading Tensors model-00001-of-00019.safetensors\n", "stock model.layers.0.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.0.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.0.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.0.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.0.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.0.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.0.block_sparse_moe.experts.1.w2.weight\n", "new experts model.layers.0.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.0.block_sparse_moe.experts.1.w3.weight\n", "stock model.layers.0.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.0.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.0.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.0.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.0.block_sparse_moe.experts.3.w1.weight\n", "new experts model.layers.0.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.0.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.0.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.0.block_sparse_moe.experts.3.w3.weight\n", "stock model.layers.0.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.0.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.0.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.0.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.0.block_sparse_moe.experts.5.w1.weight\n", "new experts model.layers.0.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.0.block_sparse_moe.experts.5.w2.weight\n", "new experts model.layers.0.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.0.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.0.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.0.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.0.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.0.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.0.block_sparse_moe.experts.7.w1.weight\n", "new experts model.layers.0.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.0.block_sparse_moe.experts.7.w2.weight\n", "new experts model.layers.0.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.0.block_sparse_moe.experts.7.w3.weight\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.0.block_sparse_moe.gate.weight\n", "model.layers.0.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "model.layers.0.input_layernorm.weight torch.Size([4096])\n", "model.layers.0.post_attention_layernorm.weight torch.Size([4096])\n", "model.layers.0.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.0.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.0.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.0.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00001.safetensors\n", "starting layer: 1\n", "Loading Tensors model-00001-of-00019.safetensors\n", "stock model.layers.1.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.1.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.1.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.1.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.1.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.1.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.1.block_sparse_moe.experts.1.w2.weight\n", "new experts model.layers.1.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.1.block_sparse_moe.experts.1.w3.weight\n", "stock model.layers.1.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.1.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.1.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.1.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.1.block_sparse_moe.experts.3.w1.weight\n", "new experts model.layers.1.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.1.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.1.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.1.block_sparse_moe.experts.3.w3.weight\n", "stock model.layers.1.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.1.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "Loading Tensors model-00002-of-00019.safetensors\n", "stock model.layers.1.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.1.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.1.block_sparse_moe.experts.5.w1.weight\n", "new experts model.layers.1.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.1.block_sparse_moe.experts.5.w2.weight\n", "new experts model.layers.1.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.1.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.1.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.1.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.1.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.1.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.1.block_sparse_moe.experts.7.w1.weight\n", "new experts model.layers.1.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.1.block_sparse_moe.experts.7.w2.weight\n", "new experts model.layers.1.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.1.block_sparse_moe.experts.7.w3.weight\n", "Loading Tensors model-00001-of-00019.safetensors\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.1.block_sparse_moe.gate.weight\n", "model.layers.1.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "Loading Tensors model-00002-of-00019.safetensors\n", "model.layers.1.input_layernorm.weight torch.Size([4096])\n", "model.layers.1.post_attention_layernorm.weight torch.Size([4096])\n", "Loading Tensors model-00001-of-00019.safetensors\n", "model.layers.1.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.1.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.1.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.1.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00002.safetensors\n", "starting layer: 2\n", "Loading Tensors model-00002-of-00019.safetensors\n", "stock model.layers.2.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.2.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.2.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.2.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.2.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.2.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.2.block_sparse_moe.experts.1.w2.weight\n", "new experts model.layers.2.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.2.block_sparse_moe.experts.1.w3.weight\n", "stock model.layers.2.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.2.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.2.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.2.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.2.block_sparse_moe.experts.3.w1.weight\n", "new experts model.layers.2.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.2.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.2.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.2.block_sparse_moe.experts.3.w3.weight\n", "stock model.layers.2.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.2.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.2.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.2.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.2.block_sparse_moe.experts.5.w1.weight\n", "new experts model.layers.2.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.2.block_sparse_moe.experts.5.w2.weight\n", "new experts model.layers.2.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.2.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.2.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.2.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.2.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.2.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.2.block_sparse_moe.experts.7.w1.weight\n", "new experts model.layers.2.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.2.block_sparse_moe.experts.7.w2.weight\n", "new experts model.layers.2.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.2.block_sparse_moe.experts.7.w3.weight\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.2.block_sparse_moe.gate.weight\n", "model.layers.2.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "model.layers.2.input_layernorm.weight torch.Size([4096])\n", "model.layers.2.post_attention_layernorm.weight torch.Size([4096])\n", "model.layers.2.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.2.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.2.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.2.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00003.safetensors\n", "starting layer: 3\n", "Loading Tensors model-00002-of-00019.safetensors\n", "stock model.layers.3.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.3.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.3.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.3.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.3.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.3.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.3.block_sparse_moe.experts.1.w2.weight\n", "new experts model.layers.3.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.3.block_sparse_moe.experts.1.w3.weight\n", "stock model.layers.3.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "Loading Tensors model-00003-of-00019.safetensors\n", "stock model.layers.3.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.3.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.3.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.3.block_sparse_moe.experts.3.w1.weight\n", "new experts model.layers.3.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.3.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.3.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.3.block_sparse_moe.experts.3.w3.weight\n", "stock model.layers.3.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.3.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.3.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.3.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.3.block_sparse_moe.experts.5.w1.weight\n", "new experts model.layers.3.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.3.block_sparse_moe.experts.5.w2.weight\n", "new experts model.layers.3.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.3.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.3.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.3.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.3.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.3.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.3.block_sparse_moe.experts.7.w1.weight\n", "new experts model.layers.3.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.3.block_sparse_moe.experts.7.w2.weight\n", "new experts model.layers.3.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.3.block_sparse_moe.experts.7.w3.weight\n", "Loading Tensors model-00002-of-00019.safetensors\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.3.block_sparse_moe.gate.weight\n", "model.layers.3.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "Loading Tensors model-00003-of-00019.safetensors\n", "model.layers.3.input_layernorm.weight torch.Size([4096])\n", "model.layers.3.post_attention_layernorm.weight torch.Size([4096])\n", "Loading Tensors model-00002-of-00019.safetensors\n", "model.layers.3.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.3.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.3.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.3.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00004.safetensors\n", "starting layer: 4\n", "Loading Tensors model-00003-of-00019.safetensors\n", "stock model.layers.4.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.4.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.4.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.4.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.4.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.4.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.4.block_sparse_moe.experts.1.w2.weight\n", "new experts model.layers.4.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.4.block_sparse_moe.experts.1.w3.weight\n", "stock model.layers.4.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.4.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.4.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.4.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.4.block_sparse_moe.experts.3.w1.weight\n", "new experts model.layers.4.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.4.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.4.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.4.block_sparse_moe.experts.3.w3.weight\n", "stock model.layers.4.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.4.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.4.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.4.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.4.block_sparse_moe.experts.5.w1.weight\n", "new experts model.layers.4.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.4.block_sparse_moe.experts.5.w2.weight\n", "new experts model.layers.4.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.4.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.4.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.4.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.4.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.4.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.4.block_sparse_moe.experts.7.w1.weight\n", "new experts model.layers.4.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.4.block_sparse_moe.experts.7.w2.weight\n", "new experts model.layers.4.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.4.block_sparse_moe.experts.7.w3.weight\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.4.block_sparse_moe.gate.weight\n", "model.layers.4.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "model.layers.4.input_layernorm.weight torch.Size([4096])\n", "model.layers.4.post_attention_layernorm.weight torch.Size([4096])\n", "model.layers.4.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.4.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.4.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.4.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00005.safetensors\n", "starting layer: 5\n", "Loading Tensors model-00004-of-00019.safetensors\n", "stock model.layers.5.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.5.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.5.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.5.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.5.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.5.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.5.block_sparse_moe.experts.1.w2.weight\n", "new experts model.layers.5.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.5.block_sparse_moe.experts.1.w3.weight\n", "stock model.layers.5.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.5.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.5.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.5.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.5.block_sparse_moe.experts.3.w1.weight\n", "new experts model.layers.5.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.5.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.5.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.5.block_sparse_moe.experts.3.w3.weight\n", "stock model.layers.5.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.5.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.5.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.5.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.5.block_sparse_moe.experts.5.w1.weight\n", "new experts model.layers.5.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.5.block_sparse_moe.experts.5.w2.weight\n", "new experts model.layers.5.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.5.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.5.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.5.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.5.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.5.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.5.block_sparse_moe.experts.7.w1.weight\n", "new experts model.layers.5.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.5.block_sparse_moe.experts.7.w2.weight\n", "new experts model.layers.5.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.5.block_sparse_moe.experts.7.w3.weight\n", "Loading Tensors model-00003-of-00019.safetensors\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.5.block_sparse_moe.gate.weight\n", "model.layers.5.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "Loading Tensors model-00004-of-00019.safetensors\n", "model.layers.5.input_layernorm.weight torch.Size([4096])\n", "model.layers.5.post_attention_layernorm.weight torch.Size([4096])\n", "Loading Tensors model-00003-of-00019.safetensors\n", "model.layers.5.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.5.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.5.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.5.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00006.safetensors\n", "starting layer: 6\n", "Loading Tensors model-00004-of-00019.safetensors\n", "stock model.layers.6.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.6.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.6.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.6.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.6.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.6.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.6.block_sparse_moe.experts.1.w2.weight\n", "new experts model.layers.6.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.6.block_sparse_moe.experts.1.w3.weight\n", "stock model.layers.6.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.6.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.6.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.6.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.6.block_sparse_moe.experts.3.w1.weight\n", "new experts model.layers.6.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.6.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.6.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.6.block_sparse_moe.experts.3.w3.weight\n", "stock model.layers.6.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.6.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.6.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.6.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.6.block_sparse_moe.experts.5.w1.weight\n", "new experts model.layers.6.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.6.block_sparse_moe.experts.5.w2.weight\n", "Loading Tensors model-00005-of-00019.safetensors\n", "new experts model.layers.6.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.6.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.6.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.6.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.6.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.6.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.6.block_sparse_moe.experts.7.w1.weight\n", "new experts model.layers.6.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.6.block_sparse_moe.experts.7.w2.weight\n", "new experts model.layers.6.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.6.block_sparse_moe.experts.7.w3.weight\n", "Loading Tensors model-00004-of-00019.safetensors\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.6.block_sparse_moe.gate.weight\n", "model.layers.6.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "Loading Tensors model-00005-of-00019.safetensors\n", "model.layers.6.input_layernorm.weight torch.Size([4096])\n", "model.layers.6.post_attention_layernorm.weight torch.Size([4096])\n", "Loading Tensors model-00004-of-00019.safetensors\n", "model.layers.6.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.6.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.6.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.6.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00007.safetensors\n", "starting layer: 7\n", "Loading Tensors model-00005-of-00019.safetensors\n", "stock model.layers.7.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.7.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.7.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.7.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.7.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.7.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.7.block_sparse_moe.experts.1.w2.weight\n", "new experts model.layers.7.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.7.block_sparse_moe.experts.1.w3.weight\n", "stock model.layers.7.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.7.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.7.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.7.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.7.block_sparse_moe.experts.3.w1.weight\n", "new experts model.layers.7.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.7.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.7.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.7.block_sparse_moe.experts.3.w3.weight\n", "stock model.layers.7.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.7.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.7.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.7.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.7.block_sparse_moe.experts.5.w1.weight\n", "new experts model.layers.7.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.7.block_sparse_moe.experts.5.w2.weight\n", "new experts model.layers.7.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.7.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.7.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.7.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.7.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.7.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.7.block_sparse_moe.experts.7.w1.weight\n", "new experts model.layers.7.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.7.block_sparse_moe.experts.7.w2.weight\n", "new experts model.layers.7.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.7.block_sparse_moe.experts.7.w3.weight\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.7.block_sparse_moe.gate.weight\n", "model.layers.7.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "model.layers.7.input_layernorm.weight torch.Size([4096])\n", "model.layers.7.post_attention_layernorm.weight torch.Size([4096])\n", "model.layers.7.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.7.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.7.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.7.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00008.safetensors\n", "starting layer: 8\n", "Loading Tensors model-00005-of-00019.safetensors\n", "stock model.layers.8.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.8.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.8.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.8.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.8.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.8.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.8.block_sparse_moe.experts.1.w2.weight\n", "new experts model.layers.8.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.8.block_sparse_moe.experts.1.w3.weight\n", "stock model.layers.8.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.8.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.8.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.8.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.8.block_sparse_moe.experts.3.w1.weight\n", "Loading Tensors model-00006-of-00019.safetensors\n", "new experts model.layers.8.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.8.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.8.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.8.block_sparse_moe.experts.3.w3.weight\n", "stock model.layers.8.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.8.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.8.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.8.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.8.block_sparse_moe.experts.5.w1.weight\n", "new experts model.layers.8.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.8.block_sparse_moe.experts.5.w2.weight\n", "new experts model.layers.8.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.8.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.8.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.8.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.8.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.8.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.8.block_sparse_moe.experts.7.w1.weight\n", "new experts model.layers.8.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.8.block_sparse_moe.experts.7.w2.weight\n", "new experts model.layers.8.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.8.block_sparse_moe.experts.7.w3.weight\n", "Loading Tensors model-00005-of-00019.safetensors\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.8.block_sparse_moe.gate.weight\n", "model.layers.8.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "Loading Tensors model-00006-of-00019.safetensors\n", "model.layers.8.input_layernorm.weight torch.Size([4096])\n", "model.layers.8.post_attention_layernorm.weight torch.Size([4096])\n", "Loading Tensors model-00005-of-00019.safetensors\n", "model.layers.8.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.8.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.8.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.8.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00009.safetensors\n", "starting layer: 9\n", "Loading Tensors model-00006-of-00019.safetensors\n", "stock model.layers.9.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.9.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.9.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.9.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.9.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.9.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.9.block_sparse_moe.experts.1.w2.weight\n", "new experts model.layers.9.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.9.block_sparse_moe.experts.1.w3.weight\n", "stock model.layers.9.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.9.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.9.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.9.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.9.block_sparse_moe.experts.3.w1.weight\n", "new experts model.layers.9.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.9.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.9.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.9.block_sparse_moe.experts.3.w3.weight\n", "stock model.layers.9.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.9.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.9.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.9.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.9.block_sparse_moe.experts.5.w1.weight\n", "new experts model.layers.9.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.9.block_sparse_moe.experts.5.w2.weight\n", "new experts model.layers.9.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.9.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.9.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.9.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.9.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.9.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.9.block_sparse_moe.experts.7.w1.weight\n", "new experts model.layers.9.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.9.block_sparse_moe.experts.7.w2.weight\n", "new experts model.layers.9.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.9.block_sparse_moe.experts.7.w3.weight\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.9.block_sparse_moe.gate.weight\n", "model.layers.9.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "model.layers.9.input_layernorm.weight torch.Size([4096])\n", "model.layers.9.post_attention_layernorm.weight torch.Size([4096])\n", "model.layers.9.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.9.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.9.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.9.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00010.safetensors\n", "starting layer: 10\n", "Loading Tensors model-00006-of-00019.safetensors\n", "stock model.layers.10.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.10.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.10.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "Loading Tensors model-00007-of-00019.safetensors\n", "new experts model.layers.10.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.10.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.10.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.10.block_sparse_moe.experts.1.w2.weight\n", "new experts model.layers.10.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.10.block_sparse_moe.experts.1.w3.weight\n", "stock model.layers.10.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.10.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.10.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.10.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.10.block_sparse_moe.experts.3.w1.weight\n", "new experts model.layers.10.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.10.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.10.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.10.block_sparse_moe.experts.3.w3.weight\n", "stock model.layers.10.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.10.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.10.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.10.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.10.block_sparse_moe.experts.5.w1.weight\n", "new experts model.layers.10.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.10.block_sparse_moe.experts.5.w2.weight\n", "new experts model.layers.10.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.10.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.10.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.10.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.10.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.10.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.10.block_sparse_moe.experts.7.w1.weight\n", "new experts model.layers.10.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.10.block_sparse_moe.experts.7.w2.weight\n", "new experts model.layers.10.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.10.block_sparse_moe.experts.7.w3.weight\n", "Loading Tensors model-00006-of-00019.safetensors\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.10.block_sparse_moe.gate.weight\n", "model.layers.10.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "Loading Tensors model-00007-of-00019.safetensors\n", "model.layers.10.input_layernorm.weight torch.Size([4096])\n", "model.layers.10.post_attention_layernorm.weight torch.Size([4096])\n", "Loading Tensors model-00006-of-00019.safetensors\n", "model.layers.10.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.10.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.10.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.10.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00011.safetensors\n", "starting layer: 11\n", "Loading Tensors model-00007-of-00019.safetensors\n", "stock model.layers.11.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.11.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.11.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.11.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.11.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.11.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.11.block_sparse_moe.experts.1.w2.weight\n", "new experts model.layers.11.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.11.block_sparse_moe.experts.1.w3.weight\n", "stock model.layers.11.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.11.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.11.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.11.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.11.block_sparse_moe.experts.3.w1.weight\n", "new experts model.layers.11.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.11.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.11.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.11.block_sparse_moe.experts.3.w3.weight\n", "stock model.layers.11.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.11.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.11.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.11.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.11.block_sparse_moe.experts.5.w1.weight\n", "new experts model.layers.11.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.11.block_sparse_moe.experts.5.w2.weight\n", "new experts model.layers.11.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.11.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.11.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.11.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "Loading Tensors model-00008-of-00019.safetensors\n", "stock model.layers.11.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.11.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.11.block_sparse_moe.experts.7.w1.weight\n", "new experts model.layers.11.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.11.block_sparse_moe.experts.7.w2.weight\n", "new experts model.layers.11.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.11.block_sparse_moe.experts.7.w3.weight\n", "Loading Tensors model-00007-of-00019.safetensors\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.11.block_sparse_moe.gate.weight\n", "model.layers.11.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "Loading Tensors model-00008-of-00019.safetensors\n", "model.layers.11.input_layernorm.weight torch.Size([4096])\n", "model.layers.11.post_attention_layernorm.weight torch.Size([4096])\n", "Loading Tensors model-00007-of-00019.safetensors\n", "model.layers.11.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.11.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.11.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.11.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00012.safetensors\n", "starting layer: 12\n", "Loading Tensors model-00008-of-00019.safetensors\n", "stock model.layers.12.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.12.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.12.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.12.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.12.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.12.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.12.block_sparse_moe.experts.1.w2.weight\n", "new experts model.layers.12.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.12.block_sparse_moe.experts.1.w3.weight\n", "stock model.layers.12.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.12.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.12.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.12.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.12.block_sparse_moe.experts.3.w1.weight\n", "new experts model.layers.12.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.12.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.12.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.12.block_sparse_moe.experts.3.w3.weight\n", "stock model.layers.12.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.12.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.12.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.12.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.12.block_sparse_moe.experts.5.w1.weight\n", "new experts model.layers.12.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.12.block_sparse_moe.experts.5.w2.weight\n", "new experts model.layers.12.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.12.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.12.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.12.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.12.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.12.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.12.block_sparse_moe.experts.7.w1.weight\n", "new experts model.layers.12.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.12.block_sparse_moe.experts.7.w2.weight\n", "new experts model.layers.12.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.12.block_sparse_moe.experts.7.w3.weight\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.12.block_sparse_moe.gate.weight\n", "model.layers.12.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "model.layers.12.input_layernorm.weight torch.Size([4096])\n", "model.layers.12.post_attention_layernorm.weight torch.Size([4096])\n", "model.layers.12.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.12.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.12.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.12.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00013.safetensors\n", "starting layer: 13\n", "Loading Tensors model-00008-of-00019.safetensors\n", "stock model.layers.13.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.13.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.13.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.13.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.13.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.13.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.13.block_sparse_moe.experts.1.w2.weight\n", "new experts model.layers.13.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.13.block_sparse_moe.experts.1.w3.weight\n", "stock model.layers.13.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.13.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.13.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.13.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.13.block_sparse_moe.experts.3.w1.weight\n", "new experts model.layers.13.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.13.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.13.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.13.block_sparse_moe.experts.3.w3.weight\n", "stock model.layers.13.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "Loading Tensors model-00009-of-00019.safetensors\n", "stock model.layers.13.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.13.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.13.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.13.block_sparse_moe.experts.5.w1.weight\n", "new experts model.layers.13.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.13.block_sparse_moe.experts.5.w2.weight\n", "new experts model.layers.13.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.13.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.13.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.13.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.13.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.13.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.13.block_sparse_moe.experts.7.w1.weight\n", "new experts model.layers.13.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.13.block_sparse_moe.experts.7.w2.weight\n", "new experts model.layers.13.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.13.block_sparse_moe.experts.7.w3.weight\n", "Loading Tensors model-00008-of-00019.safetensors\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.13.block_sparse_moe.gate.weight\n", "model.layers.13.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "Loading Tensors model-00009-of-00019.safetensors\n", "model.layers.13.input_layernorm.weight torch.Size([4096])\n", "model.layers.13.post_attention_layernorm.weight torch.Size([4096])\n", "Loading Tensors model-00008-of-00019.safetensors\n", "model.layers.13.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.13.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.13.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.13.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00014.safetensors\n", "starting layer: 14\n", "Loading Tensors model-00009-of-00019.safetensors\n", "stock model.layers.14.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.14.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.14.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.14.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.14.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.14.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.14.block_sparse_moe.experts.1.w2.weight\n", "new experts model.layers.14.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.14.block_sparse_moe.experts.1.w3.weight\n", "stock model.layers.14.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.14.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.14.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.14.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.14.block_sparse_moe.experts.3.w1.weight\n", "new experts model.layers.14.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.14.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.14.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.14.block_sparse_moe.experts.3.w3.weight\n", "stock model.layers.14.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.14.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.14.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.14.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.14.block_sparse_moe.experts.5.w1.weight\n", "new experts model.layers.14.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.14.block_sparse_moe.experts.5.w2.weight\n", "new experts model.layers.14.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.14.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.14.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.14.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.14.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.14.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.14.block_sparse_moe.experts.7.w1.weight\n", "new experts model.layers.14.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.14.block_sparse_moe.experts.7.w2.weight\n", "new experts model.layers.14.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.14.block_sparse_moe.experts.7.w3.weight\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.14.block_sparse_moe.gate.weight\n", "model.layers.14.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "model.layers.14.input_layernorm.weight torch.Size([4096])\n", "model.layers.14.post_attention_layernorm.weight torch.Size([4096])\n", "model.layers.14.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.14.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.14.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.14.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00015.safetensors\n", "starting layer: 15\n", "Loading Tensors model-00009-of-00019.safetensors\n", "stock model.layers.15.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.15.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.15.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.15.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.15.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.15.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.15.block_sparse_moe.experts.1.w2.weight\n", "new experts model.layers.15.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.15.block_sparse_moe.experts.1.w3.weight\n", "Loading Tensors model-00010-of-00019.safetensors\n", "stock model.layers.15.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.15.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.15.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.15.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.15.block_sparse_moe.experts.3.w1.weight\n", "new experts model.layers.15.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.15.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.15.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.15.block_sparse_moe.experts.3.w3.weight\n", "stock model.layers.15.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.15.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.15.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.15.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.15.block_sparse_moe.experts.5.w1.weight\n", "new experts model.layers.15.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.15.block_sparse_moe.experts.5.w2.weight\n", "new experts model.layers.15.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.15.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.15.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.15.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.15.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.15.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.15.block_sparse_moe.experts.7.w1.weight\n", "new experts model.layers.15.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.15.block_sparse_moe.experts.7.w2.weight\n", "new experts model.layers.15.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.15.block_sparse_moe.experts.7.w3.weight\n", "Loading Tensors model-00009-of-00019.safetensors\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.15.block_sparse_moe.gate.weight\n", "model.layers.15.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "Loading Tensors model-00010-of-00019.safetensors\n", "model.layers.15.input_layernorm.weight torch.Size([4096])\n", "model.layers.15.post_attention_layernorm.weight torch.Size([4096])\n", "Loading Tensors model-00009-of-00019.safetensors\n", "model.layers.15.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.15.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.15.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.15.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00016.safetensors\n", "starting layer: 16\n", "Loading Tensors model-00010-of-00019.safetensors\n", "stock model.layers.16.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.16.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.16.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.16.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.16.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.16.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.16.block_sparse_moe.experts.1.w2.weight\n", "new experts model.layers.16.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.16.block_sparse_moe.experts.1.w3.weight\n", "stock model.layers.16.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.16.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.16.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.16.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.16.block_sparse_moe.experts.3.w1.weight\n", "new experts model.layers.16.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.16.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.16.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.16.block_sparse_moe.experts.3.w3.weight\n", "stock model.layers.16.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.16.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.16.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.16.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.16.block_sparse_moe.experts.5.w1.weight\n", "new experts model.layers.16.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.16.block_sparse_moe.experts.5.w2.weight\n", "new experts model.layers.16.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.16.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.16.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.16.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.16.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.16.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.16.block_sparse_moe.experts.7.w1.weight\n", "new experts model.layers.16.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.16.block_sparse_moe.experts.7.w2.weight\n", "Loading Tensors model-00011-of-00019.safetensors\n", "new experts model.layers.16.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.16.block_sparse_moe.experts.7.w3.weight\n", "Loading Tensors model-00010-of-00019.safetensors\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.16.block_sparse_moe.gate.weight\n", "model.layers.16.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "Loading Tensors model-00011-of-00019.safetensors\n", "model.layers.16.input_layernorm.weight torch.Size([4096])\n", "model.layers.16.post_attention_layernorm.weight torch.Size([4096])\n", "Loading Tensors model-00010-of-00019.safetensors\n", "model.layers.16.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.16.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.16.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.16.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00017.safetensors\n", "starting layer: 17\n", "Loading Tensors model-00011-of-00019.safetensors\n", "stock model.layers.17.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.17.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.17.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.17.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.17.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.17.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.17.block_sparse_moe.experts.1.w2.weight\n", "new experts model.layers.17.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.17.block_sparse_moe.experts.1.w3.weight\n", "stock model.layers.17.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.17.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.17.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.17.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.17.block_sparse_moe.experts.3.w1.weight\n", "new experts model.layers.17.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.17.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.17.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.17.block_sparse_moe.experts.3.w3.weight\n", "stock model.layers.17.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.17.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.17.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.17.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.17.block_sparse_moe.experts.5.w1.weight\n", "new experts model.layers.17.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.17.block_sparse_moe.experts.5.w2.weight\n", "new experts model.layers.17.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.17.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.17.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.17.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.17.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.17.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.17.block_sparse_moe.experts.7.w1.weight\n", "new experts model.layers.17.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.17.block_sparse_moe.experts.7.w2.weight\n", "new experts model.layers.17.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.17.block_sparse_moe.experts.7.w3.weight\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.17.block_sparse_moe.gate.weight\n", "model.layers.17.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "model.layers.17.input_layernorm.weight torch.Size([4096])\n", "model.layers.17.post_attention_layernorm.weight torch.Size([4096])\n", "model.layers.17.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.17.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.17.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.17.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00018.safetensors\n", "starting layer: 18\n", "Loading Tensors model-00011-of-00019.safetensors\n", "stock model.layers.18.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.18.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.18.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.18.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.18.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.18.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.18.block_sparse_moe.experts.1.w2.weight\n", "new experts model.layers.18.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.18.block_sparse_moe.experts.1.w3.weight\n", "stock model.layers.18.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.18.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.18.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.18.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.18.block_sparse_moe.experts.3.w1.weight\n", "new experts model.layers.18.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.18.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.18.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.18.block_sparse_moe.experts.3.w3.weight\n", "stock model.layers.18.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.18.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.18.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.18.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.18.block_sparse_moe.experts.5.w1.weight\n", "Loading Tensors model-00012-of-00019.safetensors\n", "new experts model.layers.18.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.18.block_sparse_moe.experts.5.w2.weight\n", "new experts model.layers.18.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.18.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.18.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.18.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.18.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.18.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.18.block_sparse_moe.experts.7.w1.weight\n", "new experts model.layers.18.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.18.block_sparse_moe.experts.7.w2.weight\n", "new experts model.layers.18.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.18.block_sparse_moe.experts.7.w3.weight\n", "Loading Tensors model-00011-of-00019.safetensors\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.18.block_sparse_moe.gate.weight\n", "model.layers.18.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "Loading Tensors model-00012-of-00019.safetensors\n", "model.layers.18.input_layernorm.weight torch.Size([4096])\n", "model.layers.18.post_attention_layernorm.weight torch.Size([4096])\n", "Loading Tensors model-00011-of-00019.safetensors\n", "model.layers.18.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.18.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.18.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.18.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00019.safetensors\n", "starting layer: 19\n", "Loading Tensors model-00012-of-00019.safetensors\n", "stock model.layers.19.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.19.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.19.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.19.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.19.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.19.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.19.block_sparse_moe.experts.1.w2.weight\n", "new experts model.layers.19.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.19.block_sparse_moe.experts.1.w3.weight\n", "stock model.layers.19.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.19.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.19.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.19.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.19.block_sparse_moe.experts.3.w1.weight\n", "new experts model.layers.19.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.19.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.19.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.19.block_sparse_moe.experts.3.w3.weight\n", "stock model.layers.19.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.19.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.19.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.19.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.19.block_sparse_moe.experts.5.w1.weight\n", "new experts model.layers.19.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.19.block_sparse_moe.experts.5.w2.weight\n", "new experts model.layers.19.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.19.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.19.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.19.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.19.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.19.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.19.block_sparse_moe.experts.7.w1.weight\n", "new experts model.layers.19.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.19.block_sparse_moe.experts.7.w2.weight\n", "new experts model.layers.19.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.19.block_sparse_moe.experts.7.w3.weight\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.19.block_sparse_moe.gate.weight\n", "model.layers.19.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "model.layers.19.input_layernorm.weight torch.Size([4096])\n", "model.layers.19.post_attention_layernorm.weight torch.Size([4096])\n", "model.layers.19.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.19.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.19.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.19.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00020.safetensors\n", "starting layer: 20\n", "Loading Tensors model-00012-of-00019.safetensors\n", "stock model.layers.20.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.20.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.20.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.20.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.20.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.20.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.20.block_sparse_moe.experts.1.w2.weight\n", "new experts model.layers.20.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.20.block_sparse_moe.experts.1.w3.weight\n", "stock model.layers.20.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.20.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.20.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "Loading Tensors model-00013-of-00019.safetensors\n", "new experts model.layers.20.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.20.block_sparse_moe.experts.3.w1.weight\n", "new experts model.layers.20.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.20.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.20.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.20.block_sparse_moe.experts.3.w3.weight\n", "stock model.layers.20.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.20.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.20.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.20.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.20.block_sparse_moe.experts.5.w1.weight\n", "new experts model.layers.20.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.20.block_sparse_moe.experts.5.w2.weight\n", "new experts model.layers.20.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.20.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.20.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.20.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.20.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.20.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.20.block_sparse_moe.experts.7.w1.weight\n", "new experts model.layers.20.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.20.block_sparse_moe.experts.7.w2.weight\n", "new experts model.layers.20.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.20.block_sparse_moe.experts.7.w3.weight\n", "Loading Tensors model-00012-of-00019.safetensors\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.20.block_sparse_moe.gate.weight\n", "model.layers.20.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "Loading Tensors model-00013-of-00019.safetensors\n", "model.layers.20.input_layernorm.weight torch.Size([4096])\n", "model.layers.20.post_attention_layernorm.weight torch.Size([4096])\n", "Loading Tensors model-00012-of-00019.safetensors\n", "model.layers.20.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.20.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.20.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.20.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00021.safetensors\n", "starting layer: 21\n", "Loading Tensors model-00013-of-00019.safetensors\n", "stock model.layers.21.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.21.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.21.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.21.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.21.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.21.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.21.block_sparse_moe.experts.1.w2.weight\n", "new experts model.layers.21.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.21.block_sparse_moe.experts.1.w3.weight\n", "stock model.layers.21.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.21.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.21.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.21.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.21.block_sparse_moe.experts.3.w1.weight\n", "new experts model.layers.21.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.21.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.21.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.21.block_sparse_moe.experts.3.w3.weight\n", "stock model.layers.21.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.21.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.21.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.21.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.21.block_sparse_moe.experts.5.w1.weight\n", "new experts model.layers.21.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.21.block_sparse_moe.experts.5.w2.weight\n", "new experts model.layers.21.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.21.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.21.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.21.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.21.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.21.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.21.block_sparse_moe.experts.7.w1.weight\n", "new experts model.layers.21.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.21.block_sparse_moe.experts.7.w2.weight\n", "new experts model.layers.21.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.21.block_sparse_moe.experts.7.w3.weight\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.21.block_sparse_moe.gate.weight\n", "model.layers.21.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "model.layers.21.input_layernorm.weight torch.Size([4096])\n", "model.layers.21.post_attention_layernorm.weight torch.Size([4096])\n", "model.layers.21.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.21.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.21.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.21.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00022.safetensors\n", "starting layer: 22\n", "Loading Tensors model-00013-of-00019.safetensors\n", "stock model.layers.22.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.22.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "Loading Tensors model-00014-of-00019.safetensors\n", "stock model.layers.22.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.22.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.22.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.22.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.22.block_sparse_moe.experts.1.w2.weight\n", "new experts model.layers.22.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.22.block_sparse_moe.experts.1.w3.weight\n", "stock model.layers.22.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.22.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.22.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.22.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.22.block_sparse_moe.experts.3.w1.weight\n", "new experts model.layers.22.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.22.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.22.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.22.block_sparse_moe.experts.3.w3.weight\n", "stock model.layers.22.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.22.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.22.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.22.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.22.block_sparse_moe.experts.5.w1.weight\n", "new experts model.layers.22.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.22.block_sparse_moe.experts.5.w2.weight\n", "new experts model.layers.22.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.22.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.22.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.22.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.22.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.22.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.22.block_sparse_moe.experts.7.w1.weight\n", "new experts model.layers.22.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.22.block_sparse_moe.experts.7.w2.weight\n", "new experts model.layers.22.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.22.block_sparse_moe.experts.7.w3.weight\n", "Loading Tensors model-00013-of-00019.safetensors\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.22.block_sparse_moe.gate.weight\n", "model.layers.22.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "Loading Tensors model-00014-of-00019.safetensors\n", "model.layers.22.input_layernorm.weight torch.Size([4096])\n", "model.layers.22.post_attention_layernorm.weight torch.Size([4096])\n", "Loading Tensors model-00013-of-00019.safetensors\n", "model.layers.22.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.22.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.22.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.22.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00023.safetensors\n", "starting layer: 23\n", "Loading Tensors model-00014-of-00019.safetensors\n", "stock model.layers.23.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.23.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.23.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.23.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.23.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.23.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.23.block_sparse_moe.experts.1.w2.weight\n", "new experts model.layers.23.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.23.block_sparse_moe.experts.1.w3.weight\n", "stock model.layers.23.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.23.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.23.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.23.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.23.block_sparse_moe.experts.3.w1.weight\n", "new experts model.layers.23.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.23.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.23.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.23.block_sparse_moe.experts.3.w3.weight\n", "stock model.layers.23.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.23.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.23.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.23.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.23.block_sparse_moe.experts.5.w1.weight\n", "new experts model.layers.23.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.23.block_sparse_moe.experts.5.w2.weight\n", "new experts model.layers.23.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.23.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.23.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "Loading Tensors model-00015-of-00019.safetensors\n", "stock model.layers.23.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.23.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.23.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.23.block_sparse_moe.experts.7.w1.weight\n", "new experts model.layers.23.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.23.block_sparse_moe.experts.7.w2.weight\n", "new experts model.layers.23.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.23.block_sparse_moe.experts.7.w3.weight\n", "Loading Tensors model-00014-of-00019.safetensors\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.23.block_sparse_moe.gate.weight\n", "model.layers.23.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "Loading Tensors model-00015-of-00019.safetensors\n", "model.layers.23.input_layernorm.weight torch.Size([4096])\n", "model.layers.23.post_attention_layernorm.weight torch.Size([4096])\n", "Loading Tensors model-00014-of-00019.safetensors\n", "model.layers.23.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.23.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.23.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.23.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00024.safetensors\n", "starting layer: 24\n", "Loading Tensors model-00015-of-00019.safetensors\n", "stock model.layers.24.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.24.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.24.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.24.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.24.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.24.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.24.block_sparse_moe.experts.1.w2.weight\n", "new experts model.layers.24.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.24.block_sparse_moe.experts.1.w3.weight\n", "stock model.layers.24.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.24.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.24.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.24.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.24.block_sparse_moe.experts.3.w1.weight\n", "new experts model.layers.24.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.24.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.24.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.24.block_sparse_moe.experts.3.w3.weight\n", "stock model.layers.24.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.24.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.24.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.24.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.24.block_sparse_moe.experts.5.w1.weight\n", "new experts model.layers.24.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.24.block_sparse_moe.experts.5.w2.weight\n", "new experts model.layers.24.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.24.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.24.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.24.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.24.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.24.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.24.block_sparse_moe.experts.7.w1.weight\n", "new experts model.layers.24.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.24.block_sparse_moe.experts.7.w2.weight\n", "new experts model.layers.24.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.24.block_sparse_moe.experts.7.w3.weight\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.24.block_sparse_moe.gate.weight\n", "model.layers.24.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "model.layers.24.input_layernorm.weight torch.Size([4096])\n", "model.layers.24.post_attention_layernorm.weight torch.Size([4096])\n", "model.layers.24.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.24.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.24.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.24.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00025.safetensors\n", "starting layer: 25\n", "Loading Tensors model-00015-of-00019.safetensors\n", "stock model.layers.25.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.25.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.25.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.25.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.25.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.25.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.25.block_sparse_moe.experts.1.w2.weight\n", "new experts model.layers.25.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.25.block_sparse_moe.experts.1.w3.weight\n", "stock model.layers.25.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.25.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.25.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.25.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.25.block_sparse_moe.experts.3.w1.weight\n", "new experts model.layers.25.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.25.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.25.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.25.block_sparse_moe.experts.3.w3.weight\n", "Loading Tensors model-00016-of-00019.safetensors\n", "stock model.layers.25.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.25.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.25.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.25.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.25.block_sparse_moe.experts.5.w1.weight\n", "new experts model.layers.25.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.25.block_sparse_moe.experts.5.w2.weight\n", "new experts model.layers.25.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.25.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.25.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.25.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.25.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.25.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.25.block_sparse_moe.experts.7.w1.weight\n", "new experts model.layers.25.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.25.block_sparse_moe.experts.7.w2.weight\n", "new experts model.layers.25.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.25.block_sparse_moe.experts.7.w3.weight\n", "Loading Tensors model-00015-of-00019.safetensors\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.25.block_sparse_moe.gate.weight\n", "model.layers.25.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "Loading Tensors model-00016-of-00019.safetensors\n", "model.layers.25.input_layernorm.weight torch.Size([4096])\n", "model.layers.25.post_attention_layernorm.weight torch.Size([4096])\n", "Loading Tensors model-00015-of-00019.safetensors\n", "model.layers.25.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.25.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.25.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.25.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00026.safetensors\n", "starting layer: 26\n", "Loading Tensors model-00016-of-00019.safetensors\n", "stock model.layers.26.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.26.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.26.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.26.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.26.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.26.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.26.block_sparse_moe.experts.1.w2.weight\n", "new experts model.layers.26.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.26.block_sparse_moe.experts.1.w3.weight\n", "stock model.layers.26.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.26.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.26.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.26.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.26.block_sparse_moe.experts.3.w1.weight\n", "new experts model.layers.26.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.26.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.26.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.26.block_sparse_moe.experts.3.w3.weight\n", "stock model.layers.26.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.26.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.26.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.26.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.26.block_sparse_moe.experts.5.w1.weight\n", "new experts model.layers.26.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.26.block_sparse_moe.experts.5.w2.weight\n", "new experts model.layers.26.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.26.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.26.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.26.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.26.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.26.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.26.block_sparse_moe.experts.7.w1.weight\n", "new experts model.layers.26.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.26.block_sparse_moe.experts.7.w2.weight\n", "new experts model.layers.26.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.26.block_sparse_moe.experts.7.w3.weight\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.26.block_sparse_moe.gate.weight\n", "model.layers.26.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "model.layers.26.input_layernorm.weight torch.Size([4096])\n", "model.layers.26.post_attention_layernorm.weight torch.Size([4096])\n", "model.layers.26.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.26.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.26.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.26.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00027.safetensors\n", "starting layer: 27\n", "Loading Tensors model-00016-of-00019.safetensors\n", "stock model.layers.27.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.27.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.27.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.27.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.27.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.27.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.27.block_sparse_moe.experts.1.w2.weight\n", "Loading Tensors model-00017-of-00019.safetensors\n", "new experts model.layers.27.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.27.block_sparse_moe.experts.1.w3.weight\n", "stock model.layers.27.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.27.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.27.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.27.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.27.block_sparse_moe.experts.3.w1.weight\n", "new experts model.layers.27.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.27.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.27.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.27.block_sparse_moe.experts.3.w3.weight\n", "stock model.layers.27.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.27.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.27.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.27.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.27.block_sparse_moe.experts.5.w1.weight\n", "new experts model.layers.27.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.27.block_sparse_moe.experts.5.w2.weight\n", "new experts model.layers.27.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.27.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.27.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.27.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.27.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.27.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.27.block_sparse_moe.experts.7.w1.weight\n", "new experts model.layers.27.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.27.block_sparse_moe.experts.7.w2.weight\n", "new experts model.layers.27.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.27.block_sparse_moe.experts.7.w3.weight\n", "Loading Tensors model-00016-of-00019.safetensors\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.27.block_sparse_moe.gate.weight\n", "model.layers.27.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "Loading Tensors model-00017-of-00019.safetensors\n", "model.layers.27.input_layernorm.weight torch.Size([4096])\n", "model.layers.27.post_attention_layernorm.weight torch.Size([4096])\n", "Loading Tensors model-00016-of-00019.safetensors\n", "model.layers.27.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.27.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.27.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.27.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00028.safetensors\n", "starting layer: 28\n", "Loading Tensors model-00017-of-00019.safetensors\n", "stock model.layers.28.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.28.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.28.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.28.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.28.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.28.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.28.block_sparse_moe.experts.1.w2.weight\n", "new experts model.layers.28.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.28.block_sparse_moe.experts.1.w3.weight\n", "stock model.layers.28.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.28.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.28.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.28.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.28.block_sparse_moe.experts.3.w1.weight\n", "new experts model.layers.28.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.28.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.28.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.28.block_sparse_moe.experts.3.w3.weight\n", "stock model.layers.28.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.28.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.28.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.28.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.28.block_sparse_moe.experts.5.w1.weight\n", "new experts model.layers.28.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.28.block_sparse_moe.experts.5.w2.weight\n", "new experts model.layers.28.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.28.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.28.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.28.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.28.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.28.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.28.block_sparse_moe.experts.7.w1.weight\n", "Loading Tensors model-00018-of-00019.safetensors\n", "new experts model.layers.28.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.28.block_sparse_moe.experts.7.w2.weight\n", "new experts model.layers.28.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.28.block_sparse_moe.experts.7.w3.weight\n", "Loading Tensors model-00017-of-00019.safetensors\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.28.block_sparse_moe.gate.weight\n", "model.layers.28.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "Loading Tensors model-00018-of-00019.safetensors\n", "model.layers.28.input_layernorm.weight torch.Size([4096])\n", "model.layers.28.post_attention_layernorm.weight torch.Size([4096])\n", "Loading Tensors model-00017-of-00019.safetensors\n", "model.layers.28.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.28.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.28.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.28.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00029.safetensors\n", "starting layer: 29\n", "Loading Tensors model-00018-of-00019.safetensors\n", "stock model.layers.29.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.29.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.29.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.29.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.29.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.29.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.29.block_sparse_moe.experts.1.w2.weight\n", "new experts model.layers.29.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.29.block_sparse_moe.experts.1.w3.weight\n", "stock model.layers.29.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.29.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.29.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.29.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.29.block_sparse_moe.experts.3.w1.weight\n", "new experts model.layers.29.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.29.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.29.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.29.block_sparse_moe.experts.3.w3.weight\n", "stock model.layers.29.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.29.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.29.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.29.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.29.block_sparse_moe.experts.5.w1.weight\n", "new experts model.layers.29.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.29.block_sparse_moe.experts.5.w2.weight\n", "new experts model.layers.29.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.29.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.29.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.29.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.29.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.29.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.29.block_sparse_moe.experts.7.w1.weight\n", "new experts model.layers.29.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.29.block_sparse_moe.experts.7.w2.weight\n", "new experts model.layers.29.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.29.block_sparse_moe.experts.7.w3.weight\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.29.block_sparse_moe.gate.weight\n", "model.layers.29.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "model.layers.29.input_layernorm.weight torch.Size([4096])\n", "model.layers.29.post_attention_layernorm.weight torch.Size([4096])\n", "model.layers.29.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.29.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.29.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.29.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00030.safetensors\n", "starting layer: 30\n", "Loading Tensors model-00018-of-00019.safetensors\n", "stock model.layers.30.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.30.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.30.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.30.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.30.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.30.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.30.block_sparse_moe.experts.1.w2.weight\n", "new experts model.layers.30.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.30.block_sparse_moe.experts.1.w3.weight\n", "stock model.layers.30.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.30.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.30.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.30.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.30.block_sparse_moe.experts.3.w1.weight\n", "new experts model.layers.30.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.30.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.30.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.30.block_sparse_moe.experts.3.w3.weight\n", "stock model.layers.30.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.30.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.30.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "Loading Tensors model-00019-of-00019.safetensors\n", "new experts model.layers.30.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.30.block_sparse_moe.experts.5.w1.weight\n", "new experts model.layers.30.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.30.block_sparse_moe.experts.5.w2.weight\n", "new experts model.layers.30.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.30.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.30.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.30.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.30.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.30.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.30.block_sparse_moe.experts.7.w1.weight\n", "new experts model.layers.30.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.30.block_sparse_moe.experts.7.w2.weight\n", "new experts model.layers.30.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.30.block_sparse_moe.experts.7.w3.weight\n", "Loading Tensors model-00018-of-00019.safetensors\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.30.block_sparse_moe.gate.weight\n", "model.layers.30.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "Loading Tensors model-00019-of-00019.safetensors\n", "model.layers.30.input_layernorm.weight torch.Size([4096])\n", "model.layers.30.post_attention_layernorm.weight torch.Size([4096])\n", "Loading Tensors model-00018-of-00019.safetensors\n", "model.layers.30.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.30.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.30.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.30.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00031.safetensors\n", "starting layer: 31\n", "Loading Tensors model-00019-of-00019.safetensors\n", "stock model.layers.31.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.31.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.31.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.31.block_sparse_moe.experts.0.w1.weight torch.Size([14336, 4096]) from model.layers.31.block_sparse_moe.experts.1.w1.weight\n", "new experts model.layers.31.block_sparse_moe.experts.0.w2.weight torch.Size([4096, 14336]) from model.layers.31.block_sparse_moe.experts.1.w2.weight\n", "new experts model.layers.31.block_sparse_moe.experts.0.w3.weight torch.Size([14336, 4096]) from model.layers.31.block_sparse_moe.experts.1.w3.weight\n", "stock model.layers.31.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.31.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.31.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.31.block_sparse_moe.experts.1.w1.weight torch.Size([14336, 4096]) from model.layers.31.block_sparse_moe.experts.3.w1.weight\n", "new experts model.layers.31.block_sparse_moe.experts.1.w2.weight torch.Size([4096, 14336]) from model.layers.31.block_sparse_moe.experts.3.w2.weight\n", "new experts model.layers.31.block_sparse_moe.experts.1.w3.weight torch.Size([14336, 4096]) from model.layers.31.block_sparse_moe.experts.3.w3.weight\n", "stock model.layers.31.block_sparse_moe.experts.4.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.31.block_sparse_moe.experts.4.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.31.block_sparse_moe.experts.4.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.31.block_sparse_moe.experts.2.w1.weight torch.Size([14336, 4096]) from model.layers.31.block_sparse_moe.experts.5.w1.weight\n", "new experts model.layers.31.block_sparse_moe.experts.2.w2.weight torch.Size([4096, 14336]) from model.layers.31.block_sparse_moe.experts.5.w2.weight\n", "new experts model.layers.31.block_sparse_moe.experts.2.w3.weight torch.Size([14336, 4096]) from model.layers.31.block_sparse_moe.experts.5.w3.weight\n", "stock model.layers.31.block_sparse_moe.experts.6.w1.weight torch.Size([14336, 4096])\n", "stock model.layers.31.block_sparse_moe.experts.6.w2.weight torch.Size([4096, 14336])\n", "stock model.layers.31.block_sparse_moe.experts.6.w3.weight torch.Size([14336, 4096])\n", "new experts model.layers.31.block_sparse_moe.experts.3.w1.weight torch.Size([14336, 4096]) from model.layers.31.block_sparse_moe.experts.7.w1.weight\n", "new experts model.layers.31.block_sparse_moe.experts.3.w2.weight torch.Size([4096, 14336]) from model.layers.31.block_sparse_moe.experts.7.w2.weight\n", "new experts model.layers.31.block_sparse_moe.experts.3.w3.weight torch.Size([14336, 4096]) from model.layers.31.block_sparse_moe.experts.7.w3.weight\n", "reshape torch.Size([8, 4096]) -> view(4, 2, 4096) -> (4, 4098) model.layers.31.block_sparse_moe.gate.weight\n", "model.layers.31.block_sparse_moe.gate.weight torch.Size([4, 4096])\n", "model.layers.31.input_layernorm.weight torch.Size([4096])\n", "model.layers.31.post_attention_layernorm.weight torch.Size([4096])\n", "model.layers.31.self_attn.k_proj.weight torch.Size([1024, 4096])\n", "model.layers.31.self_attn.o_proj.weight torch.Size([4096, 4096])\n", "model.layers.31.self_attn.q_proj.weight torch.Size([4096, 4096])\n", "model.layers.31.self_attn.v_proj.weight torch.Size([1024, 4096])\n", "Save Tensors /content/mixtral-4x7b/model-00032.safetensors\n", "Done.\n" ] } ], "source": [ "%cd {temp_dir}\n", "\n", "import json\n", "import re\n", "import torch\n", "from safetensors import safe_open\n", "from safetensors.torch import save_file\n", "\n", "# model-00001-of-00019.safetensors\n", "# model.safetensors.index.json\n", "\n", "# save tokenizer\n", "from transformers import AutoTokenizer\n", "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)\n", "tokenizer.save_pretrained(save_dir)\n", "\n", "# save config\n", "config_path = f\"{target_dir}/config.json\"\n", "config = None\n", "with open(config_path, \"r\") as f:\n", " config = json.load(f)\n", " config[\"num_experts_per_tok\"] = 2\n", " config[\"num_local_experts\"] = 4\n", "\n", "# save config\n", "with open(f\"{save_dir}/config.json\", \"w\") as f:\n", " json.dump(config, f, indent=2)\n", "\n", "\n", "# weight\n", "weight_map = {}\n", "first_weights = [\"lm_head.weight\", \"model.embed_tokens.weight\", \"model.norm.weight\"]\n", "\n", "# load weight map\n", "bin_index_path = f\"{target_dir}/model.safetensors.index.json\"\n", "with open(bin_index_path, \"r\") as f:\n", " weight_map = json.load(f)[\"weight_map\"]\n", "\n", "def tensor_load(file_name, map_location=None):\n", " tensors = {}\n", " with safe_open(file_name, framework=\"pt\") as f:\n", " for k in f.keys():\n", " tensors[k] = f.get_tensor(k)\n", " return tensors\n", "\n", "def get_weight_byte_size(weight):\n", "\n", " if isinstance(weight, torch.Tensor):\n", " weight_byte_size = weight.nelement() * weight.element_size()\n", " else:\n", " weight_byte_size = sum(p.nelement() * p.element_size() for p in weight.parameters())\n", "\n", " return weight_byte_size\n", "\n", "# load weight map\n", "layers = {}\n", "for key in weight_map.keys():\n", " if key in first_weights:\n", " continue\n", "\n", " # keyが\"model.layers.[0-9]+.\"にmatchする場合はlayers_listに追加する\n", " layer_str = re.match(r\"model\\.layers\\.[0-9]+\\.\", key)[0]\n", " if layer_str:\n", " layer_no = re.findall(r\"\\d+\",layer_str)\n", " layer_no = layer_no[0]\n", " if layer_no not in layers.keys():\n", " layers[layer_no] = []\n", "\n", " layers[layer_no].append({ \"key\":key, \"file_name\":weight_map[key] })\n", "\n", "# new weight_map index\n", "new_weight_map = {\n", " \"metadata\": {\n", " \"total_size\": 0\n", " },\n", " \"weight_map\": {\n", " }\n", "}\n", "\n", "# load tensors\n", "total_size = 0\n", "tensor_weights = {}\n", "tensors = {}\n", "current_file_name = \"\"\n", "\n", "file_count = 0\n", "file_count_str = str(file_count).zfill(5)\n", "\n", "for key in first_weights:\n", " file_name = weight_map[key]\n", " if current_file_name != file_name:\n", "\n", " # load safetensor\n", " tensors = tensor_load(f\"{target_dir}/{file_name}\", map_location=\"cpu\")\n", " current_file_name = file_name\n", "\n", " tensor_weights[key] = tensors[key]\n", " new_weight_map[\"weight_map\"][key] = f\"model-{file_count_str}.safetensors\"\n", "\n", " # add weight size\n", " total_size += get_weight_byte_size(tensor_weights[key])\n", "\n", "# save tensor\n", "save_file(tensor_weights, f\"{save_dir}/model-{file_count_str}.safetensors\", metadata={\"format\":\"pt\"})\n", "file_count += 1\n", "\n", "layer_keys = sorted([ int(k) for k in layers.keys()])\n", "\n", "for layer_no in layer_keys:\n", " print(\"starting layer:\",layer_no)\n", " file_count_str = str(file_count).zfill(5)\n", " tensor_weights = {}\n", "\n", " stock_expert_weights = {}\n", "\n", " current_file_name = \"\"\n", " for info in layers[str(layer_no)]:\n", " file_name = info[\"file_name\"]\n", " if current_file_name != file_name:\n", " print(\"Loading Tensors \", file_name)\n", " tensors = tensor_load(f\"{target_dir}/{file_name}\", map_location=\"cpu\")\n", " current_file_name = file_name\n", "\n", " layer_key = info[\"key\"]\n", " layer_weights = tensors[layer_key]\n", "\n", " if 'experts' in layer_key:\n", "\n", " lk = re.findall(r\"block_sparse_moe[.]experts[.][0-9]+.w\", layer_key)[0]\n", " exp_index = int( re.findall(r\"\\d+\",lk)[0] )\n", "\n", " if exp_index % 2 == 0:\n", " paired_index = exp_index + 1\n", " paired_key = layer_key.replace(f'block_sparse_moe.experts.{exp_index}.', f'block_sparse_moe.experts.{paired_index}.')\n", " stock_expert_weights[paired_key] = layer_weights\n", " print(\"stock\",layer_key, layer_weights.shape)\n", " continue\n", "\n", " elif exp_index % 2 == 1:\n", " new_layer_key = re.sub(r\"block_sparse_moe\\.experts\\.\\d+\\.w\", f\"block_sparse_moe.experts.{exp_index//2}.w\", layer_key)\n", "\n", " # merge experts\n", " tensor_weights[new_layer_key] = (stock_expert_weights[layer_key] + layer_weights) / 2\n", "\n", " # add weight size\n", " total_size += get_weight_byte_size(tensor_weights[new_layer_key])\n", "\n", " new_weight_map[\"weight_map\"][new_layer_key] = f\"model-{file_count_str}.safetensors\"\n", " print(\"new experts\", new_layer_key, tensor_weights[new_layer_key].shape, \"from\", layer_key)\n", "\n", " elif 'gate' in layer_key:\n", " print(\"reshape\", layer_weights.shape, \"-> view(4, 2, 4096) -> (4, 4098)\", layer_key)\n", "\n", " # calc gate avarage\n", " weights_reshaped = layer_weights.view(4, 2, 4096)\n", " tensor_weights[layer_key] = torch.mean(weights_reshaped, dim=1)\n", "\n", " # add weight size\n", " total_size += get_weight_byte_size(tensor_weights[layer_key])\n", "\n", " new_weight_map[\"weight_map\"][layer_key] = f\"model-{file_count_str}.safetensors\"\n", " print(layer_key, tensor_weights[layer_key].shape)\n", "\n", " else:\n", " tensor_weights[layer_key] = layer_weights\n", "\n", " # add weight size\n", " total_size += get_weight_byte_size(tensor_weights[layer_key])\n", "\n", " new_weight_map[\"weight_map\"][layer_key] = f\"model-{file_count_str}.safetensors\"\n", " print(layer_key, tensor_weights[layer_key].shape)\n", "\n", " # save tensor\n", " save_file(tensor_weights, f\"{save_dir}/model-{file_count_str}.safetensors\", metadata={\"format\":\"pt\"})\n", " print(\"Save Tensors \", f\"{save_dir}/model-{file_count_str}.safetensors\")\n", " file_count += 1\n", "\n", "# save new_weight_map\n", "new_weight_map[\"metadata\"][\"total_size\"] = total_size\n", "with open(f\"{save_dir}/model.safetensors.index.json\", \"w\") as f:\n", " json.dump(new_weight_map, f, indent=2)\n", "\n", "print(\"Done.\")\n" ] } ], "metadata": { "colab": { "machine_shape": "hm", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }