diff --git "a/notebook/chat_vector.ipynb" "b/notebook/chat_vector.ipynb" new file mode 100644--- /dev/null +++ "b/notebook/chat_vector.ipynb" @@ -0,0 +1,1807 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/mmnga\n", + "/home/mmnga/hdd/llm-data\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/mmnga/.local/lib/python3.10/site-packages/IPython/core/magics/osm.py:417: UserWarning: This is now an optional IPython functionality, setting dhist requires you to install the `pickleshare` library.\n", + " self.shell.db['dhist'] = compress_dhist(dhist)[-100:]\n" + ] + } + ], + "source": [ + "%cd /home/mmnga\n", + "!source .venv/bin/activate\n", + "%cd /home/mmnga/hdd/llm-data/" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -Uqq torch safetensors transformers " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# chat-vectorの作成\n", + "model_name_or_path_base = \"/home/mmnga/hdd/llm-data/Meta-Llama-3-8B-Instruct\"\n", + "model_name_or_path_target = \"/home/mmnga/hdd/llm-data/suzume-llama-3-8B-japanese\"\n", + "save_vector_name_or_path = \"/home/mmnga/hdd/llm-data/llama-3-8B-chat-vector_2\"\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "from safetensors import safe_open\n", + "from safetensors.torch import save_file\n", + "\n", + "import pathlib, os, json, transformers\n", + "from string import Template\n", + "import torch\n", + "\n", + "class ChatVectorManager:\n", + " def __init__(self, model_name_or_path_base, model_name_or_path_target, save_vector_name_or_path):\n", + " self.model_name_or_path_base = model_name_or_path_base\n", + " self.model_name_or_path_target = model_name_or_path_target\n", + " self.save_vector_name_or_path = save_vector_name_or_path\n", + " self.config_base = {}\n", + " self.config_target = {}\n", + " self.n_layers = 0\n", + " self.layer_weight_templates = []\n", + " self.base_weight_map = {}\n", + " self.target_weight_map = {}\n", + " self.current_weight_file_base = \"\"\n", + " self.current_weight_file_target = \"\"\n", + " self.base_weights = {}\n", + " self.target_weights = {}\n", + " self.save_weights = {}\n", + " self.save_index_weights = {}\n", + " self.save_size = 0\n", + " self.save_total_size = 0\n", + " self.save_byte_size = int(4.9 * 1024 * 1024 * 1024)\n", + " self.save_counter = 0\n", + "\n", + " def load_setitngs(self, layer_num_config_name):\n", + "\n", + " # load config\n", + " with open(self.model_name_or_path_base + \"/config.json\", \"r\") as f:\n", + " self.config_base = json.load(f)\n", + "\n", + " with open(self.model_name_or_path_target + \"/config.json\", \"r\") as f:\n", + " self.config_target = json.load(f)\n", + "\n", + " # load weight map\n", + " with open(self.model_name_or_path_base + \"/model.safetensors.index.json\", \"r\") as f:\n", + " self.base_weight_map = json.load(f)[\"weight_map\"]\n", + "\n", + " with open(self.model_name_or_path_target + \"/model.safetensors.index.json\", \"r\") as f:\n", + " self.target_weight_map = json.load(f)[\"weight_map\"]\n", + "\n", + " self.n_layers = int(self.config_base[layer_num_config_name])\n", + " \n", + "\n", + " def add_layer_weight_template_name(self, weight_template_name):\n", + " self.layer_weight_templates.append(Template(weight_template_name))\n", + "\n", + " def load_base_weight(self, weight_name):\n", + " if self.current_weight_file_base == self.base_weight_map[weight_name]:\n", + " return\n", + " else:\n", + " file_name = self.base_weight_map[weight_name]\n", + " self.base_weights = safe_open(f\"{self.model_name_or_path_base}/{file_name}\", framework=\"pt\")\n", + " self.current_weight_file_base = file_name\n", + "\n", + " def load_target_weight(self, weight_name):\n", + " if self.current_weight_file_target == self.target_weight_map[weight_name]:\n", + " return\n", + " else:\n", + " file_name = self.target_weight_map[weight_name]\n", + " self.target_weights = safe_open(f\"{self.model_name_or_path_target}/{file_name}\", framework=\"pt\")\n", + " self.current_weight_file_target = file_name\n", + "\n", + " def layer_weight_iter(self):\n", + " for i in range(self.n_layers):\n", + " base_layer_weights = {}\n", + " target_layer_weights = {}\n", + " for t in self.layer_weight_templates:\n", + " weight_name = t.substitute(i=i)\n", + " self.load_base_weight(weight_name)\n", + " self.load_target_weight(weight_name)\n", + " base_layer_weights[weight_name] = self.base_weights.get_tensor(weight_name)\n", + " target_layer_weights[weight_name] = self.target_weights.get_tensor(weight_name)\n", + "\n", + " yield i, weight_name, base_layer_weights[weight_name], target_layer_weights[weight_name]\n", + "\n", + " def get_weight_byte_size(self, weight):\n", + "\n", + " if isinstance(weight, torch.Tensor):\n", + " weight_byte_size = weight.nelement() * weight.element_size()\n", + " else:\n", + " weight_byte_size = sum(p.nelement() * p.element_size() for p in weight.parameters())\n", + "\n", + " return weight_byte_size\n", + "\n", + "\n", + " def save_weights_split(self):\n", + " if len(self.save_weights.keys()) == 0:\n", + " return \n", + "\n", + " file_name = f\"{self.save_vector_name_or_path}/model-{self.save_counter:05}.safetensors\"\n", + "\n", + " for weight_name in self.save_weights.keys():\n", + " self.save_index_weights[weight_name] = file_name.split(\"/\")[-1]\n", + "\n", + " save_file(self.save_weights, file_name, metadata={\"format\":\"pt\"})\n", + " self.save_size = 0\n", + " self.save_counter += 1\n", + " self.save_weights = {}\n", + " print(f\"save: {file_name}\")\n", + "\n", + " def push_weight(self, weight_name, weight):\n", + " weight_size = self.get_weight_byte_size(weight)\n", + " self.save_weights[weight_name] = weight\n", + " self.save_size += weight_size\n", + " self.save_total_size += weight_size\n", + "\n", + " print(f\"vector: {weight_name} {weight_size}\")\n", + " if self.save_size > self.save_byte_size:\n", + " self.save_weights_split()\n", + " \n", + " def save_weight_map(self):\n", + " new_weight_map = {\n", + " \"metadata\": {\n", + " \"total_size\": self.save_total_size\n", + " },\n", + " \"weight_map\": self.save_index_weights\n", + " }\n", + " with open(f\"{self.save_vector_name_or_path}/model.safetensors.index.json\", \"w\") as f:\n", + " json.dump(new_weight_map, f, indent=4)\n", + "\n", + " print(\"make model.safetensors.index.json\")\n", + "\n", + " def save_config(self):\n", + " with open(f\"{self.save_vector_name_or_path}/config.json\", \"w\") as f:\n", + " json.dump(self.config_target, f, indent=4)\n", + "\n", + " def make_vector(self):\n", + "\n", + " os.makedirs(self.save_vector_name_or_path, exist_ok=True)\n", + "\n", + " # 数値が含まれないweight\n", + " for weight_name in [k for k in self.target_weight_map if not any(c.isdigit() for c in k)]:\n", + " self.load_base_weight(weight_name)\n", + " self.load_target_weight(weight_name)\n", + " base_weight = self.base_weights.get_tensor(weight_name)\n", + " target_weight = self.target_weights.get_tensor(weight_name)\n", + " diff = target_weight - base_weight\n", + " self.push_weight(weight_name, diff)\n", + "\n", + " for i, weight_name, base_weight, target_weight in self.layer_weight_iter():\n", + " diff = target_weight - base_weight\n", + " self.push_weight(weight_name, diff)\n", + "\n", + " self.save_weights_split()\n", + " self.save_weight_map()\n", + " self.save_config()\n", + " print(\"Done!\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "vector: lm_head.weight 1050673152\n", + "vector: model.embed_tokens.weight 1050673152\n", + "vector: model.norm.weight 8192\n", + "vector: model.layers.0.input_layernorm.weight 8192\n", + "vector: model.layers.0.mlp.down_proj.weight 117440512\n", + "vector: model.layers.0.mlp.gate_proj.weight 117440512\n", + "vector: model.layers.0.mlp.up_proj.weight 117440512\n", + "vector: model.layers.0.post_attention_layernorm.weight 8192\n", + "vector: model.layers.0.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.0.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.0.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.0.self_attn.v_proj.weight 8388608\n", + "vector: model.layers.1.input_layernorm.weight 8192\n", + "vector: model.layers.1.mlp.down_proj.weight 117440512\n", + "vector: model.layers.1.mlp.gate_proj.weight 117440512\n", + "vector: model.layers.1.mlp.up_proj.weight 117440512\n", + "vector: model.layers.1.post_attention_layernorm.weight 8192\n", + "vector: model.layers.1.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.1.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.1.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.1.self_attn.v_proj.weight 8388608\n", + "vector: model.layers.2.input_layernorm.weight 8192\n", + "vector: model.layers.2.mlp.down_proj.weight 117440512\n", + "vector: model.layers.2.mlp.gate_proj.weight 117440512\n", + "vector: model.layers.2.mlp.up_proj.weight 117440512\n", + "vector: model.layers.2.post_attention_layernorm.weight 8192\n", + "vector: model.layers.2.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.2.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.2.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.2.self_attn.v_proj.weight 8388608\n", + "vector: model.layers.3.input_layernorm.weight 8192\n", + "vector: model.layers.3.mlp.down_proj.weight 117440512\n", + "vector: model.layers.3.mlp.gate_proj.weight 117440512\n", + "vector: model.layers.3.mlp.up_proj.weight 117440512\n", + "vector: model.layers.3.post_attention_layernorm.weight 8192\n", + "vector: model.layers.3.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.3.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.3.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.3.self_attn.v_proj.weight 8388608\n", + "vector: model.layers.4.input_layernorm.weight 8192\n", + "vector: model.layers.4.mlp.down_proj.weight 117440512\n", + "vector: model.layers.4.mlp.gate_proj.weight 117440512\n", + "vector: model.layers.4.mlp.up_proj.weight 117440512\n", + "vector: model.layers.4.post_attention_layernorm.weight 8192\n", + "vector: model.layers.4.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.4.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.4.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.4.self_attn.v_proj.weight 8388608\n", + "vector: model.layers.5.input_layernorm.weight 8192\n", + "vector: model.layers.5.mlp.down_proj.weight 117440512\n", + "vector: model.layers.5.mlp.gate_proj.weight 117440512\n", + "vector: model.layers.5.mlp.up_proj.weight 117440512\n", + "vector: model.layers.5.post_attention_layernorm.weight 8192\n", + "vector: model.layers.5.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.5.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.5.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.5.self_attn.v_proj.weight 8388608\n", + "vector: model.layers.6.input_layernorm.weight 8192\n", + "vector: model.layers.6.mlp.down_proj.weight 117440512\n", + "vector: model.layers.6.mlp.gate_proj.weight 117440512\n", + "vector: model.layers.6.mlp.up_proj.weight 117440512\n", + "vector: model.layers.6.post_attention_layernorm.weight 8192\n", + "vector: model.layers.6.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.6.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.6.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.6.self_attn.v_proj.weight 8388608\n", + "vector: model.layers.7.input_layernorm.weight 8192\n", + "vector: model.layers.7.mlp.down_proj.weight 117440512\n", + "save: /home/mmnga/hdd/llm-data/llama-3-8B-chat-vector_2/model-00000.safetensors\n", + "vector: model.layers.7.mlp.gate_proj.weight 117440512\n", + "vector: model.layers.7.mlp.up_proj.weight 117440512\n", + "vector: model.layers.7.post_attention_layernorm.weight 8192\n", + "vector: model.layers.7.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.7.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.7.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.7.self_attn.v_proj.weight 8388608\n", + "vector: model.layers.8.input_layernorm.weight 8192\n", + "vector: model.layers.8.mlp.down_proj.weight 117440512\n", + "vector: model.layers.8.mlp.gate_proj.weight 117440512\n", + "vector: model.layers.8.mlp.up_proj.weight 117440512\n", + "vector: model.layers.8.post_attention_layernorm.weight 8192\n", + "vector: model.layers.8.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.8.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.8.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.8.self_attn.v_proj.weight 8388608\n", + "vector: model.layers.9.input_layernorm.weight 8192\n", + "vector: model.layers.9.mlp.down_proj.weight 117440512\n", + "vector: model.layers.9.mlp.gate_proj.weight 117440512\n", + "vector: model.layers.9.mlp.up_proj.weight 117440512\n", + "vector: model.layers.9.post_attention_layernorm.weight 8192\n", + "vector: model.layers.9.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.9.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.9.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.9.self_attn.v_proj.weight 8388608\n", + "vector: model.layers.10.input_layernorm.weight 8192\n", + "vector: model.layers.10.mlp.down_proj.weight 117440512\n", + "vector: model.layers.10.mlp.gate_proj.weight 117440512\n", + "vector: model.layers.10.mlp.up_proj.weight 117440512\n", + "vector: model.layers.10.post_attention_layernorm.weight 8192\n", + "vector: model.layers.10.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.10.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.10.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.10.self_attn.v_proj.weight 8388608\n", + "vector: model.layers.11.input_layernorm.weight 8192\n", + "vector: model.layers.11.mlp.down_proj.weight 117440512\n", + "vector: model.layers.11.mlp.gate_proj.weight 117440512\n", + "vector: model.layers.11.mlp.up_proj.weight 117440512\n", + "vector: model.layers.11.post_attention_layernorm.weight 8192\n", + "vector: model.layers.11.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.11.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.11.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.11.self_attn.v_proj.weight 8388608\n", + "vector: model.layers.12.input_layernorm.weight 8192\n", + "vector: model.layers.12.mlp.down_proj.weight 117440512\n", + "vector: model.layers.12.mlp.gate_proj.weight 117440512\n", + "vector: model.layers.12.mlp.up_proj.weight 117440512\n", + "vector: model.layers.12.post_attention_layernorm.weight 8192\n", + "vector: model.layers.12.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.12.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.12.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.12.self_attn.v_proj.weight 8388608\n", + "vector: model.layers.13.input_layernorm.weight 8192\n", + "vector: model.layers.13.mlp.down_proj.weight 117440512\n", + "vector: model.layers.13.mlp.gate_proj.weight 117440512\n", + "vector: model.layers.13.mlp.up_proj.weight 117440512\n", + "vector: model.layers.13.post_attention_layernorm.weight 8192\n", + "vector: model.layers.13.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.13.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.13.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.13.self_attn.v_proj.weight 8388608\n", + "vector: model.layers.14.input_layernorm.weight 8192\n", + "vector: model.layers.14.mlp.down_proj.weight 117440512\n", + "vector: model.layers.14.mlp.gate_proj.weight 117440512\n", + "vector: model.layers.14.mlp.up_proj.weight 117440512\n", + "vector: model.layers.14.post_attention_layernorm.weight 8192\n", + "vector: model.layers.14.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.14.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.14.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.14.self_attn.v_proj.weight 8388608\n", + "vector: model.layers.15.input_layernorm.weight 8192\n", + "vector: model.layers.15.mlp.down_proj.weight 117440512\n", + "vector: model.layers.15.mlp.gate_proj.weight 117440512\n", + "vector: model.layers.15.mlp.up_proj.weight 117440512\n", + "vector: model.layers.15.post_attention_layernorm.weight 8192\n", + "vector: model.layers.15.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.15.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.15.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.15.self_attn.v_proj.weight 8388608\n", + "vector: model.layers.16.input_layernorm.weight 8192\n", + "vector: model.layers.16.mlp.down_proj.weight 117440512\n", + "vector: model.layers.16.mlp.gate_proj.weight 117440512\n", + "vector: model.layers.16.mlp.up_proj.weight 117440512\n", + "vector: model.layers.16.post_attention_layernorm.weight 8192\n", + "vector: model.layers.16.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.16.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.16.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.16.self_attn.v_proj.weight 8388608\n", + "vector: model.layers.17.input_layernorm.weight 8192\n", + "vector: model.layers.17.mlp.down_proj.weight 117440512\n", + "vector: model.layers.17.mlp.gate_proj.weight 117440512\n", + "vector: model.layers.17.mlp.up_proj.weight 117440512\n", + "vector: model.layers.17.post_attention_layernorm.weight 8192\n", + "vector: model.layers.17.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.17.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.17.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.17.self_attn.v_proj.weight 8388608\n", + "vector: model.layers.18.input_layernorm.weight 8192\n", + "vector: model.layers.18.mlp.down_proj.weight 117440512\n", + "vector: model.layers.18.mlp.gate_proj.weight 117440512\n", + "vector: model.layers.18.mlp.up_proj.weight 117440512\n", + "vector: model.layers.18.post_attention_layernorm.weight 8192\n", + "vector: model.layers.18.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.18.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.18.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.18.self_attn.v_proj.weight 8388608\n", + "vector: model.layers.19.input_layernorm.weight 8192\n", + "vector: model.layers.19.mlp.down_proj.weight 117440512\n", + "vector: model.layers.19.mlp.gate_proj.weight 117440512\n", + "save: /home/mmnga/hdd/llm-data/llama-3-8B-chat-vector_2/model-00001.safetensors\n", + "vector: model.layers.19.mlp.up_proj.weight 117440512\n", + "vector: model.layers.19.post_attention_layernorm.weight 8192\n", + "vector: model.layers.19.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.19.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.19.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.19.self_attn.v_proj.weight 8388608\n", + "vector: model.layers.20.input_layernorm.weight 8192\n", + "vector: model.layers.20.mlp.down_proj.weight 117440512\n", + "vector: model.layers.20.mlp.gate_proj.weight 117440512\n", + "vector: model.layers.20.mlp.up_proj.weight 117440512\n", + "vector: model.layers.20.post_attention_layernorm.weight 8192\n", + "vector: model.layers.20.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.20.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.20.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.20.self_attn.v_proj.weight 8388608\n", + "vector: model.layers.21.input_layernorm.weight 8192\n", + "vector: model.layers.21.mlp.down_proj.weight 117440512\n", + "vector: model.layers.21.mlp.gate_proj.weight 117440512\n", + "vector: model.layers.21.mlp.up_proj.weight 117440512\n", + "vector: model.layers.21.post_attention_layernorm.weight 8192\n", + "vector: model.layers.21.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.21.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.21.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.21.self_attn.v_proj.weight 8388608\n", + "vector: model.layers.22.input_layernorm.weight 8192\n", + "vector: model.layers.22.mlp.down_proj.weight 117440512\n", + "vector: model.layers.22.mlp.gate_proj.weight 117440512\n", + "vector: model.layers.22.mlp.up_proj.weight 117440512\n", + "vector: model.layers.22.post_attention_layernorm.weight 8192\n", + "vector: model.layers.22.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.22.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.22.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.22.self_attn.v_proj.weight 8388608\n", + "vector: model.layers.23.input_layernorm.weight 8192\n", + "vector: model.layers.23.mlp.down_proj.weight 117440512\n", + "vector: model.layers.23.mlp.gate_proj.weight 117440512\n", + "vector: model.layers.23.mlp.up_proj.weight 117440512\n", + "vector: model.layers.23.post_attention_layernorm.weight 8192\n", + "vector: model.layers.23.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.23.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.23.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.23.self_attn.v_proj.weight 8388608\n", + "vector: model.layers.24.input_layernorm.weight 8192\n", + "vector: model.layers.24.mlp.down_proj.weight 117440512\n", + "vector: model.layers.24.mlp.gate_proj.weight 117440512\n", + "vector: model.layers.24.mlp.up_proj.weight 117440512\n", + "vector: model.layers.24.post_attention_layernorm.weight 8192\n", + "vector: model.layers.24.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.24.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.24.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.24.self_attn.v_proj.weight 8388608\n", + "vector: model.layers.25.input_layernorm.weight 8192\n", + "vector: model.layers.25.mlp.down_proj.weight 117440512\n", + "vector: model.layers.25.mlp.gate_proj.weight 117440512\n", + "vector: model.layers.25.mlp.up_proj.weight 117440512\n", + "vector: model.layers.25.post_attention_layernorm.weight 8192\n", + "vector: model.layers.25.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.25.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.25.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.25.self_attn.v_proj.weight 8388608\n", + "vector: model.layers.26.input_layernorm.weight 8192\n", + "vector: model.layers.26.mlp.down_proj.weight 117440512\n", + "vector: model.layers.26.mlp.gate_proj.weight 117440512\n", + "vector: model.layers.26.mlp.up_proj.weight 117440512\n", + "vector: model.layers.26.post_attention_layernorm.weight 8192\n", + "vector: model.layers.26.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.26.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.26.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.26.self_attn.v_proj.weight 8388608\n", + "vector: model.layers.27.input_layernorm.weight 8192\n", + "vector: model.layers.27.mlp.down_proj.weight 117440512\n", + "vector: model.layers.27.mlp.gate_proj.weight 117440512\n", + "vector: model.layers.27.mlp.up_proj.weight 117440512\n", + "vector: model.layers.27.post_attention_layernorm.weight 8192\n", + "vector: model.layers.27.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.27.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.27.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.27.self_attn.v_proj.weight 8388608\n", + "vector: model.layers.28.input_layernorm.weight 8192\n", + "vector: model.layers.28.mlp.down_proj.weight 117440512\n", + "vector: model.layers.28.mlp.gate_proj.weight 117440512\n", + "vector: model.layers.28.mlp.up_proj.weight 117440512\n", + "vector: model.layers.28.post_attention_layernorm.weight 8192\n", + "vector: model.layers.28.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.28.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.28.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.28.self_attn.v_proj.weight 8388608\n", + "vector: model.layers.29.input_layernorm.weight 8192\n", + "vector: model.layers.29.mlp.down_proj.weight 117440512\n", + "vector: model.layers.29.mlp.gate_proj.weight 117440512\n", + "vector: model.layers.29.mlp.up_proj.weight 117440512\n", + "vector: model.layers.29.post_attention_layernorm.weight 8192\n", + "vector: model.layers.29.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.29.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.29.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.29.self_attn.v_proj.weight 8388608\n", + "vector: model.layers.30.input_layernorm.weight 8192\n", + "vector: model.layers.30.mlp.down_proj.weight 117440512\n", + "vector: model.layers.30.mlp.gate_proj.weight 117440512\n", + "vector: model.layers.30.mlp.up_proj.weight 117440512\n", + "vector: model.layers.30.post_attention_layernorm.weight 8192\n", + "vector: model.layers.30.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.30.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.30.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.30.self_attn.v_proj.weight 8388608\n", + "vector: model.layers.31.input_layernorm.weight 8192\n", + "vector: model.layers.31.mlp.down_proj.weight 117440512\n", + "vector: model.layers.31.mlp.gate_proj.weight 117440512\n", + "vector: model.layers.31.mlp.up_proj.weight 117440512\n", + "save: /home/mmnga/hdd/llm-data/llama-3-8B-chat-vector_2/model-00002.safetensors\n", + "vector: model.layers.31.post_attention_layernorm.weight 8192\n", + "vector: model.layers.31.self_attn.k_proj.weight 8388608\n", + "vector: model.layers.31.self_attn.o_proj.weight 33554432\n", + "vector: model.layers.31.self_attn.q_proj.weight 33554432\n", + "vector: model.layers.31.self_attn.v_proj.weight 8388608\n", + "save: /home/mmnga/hdd/llm-data/llama-3-8B-chat-vector_2/model-00003.safetensors\n", + "make model.safetensors.index.json\n", + "Done!\n" + ] + } + ], + "source": [ + "cvm = ChatVectorManager(model_name_or_path_base, model_name_or_path_target, save_vector_name_or_path)\n", + "cvm.load_setitngs(layer_num_config_name=\"num_hidden_layers\")\n", + "cvm.add_layer_weight_template_name(\"model.layers.${i}.input_layernorm.weight\")\n", + "cvm.add_layer_weight_template_name(\"model.layers.${i}.mlp.down_proj.weight\")\n", + "cvm.add_layer_weight_template_name(\"model.layers.${i}.mlp.gate_proj.weight\")\n", + "cvm.add_layer_weight_template_name(\"model.layers.${i}.mlp.up_proj.weight\")\n", + "cvm.add_layer_weight_template_name(\"model.layers.${i}.post_attention_layernorm.weight\")\n", + "cvm.add_layer_weight_template_name(\"model.layers.${i}.self_attn.k_proj.weight\")\n", + "cvm.add_layer_weight_template_name(\"model.layers.${i}.self_attn.o_proj.weight\")\n", + "cvm.add_layer_weight_template_name(\"model.layers.${i}.self_attn.q_proj.weight\")\n", + "cvm.add_layer_weight_template_name(\"model.layers.${i}.self_attn.v_proj.weight\")\n", + "cvm.make_vector()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "merge_target_model_path = \"/home/mmnga/hdd/llm-data/Meta-Llama-3-70B-Instruct\"\n", + "vector_path = \"/home/mmnga/hdd/llm-data/llama-3-8B-chat-vector_2\"\n", + "save_merged_model_path = \"/home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector\"" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn.functional as F\n", + "from transformers import AutoTokenizer\n", + "from safetensors import safe_open\n", + "from safetensors.torch import save_file\n", + "import os\n", + "import json\n", + "from string import Template\n", + "\n", + "class ExpandChatVectorMerger:\n", + " def __init__(self, merge_target_model_path, vector_path, save_merged_model_path):\n", + " self.merge_target_model_path = merge_target_model_path\n", + " self.vector_path = vector_path\n", + " self.save_merged_model_path = save_merged_model_path\n", + "\n", + " self.hold_layers_front = 8 # この0からこのlayerまではvectorをそのまま適用する\n", + " self.hold_layers_later = -8 # 最後からこのlayerまではvectorをそのまま適用する\n", + " self.apply_layer_map = {} # targetのlayerにvectorのlayerを適用するmap\n", + " self.config_target = {}\n", + " self.config_vector = {}\n", + " self.n_layers_vector = 0\n", + " self.n_layers = 0\n", + " self.layer_weight_templates = []\n", + "\n", + " self.vector_weight_map = {}\n", + " self.target_weight_map = {}\n", + " self.current_weight_file_vector = \"\"\n", + " self.current_weight_file_target = \"\"\n", + "\n", + " self.vector_weights = {}\n", + " self.target_weights = {}\n", + " self.save_weights = {}\n", + " self.save_index_weights = {}\n", + " self.save_size = 0\n", + " self.save_total_size = 0\n", + " self.save_byte_size = int(4.9 * 1024 * 1024 * 1024)\n", + " self.save_counter = 0\n", + "\n", + " def load_setitngs(self, layer_num_config_name):\n", + "\n", + " # load config\n", + " with open(self.vector_path + \"/config.json\", \"r\") as f:\n", + " self.config_vector = json.load(f)\n", + "\n", + " with open(self.merge_target_model_path + \"/config.json\", \"r\") as f:\n", + " self.config_target = json.load(f)\n", + "\n", + " # load weight map\n", + " with open(self.vector_path + \"/model.safetensors.index.json\", \"r\") as f:\n", + " self.vector_weight_map = json.load(f)[\"weight_map\"]\n", + "\n", + " with open(self.merge_target_model_path + \"/model.safetensors.index.json\", \"r\") as f:\n", + " self.target_weight_map = json.load(f)[\"weight_map\"]\n", + "\n", + " self.n_layers = int(self.config_target[layer_num_config_name])\n", + " self.n_layers_vector = int(self.config_vector[layer_num_config_name])\n", + "\n", + " def add_layer_weight_template_name(self, weight_template_name):\n", + " self.layer_weight_templates.append(Template(weight_template_name))\n", + "\n", + " def make_apply_layer_map(self):\n", + " target_from = self.hold_layers_front\n", + " target_to = self.n_layers + self.hold_layers_later\n", + " vector_to = self.n_layers_vector + self.hold_layers_later\n", + "\n", + " expand_count_target = target_to - target_from\n", + " expand_count_vector = (vector_to - target_from) +1\n", + "\n", + " print(\"vector_to\", vector_to)\n", + " \n", + "\n", + " for i in range(self.n_layers):\n", + " if i < self.hold_layers_front:\n", + " self.apply_layer_map[str(i)] = i\n", + " print(\"front\", i, self.apply_layer_map[str(i)])\n", + " elif i > self.n_layers + self.hold_layers_later:\n", + " self.apply_layer_map[str(i)] = (i - self.n_layers) + self.n_layers_vector\n", + " print(\"later\", i, self.apply_layer_map[str(i)])\n", + " else:\n", + " index_in_vector = int(((i - self.hold_layers_front) / expand_count_target) * expand_count_vector)\n", + " self.apply_layer_map[str(i)] = min(self.hold_layers_front + index_in_vector, vector_to)\n", + " \n", + " print(\"expand\", i, self.apply_layer_map[str(i)])\n", + "\n", + "\n", + " def get_merge_weight(self, vector_weight, target_weight):\n", + " reshaped_tensor = vector_weight.unsqueeze(0).unsqueeze(0)\n", + " \n", + " if len(target_weight.shape) == 2:\n", + " upsampled_tensor = F.interpolate(reshaped_tensor, size=target_weight.shape, mode='bilinear', align_corners=False)\n", + " elif len(target_weight.shape) == 1:\n", + " upsampled_tensor = F.interpolate(reshaped_tensor, size=target_weight.shape, mode='linear', align_corners=False)\n", + "\n", + " vw = upsampled_tensor.squeeze(0).squeeze(0)\n", + "\n", + " return target_weight + vw\n", + "\n", + " def add_layer_weight_template_name(self, weight_template_name):\n", + " self.layer_weight_templates.append(Template(weight_template_name))\n", + "\n", + " def load_vector_weight(self, weight_name):\n", + " if self.current_weight_file_vector == self.vector_weight_map[weight_name]:\n", + " return\n", + " else:\n", + " file_name = self.vector_weight_map[weight_name]\n", + " self.vector_weights = safe_open(f\"{self.vector_path}/{file_name}\", framework=\"pt\")\n", + " self.current_weight_file_vector = file_name\n", + "\n", + " def load_target_weight(self, weight_name):\n", + " if self.current_weight_file_target == self.target_weight_map[weight_name]:\n", + " return\n", + " else:\n", + " file_name = self.target_weight_map[weight_name]\n", + " self.target_weights = safe_open(f\"{self.merge_target_model_path}/{file_name}\", framework=\"pt\")\n", + " self.current_weight_file_target = file_name\n", + "\n", + " def layer_weight_iter(self):\n", + "\n", + " for i in range(self.n_layers):\n", + " target_layer_weights = {}\n", + " vector_layer_weights = {}\n", + "\n", + " vector_layer_index = self.apply_layer_map[str(i)]\n", + "\n", + " for t in self.layer_weight_templates:\n", + " vector_weight_name = t.substitute(i=vector_layer_index)\n", + " target_weight_name = t.substitute(i=i)\n", + " self.load_vector_weight(vector_weight_name)\n", + " self.load_target_weight(target_weight_name)\n", + " \n", + " vector_layer_weights[vector_weight_name] = self.vector_weights.get_tensor(vector_weight_name)\n", + " target_layer_weights[target_weight_name] = self.target_weights.get_tensor(target_weight_name)\n", + "\n", + " yield i, target_weight_name, vector_layer_weights[vector_weight_name], target_layer_weights[target_weight_name]\n", + "\n", + " def get_weight_byte_size(self, weight):\n", + "\n", + " if isinstance(weight, torch.Tensor):\n", + " weight_byte_size = weight.nelement() * weight.element_size()\n", + " else:\n", + " weight_byte_size = sum(p.nelement() * p.element_size() for p in weight.parameters())\n", + "\n", + " return weight_byte_size\n", + "\n", + "\n", + " def save_weights_split(self):\n", + " if len(self.save_weights.keys()) == 0:\n", + " return \n", + "\n", + " file_name = f\"{self.save_merged_model_path}/model-{self.save_counter:05}.safetensors\"\n", + "\n", + " for weight_name in self.save_weights.keys():\n", + " self.save_index_weights[weight_name] = file_name.split(\"/\")[-1]\n", + "\n", + " save_file(self.save_weights, file_name, metadata={\"format\":\"pt\"})\n", + " self.save_size = 0\n", + " self.save_counter += 1\n", + " self.save_weights = {}\n", + " print(f\"save: {file_name}\")\n", + "\n", + " def push_weight(self, weight_name, weight):\n", + " weight_size = self.get_weight_byte_size(weight)\n", + " self.save_weights[weight_name] = weight\n", + " self.save_size += weight_size\n", + " self.save_total_size += weight_size\n", + "\n", + " print(f\"vector: {weight_name} {weight_size}\")\n", + " if self.save_size > self.save_byte_size:\n", + " self.save_weights_split()\n", + " \n", + " def save_weight_map(self):\n", + " new_weight_map = {\n", + " \"metadata\": {\n", + " \"total_size\": self.save_total_size\n", + " },\n", + " \"weight_map\": self.save_index_weights\n", + " }\n", + " with open(f\"{self.save_merged_model_path}/model.safetensors.index.json\", \"w\") as f:\n", + " json.dump(new_weight_map, f, indent=4)\n", + "\n", + " print(\"make model.safetensors.index.json\")\n", + "\n", + " def save_config(self):\n", + " with open(f\"{self.save_merged_model_path}/config.json\", \"w\") as f:\n", + " json.dump(self.config_target, f, indent=4)\n", + " \n", + " def save_tokenizer(self):\n", + " tokenizer = AutoTokenizer.from_pretrained(self.merge_target_model_path)\n", + " tokenizer.save_pretrained(self.save_merged_model_path)\n", + "\n", + " def merge(self):\n", + "\n", + " os.makedirs(self.save_merged_model_path, exist_ok=True)\n", + " self.save_tokenizer()\n", + " self.make_apply_layer_map()\n", + "\n", + " # 数値が含まれないweight\n", + " for weight_name in [k for k in self.target_weight_map if not any(c.isdigit() for c in k)]:\n", + " self.load_target_weight(weight_name)\n", + " self.load_vector_weight(weight_name)\n", + " target_weight = self.target_weights.get_tensor(weight_name)\n", + " vector_weight = self.vector_weights.get_tensor(weight_name)\n", + " merge_weight = self.get_merge_weight(vector_weight, target_weight)\n", + " self.push_weight(weight_name, merge_weight)\n", + "\n", + " # layers\n", + " for i, target_weight_name, vector_weight, target_weight in self.layer_weight_iter():\n", + " merge_weight = self.get_merge_weight(vector_weight, target_weight)\n", + " self.push_weight(target_weight_name, merge_weight)\n", + "\n", + " self.save_weights_split()\n", + " self.save_weight_map()\n", + " self.save_config()\n", + " print(\"Done!\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "config_target 80\n", + "config_vector 32\n", + "vector_to 24\n", + "front 0 0\n", + "front 1 1\n", + "front 2 2\n", + "front 3 3\n", + "front 4 4\n", + "front 5 5\n", + "front 6 6\n", + "front 7 7\n", + "expand 8 8\n", + "expand 9 8\n", + "expand 10 8\n", + "expand 11 8\n", + "expand 12 9\n", + "expand 13 9\n", + "expand 14 9\n", + "expand 15 9\n", + "expand 16 10\n", + "expand 17 10\n", + "expand 18 10\n", + "expand 19 10\n", + "expand 20 11\n", + "expand 21 11\n", + "expand 22 11\n", + "expand 23 11\n", + "expand 24 12\n", + "expand 25 12\n", + "expand 26 12\n", + "expand 27 13\n", + "expand 28 13\n", + "expand 29 13\n", + "expand 30 13\n", + "expand 31 14\n", + "expand 32 14\n", + "expand 33 14\n", + "expand 34 14\n", + "expand 35 15\n", + "expand 36 15\n", + "expand 37 15\n", + "expand 38 15\n", + "expand 39 16\n", + "expand 40 16\n", + "expand 41 16\n", + "expand 42 17\n", + "expand 43 17\n", + "expand 44 17\n", + "expand 45 17\n", + "expand 46 18\n", + "expand 47 18\n", + "expand 48 18\n", + "expand 49 18\n", + "expand 50 19\n", + "expand 51 19\n", + "expand 52 19\n", + "expand 53 19\n", + "expand 54 20\n", + "expand 55 20\n", + "expand 56 20\n", + "expand 57 21\n", + "expand 58 21\n", + "expand 59 21\n", + "expand 60 21\n", + "expand 61 22\n", + "expand 62 22\n", + "expand 63 22\n", + "expand 64 22\n", + "expand 65 23\n", + "expand 66 23\n", + "expand 67 23\n", + "expand 68 23\n", + "expand 69 24\n", + "expand 70 24\n", + "expand 71 24\n", + "expand 72 24\n", + "later 73 25\n", + "later 74 26\n", + "later 75 27\n", + "later 76 28\n", + "later 77 29\n", + "later 78 30\n", + "later 79 31\n", + "vector: lm_head.weight 2101346304\n", + "vector: model.embed_tokens.weight 2101346304\n", + "vector: model.norm.weight 16384\n", + "vector: model.layers.0.input_layernorm.weight 16384\n", + "vector: model.layers.0.mlp.down_proj.weight 469762048\n", + "vector: model.layers.0.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.0.mlp.up_proj.weight 469762048\n", + "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00000.safetensors\n", + "vector: model.layers.0.post_attention_layernorm.weight 16384\n", + "vector: model.layers.0.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.0.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.0.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.0.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.1.input_layernorm.weight 16384\n", + "vector: model.layers.1.mlp.down_proj.weight 469762048\n", + "vector: model.layers.1.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.1.mlp.up_proj.weight 469762048\n", + "vector: model.layers.1.post_attention_layernorm.weight 16384\n", + "vector: model.layers.1.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.1.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.1.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.1.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.2.input_layernorm.weight 16384\n", + "vector: model.layers.2.mlp.down_proj.weight 469762048\n", + "vector: model.layers.2.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.2.mlp.up_proj.weight 469762048\n", + "vector: model.layers.2.post_attention_layernorm.weight 16384\n", + "vector: model.layers.2.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.2.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.2.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.2.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.3.input_layernorm.weight 16384\n", + "vector: model.layers.3.mlp.down_proj.weight 469762048\n", + "vector: model.layers.3.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.3.mlp.up_proj.weight 469762048\n", + "vector: model.layers.3.post_attention_layernorm.weight 16384\n", + "vector: model.layers.3.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.3.self_attn.o_proj.weight 134217728\n", + "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00001.safetensors\n", + "vector: model.layers.3.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.3.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.4.input_layernorm.weight 16384\n", + "vector: model.layers.4.mlp.down_proj.weight 469762048\n", + "vector: model.layers.4.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.4.mlp.up_proj.weight 469762048\n", + "vector: model.layers.4.post_attention_layernorm.weight 16384\n", + "vector: model.layers.4.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.4.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.4.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.4.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.5.input_layernorm.weight 16384\n", + "vector: model.layers.5.mlp.down_proj.weight 469762048\n", + "vector: model.layers.5.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.5.mlp.up_proj.weight 469762048\n", + "vector: model.layers.5.post_attention_layernorm.weight 16384\n", + "vector: model.layers.5.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.5.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.5.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.5.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.6.input_layernorm.weight 16384\n", + "vector: model.layers.6.mlp.down_proj.weight 469762048\n", + "vector: model.layers.6.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.6.mlp.up_proj.weight 469762048\n", + "vector: model.layers.6.post_attention_layernorm.weight 16384\n", + "vector: model.layers.6.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.6.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.6.self_attn.q_proj.weight 134217728\n", + "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00002.safetensors\n", + "vector: model.layers.6.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.7.input_layernorm.weight 16384\n", + "vector: model.layers.7.mlp.down_proj.weight 469762048\n", + "vector: model.layers.7.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.7.mlp.up_proj.weight 469762048\n", + "vector: model.layers.7.post_attention_layernorm.weight 16384\n", + "vector: model.layers.7.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.7.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.7.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.7.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.8.input_layernorm.weight 16384\n", + "vector: model.layers.8.mlp.down_proj.weight 469762048\n", + "vector: model.layers.8.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.8.mlp.up_proj.weight 469762048\n", + "vector: model.layers.8.post_attention_layernorm.weight 16384\n", + "vector: model.layers.8.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.8.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.8.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.8.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.9.input_layernorm.weight 16384\n", + "vector: model.layers.9.mlp.down_proj.weight 469762048\n", + "vector: model.layers.9.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.9.mlp.up_proj.weight 469762048\n", + "vector: model.layers.9.post_attention_layernorm.weight 16384\n", + "vector: model.layers.9.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.9.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.9.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.9.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.10.input_layernorm.weight 16384\n", + "vector: model.layers.10.mlp.down_proj.weight 469762048\n", + "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00003.safetensors\n", + "vector: model.layers.10.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.10.mlp.up_proj.weight 469762048\n", + "vector: model.layers.10.post_attention_layernorm.weight 16384\n", + "vector: model.layers.10.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.10.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.10.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.10.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.11.input_layernorm.weight 16384\n", + "vector: model.layers.11.mlp.down_proj.weight 469762048\n", + "vector: model.layers.11.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.11.mlp.up_proj.weight 469762048\n", + "vector: model.layers.11.post_attention_layernorm.weight 16384\n", + "vector: model.layers.11.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.11.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.11.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.11.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.12.input_layernorm.weight 16384\n", + "vector: model.layers.12.mlp.down_proj.weight 469762048\n", + "vector: model.layers.12.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.12.mlp.up_proj.weight 469762048\n", + "vector: model.layers.12.post_attention_layernorm.weight 16384\n", + "vector: model.layers.12.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.12.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.12.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.12.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.13.input_layernorm.weight 16384\n", + "vector: model.layers.13.mlp.down_proj.weight 469762048\n", + "vector: model.layers.13.mlp.gate_proj.weight 469762048\n", + "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00004.safetensors\n", + "vector: model.layers.13.mlp.up_proj.weight 469762048\n", + "vector: model.layers.13.post_attention_layernorm.weight 16384\n", + "vector: model.layers.13.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.13.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.13.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.13.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.14.input_layernorm.weight 16384\n", + "vector: model.layers.14.mlp.down_proj.weight 469762048\n", + "vector: model.layers.14.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.14.mlp.up_proj.weight 469762048\n", + "vector: model.layers.14.post_attention_layernorm.weight 16384\n", + "vector: model.layers.14.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.14.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.14.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.14.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.15.input_layernorm.weight 16384\n", + "vector: model.layers.15.mlp.down_proj.weight 469762048\n", + "vector: model.layers.15.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.15.mlp.up_proj.weight 469762048\n", + "vector: model.layers.15.post_attention_layernorm.weight 16384\n", + "vector: model.layers.15.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.15.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.15.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.15.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.16.input_layernorm.weight 16384\n", + "vector: model.layers.16.mlp.down_proj.weight 469762048\n", + "vector: model.layers.16.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.16.mlp.up_proj.weight 469762048\n", + "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00005.safetensors\n", + "vector: model.layers.16.post_attention_layernorm.weight 16384\n", + "vector: model.layers.16.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.16.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.16.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.16.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.17.input_layernorm.weight 16384\n", + "vector: model.layers.17.mlp.down_proj.weight 469762048\n", + "vector: model.layers.17.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.17.mlp.up_proj.weight 469762048\n", + "vector: model.layers.17.post_attention_layernorm.weight 16384\n", + "vector: model.layers.17.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.17.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.17.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.17.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.18.input_layernorm.weight 16384\n", + "vector: model.layers.18.mlp.down_proj.weight 469762048\n", + "vector: model.layers.18.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.18.mlp.up_proj.weight 469762048\n", + "vector: model.layers.18.post_attention_layernorm.weight 16384\n", + "vector: model.layers.18.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.18.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.18.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.18.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.19.input_layernorm.weight 16384\n", + "vector: model.layers.19.mlp.down_proj.weight 469762048\n", + "vector: model.layers.19.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.19.mlp.up_proj.weight 469762048\n", + "vector: model.layers.19.post_attention_layernorm.weight 16384\n", + "vector: model.layers.19.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.19.self_attn.o_proj.weight 134217728\n", + "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00006.safetensors\n", + "vector: model.layers.19.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.19.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.20.input_layernorm.weight 16384\n", + "vector: model.layers.20.mlp.down_proj.weight 469762048\n", + "vector: model.layers.20.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.20.mlp.up_proj.weight 469762048\n", + "vector: model.layers.20.post_attention_layernorm.weight 16384\n", + "vector: model.layers.20.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.20.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.20.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.20.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.21.input_layernorm.weight 16384\n", + "vector: model.layers.21.mlp.down_proj.weight 469762048\n", + "vector: model.layers.21.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.21.mlp.up_proj.weight 469762048\n", + "vector: model.layers.21.post_attention_layernorm.weight 16384\n", + "vector: model.layers.21.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.21.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.21.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.21.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.22.input_layernorm.weight 16384\n", + "vector: model.layers.22.mlp.down_proj.weight 469762048\n", + "vector: model.layers.22.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.22.mlp.up_proj.weight 469762048\n", + "vector: model.layers.22.post_attention_layernorm.weight 16384\n", + "vector: model.layers.22.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.22.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.22.self_attn.q_proj.weight 134217728\n", + "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00007.safetensors\n", + "vector: model.layers.22.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.23.input_layernorm.weight 16384\n", + "vector: model.layers.23.mlp.down_proj.weight 469762048\n", + "vector: model.layers.23.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.23.mlp.up_proj.weight 469762048\n", + "vector: model.layers.23.post_attention_layernorm.weight 16384\n", + "vector: model.layers.23.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.23.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.23.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.23.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.24.input_layernorm.weight 16384\n", + "vector: model.layers.24.mlp.down_proj.weight 469762048\n", + "vector: model.layers.24.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.24.mlp.up_proj.weight 469762048\n", + "vector: model.layers.24.post_attention_layernorm.weight 16384\n", + "vector: model.layers.24.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.24.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.24.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.24.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.25.input_layernorm.weight 16384\n", + "vector: model.layers.25.mlp.down_proj.weight 469762048\n", + "vector: model.layers.25.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.25.mlp.up_proj.weight 469762048\n", + "vector: model.layers.25.post_attention_layernorm.weight 16384\n", + "vector: model.layers.25.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.25.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.25.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.25.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.26.input_layernorm.weight 16384\n", + "vector: model.layers.26.mlp.down_proj.weight 469762048\n", + "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00008.safetensors\n", + "vector: model.layers.26.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.26.mlp.up_proj.weight 469762048\n", + "vector: model.layers.26.post_attention_layernorm.weight 16384\n", + "vector: model.layers.26.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.26.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.26.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.26.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.27.input_layernorm.weight 16384\n", + "vector: model.layers.27.mlp.down_proj.weight 469762048\n", + "vector: model.layers.27.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.27.mlp.up_proj.weight 469762048\n", + "vector: model.layers.27.post_attention_layernorm.weight 16384\n", + "vector: model.layers.27.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.27.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.27.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.27.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.28.input_layernorm.weight 16384\n", + "vector: model.layers.28.mlp.down_proj.weight 469762048\n", + "vector: model.layers.28.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.28.mlp.up_proj.weight 469762048\n", + "vector: model.layers.28.post_attention_layernorm.weight 16384\n", + "vector: model.layers.28.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.28.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.28.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.28.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.29.input_layernorm.weight 16384\n", + "vector: model.layers.29.mlp.down_proj.weight 469762048\n", + "vector: model.layers.29.mlp.gate_proj.weight 469762048\n", + "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00009.safetensors\n", + "vector: model.layers.29.mlp.up_proj.weight 469762048\n", + "vector: model.layers.29.post_attention_layernorm.weight 16384\n", + "vector: model.layers.29.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.29.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.29.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.29.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.30.input_layernorm.weight 16384\n", + "vector: model.layers.30.mlp.down_proj.weight 469762048\n", + "vector: model.layers.30.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.30.mlp.up_proj.weight 469762048\n", + "vector: model.layers.30.post_attention_layernorm.weight 16384\n", + "vector: model.layers.30.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.30.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.30.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.30.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.31.input_layernorm.weight 16384\n", + "vector: model.layers.31.mlp.down_proj.weight 469762048\n", + "vector: model.layers.31.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.31.mlp.up_proj.weight 469762048\n", + "vector: model.layers.31.post_attention_layernorm.weight 16384\n", + "vector: model.layers.31.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.31.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.31.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.31.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.32.input_layernorm.weight 16384\n", + "vector: model.layers.32.mlp.down_proj.weight 469762048\n", + "vector: model.layers.32.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.32.mlp.up_proj.weight 469762048\n", + "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00010.safetensors\n", + "vector: model.layers.32.post_attention_layernorm.weight 16384\n", + "vector: model.layers.32.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.32.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.32.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.32.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.33.input_layernorm.weight 16384\n", + "vector: model.layers.33.mlp.down_proj.weight 469762048\n", + "vector: model.layers.33.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.33.mlp.up_proj.weight 469762048\n", + "vector: model.layers.33.post_attention_layernorm.weight 16384\n", + "vector: model.layers.33.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.33.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.33.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.33.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.34.input_layernorm.weight 16384\n", + "vector: model.layers.34.mlp.down_proj.weight 469762048\n", + "vector: model.layers.34.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.34.mlp.up_proj.weight 469762048\n", + "vector: model.layers.34.post_attention_layernorm.weight 16384\n", + "vector: model.layers.34.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.34.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.34.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.34.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.35.input_layernorm.weight 16384\n", + "vector: model.layers.35.mlp.down_proj.weight 469762048\n", + "vector: model.layers.35.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.35.mlp.up_proj.weight 469762048\n", + "vector: model.layers.35.post_attention_layernorm.weight 16384\n", + "vector: model.layers.35.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.35.self_attn.o_proj.weight 134217728\n", + "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00011.safetensors\n", + "vector: model.layers.35.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.35.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.36.input_layernorm.weight 16384\n", + "vector: model.layers.36.mlp.down_proj.weight 469762048\n", + "vector: model.layers.36.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.36.mlp.up_proj.weight 469762048\n", + "vector: model.layers.36.post_attention_layernorm.weight 16384\n", + "vector: model.layers.36.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.36.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.36.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.36.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.37.input_layernorm.weight 16384\n", + "vector: model.layers.37.mlp.down_proj.weight 469762048\n", + "vector: model.layers.37.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.37.mlp.up_proj.weight 469762048\n", + "vector: model.layers.37.post_attention_layernorm.weight 16384\n", + "vector: model.layers.37.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.37.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.37.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.37.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.38.input_layernorm.weight 16384\n", + "vector: model.layers.38.mlp.down_proj.weight 469762048\n", + "vector: model.layers.38.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.38.mlp.up_proj.weight 469762048\n", + "vector: model.layers.38.post_attention_layernorm.weight 16384\n", + "vector: model.layers.38.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.38.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.38.self_attn.q_proj.weight 134217728\n", + "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00012.safetensors\n", + "vector: model.layers.38.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.39.input_layernorm.weight 16384\n", + "vector: model.layers.39.mlp.down_proj.weight 469762048\n", + "vector: model.layers.39.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.39.mlp.up_proj.weight 469762048\n", + "vector: model.layers.39.post_attention_layernorm.weight 16384\n", + "vector: model.layers.39.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.39.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.39.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.39.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.40.input_layernorm.weight 16384\n", + "vector: model.layers.40.mlp.down_proj.weight 469762048\n", + "vector: model.layers.40.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.40.mlp.up_proj.weight 469762048\n", + "vector: model.layers.40.post_attention_layernorm.weight 16384\n", + "vector: model.layers.40.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.40.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.40.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.40.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.41.input_layernorm.weight 16384\n", + "vector: model.layers.41.mlp.down_proj.weight 469762048\n", + "vector: model.layers.41.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.41.mlp.up_proj.weight 469762048\n", + "vector: model.layers.41.post_attention_layernorm.weight 16384\n", + "vector: model.layers.41.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.41.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.41.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.41.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.42.input_layernorm.weight 16384\n", + "vector: model.layers.42.mlp.down_proj.weight 469762048\n", + "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00013.safetensors\n", + "vector: model.layers.42.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.42.mlp.up_proj.weight 469762048\n", + "vector: model.layers.42.post_attention_layernorm.weight 16384\n", + "vector: model.layers.42.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.42.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.42.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.42.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.43.input_layernorm.weight 16384\n", + "vector: model.layers.43.mlp.down_proj.weight 469762048\n", + "vector: model.layers.43.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.43.mlp.up_proj.weight 469762048\n", + "vector: model.layers.43.post_attention_layernorm.weight 16384\n", + "vector: model.layers.43.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.43.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.43.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.43.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.44.input_layernorm.weight 16384\n", + "vector: model.layers.44.mlp.down_proj.weight 469762048\n", + "vector: model.layers.44.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.44.mlp.up_proj.weight 469762048\n", + "vector: model.layers.44.post_attention_layernorm.weight 16384\n", + "vector: model.layers.44.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.44.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.44.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.44.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.45.input_layernorm.weight 16384\n", + "vector: model.layers.45.mlp.down_proj.weight 469762048\n", + "vector: model.layers.45.mlp.gate_proj.weight 469762048\n", + "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00014.safetensors\n", + "vector: model.layers.45.mlp.up_proj.weight 469762048\n", + "vector: model.layers.45.post_attention_layernorm.weight 16384\n", + "vector: model.layers.45.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.45.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.45.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.45.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.46.input_layernorm.weight 16384\n", + "vector: model.layers.46.mlp.down_proj.weight 469762048\n", + "vector: model.layers.46.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.46.mlp.up_proj.weight 469762048\n", + "vector: model.layers.46.post_attention_layernorm.weight 16384\n", + "vector: model.layers.46.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.46.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.46.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.46.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.47.input_layernorm.weight 16384\n", + "vector: model.layers.47.mlp.down_proj.weight 469762048\n", + "vector: model.layers.47.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.47.mlp.up_proj.weight 469762048\n", + "vector: model.layers.47.post_attention_layernorm.weight 16384\n", + "vector: model.layers.47.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.47.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.47.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.47.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.48.input_layernorm.weight 16384\n", + "vector: model.layers.48.mlp.down_proj.weight 469762048\n", + "vector: model.layers.48.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.48.mlp.up_proj.weight 469762048\n", + "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00015.safetensors\n", + "vector: model.layers.48.post_attention_layernorm.weight 16384\n", + "vector: model.layers.48.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.48.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.48.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.48.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.49.input_layernorm.weight 16384\n", + "vector: model.layers.49.mlp.down_proj.weight 469762048\n", + "vector: model.layers.49.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.49.mlp.up_proj.weight 469762048\n", + "vector: model.layers.49.post_attention_layernorm.weight 16384\n", + "vector: model.layers.49.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.49.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.49.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.49.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.50.input_layernorm.weight 16384\n", + "vector: model.layers.50.mlp.down_proj.weight 469762048\n", + "vector: model.layers.50.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.50.mlp.up_proj.weight 469762048\n", + "vector: model.layers.50.post_attention_layernorm.weight 16384\n", + "vector: model.layers.50.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.50.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.50.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.50.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.51.input_layernorm.weight 16384\n", + "vector: model.layers.51.mlp.down_proj.weight 469762048\n", + "vector: model.layers.51.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.51.mlp.up_proj.weight 469762048\n", + "vector: model.layers.51.post_attention_layernorm.weight 16384\n", + "vector: model.layers.51.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.51.self_attn.o_proj.weight 134217728\n", + "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00016.safetensors\n", + "vector: model.layers.51.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.51.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.52.input_layernorm.weight 16384\n", + "vector: model.layers.52.mlp.down_proj.weight 469762048\n", + "vector: model.layers.52.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.52.mlp.up_proj.weight 469762048\n", + "vector: model.layers.52.post_attention_layernorm.weight 16384\n", + "vector: model.layers.52.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.52.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.52.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.52.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.53.input_layernorm.weight 16384\n", + "vector: model.layers.53.mlp.down_proj.weight 469762048\n", + "vector: model.layers.53.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.53.mlp.up_proj.weight 469762048\n", + "vector: model.layers.53.post_attention_layernorm.weight 16384\n", + "vector: model.layers.53.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.53.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.53.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.53.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.54.input_layernorm.weight 16384\n", + "vector: model.layers.54.mlp.down_proj.weight 469762048\n", + "vector: model.layers.54.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.54.mlp.up_proj.weight 469762048\n", + "vector: model.layers.54.post_attention_layernorm.weight 16384\n", + "vector: model.layers.54.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.54.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.54.self_attn.q_proj.weight 134217728\n", + "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00017.safetensors\n", + "vector: model.layers.54.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.55.input_layernorm.weight 16384\n", + "vector: model.layers.55.mlp.down_proj.weight 469762048\n", + "vector: model.layers.55.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.55.mlp.up_proj.weight 469762048\n", + "vector: model.layers.55.post_attention_layernorm.weight 16384\n", + "vector: model.layers.55.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.55.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.55.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.55.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.56.input_layernorm.weight 16384\n", + "vector: model.layers.56.mlp.down_proj.weight 469762048\n", + "vector: model.layers.56.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.56.mlp.up_proj.weight 469762048\n", + "vector: model.layers.56.post_attention_layernorm.weight 16384\n", + "vector: model.layers.56.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.56.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.56.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.56.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.57.input_layernorm.weight 16384\n", + "vector: model.layers.57.mlp.down_proj.weight 469762048\n", + "vector: model.layers.57.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.57.mlp.up_proj.weight 469762048\n", + "vector: model.layers.57.post_attention_layernorm.weight 16384\n", + "vector: model.layers.57.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.57.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.57.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.57.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.58.input_layernorm.weight 16384\n", + "vector: model.layers.58.mlp.down_proj.weight 469762048\n", + "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00018.safetensors\n", + "vector: model.layers.58.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.58.mlp.up_proj.weight 469762048\n", + "vector: model.layers.58.post_attention_layernorm.weight 16384\n", + "vector: model.layers.58.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.58.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.58.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.58.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.59.input_layernorm.weight 16384\n", + "vector: model.layers.59.mlp.down_proj.weight 469762048\n", + "vector: model.layers.59.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.59.mlp.up_proj.weight 469762048\n", + "vector: model.layers.59.post_attention_layernorm.weight 16384\n", + "vector: model.layers.59.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.59.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.59.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.59.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.60.input_layernorm.weight 16384\n", + "vector: model.layers.60.mlp.down_proj.weight 469762048\n", + "vector: model.layers.60.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.60.mlp.up_proj.weight 469762048\n", + "vector: model.layers.60.post_attention_layernorm.weight 16384\n", + "vector: model.layers.60.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.60.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.60.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.60.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.61.input_layernorm.weight 16384\n", + "vector: model.layers.61.mlp.down_proj.weight 469762048\n", + "vector: model.layers.61.mlp.gate_proj.weight 469762048\n", + "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00019.safetensors\n", + "vector: model.layers.61.mlp.up_proj.weight 469762048\n", + "vector: model.layers.61.post_attention_layernorm.weight 16384\n", + "vector: model.layers.61.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.61.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.61.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.61.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.62.input_layernorm.weight 16384\n", + "vector: model.layers.62.mlp.down_proj.weight 469762048\n", + "vector: model.layers.62.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.62.mlp.up_proj.weight 469762048\n", + "vector: model.layers.62.post_attention_layernorm.weight 16384\n", + "vector: model.layers.62.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.62.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.62.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.62.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.63.input_layernorm.weight 16384\n", + "vector: model.layers.63.mlp.down_proj.weight 469762048\n", + "vector: model.layers.63.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.63.mlp.up_proj.weight 469762048\n", + "vector: model.layers.63.post_attention_layernorm.weight 16384\n", + "vector: model.layers.63.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.63.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.63.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.63.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.64.input_layernorm.weight 16384\n", + "vector: model.layers.64.mlp.down_proj.weight 469762048\n", + "vector: model.layers.64.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.64.mlp.up_proj.weight 469762048\n", + "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00020.safetensors\n", + "vector: model.layers.64.post_attention_layernorm.weight 16384\n", + "vector: model.layers.64.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.64.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.64.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.64.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.65.input_layernorm.weight 16384\n", + "vector: model.layers.65.mlp.down_proj.weight 469762048\n", + "vector: model.layers.65.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.65.mlp.up_proj.weight 469762048\n", + "vector: model.layers.65.post_attention_layernorm.weight 16384\n", + "vector: model.layers.65.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.65.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.65.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.65.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.66.input_layernorm.weight 16384\n", + "vector: model.layers.66.mlp.down_proj.weight 469762048\n", + "vector: model.layers.66.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.66.mlp.up_proj.weight 469762048\n", + "vector: model.layers.66.post_attention_layernorm.weight 16384\n", + "vector: model.layers.66.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.66.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.66.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.66.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.67.input_layernorm.weight 16384\n", + "vector: model.layers.67.mlp.down_proj.weight 469762048\n", + "vector: model.layers.67.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.67.mlp.up_proj.weight 469762048\n", + "vector: model.layers.67.post_attention_layernorm.weight 16384\n", + "vector: model.layers.67.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.67.self_attn.o_proj.weight 134217728\n", + "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00021.safetensors\n", + "vector: model.layers.67.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.67.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.68.input_layernorm.weight 16384\n", + "vector: model.layers.68.mlp.down_proj.weight 469762048\n", + "vector: model.layers.68.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.68.mlp.up_proj.weight 469762048\n", + "vector: model.layers.68.post_attention_layernorm.weight 16384\n", + "vector: model.layers.68.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.68.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.68.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.68.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.69.input_layernorm.weight 16384\n", + "vector: model.layers.69.mlp.down_proj.weight 469762048\n", + "vector: model.layers.69.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.69.mlp.up_proj.weight 469762048\n", + "vector: model.layers.69.post_attention_layernorm.weight 16384\n", + "vector: model.layers.69.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.69.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.69.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.69.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.70.input_layernorm.weight 16384\n", + "vector: model.layers.70.mlp.down_proj.weight 469762048\n", + "vector: model.layers.70.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.70.mlp.up_proj.weight 469762048\n", + "vector: model.layers.70.post_attention_layernorm.weight 16384\n", + "vector: model.layers.70.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.70.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.70.self_attn.q_proj.weight 134217728\n", + "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00022.safetensors\n", + "vector: model.layers.70.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.71.input_layernorm.weight 16384\n", + "vector: model.layers.71.mlp.down_proj.weight 469762048\n", + "vector: model.layers.71.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.71.mlp.up_proj.weight 469762048\n", + "vector: model.layers.71.post_attention_layernorm.weight 16384\n", + "vector: model.layers.71.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.71.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.71.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.71.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.72.input_layernorm.weight 16384\n", + "vector: model.layers.72.mlp.down_proj.weight 469762048\n", + "vector: model.layers.72.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.72.mlp.up_proj.weight 469762048\n", + "vector: model.layers.72.post_attention_layernorm.weight 16384\n", + "vector: model.layers.72.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.72.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.72.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.72.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.73.input_layernorm.weight 16384\n", + "vector: model.layers.73.mlp.down_proj.weight 469762048\n", + "vector: model.layers.73.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.73.mlp.up_proj.weight 469762048\n", + "vector: model.layers.73.post_attention_layernorm.weight 16384\n", + "vector: model.layers.73.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.73.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.73.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.73.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.74.input_layernorm.weight 16384\n", + "vector: model.layers.74.mlp.down_proj.weight 469762048\n", + "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00023.safetensors\n", + "vector: model.layers.74.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.74.mlp.up_proj.weight 469762048\n", + "vector: model.layers.74.post_attention_layernorm.weight 16384\n", + "vector: model.layers.74.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.74.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.74.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.74.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.75.input_layernorm.weight 16384\n", + "vector: model.layers.75.mlp.down_proj.weight 469762048\n", + "vector: model.layers.75.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.75.mlp.up_proj.weight 469762048\n", + "vector: model.layers.75.post_attention_layernorm.weight 16384\n", + "vector: model.layers.75.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.75.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.75.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.75.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.76.input_layernorm.weight 16384\n", + "vector: model.layers.76.mlp.down_proj.weight 469762048\n", + "vector: model.layers.76.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.76.mlp.up_proj.weight 469762048\n", + "vector: model.layers.76.post_attention_layernorm.weight 16384\n", + "vector: model.layers.76.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.76.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.76.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.76.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.77.input_layernorm.weight 16384\n", + "vector: model.layers.77.mlp.down_proj.weight 469762048\n", + "vector: model.layers.77.mlp.gate_proj.weight 469762048\n", + "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00024.safetensors\n", + "vector: model.layers.77.mlp.up_proj.weight 469762048\n", + "vector: model.layers.77.post_attention_layernorm.weight 16384\n", + "vector: model.layers.77.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.77.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.77.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.77.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.78.input_layernorm.weight 16384\n", + "vector: model.layers.78.mlp.down_proj.weight 469762048\n", + "vector: model.layers.78.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.78.mlp.up_proj.weight 469762048\n", + "vector: model.layers.78.post_attention_layernorm.weight 16384\n", + "vector: model.layers.78.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.78.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.78.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.78.self_attn.v_proj.weight 16777216\n", + "vector: model.layers.79.input_layernorm.weight 16384\n", + "vector: model.layers.79.mlp.down_proj.weight 469762048\n", + "vector: model.layers.79.mlp.gate_proj.weight 469762048\n", + "vector: model.layers.79.mlp.up_proj.weight 469762048\n", + "vector: model.layers.79.post_attention_layernorm.weight 16384\n", + "vector: model.layers.79.self_attn.k_proj.weight 16777216\n", + "vector: model.layers.79.self_attn.o_proj.weight 134217728\n", + "vector: model.layers.79.self_attn.q_proj.weight 134217728\n", + "vector: model.layers.79.self_attn.v_proj.weight 16777216\n", + "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00025.safetensors\n", + "make model.safetensors.index.json\n", + "Done!\n" + ] + } + ], + "source": [ + "ecvm = ExpandChatVectorMerger(merge_target_model_path, vector_path, save_merged_model_path)\n", + "ecvm.load_setitngs(\"num_hidden_layers\")\n", + "print(\"config_target\", ecvm.n_layers)\n", + "print(\"config_vector\", ecvm.n_layers_vector)\n", + "ecvm.load_setitngs(layer_num_config_name=\"num_hidden_layers\")\n", + "ecvm.add_layer_weight_template_name(\"model.layers.${i}.input_layernorm.weight\")\n", + "ecvm.add_layer_weight_template_name(\"model.layers.${i}.mlp.down_proj.weight\")\n", + "ecvm.add_layer_weight_template_name(\"model.layers.${i}.mlp.gate_proj.weight\")\n", + "ecvm.add_layer_weight_template_name(\"model.layers.${i}.mlp.up_proj.weight\")\n", + "ecvm.add_layer_weight_template_name(\"model.layers.${i}.post_attention_layernorm.weight\")\n", + "ecvm.add_layer_weight_template_name(\"model.layers.${i}.self_attn.k_proj.weight\")\n", + "ecvm.add_layer_weight_template_name(\"model.layers.${i}.self_attn.o_proj.weight\")\n", + "ecvm.add_layer_weight_template_name(\"model.layers.${i}.self_attn.q_proj.weight\")\n", + "ecvm.add_layer_weight_template_name(\"model.layers.${i}.self_attn.v_proj.weight\")\n", + "ecvm.merge()\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import AutoModelForCausalLM, AutoTokenizer\n", + "\n", + "model_id = ecvm.save_merged_model_path\n", + "re_save_path = \"/home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector_re\"\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", + "model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map=\"cpu\")\n", + "\n", + "messages = [\n", + " {\"role\": \"system\", \"content\": \"日本語で返答してください。\"},\n", + " {\"role\": \"user\", \"content\": \"東京のおすすめの観光スポットを教えて下さい\"},\n", + "]\n", + "prompt = tokenizer.apply_chat_template(\n", + " messages, \n", + " tokenize=False, \n", + " add_generation_prompt=True\n", + ")\n", + "\n", + "inputs = tokenizer([prompt], return_tensors=\"pt\")\n", + "\n", + "terminators = [\n", + " tokenizer.eos_token_id,\n", + " tokenizer.convert_tokens_to_ids(\"<|eot_id|>\")\n", + "]\n", + "\n", + "# model.save_pretrained(re_save_path)\n", + "# tokenizer.save_pretrained(re_save_path)\n", + "\n", + "outputs = model.generate(**inputs, \n", + " max_new_tokens=256,\n", + " eos_token_id=terminators,\n", + " do_sample=True,\n", + " temperature=0.6,\n", + " top_p=0.9,\n", + " )\n", + "\n", + "print(tokenizer.decode(outputs[0]))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/mmnga/.local/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", + "/home/mmnga/.local/lib/python3.10/site-packages/torch/cuda/__init__.py:619: UserWarning: Can't initialize NVML\n", + " warnings.warn(\"Can't initialize NVML\")\n", + "Loading checkpoint shards: 100%|██████████| 30/30 [01:53<00:00, 3.77s/it]\n", + "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>\n", + "\n", + "日本語で返答してください。<|eot_id|><|start_header_id|>user<|end_header_id|>\n", + "\n", + "東京のおすすめの観光スポットを教えて下さい<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n", + "\n", + "東京は観光スポットが非常に多く、どれを選ぶか迷ってしまうほどです!以下は、東京のおすすめの観光スポット10選です。\n", + "\n", + "1. **東京スカイツリー**:東京都心部にある高さ634mの超高層タワーの展望台から、東京のパノラマを眺めることができます。\n", + "2. **浅草寺**:浅草区にある古い寺院で、雷門(浅草門)や仲見世通りが有名です。\n", + "3. **渋谷スクランブルクロス**:渋谷区にある世界的に有名な交差点で、流行の最先端を感じることができます。\n", + "4. **東京タワー**:港区にある高さ333mのタワーで、夜はライトアップされます。\n", + "5. **新宿御苑**:新宿区にある大きな公園で、桜のシーズンには非常に人気があります。\n", + "6. **築地市場**:中央区にある世界最大の魚市場で、寿司や海老の朝食を味わうことができます。\n", + "7. **明治神\n" + ] + } + ], + "source": [ + "import torch\n", + "from transformers import AutoModelForCausalLM, AutoTokenizer\n", + "\n", + "model_id = \"/home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector_re\"\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", + "model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map=\"cpu\")\n", + "\n", + "messages = [\n", + " {\"role\": \"system\", \"content\": \"日本語で返答してください。\"},\n", + " {\"role\": \"user\", \"content\": \"東京のおすすめの観光スポットを教えて下さい\"},\n", + "]\n", + "prompt = tokenizer.apply_chat_template(\n", + " messages, \n", + " tokenize=False, \n", + " add_generation_prompt=True\n", + ")\n", + "\n", + "inputs = tokenizer([prompt], return_tensors=\"pt\")\n", + "\n", + "terminators = [\n", + " tokenizer.eos_token_id,\n", + " tokenizer.convert_tokens_to_ids(\"<|eot_id|>\")\n", + "]\n", + "\n", + "outputs = model.generate(**inputs, \n", + " max_new_tokens=256,\n", + " eos_token_id=terminators,\n", + " do_sample=True,\n", + " temperature=0.6,\n", + " top_p=0.9,\n", + " )\n", + "\n", + "print(tokenizer.decode(outputs[0]))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install -Uqq huggingface-hub\n", + "\n", + "# !huggingface-cli login --token $HF_TOKEN\n", + "\n", + "# tokenizer.push_to_hub(\"Llama-3-70B-japanese-suzume-vector\", use_auth_token=True, private=True)\n", + "# model.push_to_hub(\"Llama-3-70B-japanese-suzume-vector\", use_auth_token=True, private=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}