{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/mmnga\n", "/home/mmnga/hdd/llm-data\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/mmnga/.local/lib/python3.10/site-packages/IPython/core/magics/osm.py:417: UserWarning: This is now an optional IPython functionality, setting dhist requires you to install the `pickleshare` library.\n", " self.shell.db['dhist'] = compress_dhist(dhist)[-100:]\n" ] } ], "source": [ "%cd /home/mmnga\n", "!source .venv/bin/activate\n", "%cd /home/mmnga/hdd/llm-data/" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "!pip install -Uqq torch safetensors transformers " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# chat-vectorの作成\n", "model_name_or_path_base = \"/home/mmnga/hdd/llm-data/Meta-Llama-3-8B-Instruct\"\n", "model_name_or_path_target = \"/home/mmnga/hdd/llm-data/suzume-llama-3-8B-japanese\"\n", "save_vector_name_or_path = \"/home/mmnga/hdd/llm-data/llama-3-8B-chat-vector_2\"\n", "\n" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": [ "from safetensors import safe_open\n", "from safetensors.torch import save_file\n", "\n", "import pathlib, os, json, transformers\n", "from string import Template\n", "import torch\n", "\n", "class ChatVectorManager:\n", " def __init__(self, model_name_or_path_base, model_name_or_path_target, save_vector_name_or_path):\n", " self.model_name_or_path_base = model_name_or_path_base\n", " self.model_name_or_path_target = model_name_or_path_target\n", " self.save_vector_name_or_path = save_vector_name_or_path\n", " self.config_base = {}\n", " self.config_target = {}\n", " self.n_layers = 0\n", " self.layer_weight_templates = []\n", " self.base_weight_map = {}\n", " self.target_weight_map = {}\n", " self.current_weight_file_base = \"\"\n", " self.current_weight_file_target = \"\"\n", " self.base_weights = {}\n", " self.target_weights = {}\n", " self.save_weights = {}\n", " self.save_index_weights = {}\n", " self.save_size = 0\n", " self.save_total_size = 0\n", " self.save_byte_size = int(4.9 * 1024 * 1024 * 1024)\n", " self.save_counter = 0\n", "\n", " def load_setitngs(self, layer_num_config_name):\n", "\n", " # load config\n", " with open(self.model_name_or_path_base + \"/config.json\", \"r\") as f:\n", " self.config_base = json.load(f)\n", "\n", " with open(self.model_name_or_path_target + \"/config.json\", \"r\") as f:\n", " self.config_target = json.load(f)\n", "\n", " # load weight map\n", " with open(self.model_name_or_path_base + \"/model.safetensors.index.json\", \"r\") as f:\n", " self.base_weight_map = json.load(f)[\"weight_map\"]\n", "\n", " with open(self.model_name_or_path_target + \"/model.safetensors.index.json\", \"r\") as f:\n", " self.target_weight_map = json.load(f)[\"weight_map\"]\n", "\n", " self.n_layers = int(self.config_base[layer_num_config_name])\n", " \n", "\n", " def add_layer_weight_template_name(self, weight_template_name):\n", " self.layer_weight_templates.append(Template(weight_template_name))\n", "\n", " def load_base_weight(self, weight_name):\n", " if self.current_weight_file_base == self.base_weight_map[weight_name]:\n", " return\n", " else:\n", " file_name = self.base_weight_map[weight_name]\n", " self.base_weights = safe_open(f\"{self.model_name_or_path_base}/{file_name}\", framework=\"pt\")\n", " self.current_weight_file_base = file_name\n", "\n", " def load_target_weight(self, weight_name):\n", " if self.current_weight_file_target == self.target_weight_map[weight_name]:\n", " return\n", " else:\n", " file_name = self.target_weight_map[weight_name]\n", " self.target_weights = safe_open(f\"{self.model_name_or_path_target}/{file_name}\", framework=\"pt\")\n", " self.current_weight_file_target = file_name\n", "\n", " def layer_weight_iter(self):\n", " for i in range(self.n_layers):\n", " base_layer_weights = {}\n", " target_layer_weights = {}\n", " for t in self.layer_weight_templates:\n", " weight_name = t.substitute(i=i)\n", " self.load_base_weight(weight_name)\n", " self.load_target_weight(weight_name)\n", " base_layer_weights[weight_name] = self.base_weights.get_tensor(weight_name)\n", " target_layer_weights[weight_name] = self.target_weights.get_tensor(weight_name)\n", "\n", " yield i, weight_name, base_layer_weights[weight_name], target_layer_weights[weight_name]\n", "\n", " def get_weight_byte_size(self, weight):\n", "\n", " if isinstance(weight, torch.Tensor):\n", " weight_byte_size = weight.nelement() * weight.element_size()\n", " else:\n", " weight_byte_size = sum(p.nelement() * p.element_size() for p in weight.parameters())\n", "\n", " return weight_byte_size\n", "\n", "\n", " def save_weights_split(self):\n", " if len(self.save_weights.keys()) == 0:\n", " return \n", "\n", " file_name = f\"{self.save_vector_name_or_path}/model-{self.save_counter:05}.safetensors\"\n", "\n", " for weight_name in self.save_weights.keys():\n", " self.save_index_weights[weight_name] = file_name.split(\"/\")[-1]\n", "\n", " save_file(self.save_weights, file_name, metadata={\"format\":\"pt\"})\n", " self.save_size = 0\n", " self.save_counter += 1\n", " self.save_weights = {}\n", " print(f\"save: {file_name}\")\n", "\n", " def push_weight(self, weight_name, weight):\n", " weight_size = self.get_weight_byte_size(weight)\n", " self.save_weights[weight_name] = weight\n", " self.save_size += weight_size\n", " self.save_total_size += weight_size\n", "\n", " print(f\"vector: {weight_name} {weight_size}\")\n", " if self.save_size > self.save_byte_size:\n", " self.save_weights_split()\n", " \n", " def save_weight_map(self):\n", " new_weight_map = {\n", " \"metadata\": {\n", " \"total_size\": self.save_total_size\n", " },\n", " \"weight_map\": self.save_index_weights\n", " }\n", " with open(f\"{self.save_vector_name_or_path}/model.safetensors.index.json\", \"w\") as f:\n", " json.dump(new_weight_map, f, indent=4)\n", "\n", " print(\"make model.safetensors.index.json\")\n", "\n", " def save_config(self):\n", " with open(f\"{self.save_vector_name_or_path}/config.json\", \"w\") as f:\n", " json.dump(self.config_target, f, indent=4)\n", "\n", " def make_vector(self):\n", "\n", " os.makedirs(self.save_vector_name_or_path, exist_ok=True)\n", "\n", " # 数値が含まれないweight\n", " for weight_name in [k for k in self.target_weight_map if not any(c.isdigit() for c in k)]:\n", " self.load_base_weight(weight_name)\n", " self.load_target_weight(weight_name)\n", " base_weight = self.base_weights.get_tensor(weight_name)\n", " target_weight = self.target_weights.get_tensor(weight_name)\n", " diff = target_weight - base_weight\n", " self.push_weight(weight_name, diff)\n", "\n", " for i, weight_name, base_weight, target_weight in self.layer_weight_iter():\n", " diff = target_weight - base_weight\n", " self.push_weight(weight_name, diff)\n", "\n", " self.save_weights_split()\n", " self.save_weight_map()\n", " self.save_config()\n", " print(\"Done!\")\n" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "vector: lm_head.weight 1050673152\n", "vector: model.embed_tokens.weight 1050673152\n", "vector: model.norm.weight 8192\n", "vector: model.layers.0.input_layernorm.weight 8192\n", "vector: model.layers.0.mlp.down_proj.weight 117440512\n", "vector: model.layers.0.mlp.gate_proj.weight 117440512\n", "vector: model.layers.0.mlp.up_proj.weight 117440512\n", "vector: model.layers.0.post_attention_layernorm.weight 8192\n", "vector: model.layers.0.self_attn.k_proj.weight 8388608\n", "vector: model.layers.0.self_attn.o_proj.weight 33554432\n", "vector: model.layers.0.self_attn.q_proj.weight 33554432\n", "vector: model.layers.0.self_attn.v_proj.weight 8388608\n", "vector: model.layers.1.input_layernorm.weight 8192\n", "vector: model.layers.1.mlp.down_proj.weight 117440512\n", "vector: model.layers.1.mlp.gate_proj.weight 117440512\n", "vector: model.layers.1.mlp.up_proj.weight 117440512\n", "vector: model.layers.1.post_attention_layernorm.weight 8192\n", "vector: model.layers.1.self_attn.k_proj.weight 8388608\n", "vector: model.layers.1.self_attn.o_proj.weight 33554432\n", "vector: model.layers.1.self_attn.q_proj.weight 33554432\n", "vector: model.layers.1.self_attn.v_proj.weight 8388608\n", "vector: model.layers.2.input_layernorm.weight 8192\n", "vector: model.layers.2.mlp.down_proj.weight 117440512\n", "vector: model.layers.2.mlp.gate_proj.weight 117440512\n", "vector: model.layers.2.mlp.up_proj.weight 117440512\n", "vector: model.layers.2.post_attention_layernorm.weight 8192\n", "vector: model.layers.2.self_attn.k_proj.weight 8388608\n", "vector: model.layers.2.self_attn.o_proj.weight 33554432\n", "vector: model.layers.2.self_attn.q_proj.weight 33554432\n", "vector: model.layers.2.self_attn.v_proj.weight 8388608\n", "vector: model.layers.3.input_layernorm.weight 8192\n", "vector: model.layers.3.mlp.down_proj.weight 117440512\n", "vector: model.layers.3.mlp.gate_proj.weight 117440512\n", "vector: model.layers.3.mlp.up_proj.weight 117440512\n", "vector: model.layers.3.post_attention_layernorm.weight 8192\n", "vector: model.layers.3.self_attn.k_proj.weight 8388608\n", "vector: model.layers.3.self_attn.o_proj.weight 33554432\n", "vector: model.layers.3.self_attn.q_proj.weight 33554432\n", "vector: model.layers.3.self_attn.v_proj.weight 8388608\n", "vector: model.layers.4.input_layernorm.weight 8192\n", "vector: model.layers.4.mlp.down_proj.weight 117440512\n", "vector: model.layers.4.mlp.gate_proj.weight 117440512\n", "vector: model.layers.4.mlp.up_proj.weight 117440512\n", "vector: model.layers.4.post_attention_layernorm.weight 8192\n", "vector: model.layers.4.self_attn.k_proj.weight 8388608\n", "vector: model.layers.4.self_attn.o_proj.weight 33554432\n", "vector: model.layers.4.self_attn.q_proj.weight 33554432\n", "vector: model.layers.4.self_attn.v_proj.weight 8388608\n", "vector: model.layers.5.input_layernorm.weight 8192\n", "vector: model.layers.5.mlp.down_proj.weight 117440512\n", "vector: model.layers.5.mlp.gate_proj.weight 117440512\n", "vector: model.layers.5.mlp.up_proj.weight 117440512\n", "vector: model.layers.5.post_attention_layernorm.weight 8192\n", "vector: model.layers.5.self_attn.k_proj.weight 8388608\n", "vector: model.layers.5.self_attn.o_proj.weight 33554432\n", "vector: model.layers.5.self_attn.q_proj.weight 33554432\n", "vector: model.layers.5.self_attn.v_proj.weight 8388608\n", "vector: model.layers.6.input_layernorm.weight 8192\n", "vector: model.layers.6.mlp.down_proj.weight 117440512\n", "vector: model.layers.6.mlp.gate_proj.weight 117440512\n", "vector: model.layers.6.mlp.up_proj.weight 117440512\n", "vector: model.layers.6.post_attention_layernorm.weight 8192\n", "vector: model.layers.6.self_attn.k_proj.weight 8388608\n", "vector: model.layers.6.self_attn.o_proj.weight 33554432\n", "vector: model.layers.6.self_attn.q_proj.weight 33554432\n", "vector: model.layers.6.self_attn.v_proj.weight 8388608\n", "vector: model.layers.7.input_layernorm.weight 8192\n", "vector: model.layers.7.mlp.down_proj.weight 117440512\n", "save: /home/mmnga/hdd/llm-data/llama-3-8B-chat-vector_2/model-00000.safetensors\n", "vector: model.layers.7.mlp.gate_proj.weight 117440512\n", "vector: model.layers.7.mlp.up_proj.weight 117440512\n", "vector: model.layers.7.post_attention_layernorm.weight 8192\n", "vector: model.layers.7.self_attn.k_proj.weight 8388608\n", "vector: model.layers.7.self_attn.o_proj.weight 33554432\n", "vector: model.layers.7.self_attn.q_proj.weight 33554432\n", "vector: model.layers.7.self_attn.v_proj.weight 8388608\n", "vector: model.layers.8.input_layernorm.weight 8192\n", "vector: model.layers.8.mlp.down_proj.weight 117440512\n", "vector: model.layers.8.mlp.gate_proj.weight 117440512\n", "vector: model.layers.8.mlp.up_proj.weight 117440512\n", "vector: model.layers.8.post_attention_layernorm.weight 8192\n", "vector: model.layers.8.self_attn.k_proj.weight 8388608\n", "vector: model.layers.8.self_attn.o_proj.weight 33554432\n", "vector: model.layers.8.self_attn.q_proj.weight 33554432\n", "vector: model.layers.8.self_attn.v_proj.weight 8388608\n", "vector: model.layers.9.input_layernorm.weight 8192\n", "vector: model.layers.9.mlp.down_proj.weight 117440512\n", "vector: model.layers.9.mlp.gate_proj.weight 117440512\n", "vector: model.layers.9.mlp.up_proj.weight 117440512\n", "vector: model.layers.9.post_attention_layernorm.weight 8192\n", "vector: model.layers.9.self_attn.k_proj.weight 8388608\n", "vector: model.layers.9.self_attn.o_proj.weight 33554432\n", "vector: model.layers.9.self_attn.q_proj.weight 33554432\n", "vector: model.layers.9.self_attn.v_proj.weight 8388608\n", "vector: model.layers.10.input_layernorm.weight 8192\n", "vector: model.layers.10.mlp.down_proj.weight 117440512\n", "vector: model.layers.10.mlp.gate_proj.weight 117440512\n", "vector: model.layers.10.mlp.up_proj.weight 117440512\n", "vector: model.layers.10.post_attention_layernorm.weight 8192\n", "vector: model.layers.10.self_attn.k_proj.weight 8388608\n", "vector: model.layers.10.self_attn.o_proj.weight 33554432\n", "vector: model.layers.10.self_attn.q_proj.weight 33554432\n", "vector: model.layers.10.self_attn.v_proj.weight 8388608\n", "vector: model.layers.11.input_layernorm.weight 8192\n", "vector: model.layers.11.mlp.down_proj.weight 117440512\n", "vector: model.layers.11.mlp.gate_proj.weight 117440512\n", "vector: model.layers.11.mlp.up_proj.weight 117440512\n", "vector: model.layers.11.post_attention_layernorm.weight 8192\n", "vector: model.layers.11.self_attn.k_proj.weight 8388608\n", "vector: model.layers.11.self_attn.o_proj.weight 33554432\n", "vector: model.layers.11.self_attn.q_proj.weight 33554432\n", "vector: model.layers.11.self_attn.v_proj.weight 8388608\n", "vector: model.layers.12.input_layernorm.weight 8192\n", "vector: model.layers.12.mlp.down_proj.weight 117440512\n", "vector: model.layers.12.mlp.gate_proj.weight 117440512\n", "vector: model.layers.12.mlp.up_proj.weight 117440512\n", "vector: model.layers.12.post_attention_layernorm.weight 8192\n", "vector: model.layers.12.self_attn.k_proj.weight 8388608\n", "vector: model.layers.12.self_attn.o_proj.weight 33554432\n", "vector: model.layers.12.self_attn.q_proj.weight 33554432\n", "vector: model.layers.12.self_attn.v_proj.weight 8388608\n", "vector: model.layers.13.input_layernorm.weight 8192\n", "vector: model.layers.13.mlp.down_proj.weight 117440512\n", "vector: model.layers.13.mlp.gate_proj.weight 117440512\n", "vector: model.layers.13.mlp.up_proj.weight 117440512\n", "vector: model.layers.13.post_attention_layernorm.weight 8192\n", "vector: model.layers.13.self_attn.k_proj.weight 8388608\n", "vector: model.layers.13.self_attn.o_proj.weight 33554432\n", "vector: model.layers.13.self_attn.q_proj.weight 33554432\n", "vector: model.layers.13.self_attn.v_proj.weight 8388608\n", "vector: model.layers.14.input_layernorm.weight 8192\n", "vector: model.layers.14.mlp.down_proj.weight 117440512\n", "vector: model.layers.14.mlp.gate_proj.weight 117440512\n", "vector: model.layers.14.mlp.up_proj.weight 117440512\n", "vector: model.layers.14.post_attention_layernorm.weight 8192\n", "vector: model.layers.14.self_attn.k_proj.weight 8388608\n", "vector: model.layers.14.self_attn.o_proj.weight 33554432\n", "vector: model.layers.14.self_attn.q_proj.weight 33554432\n", "vector: model.layers.14.self_attn.v_proj.weight 8388608\n", "vector: model.layers.15.input_layernorm.weight 8192\n", "vector: model.layers.15.mlp.down_proj.weight 117440512\n", "vector: model.layers.15.mlp.gate_proj.weight 117440512\n", "vector: model.layers.15.mlp.up_proj.weight 117440512\n", "vector: model.layers.15.post_attention_layernorm.weight 8192\n", "vector: model.layers.15.self_attn.k_proj.weight 8388608\n", "vector: model.layers.15.self_attn.o_proj.weight 33554432\n", "vector: model.layers.15.self_attn.q_proj.weight 33554432\n", "vector: model.layers.15.self_attn.v_proj.weight 8388608\n", "vector: model.layers.16.input_layernorm.weight 8192\n", "vector: model.layers.16.mlp.down_proj.weight 117440512\n", "vector: model.layers.16.mlp.gate_proj.weight 117440512\n", "vector: model.layers.16.mlp.up_proj.weight 117440512\n", "vector: model.layers.16.post_attention_layernorm.weight 8192\n", "vector: model.layers.16.self_attn.k_proj.weight 8388608\n", "vector: model.layers.16.self_attn.o_proj.weight 33554432\n", "vector: model.layers.16.self_attn.q_proj.weight 33554432\n", "vector: model.layers.16.self_attn.v_proj.weight 8388608\n", "vector: model.layers.17.input_layernorm.weight 8192\n", "vector: model.layers.17.mlp.down_proj.weight 117440512\n", "vector: model.layers.17.mlp.gate_proj.weight 117440512\n", "vector: model.layers.17.mlp.up_proj.weight 117440512\n", "vector: model.layers.17.post_attention_layernorm.weight 8192\n", "vector: model.layers.17.self_attn.k_proj.weight 8388608\n", "vector: model.layers.17.self_attn.o_proj.weight 33554432\n", "vector: model.layers.17.self_attn.q_proj.weight 33554432\n", "vector: model.layers.17.self_attn.v_proj.weight 8388608\n", "vector: model.layers.18.input_layernorm.weight 8192\n", "vector: model.layers.18.mlp.down_proj.weight 117440512\n", "vector: model.layers.18.mlp.gate_proj.weight 117440512\n", "vector: model.layers.18.mlp.up_proj.weight 117440512\n", "vector: model.layers.18.post_attention_layernorm.weight 8192\n", "vector: model.layers.18.self_attn.k_proj.weight 8388608\n", "vector: model.layers.18.self_attn.o_proj.weight 33554432\n", "vector: model.layers.18.self_attn.q_proj.weight 33554432\n", "vector: model.layers.18.self_attn.v_proj.weight 8388608\n", "vector: model.layers.19.input_layernorm.weight 8192\n", "vector: model.layers.19.mlp.down_proj.weight 117440512\n", "vector: model.layers.19.mlp.gate_proj.weight 117440512\n", "save: /home/mmnga/hdd/llm-data/llama-3-8B-chat-vector_2/model-00001.safetensors\n", "vector: model.layers.19.mlp.up_proj.weight 117440512\n", "vector: model.layers.19.post_attention_layernorm.weight 8192\n", "vector: model.layers.19.self_attn.k_proj.weight 8388608\n", "vector: model.layers.19.self_attn.o_proj.weight 33554432\n", "vector: model.layers.19.self_attn.q_proj.weight 33554432\n", "vector: model.layers.19.self_attn.v_proj.weight 8388608\n", "vector: model.layers.20.input_layernorm.weight 8192\n", "vector: model.layers.20.mlp.down_proj.weight 117440512\n", "vector: model.layers.20.mlp.gate_proj.weight 117440512\n", "vector: model.layers.20.mlp.up_proj.weight 117440512\n", "vector: model.layers.20.post_attention_layernorm.weight 8192\n", "vector: model.layers.20.self_attn.k_proj.weight 8388608\n", "vector: model.layers.20.self_attn.o_proj.weight 33554432\n", "vector: model.layers.20.self_attn.q_proj.weight 33554432\n", "vector: model.layers.20.self_attn.v_proj.weight 8388608\n", "vector: model.layers.21.input_layernorm.weight 8192\n", "vector: model.layers.21.mlp.down_proj.weight 117440512\n", "vector: model.layers.21.mlp.gate_proj.weight 117440512\n", "vector: model.layers.21.mlp.up_proj.weight 117440512\n", "vector: model.layers.21.post_attention_layernorm.weight 8192\n", "vector: model.layers.21.self_attn.k_proj.weight 8388608\n", "vector: model.layers.21.self_attn.o_proj.weight 33554432\n", "vector: model.layers.21.self_attn.q_proj.weight 33554432\n", "vector: model.layers.21.self_attn.v_proj.weight 8388608\n", "vector: model.layers.22.input_layernorm.weight 8192\n", "vector: model.layers.22.mlp.down_proj.weight 117440512\n", "vector: model.layers.22.mlp.gate_proj.weight 117440512\n", "vector: model.layers.22.mlp.up_proj.weight 117440512\n", "vector: model.layers.22.post_attention_layernorm.weight 8192\n", "vector: model.layers.22.self_attn.k_proj.weight 8388608\n", "vector: model.layers.22.self_attn.o_proj.weight 33554432\n", "vector: model.layers.22.self_attn.q_proj.weight 33554432\n", "vector: model.layers.22.self_attn.v_proj.weight 8388608\n", "vector: model.layers.23.input_layernorm.weight 8192\n", "vector: model.layers.23.mlp.down_proj.weight 117440512\n", "vector: model.layers.23.mlp.gate_proj.weight 117440512\n", "vector: model.layers.23.mlp.up_proj.weight 117440512\n", "vector: model.layers.23.post_attention_layernorm.weight 8192\n", "vector: model.layers.23.self_attn.k_proj.weight 8388608\n", "vector: model.layers.23.self_attn.o_proj.weight 33554432\n", "vector: model.layers.23.self_attn.q_proj.weight 33554432\n", "vector: model.layers.23.self_attn.v_proj.weight 8388608\n", "vector: model.layers.24.input_layernorm.weight 8192\n", "vector: model.layers.24.mlp.down_proj.weight 117440512\n", "vector: model.layers.24.mlp.gate_proj.weight 117440512\n", "vector: model.layers.24.mlp.up_proj.weight 117440512\n", "vector: model.layers.24.post_attention_layernorm.weight 8192\n", "vector: model.layers.24.self_attn.k_proj.weight 8388608\n", "vector: model.layers.24.self_attn.o_proj.weight 33554432\n", "vector: model.layers.24.self_attn.q_proj.weight 33554432\n", "vector: model.layers.24.self_attn.v_proj.weight 8388608\n", "vector: model.layers.25.input_layernorm.weight 8192\n", "vector: model.layers.25.mlp.down_proj.weight 117440512\n", "vector: model.layers.25.mlp.gate_proj.weight 117440512\n", "vector: model.layers.25.mlp.up_proj.weight 117440512\n", "vector: model.layers.25.post_attention_layernorm.weight 8192\n", "vector: model.layers.25.self_attn.k_proj.weight 8388608\n", "vector: model.layers.25.self_attn.o_proj.weight 33554432\n", "vector: model.layers.25.self_attn.q_proj.weight 33554432\n", "vector: model.layers.25.self_attn.v_proj.weight 8388608\n", "vector: model.layers.26.input_layernorm.weight 8192\n", "vector: model.layers.26.mlp.down_proj.weight 117440512\n", "vector: model.layers.26.mlp.gate_proj.weight 117440512\n", "vector: model.layers.26.mlp.up_proj.weight 117440512\n", "vector: model.layers.26.post_attention_layernorm.weight 8192\n", "vector: model.layers.26.self_attn.k_proj.weight 8388608\n", "vector: model.layers.26.self_attn.o_proj.weight 33554432\n", "vector: model.layers.26.self_attn.q_proj.weight 33554432\n", "vector: model.layers.26.self_attn.v_proj.weight 8388608\n", "vector: model.layers.27.input_layernorm.weight 8192\n", "vector: model.layers.27.mlp.down_proj.weight 117440512\n", "vector: model.layers.27.mlp.gate_proj.weight 117440512\n", "vector: model.layers.27.mlp.up_proj.weight 117440512\n", "vector: model.layers.27.post_attention_layernorm.weight 8192\n", "vector: model.layers.27.self_attn.k_proj.weight 8388608\n", "vector: model.layers.27.self_attn.o_proj.weight 33554432\n", "vector: model.layers.27.self_attn.q_proj.weight 33554432\n", "vector: model.layers.27.self_attn.v_proj.weight 8388608\n", "vector: model.layers.28.input_layernorm.weight 8192\n", "vector: model.layers.28.mlp.down_proj.weight 117440512\n", "vector: model.layers.28.mlp.gate_proj.weight 117440512\n", "vector: model.layers.28.mlp.up_proj.weight 117440512\n", "vector: model.layers.28.post_attention_layernorm.weight 8192\n", "vector: model.layers.28.self_attn.k_proj.weight 8388608\n", "vector: model.layers.28.self_attn.o_proj.weight 33554432\n", "vector: model.layers.28.self_attn.q_proj.weight 33554432\n", "vector: model.layers.28.self_attn.v_proj.weight 8388608\n", "vector: model.layers.29.input_layernorm.weight 8192\n", "vector: model.layers.29.mlp.down_proj.weight 117440512\n", "vector: model.layers.29.mlp.gate_proj.weight 117440512\n", "vector: model.layers.29.mlp.up_proj.weight 117440512\n", "vector: model.layers.29.post_attention_layernorm.weight 8192\n", "vector: model.layers.29.self_attn.k_proj.weight 8388608\n", "vector: model.layers.29.self_attn.o_proj.weight 33554432\n", "vector: model.layers.29.self_attn.q_proj.weight 33554432\n", "vector: model.layers.29.self_attn.v_proj.weight 8388608\n", "vector: model.layers.30.input_layernorm.weight 8192\n", "vector: model.layers.30.mlp.down_proj.weight 117440512\n", "vector: model.layers.30.mlp.gate_proj.weight 117440512\n", "vector: model.layers.30.mlp.up_proj.weight 117440512\n", "vector: model.layers.30.post_attention_layernorm.weight 8192\n", "vector: model.layers.30.self_attn.k_proj.weight 8388608\n", "vector: model.layers.30.self_attn.o_proj.weight 33554432\n", "vector: model.layers.30.self_attn.q_proj.weight 33554432\n", "vector: model.layers.30.self_attn.v_proj.weight 8388608\n", "vector: model.layers.31.input_layernorm.weight 8192\n", "vector: model.layers.31.mlp.down_proj.weight 117440512\n", "vector: model.layers.31.mlp.gate_proj.weight 117440512\n", "vector: model.layers.31.mlp.up_proj.weight 117440512\n", "save: /home/mmnga/hdd/llm-data/llama-3-8B-chat-vector_2/model-00002.safetensors\n", "vector: model.layers.31.post_attention_layernorm.weight 8192\n", "vector: model.layers.31.self_attn.k_proj.weight 8388608\n", "vector: model.layers.31.self_attn.o_proj.weight 33554432\n", "vector: model.layers.31.self_attn.q_proj.weight 33554432\n", "vector: model.layers.31.self_attn.v_proj.weight 8388608\n", "save: /home/mmnga/hdd/llm-data/llama-3-8B-chat-vector_2/model-00003.safetensors\n", "make model.safetensors.index.json\n", "Done!\n" ] } ], "source": [ "cvm = ChatVectorManager(model_name_or_path_base, model_name_or_path_target, save_vector_name_or_path)\n", "cvm.load_setitngs(layer_num_config_name=\"num_hidden_layers\")\n", "cvm.add_layer_weight_template_name(\"model.layers.${i}.input_layernorm.weight\")\n", "cvm.add_layer_weight_template_name(\"model.layers.${i}.mlp.down_proj.weight\")\n", "cvm.add_layer_weight_template_name(\"model.layers.${i}.mlp.gate_proj.weight\")\n", "cvm.add_layer_weight_template_name(\"model.layers.${i}.mlp.up_proj.weight\")\n", "cvm.add_layer_weight_template_name(\"model.layers.${i}.post_attention_layernorm.weight\")\n", "cvm.add_layer_weight_template_name(\"model.layers.${i}.self_attn.k_proj.weight\")\n", "cvm.add_layer_weight_template_name(\"model.layers.${i}.self_attn.o_proj.weight\")\n", "cvm.add_layer_weight_template_name(\"model.layers.${i}.self_attn.q_proj.weight\")\n", "cvm.add_layer_weight_template_name(\"model.layers.${i}.self_attn.v_proj.weight\")\n", "cvm.make_vector()\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "merge_target_model_path = \"/home/mmnga/hdd/llm-data/Meta-Llama-3-70B-Instruct\"\n", "vector_path = \"/home/mmnga/hdd/llm-data/llama-3-8B-chat-vector_2\"\n", "save_merged_model_path = \"/home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector\"" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "import torch\n", "import torch.nn.functional as F\n", "from transformers import AutoTokenizer\n", "from safetensors import safe_open\n", "from safetensors.torch import save_file\n", "import os\n", "import json\n", "from string import Template\n", "\n", "class ExpandChatVectorMerger:\n", " def __init__(self, merge_target_model_path, vector_path, save_merged_model_path):\n", " self.merge_target_model_path = merge_target_model_path\n", " self.vector_path = vector_path\n", " self.save_merged_model_path = save_merged_model_path\n", "\n", " self.hold_layers_front = 8 # この0からこのlayerまではvectorをそのまま適用する\n", " self.hold_layers_later = -8 # 最後からこのlayerまではvectorをそのまま適用する\n", " self.apply_layer_map = {} # targetのlayerにvectorのlayerを適用するmap\n", " self.config_target = {}\n", " self.config_vector = {}\n", " self.n_layers_vector = 0\n", " self.n_layers = 0\n", " self.layer_weight_templates = []\n", "\n", " self.vector_weight_map = {}\n", " self.target_weight_map = {}\n", " self.current_weight_file_vector = \"\"\n", " self.current_weight_file_target = \"\"\n", "\n", " self.vector_weights = {}\n", " self.target_weights = {}\n", " self.save_weights = {}\n", " self.save_index_weights = {}\n", " self.save_size = 0\n", " self.save_total_size = 0\n", " self.save_byte_size = int(4.9 * 1024 * 1024 * 1024)\n", " self.save_counter = 0\n", "\n", " def load_setitngs(self, layer_num_config_name):\n", "\n", " # load config\n", " with open(self.vector_path + \"/config.json\", \"r\") as f:\n", " self.config_vector = json.load(f)\n", "\n", " with open(self.merge_target_model_path + \"/config.json\", \"r\") as f:\n", " self.config_target = json.load(f)\n", "\n", " # load weight map\n", " with open(self.vector_path + \"/model.safetensors.index.json\", \"r\") as f:\n", " self.vector_weight_map = json.load(f)[\"weight_map\"]\n", "\n", " with open(self.merge_target_model_path + \"/model.safetensors.index.json\", \"r\") as f:\n", " self.target_weight_map = json.load(f)[\"weight_map\"]\n", "\n", " self.n_layers = int(self.config_target[layer_num_config_name])\n", " self.n_layers_vector = int(self.config_vector[layer_num_config_name])\n", "\n", " def add_layer_weight_template_name(self, weight_template_name):\n", " self.layer_weight_templates.append(Template(weight_template_name))\n", "\n", " def make_apply_layer_map(self):\n", " target_from = self.hold_layers_front\n", " target_to = self.n_layers + self.hold_layers_later\n", " vector_to = self.n_layers_vector + self.hold_layers_later\n", "\n", " expand_count_target = target_to - target_from\n", " expand_count_vector = (vector_to - target_from) +1\n", "\n", " print(\"vector_to\", vector_to)\n", " \n", "\n", " for i in range(self.n_layers):\n", " if i < self.hold_layers_front:\n", " self.apply_layer_map[str(i)] = i\n", " print(\"front\", i, self.apply_layer_map[str(i)])\n", " elif i > self.n_layers + self.hold_layers_later:\n", " self.apply_layer_map[str(i)] = (i - self.n_layers) + self.n_layers_vector\n", " print(\"later\", i, self.apply_layer_map[str(i)])\n", " else:\n", " index_in_vector = int(((i - self.hold_layers_front) / expand_count_target) * expand_count_vector)\n", " self.apply_layer_map[str(i)] = min(self.hold_layers_front + index_in_vector, vector_to)\n", " \n", " print(\"expand\", i, self.apply_layer_map[str(i)])\n", "\n", "\n", " def get_merge_weight(self, vector_weight, target_weight):\n", " reshaped_tensor = vector_weight.unsqueeze(0).unsqueeze(0)\n", " \n", " if len(target_weight.shape) == 2:\n", " upsampled_tensor = F.interpolate(reshaped_tensor, size=target_weight.shape, mode='bilinear', align_corners=False)\n", " elif len(target_weight.shape) == 1:\n", " upsampled_tensor = F.interpolate(reshaped_tensor, size=target_weight.shape, mode='linear', align_corners=False)\n", "\n", " vw = upsampled_tensor.squeeze(0).squeeze(0)\n", "\n", " return target_weight + vw\n", "\n", " def add_layer_weight_template_name(self, weight_template_name):\n", " self.layer_weight_templates.append(Template(weight_template_name))\n", "\n", " def load_vector_weight(self, weight_name):\n", " if self.current_weight_file_vector == self.vector_weight_map[weight_name]:\n", " return\n", " else:\n", " file_name = self.vector_weight_map[weight_name]\n", " self.vector_weights = safe_open(f\"{self.vector_path}/{file_name}\", framework=\"pt\")\n", " self.current_weight_file_vector = file_name\n", "\n", " def load_target_weight(self, weight_name):\n", " if self.current_weight_file_target == self.target_weight_map[weight_name]:\n", " return\n", " else:\n", " file_name = self.target_weight_map[weight_name]\n", " self.target_weights = safe_open(f\"{self.merge_target_model_path}/{file_name}\", framework=\"pt\")\n", " self.current_weight_file_target = file_name\n", "\n", " def layer_weight_iter(self):\n", "\n", " for i in range(self.n_layers):\n", " target_layer_weights = {}\n", " vector_layer_weights = {}\n", "\n", " vector_layer_index = self.apply_layer_map[str(i)]\n", "\n", " for t in self.layer_weight_templates:\n", " vector_weight_name = t.substitute(i=vector_layer_index)\n", " target_weight_name = t.substitute(i=i)\n", " self.load_vector_weight(vector_weight_name)\n", " self.load_target_weight(target_weight_name)\n", " \n", " vector_layer_weights[vector_weight_name] = self.vector_weights.get_tensor(vector_weight_name)\n", " target_layer_weights[target_weight_name] = self.target_weights.get_tensor(target_weight_name)\n", "\n", " yield i, target_weight_name, vector_layer_weights[vector_weight_name], target_layer_weights[target_weight_name]\n", "\n", " def get_weight_byte_size(self, weight):\n", "\n", " if isinstance(weight, torch.Tensor):\n", " weight_byte_size = weight.nelement() * weight.element_size()\n", " else:\n", " weight_byte_size = sum(p.nelement() * p.element_size() for p in weight.parameters())\n", "\n", " return weight_byte_size\n", "\n", "\n", " def save_weights_split(self):\n", " if len(self.save_weights.keys()) == 0:\n", " return \n", "\n", " file_name = f\"{self.save_merged_model_path}/model-{self.save_counter:05}.safetensors\"\n", "\n", " for weight_name in self.save_weights.keys():\n", " self.save_index_weights[weight_name] = file_name.split(\"/\")[-1]\n", "\n", " save_file(self.save_weights, file_name, metadata={\"format\":\"pt\"})\n", " self.save_size = 0\n", " self.save_counter += 1\n", " self.save_weights = {}\n", " print(f\"save: {file_name}\")\n", "\n", " def push_weight(self, weight_name, weight):\n", " weight_size = self.get_weight_byte_size(weight)\n", " self.save_weights[weight_name] = weight\n", " self.save_size += weight_size\n", " self.save_total_size += weight_size\n", "\n", " print(f\"vector: {weight_name} {weight_size}\")\n", " if self.save_size > self.save_byte_size:\n", " self.save_weights_split()\n", " \n", " def save_weight_map(self):\n", " new_weight_map = {\n", " \"metadata\": {\n", " \"total_size\": self.save_total_size\n", " },\n", " \"weight_map\": self.save_index_weights\n", " }\n", " with open(f\"{self.save_merged_model_path}/model.safetensors.index.json\", \"w\") as f:\n", " json.dump(new_weight_map, f, indent=4)\n", "\n", " print(\"make model.safetensors.index.json\")\n", "\n", " def save_config(self):\n", " with open(f\"{self.save_merged_model_path}/config.json\", \"w\") as f:\n", " json.dump(self.config_target, f, indent=4)\n", " \n", " def save_tokenizer(self):\n", " tokenizer = AutoTokenizer.from_pretrained(self.merge_target_model_path)\n", " tokenizer.save_pretrained(self.save_merged_model_path)\n", "\n", " def merge(self):\n", "\n", " os.makedirs(self.save_merged_model_path, exist_ok=True)\n", " self.save_tokenizer()\n", " self.make_apply_layer_map()\n", "\n", " # 数値が含まれないweight\n", " for weight_name in [k for k in self.target_weight_map if not any(c.isdigit() for c in k)]:\n", " self.load_target_weight(weight_name)\n", " self.load_vector_weight(weight_name)\n", " target_weight = self.target_weights.get_tensor(weight_name)\n", " vector_weight = self.vector_weights.get_tensor(weight_name)\n", " merge_weight = self.get_merge_weight(vector_weight, target_weight)\n", " self.push_weight(weight_name, merge_weight)\n", "\n", " # layers\n", " for i, target_weight_name, vector_weight, target_weight in self.layer_weight_iter():\n", " merge_weight = self.get_merge_weight(vector_weight, target_weight)\n", " self.push_weight(target_weight_name, merge_weight)\n", "\n", " self.save_weights_split()\n", " self.save_weight_map()\n", " self.save_config()\n", " print(\"Done!\")\n" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "config_target 80\n", "config_vector 32\n", "vector_to 24\n", "front 0 0\n", "front 1 1\n", "front 2 2\n", "front 3 3\n", "front 4 4\n", "front 5 5\n", "front 6 6\n", "front 7 7\n", "expand 8 8\n", "expand 9 8\n", "expand 10 8\n", "expand 11 8\n", "expand 12 9\n", "expand 13 9\n", "expand 14 9\n", "expand 15 9\n", "expand 16 10\n", "expand 17 10\n", "expand 18 10\n", "expand 19 10\n", "expand 20 11\n", "expand 21 11\n", "expand 22 11\n", "expand 23 11\n", "expand 24 12\n", "expand 25 12\n", "expand 26 12\n", "expand 27 13\n", "expand 28 13\n", "expand 29 13\n", "expand 30 13\n", "expand 31 14\n", "expand 32 14\n", "expand 33 14\n", "expand 34 14\n", "expand 35 15\n", "expand 36 15\n", "expand 37 15\n", "expand 38 15\n", "expand 39 16\n", "expand 40 16\n", "expand 41 16\n", "expand 42 17\n", "expand 43 17\n", "expand 44 17\n", "expand 45 17\n", "expand 46 18\n", "expand 47 18\n", "expand 48 18\n", "expand 49 18\n", "expand 50 19\n", "expand 51 19\n", "expand 52 19\n", "expand 53 19\n", "expand 54 20\n", "expand 55 20\n", "expand 56 20\n", "expand 57 21\n", "expand 58 21\n", "expand 59 21\n", "expand 60 21\n", "expand 61 22\n", "expand 62 22\n", "expand 63 22\n", "expand 64 22\n", "expand 65 23\n", "expand 66 23\n", "expand 67 23\n", "expand 68 23\n", "expand 69 24\n", "expand 70 24\n", "expand 71 24\n", "expand 72 24\n", "later 73 25\n", "later 74 26\n", "later 75 27\n", "later 76 28\n", "later 77 29\n", "later 78 30\n", "later 79 31\n", "vector: lm_head.weight 2101346304\n", "vector: model.embed_tokens.weight 2101346304\n", "vector: model.norm.weight 16384\n", "vector: model.layers.0.input_layernorm.weight 16384\n", "vector: model.layers.0.mlp.down_proj.weight 469762048\n", "vector: model.layers.0.mlp.gate_proj.weight 469762048\n", "vector: model.layers.0.mlp.up_proj.weight 469762048\n", "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00000.safetensors\n", "vector: model.layers.0.post_attention_layernorm.weight 16384\n", "vector: model.layers.0.self_attn.k_proj.weight 16777216\n", "vector: model.layers.0.self_attn.o_proj.weight 134217728\n", "vector: model.layers.0.self_attn.q_proj.weight 134217728\n", "vector: model.layers.0.self_attn.v_proj.weight 16777216\n", "vector: model.layers.1.input_layernorm.weight 16384\n", "vector: model.layers.1.mlp.down_proj.weight 469762048\n", "vector: model.layers.1.mlp.gate_proj.weight 469762048\n", "vector: model.layers.1.mlp.up_proj.weight 469762048\n", "vector: model.layers.1.post_attention_layernorm.weight 16384\n", "vector: model.layers.1.self_attn.k_proj.weight 16777216\n", "vector: model.layers.1.self_attn.o_proj.weight 134217728\n", "vector: model.layers.1.self_attn.q_proj.weight 134217728\n", "vector: model.layers.1.self_attn.v_proj.weight 16777216\n", "vector: model.layers.2.input_layernorm.weight 16384\n", "vector: model.layers.2.mlp.down_proj.weight 469762048\n", "vector: model.layers.2.mlp.gate_proj.weight 469762048\n", "vector: model.layers.2.mlp.up_proj.weight 469762048\n", "vector: model.layers.2.post_attention_layernorm.weight 16384\n", "vector: model.layers.2.self_attn.k_proj.weight 16777216\n", "vector: model.layers.2.self_attn.o_proj.weight 134217728\n", "vector: model.layers.2.self_attn.q_proj.weight 134217728\n", "vector: model.layers.2.self_attn.v_proj.weight 16777216\n", "vector: model.layers.3.input_layernorm.weight 16384\n", "vector: model.layers.3.mlp.down_proj.weight 469762048\n", "vector: model.layers.3.mlp.gate_proj.weight 469762048\n", "vector: model.layers.3.mlp.up_proj.weight 469762048\n", "vector: model.layers.3.post_attention_layernorm.weight 16384\n", "vector: model.layers.3.self_attn.k_proj.weight 16777216\n", "vector: model.layers.3.self_attn.o_proj.weight 134217728\n", "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00001.safetensors\n", "vector: model.layers.3.self_attn.q_proj.weight 134217728\n", "vector: model.layers.3.self_attn.v_proj.weight 16777216\n", "vector: model.layers.4.input_layernorm.weight 16384\n", "vector: model.layers.4.mlp.down_proj.weight 469762048\n", "vector: model.layers.4.mlp.gate_proj.weight 469762048\n", "vector: model.layers.4.mlp.up_proj.weight 469762048\n", "vector: model.layers.4.post_attention_layernorm.weight 16384\n", "vector: model.layers.4.self_attn.k_proj.weight 16777216\n", "vector: model.layers.4.self_attn.o_proj.weight 134217728\n", "vector: model.layers.4.self_attn.q_proj.weight 134217728\n", "vector: model.layers.4.self_attn.v_proj.weight 16777216\n", "vector: model.layers.5.input_layernorm.weight 16384\n", "vector: model.layers.5.mlp.down_proj.weight 469762048\n", "vector: model.layers.5.mlp.gate_proj.weight 469762048\n", "vector: model.layers.5.mlp.up_proj.weight 469762048\n", "vector: model.layers.5.post_attention_layernorm.weight 16384\n", "vector: model.layers.5.self_attn.k_proj.weight 16777216\n", "vector: model.layers.5.self_attn.o_proj.weight 134217728\n", "vector: model.layers.5.self_attn.q_proj.weight 134217728\n", "vector: model.layers.5.self_attn.v_proj.weight 16777216\n", "vector: model.layers.6.input_layernorm.weight 16384\n", "vector: model.layers.6.mlp.down_proj.weight 469762048\n", "vector: model.layers.6.mlp.gate_proj.weight 469762048\n", "vector: model.layers.6.mlp.up_proj.weight 469762048\n", "vector: model.layers.6.post_attention_layernorm.weight 16384\n", "vector: model.layers.6.self_attn.k_proj.weight 16777216\n", "vector: model.layers.6.self_attn.o_proj.weight 134217728\n", "vector: model.layers.6.self_attn.q_proj.weight 134217728\n", "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00002.safetensors\n", "vector: model.layers.6.self_attn.v_proj.weight 16777216\n", "vector: model.layers.7.input_layernorm.weight 16384\n", "vector: model.layers.7.mlp.down_proj.weight 469762048\n", "vector: model.layers.7.mlp.gate_proj.weight 469762048\n", "vector: model.layers.7.mlp.up_proj.weight 469762048\n", "vector: model.layers.7.post_attention_layernorm.weight 16384\n", "vector: model.layers.7.self_attn.k_proj.weight 16777216\n", "vector: model.layers.7.self_attn.o_proj.weight 134217728\n", "vector: model.layers.7.self_attn.q_proj.weight 134217728\n", "vector: model.layers.7.self_attn.v_proj.weight 16777216\n", "vector: model.layers.8.input_layernorm.weight 16384\n", "vector: model.layers.8.mlp.down_proj.weight 469762048\n", "vector: model.layers.8.mlp.gate_proj.weight 469762048\n", "vector: model.layers.8.mlp.up_proj.weight 469762048\n", "vector: model.layers.8.post_attention_layernorm.weight 16384\n", "vector: model.layers.8.self_attn.k_proj.weight 16777216\n", "vector: model.layers.8.self_attn.o_proj.weight 134217728\n", "vector: model.layers.8.self_attn.q_proj.weight 134217728\n", "vector: model.layers.8.self_attn.v_proj.weight 16777216\n", "vector: model.layers.9.input_layernorm.weight 16384\n", "vector: model.layers.9.mlp.down_proj.weight 469762048\n", "vector: model.layers.9.mlp.gate_proj.weight 469762048\n", "vector: model.layers.9.mlp.up_proj.weight 469762048\n", "vector: model.layers.9.post_attention_layernorm.weight 16384\n", "vector: model.layers.9.self_attn.k_proj.weight 16777216\n", "vector: model.layers.9.self_attn.o_proj.weight 134217728\n", "vector: model.layers.9.self_attn.q_proj.weight 134217728\n", "vector: model.layers.9.self_attn.v_proj.weight 16777216\n", "vector: model.layers.10.input_layernorm.weight 16384\n", "vector: model.layers.10.mlp.down_proj.weight 469762048\n", "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00003.safetensors\n", "vector: model.layers.10.mlp.gate_proj.weight 469762048\n", "vector: model.layers.10.mlp.up_proj.weight 469762048\n", "vector: model.layers.10.post_attention_layernorm.weight 16384\n", "vector: model.layers.10.self_attn.k_proj.weight 16777216\n", "vector: model.layers.10.self_attn.o_proj.weight 134217728\n", "vector: model.layers.10.self_attn.q_proj.weight 134217728\n", "vector: model.layers.10.self_attn.v_proj.weight 16777216\n", "vector: model.layers.11.input_layernorm.weight 16384\n", "vector: model.layers.11.mlp.down_proj.weight 469762048\n", "vector: model.layers.11.mlp.gate_proj.weight 469762048\n", "vector: model.layers.11.mlp.up_proj.weight 469762048\n", "vector: model.layers.11.post_attention_layernorm.weight 16384\n", "vector: model.layers.11.self_attn.k_proj.weight 16777216\n", "vector: model.layers.11.self_attn.o_proj.weight 134217728\n", "vector: model.layers.11.self_attn.q_proj.weight 134217728\n", "vector: model.layers.11.self_attn.v_proj.weight 16777216\n", "vector: model.layers.12.input_layernorm.weight 16384\n", "vector: model.layers.12.mlp.down_proj.weight 469762048\n", "vector: model.layers.12.mlp.gate_proj.weight 469762048\n", "vector: model.layers.12.mlp.up_proj.weight 469762048\n", "vector: model.layers.12.post_attention_layernorm.weight 16384\n", "vector: model.layers.12.self_attn.k_proj.weight 16777216\n", "vector: model.layers.12.self_attn.o_proj.weight 134217728\n", "vector: model.layers.12.self_attn.q_proj.weight 134217728\n", "vector: model.layers.12.self_attn.v_proj.weight 16777216\n", "vector: model.layers.13.input_layernorm.weight 16384\n", "vector: model.layers.13.mlp.down_proj.weight 469762048\n", "vector: model.layers.13.mlp.gate_proj.weight 469762048\n", "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00004.safetensors\n", "vector: model.layers.13.mlp.up_proj.weight 469762048\n", "vector: model.layers.13.post_attention_layernorm.weight 16384\n", "vector: model.layers.13.self_attn.k_proj.weight 16777216\n", "vector: model.layers.13.self_attn.o_proj.weight 134217728\n", "vector: model.layers.13.self_attn.q_proj.weight 134217728\n", "vector: model.layers.13.self_attn.v_proj.weight 16777216\n", "vector: model.layers.14.input_layernorm.weight 16384\n", "vector: model.layers.14.mlp.down_proj.weight 469762048\n", "vector: model.layers.14.mlp.gate_proj.weight 469762048\n", "vector: model.layers.14.mlp.up_proj.weight 469762048\n", "vector: model.layers.14.post_attention_layernorm.weight 16384\n", "vector: model.layers.14.self_attn.k_proj.weight 16777216\n", "vector: model.layers.14.self_attn.o_proj.weight 134217728\n", "vector: model.layers.14.self_attn.q_proj.weight 134217728\n", "vector: model.layers.14.self_attn.v_proj.weight 16777216\n", "vector: model.layers.15.input_layernorm.weight 16384\n", "vector: model.layers.15.mlp.down_proj.weight 469762048\n", "vector: model.layers.15.mlp.gate_proj.weight 469762048\n", "vector: model.layers.15.mlp.up_proj.weight 469762048\n", "vector: model.layers.15.post_attention_layernorm.weight 16384\n", "vector: model.layers.15.self_attn.k_proj.weight 16777216\n", "vector: model.layers.15.self_attn.o_proj.weight 134217728\n", "vector: model.layers.15.self_attn.q_proj.weight 134217728\n", "vector: model.layers.15.self_attn.v_proj.weight 16777216\n", "vector: model.layers.16.input_layernorm.weight 16384\n", "vector: model.layers.16.mlp.down_proj.weight 469762048\n", "vector: model.layers.16.mlp.gate_proj.weight 469762048\n", "vector: model.layers.16.mlp.up_proj.weight 469762048\n", "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00005.safetensors\n", "vector: model.layers.16.post_attention_layernorm.weight 16384\n", "vector: model.layers.16.self_attn.k_proj.weight 16777216\n", "vector: model.layers.16.self_attn.o_proj.weight 134217728\n", "vector: model.layers.16.self_attn.q_proj.weight 134217728\n", "vector: model.layers.16.self_attn.v_proj.weight 16777216\n", "vector: model.layers.17.input_layernorm.weight 16384\n", "vector: model.layers.17.mlp.down_proj.weight 469762048\n", "vector: model.layers.17.mlp.gate_proj.weight 469762048\n", "vector: model.layers.17.mlp.up_proj.weight 469762048\n", "vector: model.layers.17.post_attention_layernorm.weight 16384\n", "vector: model.layers.17.self_attn.k_proj.weight 16777216\n", "vector: model.layers.17.self_attn.o_proj.weight 134217728\n", "vector: model.layers.17.self_attn.q_proj.weight 134217728\n", "vector: model.layers.17.self_attn.v_proj.weight 16777216\n", "vector: model.layers.18.input_layernorm.weight 16384\n", "vector: model.layers.18.mlp.down_proj.weight 469762048\n", "vector: model.layers.18.mlp.gate_proj.weight 469762048\n", "vector: model.layers.18.mlp.up_proj.weight 469762048\n", "vector: model.layers.18.post_attention_layernorm.weight 16384\n", "vector: model.layers.18.self_attn.k_proj.weight 16777216\n", "vector: model.layers.18.self_attn.o_proj.weight 134217728\n", "vector: model.layers.18.self_attn.q_proj.weight 134217728\n", "vector: model.layers.18.self_attn.v_proj.weight 16777216\n", "vector: model.layers.19.input_layernorm.weight 16384\n", "vector: model.layers.19.mlp.down_proj.weight 469762048\n", "vector: model.layers.19.mlp.gate_proj.weight 469762048\n", "vector: model.layers.19.mlp.up_proj.weight 469762048\n", "vector: model.layers.19.post_attention_layernorm.weight 16384\n", "vector: model.layers.19.self_attn.k_proj.weight 16777216\n", "vector: model.layers.19.self_attn.o_proj.weight 134217728\n", "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00006.safetensors\n", "vector: model.layers.19.self_attn.q_proj.weight 134217728\n", "vector: model.layers.19.self_attn.v_proj.weight 16777216\n", "vector: model.layers.20.input_layernorm.weight 16384\n", "vector: model.layers.20.mlp.down_proj.weight 469762048\n", "vector: model.layers.20.mlp.gate_proj.weight 469762048\n", "vector: model.layers.20.mlp.up_proj.weight 469762048\n", "vector: model.layers.20.post_attention_layernorm.weight 16384\n", "vector: model.layers.20.self_attn.k_proj.weight 16777216\n", "vector: model.layers.20.self_attn.o_proj.weight 134217728\n", "vector: model.layers.20.self_attn.q_proj.weight 134217728\n", "vector: model.layers.20.self_attn.v_proj.weight 16777216\n", "vector: model.layers.21.input_layernorm.weight 16384\n", "vector: model.layers.21.mlp.down_proj.weight 469762048\n", "vector: model.layers.21.mlp.gate_proj.weight 469762048\n", "vector: model.layers.21.mlp.up_proj.weight 469762048\n", "vector: model.layers.21.post_attention_layernorm.weight 16384\n", "vector: model.layers.21.self_attn.k_proj.weight 16777216\n", "vector: model.layers.21.self_attn.o_proj.weight 134217728\n", "vector: model.layers.21.self_attn.q_proj.weight 134217728\n", "vector: model.layers.21.self_attn.v_proj.weight 16777216\n", "vector: model.layers.22.input_layernorm.weight 16384\n", "vector: model.layers.22.mlp.down_proj.weight 469762048\n", "vector: model.layers.22.mlp.gate_proj.weight 469762048\n", "vector: model.layers.22.mlp.up_proj.weight 469762048\n", "vector: model.layers.22.post_attention_layernorm.weight 16384\n", "vector: model.layers.22.self_attn.k_proj.weight 16777216\n", "vector: model.layers.22.self_attn.o_proj.weight 134217728\n", "vector: model.layers.22.self_attn.q_proj.weight 134217728\n", "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00007.safetensors\n", "vector: model.layers.22.self_attn.v_proj.weight 16777216\n", "vector: model.layers.23.input_layernorm.weight 16384\n", "vector: model.layers.23.mlp.down_proj.weight 469762048\n", "vector: model.layers.23.mlp.gate_proj.weight 469762048\n", "vector: model.layers.23.mlp.up_proj.weight 469762048\n", "vector: model.layers.23.post_attention_layernorm.weight 16384\n", "vector: model.layers.23.self_attn.k_proj.weight 16777216\n", "vector: model.layers.23.self_attn.o_proj.weight 134217728\n", "vector: model.layers.23.self_attn.q_proj.weight 134217728\n", "vector: model.layers.23.self_attn.v_proj.weight 16777216\n", "vector: model.layers.24.input_layernorm.weight 16384\n", "vector: model.layers.24.mlp.down_proj.weight 469762048\n", "vector: model.layers.24.mlp.gate_proj.weight 469762048\n", "vector: model.layers.24.mlp.up_proj.weight 469762048\n", "vector: model.layers.24.post_attention_layernorm.weight 16384\n", "vector: model.layers.24.self_attn.k_proj.weight 16777216\n", "vector: model.layers.24.self_attn.o_proj.weight 134217728\n", "vector: model.layers.24.self_attn.q_proj.weight 134217728\n", "vector: model.layers.24.self_attn.v_proj.weight 16777216\n", "vector: model.layers.25.input_layernorm.weight 16384\n", "vector: model.layers.25.mlp.down_proj.weight 469762048\n", "vector: model.layers.25.mlp.gate_proj.weight 469762048\n", "vector: model.layers.25.mlp.up_proj.weight 469762048\n", "vector: model.layers.25.post_attention_layernorm.weight 16384\n", "vector: model.layers.25.self_attn.k_proj.weight 16777216\n", "vector: model.layers.25.self_attn.o_proj.weight 134217728\n", "vector: model.layers.25.self_attn.q_proj.weight 134217728\n", "vector: model.layers.25.self_attn.v_proj.weight 16777216\n", "vector: model.layers.26.input_layernorm.weight 16384\n", "vector: model.layers.26.mlp.down_proj.weight 469762048\n", "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00008.safetensors\n", "vector: model.layers.26.mlp.gate_proj.weight 469762048\n", "vector: model.layers.26.mlp.up_proj.weight 469762048\n", "vector: model.layers.26.post_attention_layernorm.weight 16384\n", "vector: model.layers.26.self_attn.k_proj.weight 16777216\n", "vector: model.layers.26.self_attn.o_proj.weight 134217728\n", "vector: model.layers.26.self_attn.q_proj.weight 134217728\n", "vector: model.layers.26.self_attn.v_proj.weight 16777216\n", "vector: model.layers.27.input_layernorm.weight 16384\n", "vector: model.layers.27.mlp.down_proj.weight 469762048\n", "vector: model.layers.27.mlp.gate_proj.weight 469762048\n", "vector: model.layers.27.mlp.up_proj.weight 469762048\n", "vector: model.layers.27.post_attention_layernorm.weight 16384\n", "vector: model.layers.27.self_attn.k_proj.weight 16777216\n", "vector: model.layers.27.self_attn.o_proj.weight 134217728\n", "vector: model.layers.27.self_attn.q_proj.weight 134217728\n", "vector: model.layers.27.self_attn.v_proj.weight 16777216\n", "vector: model.layers.28.input_layernorm.weight 16384\n", "vector: model.layers.28.mlp.down_proj.weight 469762048\n", "vector: model.layers.28.mlp.gate_proj.weight 469762048\n", "vector: model.layers.28.mlp.up_proj.weight 469762048\n", "vector: model.layers.28.post_attention_layernorm.weight 16384\n", "vector: model.layers.28.self_attn.k_proj.weight 16777216\n", "vector: model.layers.28.self_attn.o_proj.weight 134217728\n", "vector: model.layers.28.self_attn.q_proj.weight 134217728\n", "vector: model.layers.28.self_attn.v_proj.weight 16777216\n", "vector: model.layers.29.input_layernorm.weight 16384\n", "vector: model.layers.29.mlp.down_proj.weight 469762048\n", "vector: model.layers.29.mlp.gate_proj.weight 469762048\n", "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00009.safetensors\n", "vector: model.layers.29.mlp.up_proj.weight 469762048\n", "vector: model.layers.29.post_attention_layernorm.weight 16384\n", "vector: model.layers.29.self_attn.k_proj.weight 16777216\n", "vector: model.layers.29.self_attn.o_proj.weight 134217728\n", "vector: model.layers.29.self_attn.q_proj.weight 134217728\n", "vector: model.layers.29.self_attn.v_proj.weight 16777216\n", "vector: model.layers.30.input_layernorm.weight 16384\n", "vector: model.layers.30.mlp.down_proj.weight 469762048\n", "vector: model.layers.30.mlp.gate_proj.weight 469762048\n", "vector: model.layers.30.mlp.up_proj.weight 469762048\n", "vector: model.layers.30.post_attention_layernorm.weight 16384\n", "vector: model.layers.30.self_attn.k_proj.weight 16777216\n", "vector: model.layers.30.self_attn.o_proj.weight 134217728\n", "vector: model.layers.30.self_attn.q_proj.weight 134217728\n", "vector: model.layers.30.self_attn.v_proj.weight 16777216\n", "vector: model.layers.31.input_layernorm.weight 16384\n", "vector: model.layers.31.mlp.down_proj.weight 469762048\n", "vector: model.layers.31.mlp.gate_proj.weight 469762048\n", "vector: model.layers.31.mlp.up_proj.weight 469762048\n", "vector: model.layers.31.post_attention_layernorm.weight 16384\n", "vector: model.layers.31.self_attn.k_proj.weight 16777216\n", "vector: model.layers.31.self_attn.o_proj.weight 134217728\n", "vector: model.layers.31.self_attn.q_proj.weight 134217728\n", "vector: model.layers.31.self_attn.v_proj.weight 16777216\n", "vector: model.layers.32.input_layernorm.weight 16384\n", "vector: model.layers.32.mlp.down_proj.weight 469762048\n", "vector: model.layers.32.mlp.gate_proj.weight 469762048\n", "vector: model.layers.32.mlp.up_proj.weight 469762048\n", "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00010.safetensors\n", "vector: model.layers.32.post_attention_layernorm.weight 16384\n", "vector: model.layers.32.self_attn.k_proj.weight 16777216\n", "vector: model.layers.32.self_attn.o_proj.weight 134217728\n", "vector: model.layers.32.self_attn.q_proj.weight 134217728\n", "vector: model.layers.32.self_attn.v_proj.weight 16777216\n", "vector: model.layers.33.input_layernorm.weight 16384\n", "vector: model.layers.33.mlp.down_proj.weight 469762048\n", "vector: model.layers.33.mlp.gate_proj.weight 469762048\n", "vector: model.layers.33.mlp.up_proj.weight 469762048\n", "vector: model.layers.33.post_attention_layernorm.weight 16384\n", "vector: model.layers.33.self_attn.k_proj.weight 16777216\n", "vector: model.layers.33.self_attn.o_proj.weight 134217728\n", "vector: model.layers.33.self_attn.q_proj.weight 134217728\n", "vector: model.layers.33.self_attn.v_proj.weight 16777216\n", "vector: model.layers.34.input_layernorm.weight 16384\n", "vector: model.layers.34.mlp.down_proj.weight 469762048\n", "vector: model.layers.34.mlp.gate_proj.weight 469762048\n", "vector: model.layers.34.mlp.up_proj.weight 469762048\n", "vector: model.layers.34.post_attention_layernorm.weight 16384\n", "vector: model.layers.34.self_attn.k_proj.weight 16777216\n", "vector: model.layers.34.self_attn.o_proj.weight 134217728\n", "vector: model.layers.34.self_attn.q_proj.weight 134217728\n", "vector: model.layers.34.self_attn.v_proj.weight 16777216\n", "vector: model.layers.35.input_layernorm.weight 16384\n", "vector: model.layers.35.mlp.down_proj.weight 469762048\n", "vector: model.layers.35.mlp.gate_proj.weight 469762048\n", "vector: model.layers.35.mlp.up_proj.weight 469762048\n", "vector: model.layers.35.post_attention_layernorm.weight 16384\n", "vector: model.layers.35.self_attn.k_proj.weight 16777216\n", "vector: model.layers.35.self_attn.o_proj.weight 134217728\n", "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00011.safetensors\n", "vector: model.layers.35.self_attn.q_proj.weight 134217728\n", "vector: model.layers.35.self_attn.v_proj.weight 16777216\n", "vector: model.layers.36.input_layernorm.weight 16384\n", "vector: model.layers.36.mlp.down_proj.weight 469762048\n", "vector: model.layers.36.mlp.gate_proj.weight 469762048\n", "vector: model.layers.36.mlp.up_proj.weight 469762048\n", "vector: model.layers.36.post_attention_layernorm.weight 16384\n", "vector: model.layers.36.self_attn.k_proj.weight 16777216\n", "vector: model.layers.36.self_attn.o_proj.weight 134217728\n", "vector: model.layers.36.self_attn.q_proj.weight 134217728\n", "vector: model.layers.36.self_attn.v_proj.weight 16777216\n", "vector: model.layers.37.input_layernorm.weight 16384\n", "vector: model.layers.37.mlp.down_proj.weight 469762048\n", "vector: model.layers.37.mlp.gate_proj.weight 469762048\n", "vector: model.layers.37.mlp.up_proj.weight 469762048\n", "vector: model.layers.37.post_attention_layernorm.weight 16384\n", "vector: model.layers.37.self_attn.k_proj.weight 16777216\n", "vector: model.layers.37.self_attn.o_proj.weight 134217728\n", "vector: model.layers.37.self_attn.q_proj.weight 134217728\n", "vector: model.layers.37.self_attn.v_proj.weight 16777216\n", "vector: model.layers.38.input_layernorm.weight 16384\n", "vector: model.layers.38.mlp.down_proj.weight 469762048\n", "vector: model.layers.38.mlp.gate_proj.weight 469762048\n", "vector: model.layers.38.mlp.up_proj.weight 469762048\n", "vector: model.layers.38.post_attention_layernorm.weight 16384\n", "vector: model.layers.38.self_attn.k_proj.weight 16777216\n", "vector: model.layers.38.self_attn.o_proj.weight 134217728\n", "vector: model.layers.38.self_attn.q_proj.weight 134217728\n", "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00012.safetensors\n", "vector: model.layers.38.self_attn.v_proj.weight 16777216\n", "vector: model.layers.39.input_layernorm.weight 16384\n", "vector: model.layers.39.mlp.down_proj.weight 469762048\n", "vector: model.layers.39.mlp.gate_proj.weight 469762048\n", "vector: model.layers.39.mlp.up_proj.weight 469762048\n", "vector: model.layers.39.post_attention_layernorm.weight 16384\n", "vector: model.layers.39.self_attn.k_proj.weight 16777216\n", "vector: model.layers.39.self_attn.o_proj.weight 134217728\n", "vector: model.layers.39.self_attn.q_proj.weight 134217728\n", "vector: model.layers.39.self_attn.v_proj.weight 16777216\n", "vector: model.layers.40.input_layernorm.weight 16384\n", "vector: model.layers.40.mlp.down_proj.weight 469762048\n", "vector: model.layers.40.mlp.gate_proj.weight 469762048\n", "vector: model.layers.40.mlp.up_proj.weight 469762048\n", "vector: model.layers.40.post_attention_layernorm.weight 16384\n", "vector: model.layers.40.self_attn.k_proj.weight 16777216\n", "vector: model.layers.40.self_attn.o_proj.weight 134217728\n", "vector: model.layers.40.self_attn.q_proj.weight 134217728\n", "vector: model.layers.40.self_attn.v_proj.weight 16777216\n", "vector: model.layers.41.input_layernorm.weight 16384\n", "vector: model.layers.41.mlp.down_proj.weight 469762048\n", "vector: model.layers.41.mlp.gate_proj.weight 469762048\n", "vector: model.layers.41.mlp.up_proj.weight 469762048\n", "vector: model.layers.41.post_attention_layernorm.weight 16384\n", "vector: model.layers.41.self_attn.k_proj.weight 16777216\n", "vector: model.layers.41.self_attn.o_proj.weight 134217728\n", "vector: model.layers.41.self_attn.q_proj.weight 134217728\n", "vector: model.layers.41.self_attn.v_proj.weight 16777216\n", "vector: model.layers.42.input_layernorm.weight 16384\n", "vector: model.layers.42.mlp.down_proj.weight 469762048\n", "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00013.safetensors\n", "vector: model.layers.42.mlp.gate_proj.weight 469762048\n", "vector: model.layers.42.mlp.up_proj.weight 469762048\n", "vector: model.layers.42.post_attention_layernorm.weight 16384\n", "vector: model.layers.42.self_attn.k_proj.weight 16777216\n", "vector: model.layers.42.self_attn.o_proj.weight 134217728\n", "vector: model.layers.42.self_attn.q_proj.weight 134217728\n", "vector: model.layers.42.self_attn.v_proj.weight 16777216\n", "vector: model.layers.43.input_layernorm.weight 16384\n", "vector: model.layers.43.mlp.down_proj.weight 469762048\n", "vector: model.layers.43.mlp.gate_proj.weight 469762048\n", "vector: model.layers.43.mlp.up_proj.weight 469762048\n", "vector: model.layers.43.post_attention_layernorm.weight 16384\n", "vector: model.layers.43.self_attn.k_proj.weight 16777216\n", "vector: model.layers.43.self_attn.o_proj.weight 134217728\n", "vector: model.layers.43.self_attn.q_proj.weight 134217728\n", "vector: model.layers.43.self_attn.v_proj.weight 16777216\n", "vector: model.layers.44.input_layernorm.weight 16384\n", "vector: model.layers.44.mlp.down_proj.weight 469762048\n", "vector: model.layers.44.mlp.gate_proj.weight 469762048\n", "vector: model.layers.44.mlp.up_proj.weight 469762048\n", "vector: model.layers.44.post_attention_layernorm.weight 16384\n", "vector: model.layers.44.self_attn.k_proj.weight 16777216\n", "vector: model.layers.44.self_attn.o_proj.weight 134217728\n", "vector: model.layers.44.self_attn.q_proj.weight 134217728\n", "vector: model.layers.44.self_attn.v_proj.weight 16777216\n", "vector: model.layers.45.input_layernorm.weight 16384\n", "vector: model.layers.45.mlp.down_proj.weight 469762048\n", "vector: model.layers.45.mlp.gate_proj.weight 469762048\n", "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00014.safetensors\n", "vector: model.layers.45.mlp.up_proj.weight 469762048\n", "vector: model.layers.45.post_attention_layernorm.weight 16384\n", "vector: model.layers.45.self_attn.k_proj.weight 16777216\n", "vector: model.layers.45.self_attn.o_proj.weight 134217728\n", "vector: model.layers.45.self_attn.q_proj.weight 134217728\n", "vector: model.layers.45.self_attn.v_proj.weight 16777216\n", "vector: model.layers.46.input_layernorm.weight 16384\n", "vector: model.layers.46.mlp.down_proj.weight 469762048\n", "vector: model.layers.46.mlp.gate_proj.weight 469762048\n", "vector: model.layers.46.mlp.up_proj.weight 469762048\n", "vector: model.layers.46.post_attention_layernorm.weight 16384\n", "vector: model.layers.46.self_attn.k_proj.weight 16777216\n", "vector: model.layers.46.self_attn.o_proj.weight 134217728\n", "vector: model.layers.46.self_attn.q_proj.weight 134217728\n", "vector: model.layers.46.self_attn.v_proj.weight 16777216\n", "vector: model.layers.47.input_layernorm.weight 16384\n", "vector: model.layers.47.mlp.down_proj.weight 469762048\n", "vector: model.layers.47.mlp.gate_proj.weight 469762048\n", "vector: model.layers.47.mlp.up_proj.weight 469762048\n", "vector: model.layers.47.post_attention_layernorm.weight 16384\n", "vector: model.layers.47.self_attn.k_proj.weight 16777216\n", "vector: model.layers.47.self_attn.o_proj.weight 134217728\n", "vector: model.layers.47.self_attn.q_proj.weight 134217728\n", "vector: model.layers.47.self_attn.v_proj.weight 16777216\n", "vector: model.layers.48.input_layernorm.weight 16384\n", "vector: model.layers.48.mlp.down_proj.weight 469762048\n", "vector: model.layers.48.mlp.gate_proj.weight 469762048\n", "vector: model.layers.48.mlp.up_proj.weight 469762048\n", "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00015.safetensors\n", "vector: model.layers.48.post_attention_layernorm.weight 16384\n", "vector: model.layers.48.self_attn.k_proj.weight 16777216\n", "vector: model.layers.48.self_attn.o_proj.weight 134217728\n", "vector: model.layers.48.self_attn.q_proj.weight 134217728\n", "vector: model.layers.48.self_attn.v_proj.weight 16777216\n", "vector: model.layers.49.input_layernorm.weight 16384\n", "vector: model.layers.49.mlp.down_proj.weight 469762048\n", "vector: model.layers.49.mlp.gate_proj.weight 469762048\n", "vector: model.layers.49.mlp.up_proj.weight 469762048\n", "vector: model.layers.49.post_attention_layernorm.weight 16384\n", "vector: model.layers.49.self_attn.k_proj.weight 16777216\n", "vector: model.layers.49.self_attn.o_proj.weight 134217728\n", "vector: model.layers.49.self_attn.q_proj.weight 134217728\n", "vector: model.layers.49.self_attn.v_proj.weight 16777216\n", "vector: model.layers.50.input_layernorm.weight 16384\n", "vector: model.layers.50.mlp.down_proj.weight 469762048\n", "vector: model.layers.50.mlp.gate_proj.weight 469762048\n", "vector: model.layers.50.mlp.up_proj.weight 469762048\n", "vector: model.layers.50.post_attention_layernorm.weight 16384\n", "vector: model.layers.50.self_attn.k_proj.weight 16777216\n", "vector: model.layers.50.self_attn.o_proj.weight 134217728\n", "vector: model.layers.50.self_attn.q_proj.weight 134217728\n", "vector: model.layers.50.self_attn.v_proj.weight 16777216\n", "vector: model.layers.51.input_layernorm.weight 16384\n", "vector: model.layers.51.mlp.down_proj.weight 469762048\n", "vector: model.layers.51.mlp.gate_proj.weight 469762048\n", "vector: model.layers.51.mlp.up_proj.weight 469762048\n", "vector: model.layers.51.post_attention_layernorm.weight 16384\n", "vector: model.layers.51.self_attn.k_proj.weight 16777216\n", "vector: model.layers.51.self_attn.o_proj.weight 134217728\n", "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00016.safetensors\n", "vector: model.layers.51.self_attn.q_proj.weight 134217728\n", "vector: model.layers.51.self_attn.v_proj.weight 16777216\n", "vector: model.layers.52.input_layernorm.weight 16384\n", "vector: model.layers.52.mlp.down_proj.weight 469762048\n", "vector: model.layers.52.mlp.gate_proj.weight 469762048\n", "vector: model.layers.52.mlp.up_proj.weight 469762048\n", "vector: model.layers.52.post_attention_layernorm.weight 16384\n", "vector: model.layers.52.self_attn.k_proj.weight 16777216\n", "vector: model.layers.52.self_attn.o_proj.weight 134217728\n", "vector: model.layers.52.self_attn.q_proj.weight 134217728\n", "vector: model.layers.52.self_attn.v_proj.weight 16777216\n", "vector: model.layers.53.input_layernorm.weight 16384\n", "vector: model.layers.53.mlp.down_proj.weight 469762048\n", "vector: model.layers.53.mlp.gate_proj.weight 469762048\n", "vector: model.layers.53.mlp.up_proj.weight 469762048\n", "vector: model.layers.53.post_attention_layernorm.weight 16384\n", "vector: model.layers.53.self_attn.k_proj.weight 16777216\n", "vector: model.layers.53.self_attn.o_proj.weight 134217728\n", "vector: model.layers.53.self_attn.q_proj.weight 134217728\n", "vector: model.layers.53.self_attn.v_proj.weight 16777216\n", "vector: model.layers.54.input_layernorm.weight 16384\n", "vector: model.layers.54.mlp.down_proj.weight 469762048\n", "vector: model.layers.54.mlp.gate_proj.weight 469762048\n", "vector: model.layers.54.mlp.up_proj.weight 469762048\n", "vector: model.layers.54.post_attention_layernorm.weight 16384\n", "vector: model.layers.54.self_attn.k_proj.weight 16777216\n", "vector: model.layers.54.self_attn.o_proj.weight 134217728\n", "vector: model.layers.54.self_attn.q_proj.weight 134217728\n", "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00017.safetensors\n", "vector: model.layers.54.self_attn.v_proj.weight 16777216\n", "vector: model.layers.55.input_layernorm.weight 16384\n", "vector: model.layers.55.mlp.down_proj.weight 469762048\n", "vector: model.layers.55.mlp.gate_proj.weight 469762048\n", "vector: model.layers.55.mlp.up_proj.weight 469762048\n", "vector: model.layers.55.post_attention_layernorm.weight 16384\n", "vector: model.layers.55.self_attn.k_proj.weight 16777216\n", "vector: model.layers.55.self_attn.o_proj.weight 134217728\n", "vector: model.layers.55.self_attn.q_proj.weight 134217728\n", "vector: model.layers.55.self_attn.v_proj.weight 16777216\n", "vector: model.layers.56.input_layernorm.weight 16384\n", "vector: model.layers.56.mlp.down_proj.weight 469762048\n", "vector: model.layers.56.mlp.gate_proj.weight 469762048\n", "vector: model.layers.56.mlp.up_proj.weight 469762048\n", "vector: model.layers.56.post_attention_layernorm.weight 16384\n", "vector: model.layers.56.self_attn.k_proj.weight 16777216\n", "vector: model.layers.56.self_attn.o_proj.weight 134217728\n", "vector: model.layers.56.self_attn.q_proj.weight 134217728\n", "vector: model.layers.56.self_attn.v_proj.weight 16777216\n", "vector: model.layers.57.input_layernorm.weight 16384\n", "vector: model.layers.57.mlp.down_proj.weight 469762048\n", "vector: model.layers.57.mlp.gate_proj.weight 469762048\n", "vector: model.layers.57.mlp.up_proj.weight 469762048\n", "vector: model.layers.57.post_attention_layernorm.weight 16384\n", "vector: model.layers.57.self_attn.k_proj.weight 16777216\n", "vector: model.layers.57.self_attn.o_proj.weight 134217728\n", "vector: model.layers.57.self_attn.q_proj.weight 134217728\n", "vector: model.layers.57.self_attn.v_proj.weight 16777216\n", "vector: model.layers.58.input_layernorm.weight 16384\n", "vector: model.layers.58.mlp.down_proj.weight 469762048\n", "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00018.safetensors\n", "vector: model.layers.58.mlp.gate_proj.weight 469762048\n", "vector: model.layers.58.mlp.up_proj.weight 469762048\n", "vector: model.layers.58.post_attention_layernorm.weight 16384\n", "vector: model.layers.58.self_attn.k_proj.weight 16777216\n", "vector: model.layers.58.self_attn.o_proj.weight 134217728\n", "vector: model.layers.58.self_attn.q_proj.weight 134217728\n", "vector: model.layers.58.self_attn.v_proj.weight 16777216\n", "vector: model.layers.59.input_layernorm.weight 16384\n", "vector: model.layers.59.mlp.down_proj.weight 469762048\n", "vector: model.layers.59.mlp.gate_proj.weight 469762048\n", "vector: model.layers.59.mlp.up_proj.weight 469762048\n", "vector: model.layers.59.post_attention_layernorm.weight 16384\n", "vector: model.layers.59.self_attn.k_proj.weight 16777216\n", "vector: model.layers.59.self_attn.o_proj.weight 134217728\n", "vector: model.layers.59.self_attn.q_proj.weight 134217728\n", "vector: model.layers.59.self_attn.v_proj.weight 16777216\n", "vector: model.layers.60.input_layernorm.weight 16384\n", "vector: model.layers.60.mlp.down_proj.weight 469762048\n", "vector: model.layers.60.mlp.gate_proj.weight 469762048\n", "vector: model.layers.60.mlp.up_proj.weight 469762048\n", "vector: model.layers.60.post_attention_layernorm.weight 16384\n", "vector: model.layers.60.self_attn.k_proj.weight 16777216\n", "vector: model.layers.60.self_attn.o_proj.weight 134217728\n", "vector: model.layers.60.self_attn.q_proj.weight 134217728\n", "vector: model.layers.60.self_attn.v_proj.weight 16777216\n", "vector: model.layers.61.input_layernorm.weight 16384\n", "vector: model.layers.61.mlp.down_proj.weight 469762048\n", "vector: model.layers.61.mlp.gate_proj.weight 469762048\n", "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00019.safetensors\n", "vector: model.layers.61.mlp.up_proj.weight 469762048\n", "vector: model.layers.61.post_attention_layernorm.weight 16384\n", "vector: model.layers.61.self_attn.k_proj.weight 16777216\n", "vector: model.layers.61.self_attn.o_proj.weight 134217728\n", "vector: model.layers.61.self_attn.q_proj.weight 134217728\n", "vector: model.layers.61.self_attn.v_proj.weight 16777216\n", "vector: model.layers.62.input_layernorm.weight 16384\n", "vector: model.layers.62.mlp.down_proj.weight 469762048\n", "vector: model.layers.62.mlp.gate_proj.weight 469762048\n", "vector: model.layers.62.mlp.up_proj.weight 469762048\n", "vector: model.layers.62.post_attention_layernorm.weight 16384\n", "vector: model.layers.62.self_attn.k_proj.weight 16777216\n", "vector: model.layers.62.self_attn.o_proj.weight 134217728\n", "vector: model.layers.62.self_attn.q_proj.weight 134217728\n", "vector: model.layers.62.self_attn.v_proj.weight 16777216\n", "vector: model.layers.63.input_layernorm.weight 16384\n", "vector: model.layers.63.mlp.down_proj.weight 469762048\n", "vector: model.layers.63.mlp.gate_proj.weight 469762048\n", "vector: model.layers.63.mlp.up_proj.weight 469762048\n", "vector: model.layers.63.post_attention_layernorm.weight 16384\n", "vector: model.layers.63.self_attn.k_proj.weight 16777216\n", "vector: model.layers.63.self_attn.o_proj.weight 134217728\n", "vector: model.layers.63.self_attn.q_proj.weight 134217728\n", "vector: model.layers.63.self_attn.v_proj.weight 16777216\n", "vector: model.layers.64.input_layernorm.weight 16384\n", "vector: model.layers.64.mlp.down_proj.weight 469762048\n", "vector: model.layers.64.mlp.gate_proj.weight 469762048\n", "vector: model.layers.64.mlp.up_proj.weight 469762048\n", "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00020.safetensors\n", "vector: model.layers.64.post_attention_layernorm.weight 16384\n", "vector: model.layers.64.self_attn.k_proj.weight 16777216\n", "vector: model.layers.64.self_attn.o_proj.weight 134217728\n", "vector: model.layers.64.self_attn.q_proj.weight 134217728\n", "vector: model.layers.64.self_attn.v_proj.weight 16777216\n", "vector: model.layers.65.input_layernorm.weight 16384\n", "vector: model.layers.65.mlp.down_proj.weight 469762048\n", "vector: model.layers.65.mlp.gate_proj.weight 469762048\n", "vector: model.layers.65.mlp.up_proj.weight 469762048\n", "vector: model.layers.65.post_attention_layernorm.weight 16384\n", "vector: model.layers.65.self_attn.k_proj.weight 16777216\n", "vector: model.layers.65.self_attn.o_proj.weight 134217728\n", "vector: model.layers.65.self_attn.q_proj.weight 134217728\n", "vector: model.layers.65.self_attn.v_proj.weight 16777216\n", "vector: model.layers.66.input_layernorm.weight 16384\n", "vector: model.layers.66.mlp.down_proj.weight 469762048\n", "vector: model.layers.66.mlp.gate_proj.weight 469762048\n", "vector: model.layers.66.mlp.up_proj.weight 469762048\n", "vector: model.layers.66.post_attention_layernorm.weight 16384\n", "vector: model.layers.66.self_attn.k_proj.weight 16777216\n", "vector: model.layers.66.self_attn.o_proj.weight 134217728\n", "vector: model.layers.66.self_attn.q_proj.weight 134217728\n", "vector: model.layers.66.self_attn.v_proj.weight 16777216\n", "vector: model.layers.67.input_layernorm.weight 16384\n", "vector: model.layers.67.mlp.down_proj.weight 469762048\n", "vector: model.layers.67.mlp.gate_proj.weight 469762048\n", "vector: model.layers.67.mlp.up_proj.weight 469762048\n", "vector: model.layers.67.post_attention_layernorm.weight 16384\n", "vector: model.layers.67.self_attn.k_proj.weight 16777216\n", "vector: model.layers.67.self_attn.o_proj.weight 134217728\n", "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00021.safetensors\n", "vector: model.layers.67.self_attn.q_proj.weight 134217728\n", "vector: model.layers.67.self_attn.v_proj.weight 16777216\n", "vector: model.layers.68.input_layernorm.weight 16384\n", "vector: model.layers.68.mlp.down_proj.weight 469762048\n", "vector: model.layers.68.mlp.gate_proj.weight 469762048\n", "vector: model.layers.68.mlp.up_proj.weight 469762048\n", "vector: model.layers.68.post_attention_layernorm.weight 16384\n", "vector: model.layers.68.self_attn.k_proj.weight 16777216\n", "vector: model.layers.68.self_attn.o_proj.weight 134217728\n", "vector: model.layers.68.self_attn.q_proj.weight 134217728\n", "vector: model.layers.68.self_attn.v_proj.weight 16777216\n", "vector: model.layers.69.input_layernorm.weight 16384\n", "vector: model.layers.69.mlp.down_proj.weight 469762048\n", "vector: model.layers.69.mlp.gate_proj.weight 469762048\n", "vector: model.layers.69.mlp.up_proj.weight 469762048\n", "vector: model.layers.69.post_attention_layernorm.weight 16384\n", "vector: model.layers.69.self_attn.k_proj.weight 16777216\n", "vector: model.layers.69.self_attn.o_proj.weight 134217728\n", "vector: model.layers.69.self_attn.q_proj.weight 134217728\n", "vector: model.layers.69.self_attn.v_proj.weight 16777216\n", "vector: model.layers.70.input_layernorm.weight 16384\n", "vector: model.layers.70.mlp.down_proj.weight 469762048\n", "vector: model.layers.70.mlp.gate_proj.weight 469762048\n", "vector: model.layers.70.mlp.up_proj.weight 469762048\n", "vector: model.layers.70.post_attention_layernorm.weight 16384\n", "vector: model.layers.70.self_attn.k_proj.weight 16777216\n", "vector: model.layers.70.self_attn.o_proj.weight 134217728\n", "vector: model.layers.70.self_attn.q_proj.weight 134217728\n", "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00022.safetensors\n", "vector: model.layers.70.self_attn.v_proj.weight 16777216\n", "vector: model.layers.71.input_layernorm.weight 16384\n", "vector: model.layers.71.mlp.down_proj.weight 469762048\n", "vector: model.layers.71.mlp.gate_proj.weight 469762048\n", "vector: model.layers.71.mlp.up_proj.weight 469762048\n", "vector: model.layers.71.post_attention_layernorm.weight 16384\n", "vector: model.layers.71.self_attn.k_proj.weight 16777216\n", "vector: model.layers.71.self_attn.o_proj.weight 134217728\n", "vector: model.layers.71.self_attn.q_proj.weight 134217728\n", "vector: model.layers.71.self_attn.v_proj.weight 16777216\n", "vector: model.layers.72.input_layernorm.weight 16384\n", "vector: model.layers.72.mlp.down_proj.weight 469762048\n", "vector: model.layers.72.mlp.gate_proj.weight 469762048\n", "vector: model.layers.72.mlp.up_proj.weight 469762048\n", "vector: model.layers.72.post_attention_layernorm.weight 16384\n", "vector: model.layers.72.self_attn.k_proj.weight 16777216\n", "vector: model.layers.72.self_attn.o_proj.weight 134217728\n", "vector: model.layers.72.self_attn.q_proj.weight 134217728\n", "vector: model.layers.72.self_attn.v_proj.weight 16777216\n", "vector: model.layers.73.input_layernorm.weight 16384\n", "vector: model.layers.73.mlp.down_proj.weight 469762048\n", "vector: model.layers.73.mlp.gate_proj.weight 469762048\n", "vector: model.layers.73.mlp.up_proj.weight 469762048\n", "vector: model.layers.73.post_attention_layernorm.weight 16384\n", "vector: model.layers.73.self_attn.k_proj.weight 16777216\n", "vector: model.layers.73.self_attn.o_proj.weight 134217728\n", "vector: model.layers.73.self_attn.q_proj.weight 134217728\n", "vector: model.layers.73.self_attn.v_proj.weight 16777216\n", "vector: model.layers.74.input_layernorm.weight 16384\n", "vector: model.layers.74.mlp.down_proj.weight 469762048\n", "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00023.safetensors\n", "vector: model.layers.74.mlp.gate_proj.weight 469762048\n", "vector: model.layers.74.mlp.up_proj.weight 469762048\n", "vector: model.layers.74.post_attention_layernorm.weight 16384\n", "vector: model.layers.74.self_attn.k_proj.weight 16777216\n", "vector: model.layers.74.self_attn.o_proj.weight 134217728\n", "vector: model.layers.74.self_attn.q_proj.weight 134217728\n", "vector: model.layers.74.self_attn.v_proj.weight 16777216\n", "vector: model.layers.75.input_layernorm.weight 16384\n", "vector: model.layers.75.mlp.down_proj.weight 469762048\n", "vector: model.layers.75.mlp.gate_proj.weight 469762048\n", "vector: model.layers.75.mlp.up_proj.weight 469762048\n", "vector: model.layers.75.post_attention_layernorm.weight 16384\n", "vector: model.layers.75.self_attn.k_proj.weight 16777216\n", "vector: model.layers.75.self_attn.o_proj.weight 134217728\n", "vector: model.layers.75.self_attn.q_proj.weight 134217728\n", "vector: model.layers.75.self_attn.v_proj.weight 16777216\n", "vector: model.layers.76.input_layernorm.weight 16384\n", "vector: model.layers.76.mlp.down_proj.weight 469762048\n", "vector: model.layers.76.mlp.gate_proj.weight 469762048\n", "vector: model.layers.76.mlp.up_proj.weight 469762048\n", "vector: model.layers.76.post_attention_layernorm.weight 16384\n", "vector: model.layers.76.self_attn.k_proj.weight 16777216\n", "vector: model.layers.76.self_attn.o_proj.weight 134217728\n", "vector: model.layers.76.self_attn.q_proj.weight 134217728\n", "vector: model.layers.76.self_attn.v_proj.weight 16777216\n", "vector: model.layers.77.input_layernorm.weight 16384\n", "vector: model.layers.77.mlp.down_proj.weight 469762048\n", "vector: model.layers.77.mlp.gate_proj.weight 469762048\n", "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00024.safetensors\n", "vector: model.layers.77.mlp.up_proj.weight 469762048\n", "vector: model.layers.77.post_attention_layernorm.weight 16384\n", "vector: model.layers.77.self_attn.k_proj.weight 16777216\n", "vector: model.layers.77.self_attn.o_proj.weight 134217728\n", "vector: model.layers.77.self_attn.q_proj.weight 134217728\n", "vector: model.layers.77.self_attn.v_proj.weight 16777216\n", "vector: model.layers.78.input_layernorm.weight 16384\n", "vector: model.layers.78.mlp.down_proj.weight 469762048\n", "vector: model.layers.78.mlp.gate_proj.weight 469762048\n", "vector: model.layers.78.mlp.up_proj.weight 469762048\n", "vector: model.layers.78.post_attention_layernorm.weight 16384\n", "vector: model.layers.78.self_attn.k_proj.weight 16777216\n", "vector: model.layers.78.self_attn.o_proj.weight 134217728\n", "vector: model.layers.78.self_attn.q_proj.weight 134217728\n", "vector: model.layers.78.self_attn.v_proj.weight 16777216\n", "vector: model.layers.79.input_layernorm.weight 16384\n", "vector: model.layers.79.mlp.down_proj.weight 469762048\n", "vector: model.layers.79.mlp.gate_proj.weight 469762048\n", "vector: model.layers.79.mlp.up_proj.weight 469762048\n", "vector: model.layers.79.post_attention_layernorm.weight 16384\n", "vector: model.layers.79.self_attn.k_proj.weight 16777216\n", "vector: model.layers.79.self_attn.o_proj.weight 134217728\n", "vector: model.layers.79.self_attn.q_proj.weight 134217728\n", "vector: model.layers.79.self_attn.v_proj.weight 16777216\n", "save: /home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector/model-00025.safetensors\n", "make model.safetensors.index.json\n", "Done!\n" ] } ], "source": [ "ecvm = ExpandChatVectorMerger(merge_target_model_path, vector_path, save_merged_model_path)\n", "ecvm.load_setitngs(\"num_hidden_layers\")\n", "print(\"config_target\", ecvm.n_layers)\n", "print(\"config_vector\", ecvm.n_layers_vector)\n", "ecvm.load_setitngs(layer_num_config_name=\"num_hidden_layers\")\n", "ecvm.add_layer_weight_template_name(\"model.layers.${i}.input_layernorm.weight\")\n", "ecvm.add_layer_weight_template_name(\"model.layers.${i}.mlp.down_proj.weight\")\n", "ecvm.add_layer_weight_template_name(\"model.layers.${i}.mlp.gate_proj.weight\")\n", "ecvm.add_layer_weight_template_name(\"model.layers.${i}.mlp.up_proj.weight\")\n", "ecvm.add_layer_weight_template_name(\"model.layers.${i}.post_attention_layernorm.weight\")\n", "ecvm.add_layer_weight_template_name(\"model.layers.${i}.self_attn.k_proj.weight\")\n", "ecvm.add_layer_weight_template_name(\"model.layers.${i}.self_attn.o_proj.weight\")\n", "ecvm.add_layer_weight_template_name(\"model.layers.${i}.self_attn.q_proj.weight\")\n", "ecvm.add_layer_weight_template_name(\"model.layers.${i}.self_attn.v_proj.weight\")\n", "ecvm.merge()\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoModelForCausalLM, AutoTokenizer\n", "\n", "model_id = ecvm.save_merged_model_path\n", "re_save_path = \"/home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector_re\"\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", "model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map=\"cpu\")\n", "\n", "messages = [\n", " {\"role\": \"system\", \"content\": \"日本語で返答してください。\"},\n", " {\"role\": \"user\", \"content\": \"東京のおすすめの観光スポットを教えて下さい\"},\n", "]\n", "prompt = tokenizer.apply_chat_template(\n", " messages, \n", " tokenize=False, \n", " add_generation_prompt=True\n", ")\n", "\n", "inputs = tokenizer([prompt], return_tensors=\"pt\")\n", "\n", "terminators = [\n", " tokenizer.eos_token_id,\n", " tokenizer.convert_tokens_to_ids(\"<|eot_id|>\")\n", "]\n", "\n", "# model.save_pretrained(re_save_path)\n", "# tokenizer.save_pretrained(re_save_path)\n", "\n", "outputs = model.generate(**inputs, \n", " max_new_tokens=256,\n", " eos_token_id=terminators,\n", " do_sample=True,\n", " temperature=0.6,\n", " top_p=0.9,\n", " )\n", "\n", "print(tokenizer.decode(outputs[0]))\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/mmnga/.local/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", "/home/mmnga/.local/lib/python3.10/site-packages/torch/cuda/__init__.py:619: UserWarning: Can't initialize NVML\n", " warnings.warn(\"Can't initialize NVML\")\n", "Loading checkpoint shards: 100%|██████████| 30/30 [01:53<00:00, 3.77s/it]\n", "Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>\n", "\n", "日本語で返答してください。<|eot_id|><|start_header_id|>user<|end_header_id|>\n", "\n", "東京のおすすめの観光スポットを教えて下さい<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n", "\n", "東京は観光スポットが非常に多く、どれを選ぶか迷ってしまうほどです!以下は、東京のおすすめの観光スポット10選です。\n", "\n", "1. **東京スカイツリー**:東京都心部にある高さ634mの超高層タワーの展望台から、東京のパノラマを眺めることができます。\n", "2. **浅草寺**:浅草区にある古い寺院で、雷門(浅草門)や仲見世通りが有名です。\n", "3. **渋谷スクランブルクロス**:渋谷区にある世界的に有名な交差点で、流行の最先端を感じることができます。\n", "4. **東京タワー**:港区にある高さ333mのタワーで、夜はライトアップされます。\n", "5. **新宿御苑**:新宿区にある大きな公園で、桜のシーズンには非常に人気があります。\n", "6. **築地市場**:中央区にある世界最大の魚市場で、寿司や海老の朝食を味わうことができます。\n", "7. **明治神\n" ] } ], "source": [ "import torch\n", "from transformers import AutoModelForCausalLM, AutoTokenizer\n", "\n", "model_id = \"/home/mmnga/hdd/llm-data/Llama-3-70B-suzume-vector_re\"\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", "model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map=\"cpu\")\n", "\n", "messages = [\n", " {\"role\": \"system\", \"content\": \"日本語で返答してください。\"},\n", " {\"role\": \"user\", \"content\": \"東京のおすすめの観光スポットを教えて下さい\"},\n", "]\n", "prompt = tokenizer.apply_chat_template(\n", " messages, \n", " tokenize=False, \n", " add_generation_prompt=True\n", ")\n", "\n", "inputs = tokenizer([prompt], return_tensors=\"pt\")\n", "\n", "terminators = [\n", " tokenizer.eos_token_id,\n", " tokenizer.convert_tokens_to_ids(\"<|eot_id|>\")\n", "]\n", "\n", "outputs = model.generate(**inputs, \n", " max_new_tokens=256,\n", " eos_token_id=terminators,\n", " do_sample=True,\n", " temperature=0.6,\n", " top_p=0.9,\n", " )\n", "\n", "print(tokenizer.decode(outputs[0]))\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# !pip install -Uqq huggingface-hub\n", "\n", "# !huggingface-cli login --token $HF_TOKEN\n", "\n", "# tokenizer.push_to_hub(\"Llama-3-70B-japanese-suzume-vector\", use_auth_token=True, private=True)\n", "# model.push_to_hub(\"Llama-3-70B-japanese-suzume-vector\", use_auth_token=True, private=True)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 }