init

Browse files

Files changed (13) hide show

.gitattributes +36 -0
.gitignore +3 -0
README.md +11 -0
config.json +29 -0
data.txt +14 -0
finetune.ipynb +151 -0
generate_moe.ipynb +328 -0
generation_config.json +6 -0
model.safetensors +3 -0
model_original.safetensors +3 -0
special_tokens_map.json +23 -0
tokenizer.json +0 -0
tokenizer_config.json +32 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.gguf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ .ipynb_checkpoints
2	+ wandb
3	+

README.md ADDED Viewed

	@@ -0,0 +1,11 @@

+---
+license: mit
+---
+# stories15M_MOE
+This model is [ModelCloud/tinyllama-15M-stories](https://huggingface.co/ModelCloud/tinyllama-15M-stories) repeated 4 times to make 4 experts.
+The model is used for testing, not intended to be used in production (unless your product is some kind of bedtime story teller)
+Weight of router is initialized randomly

config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "architectures": [
+    "MixtralForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 288,
+  "initializer_range": 0.02,
+  "intermediate_size": 768,
+  "max_position_embeddings": 256,
+  "model_type": "mixtral",
+  "num_attention_heads": 6,
+  "num_experts_per_tok": 2,
+  "num_hidden_layers": 6,
+  "num_key_value_heads": 6,
+  "num_local_experts": 4,
+  "output_router_logits": false,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 1000000.0,
+  "router_aux_loss_coef": 0.02,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.36.0.dev0",
+  "use_cache": true,
+  "vocab_size": 32000
+}

data.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+  From fairest creatures we desire increase,
+  That thereby beauty's rose might never die,
+  But as the riper should by time decease,
+  His tender heir might bear his memory:
+  But thou contracted to thine own bright eyes,
+  Feed'st thy light's flame with self-substantial fuel,
+  Making a famine where abundance lies,
+  Thy self thy foe, to thy sweet self too cruel:
+  Thou that art now the world's fresh ornament,
+  And only herald to the gaudy spring,
+  Within thine own bud buriest thy content,
+  And tender churl mak'st waste in niggarding:
+    Pity the world, or else this glutton be,
+    To eat the world's due, by the grave and thee.

finetune.ipynb ADDED Viewed

	@@ -0,0 +1,151 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a41f141c-b6a8-40d1-b72d-127d028c0592",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
+    "\n",
+    "model_path = os.getcwd()\n",
+    "print(model_path)\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_path, legacy=False)\n",
+    "model = AutoModelForCausalLM.from_pretrained(model_path, use_safetensors=True, local_files_only=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "93e9ec6a-4a57-484f-a1a5-ecb6674e8f77",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#inputs = tokenizer('', return_tensors=\"pt\")\n",
+    "#outputs = model.generate(inputs['input_ids'], max_new_tokens=20, temperature=0)\n",
+    "#print(tokenizer.decode(outputs[0], skip_special_tokens=True))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e570b6db-efa8-4c9f-ac71-573479b00711",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.gradient_checkpointing_enable()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9345e74b-5bef-4cc9-982e-342af69b290a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from peft import LoraConfig\n",
+    "\n",
+    "config = LoraConfig(\n",
+    "    r=32,\n",
+    "    lora_alpha=64,\n",
+    "    target_modules=[\n",
+    "        \"q_proj\",\n",
+    "        \"k_proj\",\n",
+    "        \"v_proj\",\n",
+    "        \"o_proj\",\n",
+    "        \"w1\",\n",
+    "        \"w2\",\n",
+    "        \"w3\",\n",
+    "        \"lm_head\",\n",
+    "    ],\n",
+    "    bias=\"none\",\n",
+    "    lora_dropout=0.05,  # Conventional\n",
+    "    task_type=\"CAUSAL_LM\",\n",
+    ")\n",
+    "\n",
+    "#print(model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "09dd4848-9c7a-4a3b-9887-59652c915cc3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import transformers\n",
+    "from datetime import datetime\n",
+    "\n",
+    "project = \"moe_shakespeare15M\"\n",
+    "run_name = project\n",
+    "output_dir = \"./\" + run_name\n",
+    "\n",
+    "with open(\"data.txt\", \"r\") as f:\n",
+    "    content = f.read()\n",
+    "    tokenized_train_dataset = [\n",
+    "        tokenizer(content)['input_ids']\n",
+    "    ]\n",
+    "\n",
+    "trainer = transformers.Trainer(\n",
+    "    model=model,\n",
+    "    train_dataset=tokenized_train_dataset,\n",
+    "    args=transformers.TrainingArguments(\n",
+    "        output_dir=output_dir,\n",
+    "        warmup_steps=10,\n",
+    "        per_device_train_batch_size=2,\n",
+    "        gradient_accumulation_steps=1,\n",
+    "        gradient_checkpointing=True,\n",
+    "        max_steps=300,\n",
+    "        learning_rate=2.5e-5, # Want a small lr for finetuning\n",
+    "        # fp16=True, \n",
+    "        optim=\"paged_adamw_8bit\",\n",
+    "        # logging_steps=25,              # When to start reporting loss\n",
+    "        # logging_dir=\"./logs\",        # Directory for storing logs\n",
+    "        save_strategy=\"steps\",       # Save the model checkpoint every logging step\n",
+    "        save_steps=50,                # Save checkpoints every 50 steps\n",
+    "        # evaluation_strategy=\"steps\", # Evaluate the model every logging step\n",
+    "        # eval_steps=25,               # Evaluate and save checkpoints every 50 steps\n",
+    "        # do_eval=True,                # Perform evaluation at the end of training\n",
+    "        report_to=\"none\",           # Comment this out if you don't want to use weights & baises\n",
+    "        run_name=f\"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}\"          # Name of the W&B run (optional)\n",
+    "    ),\n",
+    "    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),\n",
+    ")\n",
+    "\n",
+    "model.config.use_cache = False  # silence the warnings. Please re-enable for inference!\n",
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7f0ad783-3f3e-4812-bc4e-026f9aad1435",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

generate_moe.ipynb ADDED Viewed

	@@ -0,0 +1,328 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "66851a9c-d852-4a25-8cc7-1b7c03d1b3c2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from safetensors.torch import load_file\n",
+    "import torch\n",
+    "\n",
+    "model = load_file(\"model_original.safetensors\", device=\"cpu\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "6775e2ae-a543-401d-9f81-c450f3eb5910",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "model.embed_tokens.weight\n",
+      "model.layers.0.input_layernorm.weight\n",
+      "model.layers.0.mlp.down_proj.weight\n",
+      "model.layers.0.mlp.gate_proj.weight\n",
+      "model.layers.0.mlp.up_proj.weight\n",
+      "model.layers.0.post_attention_layernorm.weight\n",
+      "model.layers.0.self_attn.k_proj.weight\n",
+      "model.layers.0.self_attn.o_proj.weight\n",
+      "model.layers.0.self_attn.q_proj.weight\n",
+      "model.layers.0.self_attn.v_proj.weight\n",
+      "model.layers.1.input_layernorm.weight\n",
+      "model.layers.1.mlp.down_proj.weight\n",
+      "model.layers.1.mlp.gate_proj.weight\n",
+      "model.layers.1.mlp.up_proj.weight\n",
+      "model.layers.1.post_attention_layernorm.weight\n",
+      "model.layers.1.self_attn.k_proj.weight\n",
+      "model.layers.1.self_attn.o_proj.weight\n",
+      "model.layers.1.self_attn.q_proj.weight\n",
+      "model.layers.1.self_attn.v_proj.weight\n",
+      "model.layers.2.input_layernorm.weight\n",
+      "model.layers.2.mlp.down_proj.weight\n",
+      "model.layers.2.mlp.gate_proj.weight\n",
+      "model.layers.2.mlp.up_proj.weight\n",
+      "model.layers.2.post_attention_layernorm.weight\n",
+      "model.layers.2.self_attn.k_proj.weight\n",
+      "model.layers.2.self_attn.o_proj.weight\n",
+      "model.layers.2.self_attn.q_proj.weight\n",
+      "model.layers.2.self_attn.v_proj.weight\n",
+      "model.layers.3.input_layernorm.weight\n",
+      "model.layers.3.mlp.down_proj.weight\n",
+      "model.layers.3.mlp.gate_proj.weight\n",
+      "model.layers.3.mlp.up_proj.weight\n",
+      "model.layers.3.post_attention_layernorm.weight\n",
+      "model.layers.3.self_attn.k_proj.weight\n",
+      "model.layers.3.self_attn.o_proj.weight\n",
+      "model.layers.3.self_attn.q_proj.weight\n",
+      "model.layers.3.self_attn.v_proj.weight\n",
+      "model.layers.4.input_layernorm.weight\n",
+      "model.layers.4.mlp.down_proj.weight\n",
+      "model.layers.4.mlp.gate_proj.weight\n",
+      "model.layers.4.mlp.up_proj.weight\n",
+      "model.layers.4.post_attention_layernorm.weight\n",
+      "model.layers.4.self_attn.k_proj.weight\n",
+      "model.layers.4.self_attn.o_proj.weight\n",
+      "model.layers.4.self_attn.q_proj.weight\n",
+      "model.layers.4.self_attn.v_proj.weight\n",
+      "model.layers.5.input_layernorm.weight\n",
+      "model.layers.5.mlp.down_proj.weight\n",
+      "model.layers.5.mlp.gate_proj.weight\n",
+      "model.layers.5.mlp.up_proj.weight\n",
+      "model.layers.5.post_attention_layernorm.weight\n",
+      "model.layers.5.self_attn.k_proj.weight\n",
+      "model.layers.5.self_attn.o_proj.weight\n",
+      "model.layers.5.self_attn.q_proj.weight\n",
+      "model.layers.5.self_attn.v_proj.weight\n",
+      "model.norm.weight\n"
+     ]
+    }
+   ],
+   "source": [
+    "for name, tensor in model.items():\n",
+    "    print(name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "8b06f3c7-927d-4148-950c-5e1c93a54b75",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "model.embed_tokens.weight torch.Size([32000, 288])\n",
+      "model.norm.weight torch.Size([288])\n",
+      "lm_head.weight torch.Size([32000, 288])\n",
+      "model.layers.0.input_layernorm.weight torch.Size([288])\n",
+      "model.layers.0.post_attention_layernorm.weight torch.Size([288])\n",
+      "model.layers.0.self_attn.k_proj.weight torch.Size([288, 288])\n",
+      "model.layers.0.self_attn.o_proj.weight torch.Size([288, 288])\n",
+      "model.layers.0.self_attn.q_proj.weight torch.Size([288, 288])\n",
+      "model.layers.0.self_attn.v_proj.weight torch.Size([288, 288])\n",
+      "model.layers.0.block_sparse_moe.gate.weight torch.Size([4, 288])\n",
+      "model.layers.0.block_sparse_moe.experts.0.w1.weight torch.Size([768, 288])\n",
+      "model.layers.0.block_sparse_moe.experts.0.w2.weight torch.Size([288, 768])\n",
+      "model.layers.0.block_sparse_moe.experts.0.w3.weight torch.Size([768, 288])\n",
+      "model.layers.0.block_sparse_moe.experts.1.w1.weight torch.Size([768, 288])\n",
+      "model.layers.0.block_sparse_moe.experts.1.w2.weight torch.Size([288, 768])\n",
+      "model.layers.0.block_sparse_moe.experts.1.w3.weight torch.Size([768, 288])\n",
+      "model.layers.0.block_sparse_moe.experts.2.w1.weight torch.Size([768, 288])\n",
+      "model.layers.0.block_sparse_moe.experts.2.w2.weight torch.Size([288, 768])\n",
+      "model.layers.0.block_sparse_moe.experts.2.w3.weight torch.Size([768, 288])\n",
+      "model.layers.0.block_sparse_moe.experts.3.w1.weight torch.Size([768, 288])\n",
+      "model.layers.0.block_sparse_moe.experts.3.w2.weight torch.Size([288, 768])\n",
+      "model.layers.0.block_sparse_moe.experts.3.w3.weight torch.Size([768, 288])\n",
+      "model.layers.1.input_layernorm.weight torch.Size([288])\n",
+      "model.layers.1.post_attention_layernorm.weight torch.Size([288])\n",
+      "model.layers.1.self_attn.k_proj.weight torch.Size([288, 288])\n",
+      "model.layers.1.self_attn.o_proj.weight torch.Size([288, 288])\n",
+      "model.layers.1.self_attn.q_proj.weight torch.Size([288, 288])\n",
+      "model.layers.1.self_attn.v_proj.weight torch.Size([288, 288])\n",
+      "model.layers.1.block_sparse_moe.gate.weight torch.Size([4, 288])\n",
+      "model.layers.1.block_sparse_moe.experts.0.w1.weight torch.Size([768, 288])\n",
+      "model.layers.1.block_sparse_moe.experts.0.w2.weight torch.Size([288, 768])\n",
+      "model.layers.1.block_sparse_moe.experts.0.w3.weight torch.Size([768, 288])\n",
+      "model.layers.1.block_sparse_moe.experts.1.w1.weight torch.Size([768, 288])\n",
+      "model.layers.1.block_sparse_moe.experts.1.w2.weight torch.Size([288, 768])\n",
+      "model.layers.1.block_sparse_moe.experts.1.w3.weight torch.Size([768, 288])\n",
+      "model.layers.1.block_sparse_moe.experts.2.w1.weight torch.Size([768, 288])\n",
+      "model.layers.1.block_sparse_moe.experts.2.w2.weight torch.Size([288, 768])\n",
+      "model.layers.1.block_sparse_moe.experts.2.w3.weight torch.Size([768, 288])\n",
+      "model.layers.1.block_sparse_moe.experts.3.w1.weight torch.Size([768, 288])\n",
+      "model.layers.1.block_sparse_moe.experts.3.w2.weight torch.Size([288, 768])\n",
+      "model.layers.1.block_sparse_moe.experts.3.w3.weight torch.Size([768, 288])\n",
+      "model.layers.2.input_layernorm.weight torch.Size([288])\n",
+      "model.layers.2.post_attention_layernorm.weight torch.Size([288])\n",
+      "model.layers.2.self_attn.k_proj.weight torch.Size([288, 288])\n",
+      "model.layers.2.self_attn.o_proj.weight torch.Size([288, 288])\n",
+      "model.layers.2.self_attn.q_proj.weight torch.Size([288, 288])\n",
+      "model.layers.2.self_attn.v_proj.weight torch.Size([288, 288])\n",
+      "model.layers.2.block_sparse_moe.gate.weight torch.Size([4, 288])\n",
+      "model.layers.2.block_sparse_moe.experts.0.w1.weight torch.Size([768, 288])\n",
+      "model.layers.2.block_sparse_moe.experts.0.w2.weight torch.Size([288, 768])\n",
+      "model.layers.2.block_sparse_moe.experts.0.w3.weight torch.Size([768, 288])\n",
+      "model.layers.2.block_sparse_moe.experts.1.w1.weight torch.Size([768, 288])\n",
+      "model.layers.2.block_sparse_moe.experts.1.w2.weight torch.Size([288, 768])\n",
+      "model.layers.2.block_sparse_moe.experts.1.w3.weight torch.Size([768, 288])\n",
+      "model.layers.2.block_sparse_moe.experts.2.w1.weight torch.Size([768, 288])\n",
+      "model.layers.2.block_sparse_moe.experts.2.w2.weight torch.Size([288, 768])\n",
+      "model.layers.2.block_sparse_moe.experts.2.w3.weight torch.Size([768, 288])\n",
+      "model.layers.2.block_sparse_moe.experts.3.w1.weight torch.Size([768, 288])\n",
+      "model.layers.2.block_sparse_moe.experts.3.w2.weight torch.Size([288, 768])\n",
+      "model.layers.2.block_sparse_moe.experts.3.w3.weight torch.Size([768, 288])\n",
+      "model.layers.3.input_layernorm.weight torch.Size([288])\n",
+      "model.layers.3.post_attention_layernorm.weight torch.Size([288])\n",
+      "model.layers.3.self_attn.k_proj.weight torch.Size([288, 288])\n",
+      "model.layers.3.self_attn.o_proj.weight torch.Size([288, 288])\n",
+      "model.layers.3.self_attn.q_proj.weight torch.Size([288, 288])\n",
+      "model.layers.3.self_attn.v_proj.weight torch.Size([288, 288])\n",
+      "model.layers.3.block_sparse_moe.gate.weight torch.Size([4, 288])\n",
+      "model.layers.3.block_sparse_moe.experts.0.w1.weight torch.Size([768, 288])\n",
+      "model.layers.3.block_sparse_moe.experts.0.w2.weight torch.Size([288, 768])\n",
+      "model.layers.3.block_sparse_moe.experts.0.w3.weight torch.Size([768, 288])\n",
+      "model.layers.3.block_sparse_moe.experts.1.w1.weight torch.Size([768, 288])\n",
+      "model.layers.3.block_sparse_moe.experts.1.w2.weight torch.Size([288, 768])\n",
+      "model.layers.3.block_sparse_moe.experts.1.w3.weight torch.Size([768, 288])\n",
+      "model.layers.3.block_sparse_moe.experts.2.w1.weight torch.Size([768, 288])\n",
+      "model.layers.3.block_sparse_moe.experts.2.w2.weight torch.Size([288, 768])\n",
+      "model.layers.3.block_sparse_moe.experts.2.w3.weight torch.Size([768, 288])\n",
+      "model.layers.3.block_sparse_moe.experts.3.w1.weight torch.Size([768, 288])\n",
+      "model.layers.3.block_sparse_moe.experts.3.w2.weight torch.Size([288, 768])\n",
+      "model.layers.3.block_sparse_moe.experts.3.w3.weight torch.Size([768, 288])\n",
+      "model.layers.4.input_layernorm.weight torch.Size([288])\n",
+      "model.layers.4.post_attention_layernorm.weight torch.Size([288])\n",
+      "model.layers.4.self_attn.k_proj.weight torch.Size([288, 288])\n",
+      "model.layers.4.self_attn.o_proj.weight torch.Size([288, 288])\n",
+      "model.layers.4.self_attn.q_proj.weight torch.Size([288, 288])\n",
+      "model.layers.4.self_attn.v_proj.weight torch.Size([288, 288])\n",
+      "model.layers.4.block_sparse_moe.gate.weight torch.Size([4, 288])\n",
+      "model.layers.4.block_sparse_moe.experts.0.w1.weight torch.Size([768, 288])\n",
+      "model.layers.4.block_sparse_moe.experts.0.w2.weight torch.Size([288, 768])\n",
+      "model.layers.4.block_sparse_moe.experts.0.w3.weight torch.Size([768, 288])\n",
+      "model.layers.4.block_sparse_moe.experts.1.w1.weight torch.Size([768, 288])\n",
+      "model.layers.4.block_sparse_moe.experts.1.w2.weight torch.Size([288, 768])\n",
+      "model.layers.4.block_sparse_moe.experts.1.w3.weight torch.Size([768, 288])\n",
+      "model.layers.4.block_sparse_moe.experts.2.w1.weight torch.Size([768, 288])\n",
+      "model.layers.4.block_sparse_moe.experts.2.w2.weight torch.Size([288, 768])\n",
+      "model.layers.4.block_sparse_moe.experts.2.w3.weight torch.Size([768, 288])\n",
+      "model.layers.4.block_sparse_moe.experts.3.w1.weight torch.Size([768, 288])\n",
+      "model.layers.4.block_sparse_moe.experts.3.w2.weight torch.Size([288, 768])\n",
+      "model.layers.4.block_sparse_moe.experts.3.w3.weight torch.Size([768, 288])\n",
+      "model.layers.5.input_layernorm.weight torch.Size([288])\n",
+      "model.layers.5.post_attention_layernorm.weight torch.Size([288])\n",
+      "model.layers.5.self_attn.k_proj.weight torch.Size([288, 288])\n",
+      "model.layers.5.self_attn.o_proj.weight torch.Size([288, 288])\n",
+      "model.layers.5.self_attn.q_proj.weight torch.Size([288, 288])\n",
+      "model.layers.5.self_attn.v_proj.weight torch.Size([288, 288])\n",
+      "model.layers.5.block_sparse_moe.gate.weight torch.Size([4, 288])\n",
+      "model.layers.5.block_sparse_moe.experts.0.w1.weight torch.Size([768, 288])\n",
+      "model.layers.5.block_sparse_moe.experts.0.w2.weight torch.Size([288, 768])\n",
+      "model.layers.5.block_sparse_moe.experts.0.w3.weight torch.Size([768, 288])\n",
+      "model.layers.5.block_sparse_moe.experts.1.w1.weight torch.Size([768, 288])\n",
+      "model.layers.5.block_sparse_moe.experts.1.w2.weight torch.Size([288, 768])\n",
+      "model.layers.5.block_sparse_moe.experts.1.w3.weight torch.Size([768, 288])\n",
+      "model.layers.5.block_sparse_moe.experts.2.w1.weight torch.Size([768, 288])\n",
+      "model.layers.5.block_sparse_moe.experts.2.w2.weight torch.Size([288, 768])\n",
+      "model.layers.5.block_sparse_moe.experts.2.w3.weight torch.Size([768, 288])\n",
+      "model.layers.5.block_sparse_moe.experts.3.w1.weight torch.Size([768, 288])\n",
+      "model.layers.5.block_sparse_moe.experts.3.w2.weight torch.Size([288, 768])\n",
+      "model.layers.5.block_sparse_moe.experts.3.w3.weight torch.Size([768, 288])\n"
+     ]
+    }
+   ],
+   "source": [
+    "N_EXPERTS = 4\n",
+    "N_LAYERS = 6\n",
+    "N_FF = 768\n",
+    "N_EMBD = 288\n",
+    "\n",
+    "moe_model = dict()\n",
+    "def copy_tensor(name, new_name = None):\n",
+    "    new_name = name if new_name is None else new_name\n",
+    "    moe_model[new_name] = torch.clone(model[name])\n",
+    "\n",
+    "copy_tensor('model.embed_tokens.weight')\n",
+    "copy_tensor('model.norm.weight')\n",
+    "copy_tensor('model.embed_tokens.weight', 'lm_head.weight')\n",
+    "\n",
+    "torch.manual_seed(0)\n",
+    "for il in range(N_LAYERS):\n",
+    "    copy_tensor(f'model.layers.{il}.input_layernorm.weight')\n",
+    "    copy_tensor(f'model.layers.{il}.post_attention_layernorm.weight')\n",
+    "    copy_tensor(f'model.layers.{il}.self_attn.k_proj.weight')\n",
+    "    copy_tensor(f'model.layers.{il}.self_attn.o_proj.weight')\n",
+    "    copy_tensor(f'model.layers.{il}.self_attn.q_proj.weight')\n",
+    "    copy_tensor(f'model.layers.{il}.self_attn.v_proj.weight')\n",
+    "    moe_model[f'model.layers.{il}.block_sparse_moe.gate.weight'] = torch.rand(N_EXPERTS, N_EMBD)\n",
+    "    for ex in range(N_EXPERTS):\n",
+    "        copy_tensor(f'model.layers.{il}.mlp.gate_proj.weight', f'model.layers.{il}.block_sparse_moe.experts.{ex}.w1.weight')\n",
+    "        copy_tensor(f'model.layers.{il}.mlp.down_proj.weight', f'model.layers.{il}.block_sparse_moe.experts.{ex}.w2.weight')\n",
+    "        copy_tensor(f'model.layers.{il}.mlp.up_proj.weight',   f'model.layers.{il}.block_sparse_moe.experts.{ex}.w3.weight')\n",
+    "\n",
+    "for name, tensor in moe_model.items():\n",
+    "    print(name, tensor.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "19817bec-448f-4619-8772-2b3c77f0a1c2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from safetensors.torch import save_file\n",
+    "\n",
+    "save_file(moe_model, \"model.safetensors\", metadata={\"format\": \"pt\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e5bfd2cb-f53b-4285-bf5d-52a6c23779e0",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "e29a4b7e-e390-4d69-857c-02fc6065e33d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "\n",
+    "index_json = {\n",
+    "    \"metadata\": {\n",
+    "        \"total_size\": os.path.getsize(\"model.safetensors\"),\n",
+    "        \"format\": \"safetensors\"\n",
+    "    },\n",
+    "    \"weight_map\": {}\n",
+    "}\n",
+    "\n",
+    "for name, _ in moe_model.items():\n",
+    "    index_json[\"weight_map\"][name] = \"model.safetensors\"\n",
+    "\n",
+    "#with open(\"model.safetensors.index.json\", 'w') as json_file:\n",
+    "#    json.dump(index_json, json_file, indent=2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c7e0736c-0139-4808-8943-c9eba5dcfc76",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "transformers_version": "4.36.0.dev0"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dbfa0289f68a8dd721d10eb12d8bd82e098455682027f6f9986ba548913f9082
+size 72744704

model_original.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b9e8d4614e24c89e99502000294af0aab73e9266029357377578e0a504b7f8d9
+size 30389560

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "clean_up_tokenization_spaces": false,
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "model_max_length": 2048,
+  "pad_token": null,
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "use_default_system_prompt": true
+}