OpenDungeon
/

bloom-7b1-8bit

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "29d14fe0",
+   "metadata": {
+    "cellId": "hwmcjwsucnwczi4u66ftg",
+    "id": "e13eff4e-c134-4dac-9523-07b297164250"
+   },
+   "source": [
+    "# Example of Quantizating 7.1 billion Bloom with 8-bit weights\n",
+    "\n",
+    "Heavily inspired by [Hivemind's work](https://nbviewer.org/urls/huggingface.co/hivemind/gpt-j-6B-8bit/raw/main/convert-gpt-j.ipynb) and [joaoalvarenga's work](https://huggingface.co/joaoalvarenga/bloom-8bit)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "39f137ae",
+   "metadata": {
+    "cellId": "wg56t50s3la38havqevkme",
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "699e94eb-3ce1-4788-999b-fb6d593ba7e9",
+    "outputId": "764a6719-66d0-4ef7-df2d-4cfda0914f65"
+   },
+   "outputs": [],
+   "source": [
+    "#%pip install transformers==4.20.1\n",
+    "#%pip install bitsandbytes\n",
+    "#%pip install datasets\n",
+    "#%pip install accelerate"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "53e4dd05",
+   "metadata": {
+    "cellId": "aklenvay105v0md7yy679m",
+    "id": "0afea72c-691d-4719-a84a-663f1891af6e"
+   },
+   "source": [
+    "### Load and convert original Bloom structure to 8-bit\n",
+    "\n",
+    "You can load an already compressed 8-bit version of Bloom from [OpenDungeon/bloom-7b1-8bit](https://huggingface.co/OpenDungeon/bloom-7b1-8bit/tree/main) with small monkey patching. But this notebook focuses on compression of Bloom, not usage."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "e1ca3df9",
+   "metadata": {
+    "cellId": "ktgxcupgtcf8hhh2k1r2ij",
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "xcdQSnYIk12Z",
+    "outputId": "8d0fff65-4d34-41bd-f750-278a35ac9533"
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/dm/.local/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "/usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (1.26.12) or chardet (3.0.4) doesn't match a supported version!\n",
+      "  warnings.warn(\"urllib3 ({}) or chardet ({}) doesn't match a supported \"\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "===================================BUG REPORT===================================\n",
+      "Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
+      "For effortless bug reporting copy-paste your error into this form: https://docs.google.com/forms/d/e/1FAIpQLScPB8emS3Thkp66nvqwmjTEgxp8Y9ufuWTzFyr9kJ5AoI47dQ/viewform?usp=sf_link\n",
+      "================================================================================\n",
+      "CUDA_SETUP: WARNING! libcudart.so not found in any environmental path. Searching /usr/local/cuda/lib64...\n",
+      "WARNING: No libcudart.so found! Install CUDA or the cudatoolkit package (anaconda)!\n",
+      "CUDA SETUP: Loading binary /home/dm/.local/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/dm/.local/lib/python3.8/site-packages/bitsandbytes/cuda_setup/paths.py:27: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('8bitexperiments/f746d450-b748-4d1f-b3e3-9e4fd3f72d6e')}\n",
+      "  warn(\n",
+      "/home/dm/.local/lib/python3.8/site-packages/bitsandbytes/cuda_setup/paths.py:27: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('module'), PosixPath('//matplotlib_inline.backend_inline')}\n",
+      "  warn(\n",
+      "/home/dm/.local/lib/python3.8/site-packages/bitsandbytes/cuda_setup/paths.py:27: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('/usr/local/cuda/lib64')}\n",
+      "  warn(\n",
+      "/home/dm/.local/lib/python3.8/site-packages/bitsandbytes/cextension.py:48: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers and GPU quantization are unavailable.\n",
+      "  warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.nn.functional as F\n",
+    "import transformers\n",
+    "from bitsandbytes.functional import quantize_blockwise, dequantize_blockwise\n",
+    "\n",
+    "model_name = \"bigscience/bloom-7b1\"\n",
+    "gpt = transformers.BloomForCausalLM.from_pretrained(model_name, cache_dir=\"mycache\")\n",
+    "tokenizer = transformers.AutoTokenizer.from_pretrained(model_name,  cache_dir=\"mycache\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "b37255b0",
+   "metadata": {
+    "cellId": "wmew4wc0e3pztbva18lggg",
+    "id": "YjLHVyIOkdCH"
+   },
+   "outputs": [],
+   "source": [
+    "def quantize_blockise_lowmemory(matrix: torch.Tensor, chunk_size: int = 2 ** 20):\n",
+    "    assert chunk_size % 4096 == 0\n",
+    "    code = None\n",
+    "    chunks = []\n",
+    "    absmaxes = []\n",
+    "    flat_tensor = matrix.view(-1)\n",
+    "    for i in range((matrix.numel() - 1) // chunk_size + 1):\n",
+    "        input_chunk = flat_tensor[i * chunk_size: (i + 1) * chunk_size].clone()\n",
+    "        quantized_chunk, (absmax_chunk, code) = quantize_blockwise(input_chunk, code=code)\n",
+    "        chunks.append(quantized_chunk)\n",
+    "        absmaxes.append(absmax_chunk)\n",
+    "        \n",
+    "    matrix_i8 = torch.cat(chunks).reshape_as(matrix)\n",
+    "    absmax = torch.cat(absmaxes)\n",
+    "    return matrix_i8, (absmax, code)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "5c03f13b",
+   "metadata": {
+    "cellId": "zwcfu5ypstmsusllfldemc",
+    "id": "StJJ6oickpZs"
+   },
+   "outputs": [],
+   "source": [
+    "from typing import Tuple\n",
+    "from torch.cuda.amp import custom_fwd, custom_bwd\n",
+    "\n",
+    "\n",
+    "class DequantizeAndLinear(torch.autograd.Function):\n",
+    "    @staticmethod\n",
+    "    @custom_fwd\n",
+    "    def forward(ctx, input: torch.Tensor, weights_quantized: torch.ByteTensor,\n",
+    "                absmax: torch.FloatTensor, code: torch.FloatTensor, bias: torch.FloatTensor):\n",
+    "\n",
+    "        weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)\n",
+    "        ctx.save_for_backward(input, weights_quantized, absmax, code)\n",
+    "        ctx._has_bias = bias is not None\n",
+    "        return F.linear(input, weights_deq, bias)\n",
+    "\n",
+    "    @staticmethod\n",
+    "    @custom_bwd\n",
+    "    def backward(ctx, grad_output: torch.Tensor):\n",
+    "        assert not ctx.needs_input_grad[1] and not ctx.needs_input_grad[2] and not ctx.needs_input_grad[3]\n",
+    "        input, weights_quantized, absmax, code = ctx.saved_tensors\n",
+    "        # grad_output: [*batch, out_features]\n",
+    "        weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)\n",
+    "        grad_input = grad_output @ weights_deq\n",
+    "        grad_bias = grad_output.flatten(0, -2).sum(dim=0) if ctx._has_bias else None\n",
+    "        return grad_input, None, None, None, grad_bias\n",
+    "\n",
+    "\n",
+    "class BNBLinearWithAdapter(nn.Module):\n",
+    "    def __init__(self, weight, absmax, code,  bias=None, adapter_dim=0):\n",
+    "        assert isinstance(bias, nn.Parameter) or bias is None\n",
+    "        super().__init__()\n",
+    "        self.out_features, self.in_features = weight.shape\n",
+    "        self.register_buffer(\"weight\", weight.requires_grad_(False))\n",
+    "        self.register_buffer(\"absmax\", absmax.requires_grad_(False))\n",
+    "        self.register_buffer(\"code\", code.requires_grad_(False))\n",
+    "        self.bias = bias\n",
+    "\n",
+    "        if adapter_dim > 0:\n",
+    "            self.adapter = nn.Sequential(\n",
+    "                nn.Linear(self.in_features, adapter_dim, bias=False),\n",
+    "                nn.Linear(adapter_dim, self.out_features, bias=False),\n",
+    "            )\n",
+    "\n",
+    "            nn.init.zeros_(self.adapter[1].weight)\n",
+    "        else:\n",
+    "            self.adapter = None\n",
+    "\n",
+    "    def forward(self, input):\n",
+    "        out = DequantizeAndLinear.apply(input, self.weight, self.absmax, self.code, self.bias)\n",
+    "\n",
+    "        if self.adapter:\n",
+    "            return self.adapter(input) + out\n",
+    "\n",
+    "        return out\n",
+    "\n",
+    "\n",
+    "    @classmethod\n",
+    "    def from_linear(cls, linear: nn.Linear, **kwargs) -> \"FrozenBNBLinear\":\n",
+    "        weights_int8, state = quantize_blockise_lowmemory(linear.weight)\n",
+    "        return cls(weights_int8, *state, linear.bias, **kwargs)\n",
+    "\n",
+    "    def __repr__(self):\n",
+    "        return f\"{self.__class__.__name__}({self.in_features}, {self.out_features})\"\n",
+    "\n",
+    "\n",
+    "class BNBEmbeddingWithAdapter(nn.Module):\n",
+    "    def __init__(self, weight, absmax, code, adapter_dim=0):\n",
+    "        super().__init__()\n",
+    "        self.num_embeddings, self.embedding_dim = weight.shape\n",
+    "        self.register_buffer(\"weight\", weight.requires_grad_(False))\n",
+    "        self.register_buffer(\"absmax\", absmax.requires_grad_(False))\n",
+    "        self.register_buffer(\"code\", code.requires_grad_(False))\n",
+    "\n",
+    "        if adapter_dim > 0:\n",
+    "            self.adapter = nn.Sequential(\n",
+    "                nn.Embedding(self.num_embeddings, adapter_dim),\n",
+    "                nn.Linear(adapter_dim, self.embedding_dim, bias=False),\n",
+    "            )\n",
+    "\n",
+    "            nn.init.zeros_(self.adapter[1].weight)\n",
+    "        else:\n",
+    "            self.adapter = None\n",
+    "\n",
+    "    def forward(self, input, **kwargs):\n",
+    "        with torch.no_grad():\n",
+    "            # note: both quantuized weights and input indices are *not* differentiable\n",
+    "            weight_deq = dequantize_blockwise(self.weight, absmax=self.absmax, code=self.code)\n",
+    "            out = F.embedding(input, weight_deq, **kwargs)\n",
+    "        if self.adapter:\n",
+    "            return out + self.adapter(input, **kwargs)\n",
+    "\n",
+    "        return out\n",
+    "\n",
+    "    @classmethod\n",
+    "    def from_embedding(cls, embedding: nn.Embedding, **kwargs) -> \"FrozenBNBEmbedding\":\n",
+    "        weights_int8, state = quantize_blockise_lowmemory(embedding.weight)\n",
+    "        return cls(weights_int8, *state, **kwargs)\n",
+    "\n",
+    "    def __repr__(self):\n",
+    "        return f\"{self.__class__.__name__}({self.num_embeddings}, {self.embedding_dim})\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "92a58957",
+   "metadata": {
+    "cellId": "due8kcyko4fv3vxzrbin3",
+    "id": "6LafYNhlktnt"
+   },
+   "outputs": [],
+   "source": [
+    "def bnbfy_(model, adapter_dim: int = 0):    \n",
+    "    for module in list(model.transformer.h.modules()):\n",
+    "        for name, child in module.named_children():\n",
+    "            if isinstance(child, nn.Linear):\n",
+    "                print(name, child)\n",
+    "                setattr(module, name, BNBLinearWithAdapter.from_linear(child, adapter_dim=adapter_dim))\n",
+    "\n",
+    "            elif isinstance(child, nn.Embedding):\n",
+    "                print(name, child)\n",
+    "                setattr(module, name, BNBEmbeddingWithAdapter.from_embedding(child, adapter_dim=adapter_dim))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "f2d513c6-cd72-411d-9a25-9a21e5c2b87c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "model size: 26966.156MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "#!g1.1\n",
+    "param_size = 0\n",
+    "for param in gpt.parameters():\n",
+    "    param_size += param.nelement() * param.element_size()\n",
+    "buffer_size = 0\n",
+    "for buffer in gpt.buffers():\n",
+    "    buffer_size += buffer.nelement() * buffer.element_size()\n",
+    "\n",
+    "size_all_mb = (param_size + buffer_size) / 1024**2\n",
+    "print('model size: {:.3f}MB'.format(size_all_mb))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "ab52cd5c",
+   "metadata": {
+    "cellId": "5269rte0cil8omgnvcif"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "BloomForCausalLM(\n",
+       "  (transformer): BloomModel(\n",
+       "    (word_embeddings): Embedding(250880, 4096)\n",
+       "    (word_embeddings_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "    (h): ModuleList(\n",
+       "      (0): BloomBlock(\n",
+       "        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (self_attention): BloomAttention(\n",
+       "          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n",
+       "          (dense): Linear(in_features=4096, out_features=4096, bias=True)\n",
+       "          (attention_dropout): Dropout(p=0.0, inplace=False)\n",
+       "        )\n",
+       "        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): BloomMLP(\n",
+       "          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n",
+       "          (gelu_impl): BloomGelu()\n",
+       "          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (1): BloomBlock(\n",
+       "        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (self_attention): BloomAttention(\n",
+       "          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n",
+       "          (dense): Linear(in_features=4096, out_features=4096, bias=True)\n",
+       "          (attention_dropout): Dropout(p=0.0, inplace=False)\n",
+       "        )\n",
+       "        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): BloomMLP(\n",
+       "          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n",
+       "          (gelu_impl): BloomGelu()\n",
+       "          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (2): BloomBlock(\n",
+       "        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (self_attention): BloomAttention(\n",
+       "          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n",
+       "          (dense): Linear(in_features=4096, out_features=4096, bias=True)\n",
+       "          (attention_dropout): Dropout(p=0.0, inplace=False)\n",
+       "        )\n",
+       "        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): BloomMLP(\n",
+       "          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n",
+       "          (gelu_impl): BloomGelu()\n",
+       "          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (3): BloomBlock(\n",
+       "        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (self_attention): BloomAttention(\n",
+       "          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n",
+       "          (dense): Linear(in_features=4096, out_features=4096, bias=True)\n",
+       "          (attention_dropout): Dropout(p=0.0, inplace=False)\n",
+       "        )\n",
+       "        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): BloomMLP(\n",
+       "          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n",
+       "          (gelu_impl): BloomGelu()\n",
+       "          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (4): BloomBlock(\n",
+       "        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (self_attention): BloomAttention(\n",
+       "          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n",
+       "          (dense): Linear(in_features=4096, out_features=4096, bias=True)\n",
+       "          (attention_dropout): Dropout(p=0.0, inplace=False)\n",
+       "        )\n",
+       "        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): BloomMLP(\n",
+       "          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n",
+       "          (gelu_impl): BloomGelu()\n",
+       "          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (5): BloomBlock(\n",
+       "        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (self_attention): BloomAttention(\n",
+       "          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n",
+       "          (dense): Linear(in_features=4096, out_features=4096, bias=True)\n",
+       "          (attention_dropout): Dropout(p=0.0, inplace=False)\n",
+       "        )\n",
+       "        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): BloomMLP(\n",
+       "          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n",
+       "          (gelu_impl): BloomGelu()\n",
+       "          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (6): BloomBlock(\n",
+       "        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (self_attention): BloomAttention(\n",
+       "          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n",
+       "          (dense): Linear(in_features=4096, out_features=4096, bias=True)\n",
+       "          (attention_dropout): Dropout(p=0.0, inplace=False)\n",
+       "        )\n",
+       "        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): BloomMLP(\n",
+       "          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n",
+       "          (gelu_impl): BloomGelu()\n",
+       "          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (7): BloomBlock(\n",
+       "        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (self_attention): BloomAttention(\n",
+       "          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n",
+       "          (dense): Linear(in_features=4096, out_features=4096, bias=True)\n",
+       "          (attention_dropout): Dropout(p=0.0, inplace=False)\n",
+       "        )\n",
+       "        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): BloomMLP(\n",
+       "          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n",
+       "          (gelu_impl): BloomGelu()\n",
+       "          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (8): BloomBlock(\n",
+       "        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (self_attention): BloomAttention(\n",
+       "          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n",
+       "          (dense): Linear(in_features=4096, out_features=4096, bias=True)\n",
+       "          (attention_dropout): Dropout(p=0.0, inplace=False)\n",
+       "        )\n",
+       "        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): BloomMLP(\n",
+       "          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n",
+       "          (gelu_impl): BloomGelu()\n",
+       "          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (9): BloomBlock(\n",
+       "        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (self_attention): BloomAttention(\n",
+       "          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n",
+       "          (dense): Linear(in_features=4096, out_features=4096, bias=True)\n",
+       "          (attention_dropout): Dropout(p=0.0, inplace=False)\n",
+       "        )\n",
+       "        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): BloomMLP(\n",
+       "          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n",
+       "          (gelu_impl): BloomGelu()\n",
+       "          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (10): BloomBlock(\n",
+       "        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (self_attention): BloomAttention(\n",
+       "          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n",
+       "          (dense): Linear(in_features=4096, out_features=4096, bias=True)\n",
+       "          (attention_dropout): Dropout(p=0.0, inplace=False)\n",
+       "        )\n",
+       "        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): BloomMLP(\n",
+       "          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n",
+       "          (gelu_impl): BloomGelu()\n",
+       "          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (11): BloomBlock(\n",
+       "        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (self_attention): BloomAttention(\n",
+       "          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n",
+       "          (dense): Linear(in_features=4096, out_features=4096, bias=True)\n",
+       "          (attention_dropout): Dropout(p=0.0, inplace=False)\n",
+       "        )\n",
+       "        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): BloomMLP(\n",
+       "          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n",
+       "          (gelu_impl): BloomGelu()\n",
+       "          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (12): BloomBlock(\n",
+       "        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (self_attention): BloomAttention(\n",
+       "          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n",
+       "          (dense): Linear(in_features=4096, out_features=4096, bias=True)\n",
+       "          (attention_dropout): Dropout(p=0.0, inplace=False)\n",
+       "        )\n",
+       "        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): BloomMLP(\n",
+       "          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n",
+       "          (gelu_impl): BloomGelu()\n",
+       "          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (13): BloomBlock(\n",
+       "        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (self_attention): BloomAttention(\n",
+       "          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n",
+       "          (dense): Linear(in_features=4096, out_features=4096, bias=True)\n",
+       "          (attention_dropout): Dropout(p=0.0, inplace=False)\n",
+       "        )\n",
+       "        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): BloomMLP(\n",
+       "          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n",
+       "          (gelu_impl): BloomGelu()\n",
+       "          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (14): BloomBlock(\n",
+       "        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (self_attention): BloomAttention(\n",
+       "          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n",
+       "          (dense): Linear(in_features=4096, out_features=4096, bias=True)\n",
+       "          (attention_dropout): Dropout(p=0.0, inplace=False)\n",
+       "        )\n",
+       "        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): BloomMLP(\n",
+       "          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n",
+       "          (gelu_impl): BloomGelu()\n",
+       "          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (15): BloomBlock(\n",
+       "        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (self_attention): BloomAttention(\n",
+       "          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n",
+       "          (dense): Linear(in_features=4096, out_features=4096, bias=True)\n",
+       "          (attention_dropout): Dropout(p=0.0, inplace=False)\n",
+       "        )\n",
+       "        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): BloomMLP(\n",
+       "          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n",
+       "          (gelu_impl): BloomGelu()\n",
+       "          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (16): BloomBlock(\n",
+       "        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (self_attention): BloomAttention(\n",
+       "          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n",
+       "          (dense): Linear(in_features=4096, out_features=4096, bias=True)\n",
+       "          (attention_dropout): Dropout(p=0.0, inplace=False)\n",
+       "        )\n",
+       "        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): BloomMLP(\n",
+       "          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n",
+       "          (gelu_impl): BloomGelu()\n",
+       "          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (17): BloomBlock(\n",
+       "        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (self_attention): BloomAttention(\n",
+       "          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n",
+       "          (dense): Linear(in_features=4096, out_features=4096, bias=True)\n",
+       "          (attention_dropout): Dropout(p=0.0, inplace=False)\n",
+       "        )\n",
+       "        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): BloomMLP(\n",
+       "          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n",
+       "          (gelu_impl): BloomGelu()\n",
+       "          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (18): BloomBlock(\n",
+       "        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (self_attention): BloomAttention(\n",
+       "          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n",
+       "          (dense): Linear(in_features=4096, out_features=4096, bias=True)\n",
+       "          (attention_dropout): Dropout(p=0.0, inplace=False)\n",
+       "        )\n",
+       "        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): BloomMLP(\n",
+       "          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n",
+       "          (gelu_impl): BloomGelu()\n",
+       "          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (19): BloomBlock(\n",
+       "        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (self_attention): BloomAttention(\n",
+       "          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n",
+       "          (dense): Linear(in_features=4096, out_features=4096, bias=True)\n",
+       "          (attention_dropout): Dropout(p=0.0, inplace=False)\n",
+       "        )\n",
+       "        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): BloomMLP(\n",
+       "          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n",
+       "          (gelu_impl): BloomGelu()\n",
+       "          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (20): BloomBlock(\n",
+       "        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (self_attention): BloomAttention(\n",
+       "          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n",
+       "          (dense): Linear(in_features=4096, out_features=4096, bias=True)\n",
+       "          (attention_dropout): Dropout(p=0.0, inplace=False)\n",
+       "        )\n",
+       "        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): BloomMLP(\n",
+       "          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n",
+       "          (gelu_impl): BloomGelu()\n",
+       "          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (21): BloomBlock(\n",
+       "        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (self_attention): BloomAttention(\n",
+       "          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n",
+       "          (dense): Linear(in_features=4096, out_features=4096, bias=True)\n",
+       "          (attention_dropout): Dropout(p=0.0, inplace=False)\n",
+       "        )\n",
+       "        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): BloomMLP(\n",
+       "          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n",
+       "          (gelu_impl): BloomGelu()\n",
+       "          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (22): BloomBlock(\n",
+       "        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (self_attention): BloomAttention(\n",
+       "          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n",
+       "          (dense): Linear(in_features=4096, out_features=4096, bias=True)\n",
+       "          (attention_dropout): Dropout(p=0.0, inplace=False)\n",
+       "        )\n",
+       "        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): BloomMLP(\n",
+       "          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n",
+       "          (gelu_impl): BloomGelu()\n",
+       "          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (23): BloomBlock(\n",
+       "        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (self_attention): BloomAttention(\n",
+       "          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n",
+       "          (dense): Linear(in_features=4096, out_features=4096, bias=True)\n",
+       "          (attention_dropout): Dropout(p=0.0, inplace=False)\n",
+       "        )\n",
+       "        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): BloomMLP(\n",
+       "          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n",
+       "          (gelu_impl): BloomGelu()\n",
+       "          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (24): BloomBlock(\n",
+       "        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (self_attention): BloomAttention(\n",
+       "          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n",
+       "          (dense): Linear(in_features=4096, out_features=4096, bias=True)\n",
+       "          (attention_dropout): Dropout(p=0.0, inplace=False)\n",
+       "        )\n",
+       "        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): BloomMLP(\n",
+       "          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n",
+       "          (gelu_impl): BloomGelu()\n",
+       "          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (25): BloomBlock(\n",
+       "        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (self_attention): BloomAttention(\n",
+       "          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n",
+       "          (dense): Linear(in_features=4096, out_features=4096, bias=True)\n",
+       "          (attention_dropout): Dropout(p=0.0, inplace=False)\n",
+       "        )\n",
+       "        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): BloomMLP(\n",
+       "          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n",
+       "          (gelu_impl): BloomGelu()\n",
+       "          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (26): BloomBlock(\n",
+       "        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (self_attention): BloomAttention(\n",
+       "          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n",
+       "          (dense): Linear(in_features=4096, out_features=4096, bias=True)\n",
+       "          (attention_dropout): Dropout(p=0.0, inplace=False)\n",
+       "        )\n",
+       "        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): BloomMLP(\n",
+       "          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n",
+       "          (gelu_impl): BloomGelu()\n",
+       "          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (27): BloomBlock(\n",
+       "        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (self_attention): BloomAttention(\n",
+       "          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n",
+       "          (dense): Linear(in_features=4096, out_features=4096, bias=True)\n",
+       "          (attention_dropout): Dropout(p=0.0, inplace=False)\n",
+       "        )\n",
+       "        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): BloomMLP(\n",
+       "          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n",
+       "          (gelu_impl): BloomGelu()\n",
+       "          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (28): BloomBlock(\n",
+       "        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (self_attention): BloomAttention(\n",
+       "          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n",
+       "          (dense): Linear(in_features=4096, out_features=4096, bias=True)\n",
+       "          (attention_dropout): Dropout(p=0.0, inplace=False)\n",
+       "        )\n",
+       "        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): BloomMLP(\n",
+       "          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n",
+       "          (gelu_impl): BloomGelu()\n",
+       "          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (29): BloomBlock(\n",
+       "        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (self_attention): BloomAttention(\n",
+       "          (query_key_value): Linear(in_features=4096, out_features=12288, bias=True)\n",
+       "          (dense): Linear(in_features=4096, out_features=4096, bias=True)\n",
+       "          (attention_dropout): Dropout(p=0.0, inplace=False)\n",
+       "        )\n",
+       "        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "        (mlp): BloomMLP(\n",
+       "          (dense_h_to_4h): Linear(in_features=4096, out_features=16384, bias=True)\n",
+       "          (gelu_impl): BloomGelu()\n",
+       "          (dense_4h_to_h): Linear(in_features=16384, out_features=4096, bias=True)\n",
+       "        )\n",
+       "      )\n",
+       "    )\n",
+       "    (ln_f): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n",
+       "  )\n",
+       "  (lm_head): Linear(in_features=4096, out_features=250880, bias=False)\n",
+       ")"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#!g1.1\n",
+    "gpt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "9280b510",
+   "metadata": {
+    "cellId": "a7nstbdt9vo9qikpzvo48c",
+    "id": "jV3pGEGalDwz"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n"
+     ]
+    }
+   ],
+   "source": [
+    "#!g1.1\n",
+    "bnbfy_(gpt, adapter_dim=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e35305f2",
+   "metadata": {
+    "cellId": "q5jafg9w9x0hg355icd4vo"
+   },
+   "outputs": [],
+   "source": [
+    "#!g1.1\n",
+    "param_size = 0\n",
+    "for param in gpt.parameters():\n",
+    "    param_size += param.nelement() * param.element_size()\n",
+    "buffer_size = 0\n",
+    "for buffer in gpt.buffers():\n",
+    "    buffer_size += buffer.nelement() * buffer.element_size()\n",
+    "\n",
+    "size_all_mb = (param_size + buffer_size) / 1024**2\n",
+    "print('model size: {:.3f}MB'.format(size_all_mb))\n",
+    "gpt.save_pretrained('bloom-7b1-8bit')"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "collapsed_sections": [],
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "notebookId": "8f3ce20e-06a1-44f2-9373-2b6424b859a3",
+  "notebookPath": "bloom8bit.ipynb"
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

load_bloom.ipynb ADDED Viewed

	@@ -0,0 +1,616 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "e28407ed",
+   "metadata": {
+    "cellId": "b3rvfwilbqlfhl99ot1adn"
+   },
+   "source": [
+    "# Example of Fine-tuning 7.1 billion Bloom with 8-bit weights\n",
+    "\n",
+    "This notebook shows an example of how to fine tune Bloom with Low Rank Adapters. Heavily inspired by [Hivemind's work](https://colab.research.google.com/drive/1ft6wQU0BhqG5PRlwgaZJv2VukKKjU4Es)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5f5a6af2",
+   "metadata": {
+    "cellId": "q43y9u4kj5g2qn01pdohou"
+   },
+   "source": [
+    "### Load and convert original Bloom structure to 8-bit LoRA\n",
+    "\n",
+    "You can load an already compressed 8-bit version of Bloom from [joaoalvarenga/bloom-8bit](https://huggingface.co/joaoalvarenga/bloom-8bit), but first we need to make some adaptations into original model structure. Some of the following code is an adaptation from [Hivemind's GPT-J 8-bit fine-tuning notebook](https://colab.research.google.com/drive/1ft6wQU0BhqG5PRlwgaZJv2VukKKjU4Es)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "815f9f31",
+   "metadata": {
+    "cellId": "qwv8mzg52blrc6ghm3x9s"
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/dm/.local/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "/usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (1.26.12) or chardet (3.0.4) doesn't match a supported version!\n",
+      "  warnings.warn(\"urllib3 ({}) or chardet ({}) doesn't match a supported \"\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "===================================BUG REPORT===================================\n",
+      "Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
+      "For effortless bug reporting copy-paste your error into this form: https://docs.google.com/forms/d/e/1FAIpQLScPB8emS3Thkp66nvqwmjTEgxp8Y9ufuWTzFyr9kJ5AoI47dQ/viewform?usp=sf_link\n",
+      "================================================================================\n",
+      "CUDA_SETUP: WARNING! libcudart.so not found in any environmental path. Searching /usr/local/cuda/lib64...\n",
+      "WARNING: No libcudart.so found! Install CUDA or the cudatoolkit package (anaconda)!\n",
+      "CUDA SETUP: Loading binary /home/dm/.local/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/dm/.local/lib/python3.8/site-packages/bitsandbytes/cuda_setup/paths.py:27: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('8bitexperiments/71572fb0-3a48-4729-a863-ee5aa6e60c92')}\n",
+      "  warn(\n",
+      "/home/dm/.local/lib/python3.8/site-packages/bitsandbytes/cuda_setup/paths.py:27: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('//matplotlib_inline.backend_inline'), PosixPath('module')}\n",
+      "  warn(\n",
+      "/home/dm/.local/lib/python3.8/site-packages/bitsandbytes/cuda_setup/paths.py:27: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('/usr/local/cuda/lib64')}\n",
+      "  warn(\n",
+      "/home/dm/.local/lib/python3.8/site-packages/bitsandbytes/cextension.py:48: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers and GPU quantization are unavailable.\n",
+      "  warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "import transformers\n",
+    "\n",
+    "import torch\n",
+    "import torch.nn.functional as F\n",
+    "from torch import nn\n",
+    "from torch.cuda.amp import custom_fwd, custom_bwd\n",
+    "\n",
+    "from bitsandbytes.functional import quantize_blockwise, dequantize_blockwise\n",
+    "\n",
+    "from tqdm.auto import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "640b12f7",
+   "metadata": {
+    "cellId": "b5qu2yccmjrp0n86a0k2"
+   },
+   "outputs": [],
+   "source": [
+    "class FrozenBNBLinear(nn.Module):\n",
+    "    def __init__(self, weight, absmax, code, bias=None):\n",
+    "        assert isinstance(bias, nn.Parameter) or bias is None\n",
+    "        super().__init__()\n",
+    "        self.out_features, self.in_features = weight.shape\n",
+    "        self.register_buffer(\"weight\", weight.requires_grad_(False))\n",
+    "        self.register_buffer(\"absmax\", absmax.requires_grad_(False))\n",
+    "        self.register_buffer(\"code\", code.requires_grad_(False))\n",
+    "        self.adapter = None\n",
+    "        self.bias = bias\n",
+    " \n",
+    "    def forward(self, input):\n",
+    "        output = DequantizeAndLinear.apply(input, self.weight, self.absmax, self.code, self.bias)\n",
+    "        if self.adapter:\n",
+    "            output += self.adapter(input)\n",
+    "        return output\n",
+    " \n",
+    "    @classmethod\n",
+    "    def from_linear(cls, linear: nn.Linear) -> \"FrozenBNBLinear\":\n",
+    "        weights_int8, state = quantize_blockise_lowmemory(linear.weight)\n",
+    "        return cls(weights_int8, *state, linear.bias)\n",
+    " \n",
+    "    def __repr__(self):\n",
+    "        return f\"{self.__class__.__name__}({self.in_features}, {self.out_features})\"\n",
+    " \n",
+    " \n",
+    "class DequantizeAndLinear(torch.autograd.Function): \n",
+    "    @staticmethod\n",
+    "    @custom_fwd\n",
+    "    def forward(ctx, input: torch.Tensor, weights_quantized: torch.ByteTensor,\n",
+    "                absmax: torch.FloatTensor, code: torch.FloatTensor, bias: torch.FloatTensor):\n",
+    "        weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)\n",
+    "        ctx.save_for_backward(input, weights_quantized, absmax, code)\n",
+    "        ctx._has_bias = bias is not None\n",
+    "        return F.linear(input, weights_deq, bias)\n",
+    " \n",
+    "    @staticmethod\n",
+    "    @custom_bwd\n",
+    "    def backward(ctx, grad_output: torch.Tensor):\n",
+    "        assert not ctx.needs_input_grad[1] and not ctx.needs_input_grad[2] and not ctx.needs_input_grad[3]\n",
+    "        input, weights_quantized, absmax, code = ctx.saved_tensors\n",
+    "        # grad_output: [*batch, out_features]\n",
+    "        weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)\n",
+    "        grad_input = grad_output @ weights_deq\n",
+    "        grad_bias = grad_output.flatten(0, -2).sum(dim=0) if ctx._has_bias else None\n",
+    "        return grad_input, None, None, None, grad_bias\n",
+    " \n",
+    " \n",
+    "class FrozenBNBEmbedding(nn.Module):\n",
+    "    def __init__(self, weight, absmax, code):\n",
+    "        super().__init__()\n",
+    "        self.num_embeddings, self.embedding_dim = weight.shape\n",
+    "        self.register_buffer(\"weight\", weight.requires_grad_(False))\n",
+    "        self.register_buffer(\"absmax\", absmax.requires_grad_(False))\n",
+    "        self.register_buffer(\"code\", code.requires_grad_(False))\n",
+    "        self.adapter = None\n",
+    " \n",
+    "    def forward(self, input, **kwargs):\n",
+    "        with torch.no_grad():\n",
+    "            # note: both quantuized weights and input indices are *not* differentiable\n",
+    "            weight_deq = dequantize_blockwise(self.weight, absmax=self.absmax, code=self.code)\n",
+    "            output = F.embedding(input, weight_deq, **kwargs)\n",
+    "        if self.adapter:\n",
+    "            output += self.adapter(input)\n",
+    "        return output \n",
+    " \n",
+    "    @classmethod\n",
+    "    def from_embedding(cls, embedding: nn.Embedding) -> \"FrozenBNBEmbedding\":\n",
+    "        weights_int8, state = quantize_blockise_lowmemory(embedding.weight)\n",
+    "        return cls(weights_int8, *state)\n",
+    " \n",
+    "    def __repr__(self):\n",
+    "        return f\"{self.__class__.__name__}({self.num_embeddings}, {self.embedding_dim})\"\n",
+    " \n",
+    " \n",
+    "def quantize_blockise_lowmemory(matrix: torch.Tensor, chunk_size: int = 2 ** 20):\n",
+    "    assert chunk_size % 4096 == 0\n",
+    "    code = None\n",
+    "    chunks = []\n",
+    "    absmaxes = []\n",
+    "    flat_tensor = matrix.view(-1)\n",
+    "    for i in range((matrix.numel() - 1) // chunk_size + 1):\n",
+    "        input_chunk = flat_tensor[i * chunk_size: (i + 1) * chunk_size].clone()\n",
+    "        quantized_chunk, (absmax_chunk, code) = quantize_blockwise(input_chunk, code=code)\n",
+    "        chunks.append(quantized_chunk)\n",
+    "        absmaxes.append(absmax_chunk)\n",
+    " \n",
+    "    matrix_i8 = torch.cat(chunks).reshape_as(matrix)\n",
+    "    absmax = torch.cat(absmaxes)\n",
+    "    return matrix_i8, (absmax, code)\n",
+    "\n",
+    "\n",
+    "def convert_to_int8(model):\n",
+    "    \"\"\"Convert linear and embedding modules to 8-bit with optional adapters\"\"\"\n",
+    "    for module in list(model.modules()):\n",
+    "        for name, child in module.named_children():\n",
+    "            if isinstance(child, nn.Linear):\n",
+    "                print(name, child)\n",
+    "                setattr( \n",
+    "                    module,\n",
+    "                    name,\n",
+    "                    FrozenBNBLinear(\n",
+    "                        weight=torch.zeros(child.out_features, child.in_features, dtype=torch.uint8),\n",
+    "                        absmax=torch.zeros((child.weight.numel() - 1) // 4096 + 1),\n",
+    "                        code=torch.zeros(256),\n",
+    "                        bias=child.bias,\n",
+    "                    ),\n",
+    "                )\n",
+    "            elif isinstance(child, nn.Embedding):\n",
+    "                setattr(\n",
+    "                    module,\n",
+    "                    name,\n",
+    "                    FrozenBNBEmbedding(\n",
+    "                        weight=torch.zeros(child.num_embeddings, child.embedding_dim, dtype=torch.uint8),\n",
+    "                        absmax=torch.zeros((child.weight.numel() - 1) // 4096 + 1),\n",
+    "                        code=torch.zeros(256),\n",
+    "                    )\n",
+    "                )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "e0dbb262",
+   "metadata": {
+    "cellId": "j9ds51fcwxxy0blcplb6"
+   },
+   "outputs": [],
+   "source": [
+    "class BloomBlock(transformers.models.bloom.modeling_bloom.BloomBlock):\n",
+    "    def __init__(self, config):\n",
+    "        super().__init__(config)\n",
+    "        convert_to_int8(self.self_attention)\n",
+    "        convert_to_int8(self.mlp)\n",
+    "\n",
+    "\n",
+    "class BloomModel(transformers.models.bloom.modeling_bloom.BloomModel):\n",
+    "    def __init__(self, config):\n",
+    "        super().__init__(config)\n",
+    "        convert_to_int8(self)\n",
+    "        \n",
+    "\n",
+    "class BloomForCausalLM(transformers.models.bloom.modeling_bloom.BloomForCausalLM):\n",
+    "    def __init__(self, config):\n",
+    "        super().__init__(config)\n",
+    "        convert_to_int8(self)\n",
+    "        \n",
+    "transformers.models.bloom.modeling_bloom.BloomBlock = BloomBlock"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a68bbee4",
+   "metadata": {
+    "cellId": "a5he2q7ulm4wkwqno10wsg"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n",
+      "query_key_value Linear(in_features=4096, out_features=12288, bias=True)\n",
+      "dense Linear(in_features=4096, out_features=4096, bias=True)\n",
+      "dense_h_to_4h Linear(in_features=4096, out_features=16384, bias=True)\n",
+      "dense_4h_to_h Linear(in_features=16384, out_features=4096, bias=True)\n"
+     ]
+    }
+   ],
+   "source": [
+    "#!g1.1\n",
+    "from transformers import BloomForCausalLM, AutoModel\n",
+    "tokenizer = transformers.AutoTokenizer.from_pretrained(\"bigscience/bloom-7b1\",  cache_dir=\"mycache\")\n",
+    "model = BloomForCausalLM.from_pretrained('bloom-8bit-v4.pt')\n",
+    "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+    "model.to(device)\n",
+    "pass"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11bdaf76",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5c9feef8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prefix = \"\"\"\"It is a fantasy role-play game.\n",
+    "\n",
+    "Game Master: You are John, a wizard living in the kingdom of Larion. You have a staff and a spellbook. You finish your long journey and finally arrive at the ruin you've been looking for. You have come here searching for a mystical spellbook of great power called the book of essence. You look around and see the ancient ruins of an elf tower. The ruins have not been touched for decades. You look at the tower, and you can see a set of stone stairs that seem to lead somewhere deep inside the tower.\n",
+    "Player: I walk upstairs\n",
+    "Game Master: You climb up the stairs in the ruined tower. There is a door on the second floor of the tower, the door seems to be made of enchanted wood.\n",
+    "Player: I ask the door if I may to come in\n",
+    "Game Master: The door sighs open and you walk into the room.\n",
+    "Player: I take a look around\n",
+    "Game Master:\"\"\"\n",
+    "\n",
+    "print(end=prefix)\n",
+    "past_key_values = None  # used to keep track of conversation history\n",
+    "input_dict = tokenizer([prefix], return_tensors='pt', padding=False)\n",
+    "\n",
+    "output=\"\"\n",
+    "\n",
+    "with torch.inference_mode():\n",
+    "    for i in range(200):\n",
+    "        outputs = model.forward(**input_dict, use_cache=True, past_key_values=past_key_values)\n",
+    "        last_logits = outputs.logits[0, -1]\n",
+    "        \n",
+    "        last_logits[last_logits.topk(k=10).indices] += 10 # other logits are now e^10 times less likely to be chosen\n",
+    "\n",
+    "        past_key_values = outputs.past_key_values\n",
+    "        token_ix = torch.multinomial(last_logits.softmax(-1), 1).item()\n",
+    "        prefix = tokenizer.decode([token_ix])\n",
+    "        output = output + tokenizer.decode([token_ix])\n",
+    "        if 'player' in output or 'Player' in output:\n",
+    "            break\n",
+    "        if 'Master' in output:\n",
+    "            break\n",
+    "        print(end=tokenizer.decode([token_ix]), flush=True)\n",
+    "\n",
+    "        input_dict = dict(input_ids=torch.tensor([[token_ix]]))\n",
+    "print()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7eba8c3f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prefix = \"\"\"\"It is a fantasy role-play game.\n",
+    "\n",
+    "Game Master: You are John, a wizard living in the kingdom of Larion. You have a staff and a spellbook. You finish your long journey and finally arrive at the ruin you've been looking for. You have come here searching for a mystical spellbook of great power called the book of essence. You look around and see the ancient ruins of an elf tower. The ruins have not been touched for decades. You look at the tower, and you can see a set of stone stairs that seem to lead somewhere deep inside the tower.\n",
+    "Player: I walk upstairs\n",
+    "Game Master: You climb up the stairs in the ruined tower. There is a door on the second floor of the tower, the door seems to be made of enchanted wood.\n",
+    "Player: I ask the door if I may to come in\n",
+    "Game Master: The door sighs open and you walk into the room.\n",
+    "Player: I take a look around\n",
+    "Game Master:\"\"\"\n",
+    "\n",
+    "print(end=prefix)\n",
+    "past_key_values = None  # used to keep track of conversation history\n",
+    "input_dict = tokenizer([prefix], return_tensors='pt', padding=False)\n",
+    "\n",
+    "output = \"\"\n",
+    "\n",
+    "with torch.inference_mode():\n",
+    "    for i in range(200):\n",
+    "        outputs = model.forward(**input_dict, use_cache=True, past_key_values=past_key_values)\n",
+    "        last_logits = outputs.logits[0, -1]\n",
+    "        \n",
+    "        last_logits[last_logits.topk(k=10).indices] += 10 # other logits are now e^10 times less likely to be chosen\n",
+    "\n",
+    "        past_key_values = outputs.past_key_values\n",
+    "        token_ix = torch.multinomial(last_logits.softmax(-1), 1).item()\n",
+    "        prefix = tokenizer.decode([token_ix])\n",
+    "        output = output + tokenizer.decode([token_ix])\n",
+    "        if 'player' in output or 'Player' in output:\n",
+    "            break\n",
+    "        if 'Master' in output:\n",
+    "            break\n",
+    "        print(end=tokenizer.decode([token_ix]), flush=True)\n",
+    "\n",
+    "        input_dict = dict(input_ids=torch.tensor([[token_ix]]))\n",
+    "print()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4db07557",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!g1.1\n",
+    "prompt = tokenizer(\"A cat sat on a mat and\", return_tensors='pt')\n",
+    "out = model.generate(**prompt, min_length=10, max_length=10, do_sample=True)\n",
+    "tokenizer.decode(out[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5398feef",
+   "metadata": {
+    "cellId": "uero3zs1ebpefelhzioy2t",
+    "execution_id": "243e4f22-9ad3-412c-98cb-6b01253531c9"
+   },
+   "source": [
+    "### Fine-tune and save model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a2fdb897",
+   "metadata": {
+    "cellId": "wmrjusxrcomgqirlydiuj"
+   },
+   "outputs": [],
+   "source": [
+    "#!g1.1\n",
+    "def add_adapters(model, adapter_dim=16):\n",
+    "    assert adapter_dim > 0\n",
+    "\n",
+    "    for module in model.modules():\n",
+    "        if isinstance(module, FrozenBNBLinear):\n",
+    "            module.adapter = nn.Sequential(\n",
+    "                nn.Linear(module.in_features, adapter_dim, bias=False),\n",
+    "                nn.Linear(adapter_dim, module.out_features, bias=False),\n",
+    "            )\n",
+    "            nn.init.zeros_(module.adapter[1].weight)\n",
+    "        elif isinstance(module, FrozenBNBEmbedding):\n",
+    "            module.adapter = nn.Sequential(\n",
+    "                nn.Embedding(module.num_embeddings, adapter_dim),\n",
+    "                nn.Linear(adapter_dim, module.embedding_dim, bias=False),\n",
+    "            )\n",
+    "            nn.init.zeros_(module.adapter[1].weight)\n",
+    "\n",
+    "add_adapters(model)\n",
+    "model.to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ceac0236",
+   "metadata": {
+    "cellId": "mzznv2rl07bt6x7aybc1h"
+   },
+   "outputs": [],
+   "source": [
+    "#!g1.1\n",
+    "from datasets import load_dataset\n",
+    "from bitsandbytes.optim import Adam8bit\n",
+    "\n",
+    "model.gradient_checkpointing_enable()\n",
+    "\n",
+    "wikisql = load_dataset(\"wikisql\", streaming=True)\n",
+    "optimizer = Adam8bit(model.parameters(), lr=1e-5)\n",
+    "\n",
+    "with torch.cuda.amp.autocast():\n",
+    "    for row in tqdm(wikisql['train']):\n",
+    "\n",
+    "        batch = tokenizer(row['question'] + row['sql']['human_readable'], truncation=True, max_length=128, return_tensors='pt')\n",
+    "        batch = {k: v.cuda() for k, v in batch.items()}\n",
+    "\n",
+    "        out = gpt.forward(**batch,)\n",
+    "\n",
+    "        loss = F.cross_entropy(out.logits[:, :-1, :].flatten(0, -2), batch['input_ids'][:, 1:].flatten(),\n",
+    "                               reduction='mean')\n",
+    "        print(loss)\n",
+    "        loss.backward()\n",
+    "\n",
+    "        optimizer.step()\n",
+    "        optimizer.zero_grad()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7d1b3b65",
+   "metadata": {
+    "cellId": "mirxlhno0w8wrmaaxj4u7"
+   },
+   "outputs": [],
+   "source": [
+    "#!g1.1\n",
+    "model.save_pretrained('bloom-8bit-fine-tuned')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  },
+  "notebookId": "433858c6-d0c2-461b-85f3-1153722e7367",
+  "notebookPath": "untitled.ipynb"
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}