Maykeye
/

MambaBitPet-NoRes-NoNorm-TinyShakespeare

Model card Files Files and versions Community

Maykeye commited on Apr 25

Commit

c9fc3d0

•

1 Parent(s): 9d689e0

Initial commit (without weights)

Browse files

Files changed (3) hide show

README.md +31 -0
mambabit.py +127 -0
trainer.ipynb +196 -0

README.md CHANGED Viewed

@@ -1,3 +1,34 @@
 ---
 license: apache-2.0
 ---

 ---
 license: apache-2.0
 ---
+MambaBit. Bit-level cursed model with vocab size=2
+* 4 layers, vocab size=2, embedded size = 4096 float32 parm per bit.
+* Training was done on first 8030848 bits of tiny Shakespeare in 10 hours on laptop with 16GB VRAM on 9 batches of 128*8 bit each. Training code included in trainer.ipynb
+* To run the model run `python mambabit.py "As sun raised over"`.
+Expected output
+```
+As sun raised over me.
+LEONTES:
+Now means means me not so much as my father,
+In the good many lord, and my father come.
+KING RICHARD III:
+What is my father come and my father,
+In the good lord, and my father come and before his father.
+GLOUCESTER:
+Now the goes of men, a
+```
+* Bytes are encoded with most significant bit fed first. Eg '7' = [0, 0, 1, 1, 0, 1, 1, 1], so MSB 0 is being fed first
+rather than last as if it was with [1, 1, 1, 0, 1, 1, 0, 0]. Intuition with that is that bits at the beginning change less frequent than in the end, so model will be like "I think I will produce a digit" then "I think I will produce 7" instead of "so I spat something. Should it be a number? a letter? dunno"
+* I tried to use BF16 originally, but model went into nan (with default big LR) or gradients were so small weights didn't change(smaller LR). I switched back to F32, however some layers still initialize weight with factor x0.001 as I hoped it
+would stop model from going to nan.

mambabit.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import torch
+import torch.nn as nn
+from torch import Tensor
+from mamba_ssm.modules.mamba_simple import Mamba
+from mamba_ssm.utils.generation import InferenceParams
+from tqdm.auto import tqdm
+import sys
+dim_model = 4096
+n_vocab = 2
+n_layers = 4
+@torch.no_grad()
+def string_to_bits(text: str, _cache = []) -> Tensor:
+   all_values = torch.arange(0, 256)
+   if not _cache:
+      bits = [((all_values & (1 << i)) != 0).int() for i in range(7, -1, -1)]
+      bits_tensor = torch.stack(bits).mT
+      _cache.append(bits_tensor)
+   else:
+      bits_tensor = _cache[0]
+   binary = text.encode()
+   raw =  torch.frombuffer(binary, dtype=torch.uint8).int()
+   return bits_tensor[raw].long().ravel()
+@torch.no_grad()
+def bits_to_string(bits: Tensor):
+    if bits.dim() == 2:
+        return [bits_to_string(t) for t in bits]
+    assert bits.dim() == 1
+    assert len(bits) % 8 == 0
+    factors = torch.tensor([2**i for i in range(7,-1,-1)]).to(device=bits.device)
+    as_bytes = bits.view(-1, 8)
+    as_bytes = (as_bytes*factors).sum(-1)
+    return ''.join([chr(x) for x in as_bytes])
+class Encoder(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.emb = nn.Embedding(n_vocab, dim_model)
+        self.emb.weight.data *= 0.001
+    def forward(self, x):
+        return self.emb(x)
+class Decoder(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim_model)
+        self.decoder = nn.Linear(dim_model, n_vocab, False)
+        self.decoder.weight.data *= 0.001
+    def forward(self, x):
+        x = self.norm(x)
+        x = self.decoder(x)
+        return x
+class MambaBit(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.enc = Encoder()
+        self.layers = nn.ModuleList([Mamba(dim_model) for _ in range(n_layers)])
+        self.dec = Decoder()
+    def forward(self, x):
+        x = self.enc(x)
+        for layer in self.layers:
+            x = layer(x)
+        x = self.dec(x)
+        return x
+class MambaBitWithInference(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.enc = Encoder()
+        self.layers = nn.ModuleList([Mamba(dim_model, layer_idx=i) for i in range(n_layers)])
+        self.dec = Decoder()
+    def forward(self, x, inference_parms=None):
+        x = self.enc(x)
+        for i,layer in enumerate(self.layers):
+            x = layer(x, inference_params=inference_parms)
+        x = self.dec(x)
+        return x
+# test using O(N^2) cacheless stateless algorithm.
+@torch.no_grad()
+def test_n2(m: MambaBit, prompt: str, chars=10):
+    x = string_to_bits(prompt).cuda()[None]
+    process = chars * 8
+    for i in tqdm(range(process)):
+        y = m(x)
+        new = y[:, -1:].argmax(-1)
+        x = torch.cat((x, new), 1)
+    return bits_to_string(x)
+# test using O(N) by reusing state
+@torch.no_grad()
+def test_n(m: MambaBit, prompt: str, chars=10):
+    x = string_to_bits(prompt).cuda()[None]
+    process = chars * 8
+    inference_parms = InferenceParams(
+        max_seqlen=x.numel() + process,
+        max_batch_size=1)
+    y = m(x, inference_parms=inference_parms)
+    new = y[:, -1:].argmax(-1)
+    for i in tqdm(range(process)):
+        x = torch.cat((x, new), 1)
+        inference_parms.seqlen_offset = x.numel() + i
+        y = m(new, inference_parms=inference_parms)
+        new = y[:, -1:].argmax(-1)
+    return bits_to_string(x)
+def run():
+    mamba_bit = MambaBitWithInference().cuda()
+    mamba_bit.load_state_dict(torch.load("mamba_bit.bin"))
+    prompt="FIRST CITIZE" if len(sys.argv) != 2 else sys.argv[1]
+    # test_n2 is O(N^2), test_n is O(N) but inference_params are not well documented
+    s = test_n(mamba_bit, prompt, chars=256)[0]
+    print(s)
+if __name__ == "__main__":
+    run()

trainer.ipynb ADDED Viewed

	@@ -0,0 +1,196 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "from torch import Tensor\n",
+    "import random\n",
+    "from tqdm.auto import tqdm\n",
+    "from mamba_ssm.modules.mamba_simple import Mamba\n",
+    "\n",
+    "def model_numel(m: nn.Module):\n",
+    "    return sum(p.numel() for p in m.parameters())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "raw_txt = Path(\"../shake.txt\").read_text()\n",
+    "total_len = len(raw_txt)\n",
+    "aux_len = int(total_len * 0.05)\n",
+    "\n",
+    "head_txt, test_txt = raw_txt[:-aux_len], raw_txt[-aux_len:]\n",
+    "train_txt, valid_txt = head_txt[:-aux_len], head_txt[-aux_len:]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(train_txt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mambabit import string_to_bits, bits_to_string\n",
+    "\n",
+    "train_ds = string_to_bits(train_txt)\n",
+    "valid_ds = string_to_bits(valid_txt)\n",
+    "test_ds = string_to_bits(test_txt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def random_batches(split: Tensor, n_batch: int, bs: int):\n",
+    "    assert bs % 8 == 0, \"have mercy\"\n",
+    "    max_allowed_pos = len(split) // 8 - bs // 8\n",
+    "\n",
+    "    values = []\n",
+    "    for i in range(n_batch):\n",
+    "        pos = random.randint(0, max_allowed_pos)\n",
+    "        values.append(split[pos*8: pos*8+bs])\n",
+    "    return torch.stack(values).cuda()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mambabit import dim_model, n_vocab, n_layers, MambaBit"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mamba_bit = MambaBit().cuda()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if True:\n",
+    "    mamba_bit.load_state_dict(torch.load(\"mamba_bit.bin\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "def train(m: nn.Module, \n",
+    "        n_epoch: int = 100,         \n",
+    "        n_batch: int = 4, \n",
+    "        bs: int = 256):\n",
+    "    opt = torch.optim.AdamW(m.parameters(), lr=0.0001, fused=True)\n",
+    "\n",
+    "    for e in (bar := tqdm(range(n_epoch))):        \n",
+    "        b = random_batches(train_ds, n_batch, bs)\n",
+    "\n",
+    "        y_pred = m(b)\n",
+    "        y_pred = y_pred[:, :-1].reshape(-1, n_vocab)\n",
+    "        y_true = b[:, 1:].ravel()\n",
+    "\n",
+    "        loss = F.cross_entropy(y_pred,y_true)\n",
+    "        loss.backward()\n",
+    "        opt.step()\n",
+    "        opt.zero_grad()\n",
+    "       \n",
+    "        l = loss.item()\n",
+    "        bar.set_description(f\"L:{l:.10f}\")\n",
+    "\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if True:\n",
+    "    train(mamba_bit, 5000, 9, 8*128)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.save(mamba_bit.state_dict(), \"mamba_bit.bin\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# TEST\n",
+    "@torch.no_grad()\n",
+    "def test(prompt: str, chars=10):\n",
+    "    x0 = decode_bits(prompt).cuda()[None]\n",
+    "    x = x0.clone()\n",
+    "    process = chars * 8\n",
+    "    for _ in tqdm(range(process)):\n",
+    "        y = mamba_bit(x)\n",
+    "        new = y[:, -1:].argmax(-1)\n",
+    "        x = torch.cat((x, new), 1)    \n",
+    "    return encode_bits(x)\n",
+    "\n",
+    "    \n",
+    "print(test(\"FIRST CIT\", chars=10))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "sd",
+   "language": "python",
+   "name": "sd"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}