jkeisling commited on
Commit
fb24f54
·
1 Parent(s): cdbe788

Initial commit: Port over untracked work in flight

Browse files
.gitignore CHANGED
@@ -1,3 +1,7 @@
 
 
 
 
1
  # Byte-compiled / optimized / DLL files
2
  __pycache__/
3
  *.py[cod]
 
1
+ # Large files
2
+ checkpoints/
3
+ datasets/
4
+
5
  # Byte-compiled / optimized / DLL files
6
  __pycache__/
7
  *.py[cod]
gpt.ipynb ADDED
@@ -0,0 +1,708 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "attachments": {},
5
+ "cell_type": "markdown",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Learn GPT from scratch"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 36,
14
+ "metadata": {},
15
+ "outputs": [],
16
+ "source": [
17
+ "import os\n",
18
+ "\n",
19
+ "# We always start with a dataset to train on. Let's download the tiny shakespeare dataset\n",
20
+ "if not os.path.isfile(\"./datasets/corpora/shakespeare.txt\"):\n",
21
+ " !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt > datasets/corpora/shakespeare.txt"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": 10,
27
+ "metadata": {},
28
+ "outputs": [],
29
+ "source": [
30
+ "with open(\"datasets/corpora/shakespeare.txt\", 'r', encoding='utf-8') as f:\n",
31
+ " text = f.read()"
32
+ ]
33
+ },
34
+ {
35
+ "attachments": {},
36
+ "cell_type": "markdown",
37
+ "metadata": {},
38
+ "source": [
39
+ "## Tokenization and dataset creation"
40
+ ]
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "execution_count": 11,
45
+ "metadata": {},
46
+ "outputs": [
47
+ {
48
+ "data": {
49
+ "text/plain": [
50
+ "<torch._C.Generator at 0x7f7b543cb430>"
51
+ ]
52
+ },
53
+ "execution_count": 11,
54
+ "metadata": {},
55
+ "output_type": "execute_result"
56
+ }
57
+ ],
58
+ "source": [
59
+ "import torch\n",
60
+ "import torch.nn as nn\n",
61
+ "import torch.optim as optim\n",
62
+ "from torch.optim import lr_scheduler\n",
63
+ "import torch.nn.functional as F\n",
64
+ "from torch.utils.data import Dataset, DataLoader, TensorDataset, random_split\n",
65
+ "import pandas as pd\n",
66
+ "import numpy as np\n",
67
+ "import math\n",
68
+ "\n",
69
+ "torch.manual_seed(1337)"
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "code",
74
+ "execution_count": 12,
75
+ "metadata": {},
76
+ "outputs": [],
77
+ "source": [
78
+ "# Simple dumb ASCII character-level \"encoding\" since all training data is ASCII\n",
79
+ "def encode_text(text):\n",
80
+ " return([ord(t) for t in text])\n",
81
+ "\n",
82
+ "def decode_text(indices):\n",
83
+ " return([chr(x) for x in indices])\n"
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "code",
88
+ "execution_count": 13,
89
+ "metadata": {},
90
+ "outputs": [],
91
+ "source": [
92
+ "# Tensorify data, put it in dataset\n",
93
+ "data = torch.tensor(encode_text(text), dtype=torch.int32)\n",
94
+ "\n",
95
+ "split_idx = int(0.9 * len(data))\n",
96
+ "train_data = data[:split_idx]\n",
97
+ "test_data = data[split_idx:]"
98
+ ]
99
+ },
100
+ {
101
+ "attachments": {},
102
+ "cell_type": "markdown",
103
+ "metadata": {},
104
+ "source": [
105
+ "We have to make a custom PyTorch dataset class to automatically generate the \"context\" windows at load time. This allows us to avoid keeping these windows around in memory when not in use:"
106
+ ]
107
+ },
108
+ {
109
+ "cell_type": "code",
110
+ "execution_count": 31,
111
+ "metadata": {},
112
+ "outputs": [],
113
+ "source": [
114
+ "class TextDataset(Dataset):\n",
115
+ " def __init__(self, data_tensor, context_size):\n",
116
+ " self.data_tensor = data_tensor\n",
117
+ " self.context_size = context_size\n",
118
+ " \n",
119
+ " def __len__(self):\n",
120
+ " return len(self.data_tensor)\n",
121
+ "\n",
122
+ " def __getitem__(self, index):\n",
123
+ " if index < self.context_size:\n",
124
+ " x = F.pad(self.data_tensor[:index], (self.context_size - index, 0), value=0)\n",
125
+ " else:\n",
126
+ " x = self.data_tensor[index - self.context_size:index]\n",
127
+ " \n",
128
+ " y = self.data_tensor[index]\n",
129
+ " return x, y"
130
+ ]
131
+ },
132
+ {
133
+ "attachments": {},
134
+ "cell_type": "markdown",
135
+ "metadata": {},
136
+ "source": [
137
+ "NOTE 2023-03-25: I think this is bugged, and that's the reason the training loss is so damn high. Testing:"
138
+ ]
139
+ },
140
+ {
141
+ "cell_type": "code",
142
+ "execution_count": 34,
143
+ "metadata": {},
144
+ "outputs": [
145
+ {
146
+ "name": "stdout",
147
+ "output_type": "stream",
148
+ "text": [
149
+ "Step 0:\n",
150
+ "[0, 0, 0, 0, 0, 0, 0, 0]\n",
151
+ "---\n",
152
+ "[0, 0, 0, 0, 0, 0, 0, 70]\n",
153
+ "---\n",
154
+ "['F', 'i']\n",
155
+ "Step 1:\n",
156
+ "[0, 0, 0, 0, 0, 0, 70, 105]\n",
157
+ "---\n",
158
+ "[0, 0, 0, 0, 0, 70, 105, 114]\n",
159
+ "---\n",
160
+ "['r', 's']\n",
161
+ "Step 2:\n",
162
+ "[0, 0, 0, 0, 70, 105, 114, 115]\n",
163
+ "---\n",
164
+ "[0, 0, 0, 70, 105, 114, 115, 116]\n",
165
+ "---\n",
166
+ "['t', ' ']\n",
167
+ "Step 3:\n",
168
+ "[0, 0, 70, 105, 114, 115, 116, 32]\n",
169
+ "---\n",
170
+ "[0, 70, 105, 114, 115, 116, 32, 67]\n",
171
+ "---\n",
172
+ "['C', 'i']\n",
173
+ "Step 4:\n",
174
+ "[70, 105, 114, 115, 116, 32, 67, 105]\n",
175
+ "---\n",
176
+ "[105, 114, 115, 116, 32, 67, 105, 116]\n",
177
+ "---\n",
178
+ "['t', 'i']\n",
179
+ "Step 5:\n",
180
+ "[114, 115, 116, 32, 67, 105, 116, 105]\n",
181
+ "---\n",
182
+ "[115, 116, 32, 67, 105, 116, 105, 122]\n",
183
+ "---\n",
184
+ "['z', 'e']\n"
185
+ ]
186
+ }
187
+ ],
188
+ "source": [
189
+ "train_dataset = TextDataset(train_data, 8)\n",
190
+ "train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=False)\n",
191
+ "\n",
192
+ "step = 0\n",
193
+ "for x, y in train_dataloader:\n",
194
+ " print(f\"Step {step}:\")\n",
195
+ " for b in x.tolist():\n",
196
+ " print(b)\n",
197
+ " print(\"---\")\n",
198
+ "\n",
199
+ " print(decode_text(y.tolist()))\n",
200
+ " step += 1\n",
201
+ " if step > 5:\n",
202
+ " break\n",
203
+ "\n"
204
+ ]
205
+ },
206
+ {
207
+ "attachments": {},
208
+ "cell_type": "markdown",
209
+ "metadata": {},
210
+ "source": [
211
+ "## Attention is all you need (注目こそが必要なすべて)"
212
+ ]
213
+ },
214
+ {
215
+ "cell_type": "code",
216
+ "execution_count": 8,
217
+ "metadata": {},
218
+ "outputs": [],
219
+ "source": [
220
+ "class MultiheadAttention(nn.Module):\n",
221
+ " def __init__(self, embed_dim, num_heads, dropout=0.0, bias=True, device=None, dtype=None):\n",
222
+ " super(MultiheadAttention, self).__init__()\n",
223
+ "\n",
224
+ " # Save variables\n",
225
+ " self.embed_dim = embed_dim\n",
226
+ " self.num_heads = num_heads\n",
227
+ " self.d_k = embed_dim // num_heads\n",
228
+ "\n",
229
+ " self.Q = nn.Linear(embed_dim, embed_dim, bias=bias)\n",
230
+ " self.K = nn.Linear(embed_dim, embed_dim, bias=bias)\n",
231
+ " self.V = nn.Linear(embed_dim, embed_dim, bias=bias)\n",
232
+ "\n",
233
+ " self.dropout = nn.Dropout(dropout)\n",
234
+ " self.out_proj = nn.Linear(embed_dim, embed_dim, bias=False)\n",
235
+ " nn.init.kaiming_normal_(self.out_proj.weight, mode='fan_in', nonlinearity='linear')\n",
236
+ "\n",
237
+ " def forward(self, query, key, value, key_padding_mask=None):\n",
238
+ " batch_size = query.size(0)\n",
239
+ "\n",
240
+ " # Apply linear layers\n",
241
+ " q = self.Q(query) # [B, C, E]\n",
242
+ " k = self.K(key) # [B, C, E]\n",
243
+ " v = self.V(value) # [B, C, E]\n",
244
+ "\n",
245
+ " # Mutate dimensions so the attention matmul can get rid of the inner d_k\n",
246
+ " q = q.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2) # [batch_size, num_heads, C, d_k]\n",
247
+ " k = k.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2) # [batch_size, num_heads, C, d_k]\n",
248
+ " v = v.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2) # [batch_size, num_heads, C, d_k]\n",
249
+ " \n",
250
+ " # Get raw attention scores\n",
251
+ " scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) # [B, num_heads, C, C]\n",
252
+ "\n",
253
+ " # Apply mask, if necessary\n",
254
+ " if key_padding_mask is not None:\n",
255
+ " \"\"\"\n",
256
+ " MAY BE WORTH DEBUGGING\n",
257
+ "\n",
258
+ " if key_padding_mask.dim() == 3:\n",
259
+ " # If the mask is 3D, add an extra dimension for the num_heads\n",
260
+ " key_padding_mask = key_padding_mask.unsqueeze(1) # [batch_size, 1, seq_len, seq_len]\n",
261
+ " else:\n",
262
+ " # If the mask is 2D, add dimensions for the num_heads and the 'query' sequence length\n",
263
+ " key_padding_mask = key_padding_mask.unsqueeze(1).unsqueeze(2) # [batch_size, 1, 1, seq_len]\n",
264
+ " \"\"\"\n",
265
+ " # Apply the mask to attention scores\n",
266
+ " scores = scores.masked_fill(key_padding_mask, float('-inf'))\n",
267
+ "\n",
268
+ " # Scale by sqrt(k)\n",
269
+ " attn = F.softmax(scores, dim=-1)\n",
270
+ " attn = self.dropout(attn)\n",
271
+ " out = attn @ v # [B, num_heads, C, d_k]\n",
272
+ "\n",
273
+ " # Concat and project\n",
274
+ " # Swap C and num_heads, force memory to coalesce, then fuse back num_heads and d_k together\n",
275
+ " out = out.transpose(1, 2).contiguous().view(batch_size, -1, self.embed_dim)\n",
276
+ " # Project: give attention \"time to think\". Maybe this should be part of a different module but whatever\n",
277
+ " out = self.out_proj(out)\n",
278
+ " return(out)\n",
279
+ "\n"
280
+ ]
281
+ },
282
+ {
283
+ "cell_type": "code",
284
+ "execution_count": 9,
285
+ "metadata": {},
286
+ "outputs": [],
287
+ "source": [
288
+ "class FeedForward(nn.Module):\n",
289
+ " def __init__(self, embed_dim, dropout):\n",
290
+ " super().__init__()\n",
291
+ " self.net = nn.Sequential(\n",
292
+ " nn.Linear(embed_dim, 4 * embed_dim),\n",
293
+ " nn.ReLU(),\n",
294
+ " nn.Dropout(dropout)\n",
295
+ " nn.Linear(4 * embed_dim, embed_dim),\n",
296
+ " )\n",
297
+ "\n",
298
+ " def forward(self, x):\n",
299
+ " return(self.net(x))"
300
+ ]
301
+ },
302
+ {
303
+ "cell_type": "code",
304
+ "execution_count": 10,
305
+ "metadata": {},
306
+ "outputs": [],
307
+ "source": [
308
+ "class Block(nn.Module):\n",
309
+ " \"\"\"Self-attention\"\"\"\n",
310
+ " def __init__(self, embed_dim, num_heads, mask, dropout=0.2):\n",
311
+ " super(Block, self).__init__() \n",
312
+ " self.register_buffer(\"mask\", mask)\n",
313
+ " self.head = MultiheadAttention(embed_dim=embed_dim, num_heads=num_heads, dropout=dropout)\n",
314
+ " self.ffwd = FeedForward(embed_dim=embed_dim, dropout=dropout)\n",
315
+ " self.ln1 = nn.LayerNorm(embed_dim)\n",
316
+ " self.ln2 = nn.LayerNorm(embed_dim)\n",
317
+ "\n",
318
+ " def forward(self, x):\n",
319
+ " # Residual connections\n",
320
+ " x = self.ln1(x)\n",
321
+ " x = x + self.head.forward(x, x, x, key_padding_mask=self.mask) \n",
322
+ " out = x + self.ffwd(self.ln2(x))\n",
323
+ " return out\n"
324
+ ]
325
+ },
326
+ {
327
+ "cell_type": "code",
328
+ "execution_count": 11,
329
+ "metadata": {},
330
+ "outputs": [],
331
+ "source": [
332
+ "class GPT(nn.Module):\n",
333
+ " def __init__(self, embedding_dim, vocab_size, context_size, lr=1e-3):\n",
334
+ " # Inherit PyTorch stuff\n",
335
+ " super(GPT, self).__init__()\n",
336
+ "\n",
337
+ " # Save variables for later\n",
338
+ " self.embedding_dim = embedding_dim\n",
339
+ " self.output_dim = vocab_size\n",
340
+ " self.context_size = context_size\n",
341
+ "\n",
342
+ " # Initialize layers. Sadly this breaks the whole \"self.layers: concept but whatever\n",
343
+ " self.tok_embed = nn.Embedding(vocab_size, embedding_dim)\n",
344
+ " self.pos_embed = nn.Embedding(context_size, embedding_dim)\n",
345
+ "\n",
346
+ " NUM_HEADS=6\n",
347
+ " NUM_LAYERS=6\n",
348
+ " \n",
349
+ " mask = torch.tril(torch.ones(self.context_size, self.context_size)).bool()\n",
350
+ " mask = ~mask\n",
351
+ " self.register_buffer(mask)\n",
352
+ "\n",
353
+ " self.blocks = nn.Sequential(\n",
354
+ " *[Block(embed_dim=embedding_dim, num_heads=NUM_HEADS, mask=mask) for _ in range(NUM_LAYERS)],\n",
355
+ " nn.Dropout(0.2)\n",
356
+ " )\n",
357
+ "\n",
358
+ " # Final feed-forward layer from embeddings\n",
359
+ " self.ffwd = nn.Linear(embedding_dim, out_features=vocab_size)\n",
360
+ "\n",
361
+ " def forward(self, x):\n",
362
+ " tok_embed = self.tok_embed(x)\n",
363
+ " tok_embed = tok_embed.view(-1, self.context_size, self.embedding_dim)\n",
364
+ " pos_embed = self.pos_embed(torch.arange(0, self.context_size, device=\"cuda\")).unsqueeze(0)\n",
365
+ " x = tok_embed + pos_embed\n",
366
+ "\n",
367
+ " # The actual attention is all you need here!\n",
368
+ " # B*C*C cutting out the future\n",
369
+ " x = self.blocks(x)\n",
370
+ "\n",
371
+ " preds = self.ffwd(x)\n",
372
+ " return(preds)\n",
373
+ " \n",
374
+ " def infer(self, x):\n",
375
+ " with torch.no_grad():\n",
376
+ " res = self.forward(x)\n",
377
+ " return(res)\n"
378
+ ]
379
+ },
380
+ {
381
+ "attachments": {},
382
+ "cell_type": "markdown",
383
+ "metadata": {},
384
+ "source": [
385
+ "## Training"
386
+ ]
387
+ },
388
+ {
389
+ "cell_type": "code",
390
+ "execution_count": 19,
391
+ "metadata": {},
392
+ "outputs": [],
393
+ "source": [
394
+ "def compute_loss(model, criterion, x, y):\n",
395
+ " logits = model(x)\n",
396
+ " last_logits = logits[:, -1, :]\n",
397
+ " log_probs = nn.LogSoftmax(dim=1)(last_logits)\n",
398
+ " loss = criterion(log_probs, y.view(-1).long())\n",
399
+ " return loss"
400
+ ]
401
+ },
402
+ {
403
+ "cell_type": "code",
404
+ "execution_count": 47,
405
+ "metadata": {},
406
+ "outputs": [],
407
+ "source": [
408
+ "EMBEDDING_NDIM = 384\n",
409
+ "VOCAB_SIZE = 128\n",
410
+ "BATCH_SIZE=64\n",
411
+ "# \"Context window\"\n",
412
+ "BLOCK_SIZE=256\n",
413
+ "LR=1e-3\n",
414
+ "\n",
415
+ "train_dataset = TextDataset(train_data, BLOCK_SIZE)\n",
416
+ "test_dataset = TextDataset(train_data, BLOCK_SIZE)\n",
417
+ "\n",
418
+ "# Janky training code\n",
419
+ "model = GPT(\n",
420
+ " embedding_dim=EMBEDDING_NDIM, \n",
421
+ " vocab_size=VOCAB_SIZE,\n",
422
+ " context_size=BLOCK_SIZE,\n",
423
+ " lr=LR\n",
424
+ " )\n",
425
+ "\n",
426
+ "model = model.to('cuda')\n",
427
+ "optimizer = optim.AdamW(model.parameters(), lr=LR)\n",
428
+ "# TODO Fix this!\n",
429
+ "scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10000, gamma=0.2)\n",
430
+ "criterion = nn.NLLLoss()"
431
+ ]
432
+ },
433
+ {
434
+ "cell_type": "code",
435
+ "execution_count": 50,
436
+ "metadata": {},
437
+ "outputs": [
438
+ {
439
+ "name": "stdout",
440
+ "output_type": "stream",
441
+ "text": [
442
+ "Step 0; loss: 3.3686537742614746\n",
443
+ "Step 100; loss: 3.3535483678181968\n",
444
+ "Step 200; loss: 3.3484479188919067\n",
445
+ "Step 300; loss: 3.344235420227051\n",
446
+ "Step 400; loss: 3.338580369949341\n",
447
+ "Step 500; loss: 3.330465725490025\n",
448
+ "Step 600; loss: 3.333183079957962\n",
449
+ "Step 700; loss: 3.3319032986958823\n",
450
+ "Step 800; loss: 3.332624101638794\n",
451
+ "Step 900; loss: 3.3325188810175117\n",
452
+ "Step 1000; loss: 3.331260542074839\n",
453
+ "Step 1100; loss: 3.3311657355381894\n"
454
+ ]
455
+ },
456
+ {
457
+ "ename": "KeyboardInterrupt",
458
+ "evalue": "",
459
+ "output_type": "error",
460
+ "traceback": [
461
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
462
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
463
+ "\u001b[1;32m/home/ritsuko/projects/ai/micrograd/gpt.ipynb Cell 20\u001b[0m in \u001b[0;36m2\n\u001b[1;32m <a href='vscode-notebook-cell:/home/ritsuko/projects/ai/micrograd/gpt.ipynb#X24sZmlsZQ%3D%3D?line=24'>25</a>\u001b[0m \u001b[39m# Backward pass\u001b[39;00m\n\u001b[1;32m <a href='vscode-notebook-cell:/home/ritsuko/projects/ai/micrograd/gpt.ipynb#X24sZmlsZQ%3D%3D?line=25'>26</a>\u001b[0m optimizer\u001b[39m.\u001b[39mzero_grad()\n\u001b[0;32m---> <a href='vscode-notebook-cell:/home/ritsuko/projects/ai/micrograd/gpt.ipynb#X24sZmlsZQ%3D%3D?line=26'>27</a>\u001b[0m loss\u001b[39m.\u001b[39;49mbackward()\n\u001b[1;32m <a href='vscode-notebook-cell:/home/ritsuko/projects/ai/micrograd/gpt.ipynb#X24sZmlsZQ%3D%3D?line=27'>28</a>\u001b[0m optimizer\u001b[39m.\u001b[39mstep()\n\u001b[1;32m <a href='vscode-notebook-cell:/home/ritsuko/projects/ai/micrograd/gpt.ipynb#X24sZmlsZQ%3D%3D?line=28'>29</a>\u001b[0m scheduler\u001b[39m.\u001b[39mstep()\n",
464
+ "File \u001b[0;32m~/.local/lib/python3.10/site-packages/torch/_tensor.py:396\u001b[0m, in \u001b[0;36mTensor.backward\u001b[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001b[0m\n\u001b[1;32m 387\u001b[0m \u001b[39mif\u001b[39;00m has_torch_function_unary(\u001b[39mself\u001b[39m):\n\u001b[1;32m 388\u001b[0m \u001b[39mreturn\u001b[39;00m handle_torch_function(\n\u001b[1;32m 389\u001b[0m Tensor\u001b[39m.\u001b[39mbackward,\n\u001b[1;32m 390\u001b[0m (\u001b[39mself\u001b[39m,),\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 394\u001b[0m create_graph\u001b[39m=\u001b[39mcreate_graph,\n\u001b[1;32m 395\u001b[0m inputs\u001b[39m=\u001b[39minputs)\n\u001b[0;32m--> 396\u001b[0m torch\u001b[39m.\u001b[39;49mautograd\u001b[39m.\u001b[39;49mbackward(\u001b[39mself\u001b[39;49m, gradient, retain_graph, create_graph, inputs\u001b[39m=\u001b[39;49minputs)\n",
465
+ "File \u001b[0;32m~/.local/lib/python3.10/site-packages/torch/autograd/__init__.py:173\u001b[0m, in \u001b[0;36mbackward\u001b[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[0m\n\u001b[1;32m 168\u001b[0m retain_graph \u001b[39m=\u001b[39m create_graph\n\u001b[1;32m 170\u001b[0m \u001b[39m# The reason we repeat same the comment below is that\u001b[39;00m\n\u001b[1;32m 171\u001b[0m \u001b[39m# some Python versions print out the first line of a multi-line function\u001b[39;00m\n\u001b[1;32m 172\u001b[0m \u001b[39m# calls in the traceback and some print out the last line\u001b[39;00m\n\u001b[0;32m--> 173\u001b[0m Variable\u001b[39m.\u001b[39;49m_execution_engine\u001b[39m.\u001b[39;49mrun_backward( \u001b[39m# Calls into the C++ engine to run the backward pass\u001b[39;49;00m\n\u001b[1;32m 174\u001b[0m tensors, grad_tensors_, retain_graph, create_graph, inputs,\n\u001b[1;32m 175\u001b[0m allow_unreachable\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m, accumulate_grad\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m)\n",
466
+ "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
467
+ ]
468
+ }
469
+ ],
470
+ "source": [
471
+ "from torch.utils.\n",
472
+ "EPOCHS = 1\n",
473
+ "STEPS = 5000\n",
474
+ "VAL_INTERVAL = 100\n",
475
+ "\n",
476
+ "losses = []\n",
477
+ "model.train()\n",
478
+ "\n",
479
+ "train_dataloader = DataLoader(\n",
480
+ " train_dataset, \n",
481
+ " batch_size=BATCH_SIZE, \n",
482
+ " shuffle=True, \n",
483
+ " num_workers=4\n",
484
+ ")\n",
485
+ "\n",
486
+ "test_dataloader = DataLoader(test_dataset, batch_size=512, num_workers=4, shuffle=True)\n",
487
+ "\n",
488
+ "step = 0\n",
489
+ "for epoch in range(EPOCHS):\n",
490
+ " for data, target in train_dataloader:\n",
491
+ " data = data.to('cuda')\n",
492
+ " target = target.to('cuda')\n",
493
+ "\n",
494
+ " loss = compute_loss(model, criterion, data, target)\n",
495
+ "\n",
496
+ " # Backward pass\n",
497
+ " optimizer.zero_grad()\n",
498
+ " loss.backward()\n",
499
+ " optimizer.step()\n",
500
+ " scheduler.step()\n",
501
+ "\n",
502
+ " losses.append(loss.cpu().detach().numpy())\n",
503
+ "\n",
504
+ " if step % VAL_INTERVAL == 0:\n",
505
+ " with torch.no_grad():\n",
506
+ " model.eval()\n",
507
+ " for x, y in test_dataloader:\n",
508
+ " x = x.to(\"cuda\")\n",
509
+ " y = y.to(\"cuda\")\n",
510
+ "\n",
511
+ " batch_loss = compute_loss(model, criterion, x, y)\n",
512
+ " total_loss += batch_loss.item() * 512\n",
513
+ " total_samples += 512\n",
514
+ " if total_samples > 10:\n",
515
+ " break\n",
516
+ "\n",
517
+ " average_loss = total_loss / total_samples\n",
518
+ " print(f\"Step {step}; loss: {average_loss}\")\n",
519
+ " model.train()\n",
520
+ "\n",
521
+ " step += 1\n",
522
+ " if step >= STEPS:\n",
523
+ " break\n"
524
+ ]
525
+ },
526
+ {
527
+ "cell_type": "code",
528
+ "execution_count": 15,
529
+ "metadata": {},
530
+ "outputs": [],
531
+ "source": [
532
+ "PATH = \"checkpoints/model.pt\""
533
+ ]
534
+ },
535
+ {
536
+ "cell_type": "code",
537
+ "execution_count": 36,
538
+ "metadata": {},
539
+ "outputs": [],
540
+ "source": [
541
+ "\n",
542
+ "# Store\n",
543
+ "torch.save({\n",
544
+ " 'steps': step,\n",
545
+ " 'model_state_dict': model.state_dict(),\n",
546
+ " 'optimizer_state_dict': optimizer.state_dict(),\n",
547
+ "}, PATH)"
548
+ ]
549
+ },
550
+ {
551
+ "cell_type": "code",
552
+ "execution_count": 18,
553
+ "metadata": {},
554
+ "outputs": [],
555
+ "source": [
556
+ "checkpoint = torch.load(PATH)\n",
557
+ "model.load_state_dict(checkpoint['model_state_dict'])\n",
558
+ "optimizer.load_state_dict(checkpoint['optimizer_state_dict'])"
559
+ ]
560
+ },
561
+ {
562
+ "attachments": {},
563
+ "cell_type": "markdown",
564
+ "metadata": {},
565
+ "source": [
566
+ "Now we test for overfitting:"
567
+ ]
568
+ },
569
+ {
570
+ "cell_type": "code",
571
+ "execution_count": 37,
572
+ "metadata": {},
573
+ "outputs": [
574
+ {
575
+ "data": {
576
+ "text/plain": [
577
+ "2399"
578
+ ]
579
+ },
580
+ "execution_count": 37,
581
+ "metadata": {},
582
+ "output_type": "execute_result"
583
+ }
584
+ ],
585
+ "source": [
586
+ "import gc\n",
587
+ "gc.collect()"
588
+ ]
589
+ },
590
+ {
591
+ "cell_type": "code",
592
+ "execution_count": 51,
593
+ "metadata": {},
594
+ "outputs": [
595
+ {
596
+ "name": "stdout",
597
+ "output_type": "stream",
598
+ "text": [
599
+ "3.4188449382781982\n"
600
+ ]
601
+ }
602
+ ],
603
+ "source": [
604
+ "model.eval()\n",
605
+ "total_loss = 0.0\n",
606
+ "total_samples = 0\n",
607
+ "\n",
608
+ "test_dataloader = DataLoader(test_dataset, batch_size=512, num_workers=4)\n",
609
+ "with torch.no_grad():\n",
610
+ " for x, y in test_dataloader:\n",
611
+ " x = x.to(\"cuda\")\n",
612
+ " y = y.to(\"cuda\")\n",
613
+ "\n",
614
+ " batch_loss = compute_loss(model, criterion, x, y)\n",
615
+ " total_loss += batch_loss.item() * x.size(0)\n",
616
+ " total_samples += x.size(0)\n",
617
+ " if total_samples > 100:\n",
618
+ " break\n",
619
+ "\n",
620
+ " average_loss = total_loss / total_samples\n",
621
+ " print(average_loss)"
622
+ ]
623
+ },
624
+ {
625
+ "cell_type": "code",
626
+ "execution_count": null,
627
+ "metadata": {},
628
+ "outputs": [],
629
+ "source": []
630
+ },
631
+ {
632
+ "attachments": {},
633
+ "cell_type": "markdown",
634
+ "metadata": {},
635
+ "source": [
636
+ "Finally, we generate:"
637
+ ]
638
+ },
639
+ {
640
+ "cell_type": "code",
641
+ "execution_count": 52,
642
+ "metadata": {},
643
+ "outputs": [
644
+ {
645
+ "name": "stdout",
646
+ "output_type": "stream",
647
+ "text": [
648
+ ",n aon mr\n",
649
+ "nr\n",
650
+ "egtel s.mangtVk h\n",
651
+ " -hinSfii ol ihIraddeioi akpshaC.n trU d aamooaa eoeEhl:daoUabo'm-fddE auh hpyHs wv'erstiInnmwt hnAuNu ufl\n",
652
+ "I: rl.T l!eool'lIhl:aynet nna:i yaneehtea hdel\n",
653
+ " hse l;imi\n",
654
+ " hgy f iuto eoh gBum.umhemvt\n",
655
+ "a hFo lNsute oaaenh;byeon"
656
+ ]
657
+ }
658
+ ],
659
+ "source": [
660
+ "g_cuda = torch.Generator(device='cuda')\n",
661
+ "\n",
662
+ "contexts = torch.tensor(encode_text(\"God\"), dtype=torch.int32).to('cuda')\n",
663
+ "GEN_LENGTH=256\n",
664
+ "\n",
665
+ "model.eval()\n",
666
+ "for i in range(GEN_LENGTH):\n",
667
+ " transform = nn.LogSoftmax(1)\n",
668
+ " # What happens if GEN_LENGTH > CONTEXT? don't worry about it\n",
669
+ " #x = F.pad(contexts[:, -BLOCK_SIZE:], (0, BLOCK_SIZE - contexts.size(0)), \"constant\", 0)\n",
670
+ " x = contexts[-BLOCK_SIZE:]\n",
671
+ " x = F.pad(x, (0, BLOCK_SIZE - x.size(0)), \"constant\", 0).unsqueeze(0) # B*T\n",
672
+ " preds = model.infer(x)\n",
673
+ " preds = preds.squeeze(0)\n",
674
+ " probs = torch.softmax(preds, dim=-1)\n",
675
+ "\n",
676
+ " # TODO: Broken because of bug with the trailing 0s. FIX THIS\n",
677
+ " next_char = torch.multinomial(torch.exp(preds[(-1 if i >= BLOCK_SIZE else i), :]), num_samples=1, generator=g_cuda)\n",
678
+ " #context = torch.cat(context, next_char)\n",
679
+ " contexts = torch.cat((contexts, next_char), dim=0)\n",
680
+ " print(decode_text(next_char.cpu().numpy())[-1], end=\"\")\n",
681
+ "\n",
682
+ "#print(\"\".join(decode_text(contexts.cpu().numpy())))"
683
+ ]
684
+ }
685
+ ],
686
+ "metadata": {
687
+ "kernelspec": {
688
+ "display_name": "Python 3",
689
+ "language": "python",
690
+ "name": "python3"
691
+ },
692
+ "language_info": {
693
+ "codemirror_mode": {
694
+ "name": "ipython",
695
+ "version": 3
696
+ },
697
+ "file_extension": ".py",
698
+ "mimetype": "text/x-python",
699
+ "name": "python",
700
+ "nbconvert_exporter": "python",
701
+ "pygments_lexer": "ipython3",
702
+ "version": "3.10.10"
703
+ },
704
+ "orig_nbformat": 4
705
+ },
706
+ "nbformat": 4,
707
+ "nbformat_minor": 2
708
+ }
makemore_bigram.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
makemore_mlp.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
makemore_mlp2.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
makemore_wavenet.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
micrograd.ipynb ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import math\n",
10
+ "import numpy as np\n",
11
+ "import matplotlib.pyplot as plt\n",
12
+ "%matplotlib inline"
13
+ ]
14
+ },
15
+ {
16
+ "attachments": {},
17
+ "cell_type": "markdown",
18
+ "metadata": {},
19
+ "source": [
20
+ "# What are derivatives?"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 3,
26
+ "metadata": {},
27
+ "outputs": [],
28
+ "source": [
29
+ "def f(x):\n",
30
+ " \"\"\"Random quadratic\"\"\"\n",
31
+ " return 3*x ** 2 - 4*x + 5"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "execution_count": 4,
37
+ "metadata": {},
38
+ "outputs": [
39
+ {
40
+ "data": {
41
+ "text/plain": [
42
+ "20"
43
+ ]
44
+ },
45
+ "execution_count": 4,
46
+ "metadata": {},
47
+ "output_type": "execute_result"
48
+ }
49
+ ],
50
+ "source": [
51
+ "f(3)"
52
+ ]
53
+ },
54
+ {
55
+ "cell_type": "code",
56
+ "execution_count": 6,
57
+ "metadata": {},
58
+ "outputs": [
59
+ {
60
+ "data": {
61
+ "text/plain": [
62
+ "[<matplotlib.lines.Line2D at 0x7fac624994b0>]"
63
+ ]
64
+ },
65
+ "execution_count": 6,
66
+ "metadata": {},
67
+ "output_type": "execute_result"
68
+ },
69
+ {
70
+ "data": {
71
+ "image/png": "",
72
+ "text/plain": [
73
+ "<Figure size 640x480 with 1 Axes>"
74
+ ]
75
+ },
76
+ "metadata": {},
77
+ "output_type": "display_data"
78
+ }
79
+ ],
80
+ "source": [
81
+ "xs = np.arange(-10, 10, 0.5)\n",
82
+ "ys = f(xs)\n",
83
+ "plt.plot(xs, ys)"
84
+ ]
85
+ },
86
+ {
87
+ "attachments": {},
88
+ "cell_type": "markdown",
89
+ "metadata": {},
90
+ "source": [
91
+ "Now let's implement a derivative in code, since the actual expression is too difficult:"
92
+ ]
93
+ },
94
+ {
95
+ "cell_type": "code",
96
+ "execution_count": 10,
97
+ "metadata": {},
98
+ "outputs": [
99
+ {
100
+ "data": {
101
+ "text/plain": [
102
+ "0.0029999999995311555"
103
+ ]
104
+ },
105
+ "execution_count": 10,
106
+ "metadata": {},
107
+ "output_type": "execute_result"
108
+ }
109
+ ],
110
+ "source": [
111
+ "h = 10 ** -3\n",
112
+ "x = 2/3\n",
113
+ "(f(x + h) - f(x)) / h"
114
+ ]
115
+ },
116
+ {
117
+ "cell_type": "code",
118
+ "execution_count": 11,
119
+ "metadata": {},
120
+ "outputs": [
121
+ {
122
+ "name": "stdout",
123
+ "output_type": "stream",
124
+ "text": [
125
+ "4.0\n"
126
+ ]
127
+ }
128
+ ],
129
+ "source": [
130
+ "a = 2.0\n",
131
+ "b = -3.0\n",
132
+ "c = 10.0\n",
133
+ "d = a*b + c\n",
134
+ "print(d)"
135
+ ]
136
+ },
137
+ {
138
+ "cell_type": "code",
139
+ "execution_count": 13,
140
+ "metadata": {},
141
+ "outputs": [
142
+ {
143
+ "name": "stdout",
144
+ "output_type": "stream",
145
+ "text": [
146
+ "d1 4.0\n",
147
+ "d2 4.0002\n",
148
+ "slope 2.0000000000042206\n"
149
+ ]
150
+ }
151
+ ],
152
+ "source": [
153
+ "h = 0.0001\n",
154
+ "\n",
155
+ "# Random inputs\n",
156
+ "a = 2.0\n",
157
+ "b = -3.0\n",
158
+ "c= 10.0\n",
159
+ "\n",
160
+ "d1 = a*b + c\n",
161
+ "b += h\n",
162
+ "d2 = a*b + c\n",
163
+ "\n",
164
+ "print('d1', d1)\n",
165
+ "print('d2', d2)\n",
166
+ "print('slope', (d2 - d1) /h)"
167
+ ]
168
+ },
169
+ {
170
+ "attachments": {},
171
+ "cell_type": "markdown",
172
+ "metadata": {},
173
+ "source": [
174
+ "## Value object"
175
+ ]
176
+ },
177
+ {
178
+ "cell_type": "code",
179
+ "execution_count": 24,
180
+ "metadata": {},
181
+ "outputs": [],
182
+ "source": [
183
+ "class Value:\n",
184
+ " def __init__(self, data, _children=(), _op=''):\n",
185
+ " self.data = data\n",
186
+ " # Changing this variable doesn't change the loss\n",
187
+ " self.grad = 0.0\n",
188
+ " self._prev = set(_children)\n",
189
+ " self._op = _op\n",
190
+ " self._backward = lambda: None\n",
191
+ "\n",
192
+ " def __repr__(self):\n",
193
+ " return f\"Value(data={self.data})\"\n",
194
+ "\n",
195
+ " def __add__(self, other):\n",
196
+ " out = Value(self.data + other.data, (self, other), '+')\n",
197
+ " def _backward():\n",
198
+ " self.grad = 1.0 * out.grad\n",
199
+ " other.grad = 1.0 * out.grad\n",
200
+ " out._backward = _backward\n",
201
+ " return out\n",
202
+ "\n",
203
+ " def __mul__(self, other):\n",
204
+ " out = Value(self.data * other.data, (self, other), '*')\n",
205
+ "\n",
206
+ " def _backward(): \n",
207
+ " self.grad = other.data * out.grad\n",
208
+ " other.grad = self.data * out.grad\n",
209
+ " out._backward = _backward\n",
210
+ " return out\n",
211
+ "\n",
212
+ " def tanh(self):\n",
213
+ " n = self.data\n",
214
+ " t = (math.exp(2*n) - 1) / (math.exp (2*n + 1))\n",
215
+ " out = Value(t, (self, ), 'tanh')\n",
216
+ " def _backward():\n",
217
+ " # Local derivative \n",
218
+ " self.grad = (1 - t**2) * out.grad\n",
219
+ " \n",
220
+ " out._backward = _backward\n",
221
+ " return out\n"
222
+ ]
223
+ },
224
+ {
225
+ "attachments": {},
226
+ "cell_type": "markdown",
227
+ "metadata": {},
228
+ "source": [
229
+ "this is the alternative, and I do think it feels a bit sharper"
230
+ ]
231
+ },
232
+ {
233
+ "cell_type": "code",
234
+ "execution_count": 26,
235
+ "metadata": {},
236
+ "outputs": [
237
+ {
238
+ "data": {
239
+ "text/plain": [
240
+ "'+'"
241
+ ]
242
+ },
243
+ "execution_count": 26,
244
+ "metadata": {},
245
+ "output_type": "execute_result"
246
+ }
247
+ ],
248
+ "source": [
249
+ "a = Value(2.0)\n",
250
+ "b = Value(-3.0)\n",
251
+ "c = Value(10.0)\n",
252
+ "d = a*b + c\n",
253
+ "d._op"
254
+ ]
255
+ }
256
+ ],
257
+ "metadata": {
258
+ "kernelspec": {
259
+ "display_name": "venv",
260
+ "language": "python",
261
+ "name": "python3"
262
+ },
263
+ "language_info": {
264
+ "codemirror_mode": {
265
+ "name": "ipython",
266
+ "version": 3
267
+ },
268
+ "file_extension": ".py",
269
+ "mimetype": "text/x-python",
270
+ "name": "python",
271
+ "nbconvert_exporter": "python",
272
+ "pygments_lexer": "ipython3",
273
+ "version": "3.10.9 (main, Dec 19 2022, 17:35:49) [GCC 12.2.0]"
274
+ },
275
+ "orig_nbformat": 4,
276
+ "vscode": {
277
+ "interpreter": {
278
+ "hash": "632cf67f3dc7ae5e2d87dcad018c0f1b1cd2d0aa8b91fc5df1dd41dbdb2fa25c"
279
+ }
280
+ }
281
+ },
282
+ "nbformat": 4,
283
+ "nbformat_minor": 2
284
+ }
readme.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Neural Networks: Zero to Hero
2
+
3
+ This repository contains my personal implementation of neural network models, based on Andrej Karpathy's [Neural Networks: Zero to Hero](https://www.youtube.com/playlist?list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ) tutorials on the basics of neural nets and transformers. The code includes models for bigram language modeling, CNN-based natural language processing, and transformer models. These models are designed to help me understand the basic concepts of neural networks and their applications in NLP.
4
+
5
+ This code is heavily inspired by Karpathy's own [nn-zero-to-hero code](https://github.com/karpathy/nn-zero-to-hero), but is not a direct fork. The code is written in Jupyter notebooks and uses the PyTorch library for building and training the models. Each model is implemented as a standalone notebook and can be run independently.
6
+
7
+ Please note that this code is not intended for production use (obviously) and should be used for educational purposes only. If you find any bugs or have suggestions for improvement, please feel free to contribute!