In [1]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-10-27 16:11:32-- https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... 

connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2023-10-27 16:11:36 (734 KB/s) - ‘input.txt.1’ saved [1115394/1115394]



In [2]:
with open("input.txt") as f:
 text = f.read()

In [4]:
text[:50]

'First Citizen:\nBefore we proceed any further, hear'

In [5]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

print("".join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [7]:
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[i] for i in l])

print(encode("hi there"))

print(decode(encode("hi there")))

[46, 47, 1, 58, 46, 43, 56, 43]
hi there


In [8]:
import torch

data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 14, 43, 44,
 53, 56, 43, 1, 61, 43, 1, 54, 56, 53, 41, 43, 43, 42, 1, 39, 52, 63,
 1, 44, 59, 56, 58, 46, 43, 56, 6, 1, 46, 43, 39, 56, 1, 51, 43, 1,
 57, 54, 43, 39, 49, 8, 0, 0, 13, 50, 50, 10, 0, 31, 54, 43, 39, 49,
 6, 1, 57, 54, 43, 39, 49, 8, 0, 0, 18, 47, 56, 57, 58, 1, 15, 47,
 58, 47, 64, 43, 52, 10, 0, 37, 53, 59])


In [9]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [10]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8


def get_batch(split):
 data = train_data if split == "train" else val_data
 ix = torch.randint(len(data) - block_size, (batch_size,))
 x = torch.stack([data[i : i + block_size] for i in ix])
 y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
 return x, y


xb, yb = get_batch("train")
print("Inputs:")
print(xb.shape)
print(xb)

print("-----------")
print("Targets:")
print(yb.shape)
print(yb)

Inputs:
torch.Size([4, 8])
tensor([[24, 43, 58, 5, 57, 1, 46, 43],
 [44, 53, 56, 1, 58, 46, 39, 58],
 [52, 58, 1, 58, 46, 39, 58, 1],
 [25, 17, 27, 10, 0, 21, 1, 54]])
-----------
Targets:
torch.Size([4, 8])
tensor([[43, 58, 5, 57, 1, 46, 43, 39],
 [53, 56, 1, 58, 46, 39, 58, 1],
 [58, 1, 58, 46, 39, 58, 1, 46],
 [17, 27, 10, 0, 21, 1, 54, 39]])


In [11]:
import torch.nn as nn
from torch.nn import functional as F


class BigramLanguageModel(nn.Module):
 def __init__(self, vocab_size):
 super().__init__()
 self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

 def forward(self, idx, targets):
 logits = self.token_embedding_table(idx)

 return logits

In [12]:
m = BigramLanguageModel(vocab_size)
out = m(xb, yb)
print(out.shape) # B,T,C -> 4X8X65

torch.Size([4, 8, 65])


In [15]:
class BigramLanguageModel(nn.Module):
 def __init__(self, vocab_size):
 super().__init__()
 self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

 def forward(self, idx, targets=None):
 logits = self.token_embedding_table(idx) # BTC
 loss = None
 if targets is not None:
 B, T, C = logits.shape
 logits = logits.view(B * T, C)
 targets = targets.view(B * T)
 loss = F.cross_entropy(logits, targets)
 return logits, loss

 def generate(self, idx, max_new_tokens):
 for _ in range(max_new_tokens):
 logits, loss = self(idx) # BxTxC
 logits = logits[:, -1, :] # BxC
 probs = F.softmax(logits, dim=-1) # BxC
 idx_next = torch.multinomial(probs, num_samples=1) # Bx1
 idx = torch.cat((idx, idx_next), dim=1) # BxT+1

 return idx


m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape) # B,T,C -> 4X8X65
print(loss)

torch.Size([32, 65])
tensor(4.5262, grad_fn=)


In [16]:
idx = torch.zeros((1, 1), dtype=torch.long)

results = decode(m.generate(idx, max_new_tokens=100)[0].tolist())

print(results)


'JgC.JZWqUkpdtkSpmzjM-,RqzgaN?vC:hgjnAnBZDga-APqGUH!WdCbIb;$DefOYbEvcaKGMmnO'q$KdS-'ZH
.YSqr'X!Q! d;


In [17]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [19]:
batch_size = 32

for steps in range(10000):
 xb, yb = get_batch("train")

 logits, loss = m(xb, yb)
 optimizer.zero_grad(set_to_none=True)
 loss.backward()
 optimizer.step()

print(loss.item())

2.4206888675689697


In [20]:
idx = torch.zeros((1, 1), dtype=torch.long)

results = decode(m.generate(idx, max_new_tokens=100)[0].tolist())

print(results)


Hou'sy'ting'stis's w ys'stholealy woawhimedy it 'save,
Too:Had wh fo an, ZCENERUCHENar ee onds, th h


In [28]:
B, T, C = 4, 8, 32

x = torch.randn(B, T, C)

head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x)
q = query(x)
wei = q @ k.transpose(-2, -1) * (head_size**-0.5) # (B,T,16) @ (B,16,T) --> (B,T,T)

tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, float("-inf"))
wei = F.softmax(wei, dim=-1)
v = value(x)
out = wei @ v

out.shape


torch.Size([4, 8, 16])

In [29]:
wei[0]


tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
 [0.3325, 0.6675, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
 [0.3578, 0.2873, 0.3550, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
 [0.2281, 0.1964, 0.2733, 0.3022, 0.0000, 0.0000, 0.0000, 0.0000],
 [0.2851, 0.1588, 0.2068, 0.1436, 0.2057, 0.0000, 0.0000, 0.0000],
 [0.2429, 0.1547, 0.1550, 0.1475, 0.2049, 0.0951, 0.0000, 0.0000],
 [0.1573, 0.1838, 0.1123, 0.1680, 0.1528, 0.1194, 0.1063, 0.0000],
 [0.1139, 0.1704, 0.0766, 0.1134, 0.1600, 0.1466, 0.1228, 0.0963]],
 grad_fn=)