In [1]:
import torch
from contextlib import nullcontext
from bigram_model import BigramLanguageModel
from tokenizer_utils import IntCharTokenizer

In [2]:
import os

In [3]:
# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device == 'cpu' else torch.amp.autocast(device_type=device, dtype=ptdtype)
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))

In [5]:
from data_utils import *
model_args = dict(n_layer=n_layer, n_head=n_head, n_embd=n_embed, block_size=BLOCK_SIZE,
 bias=False, vocab_size=None, dropout=dropout)

In [6]:
from data_utils import *
xb, yb = get_random_batch('train')
xb = xb.to(device)
yb = yb.to(device)

m = BigramLanguageModel(vocab_size=65, n_embed=n_embed, block_size=BLOCK_SIZE, num_heads=n_head, n_layers=n_layer).to(device)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

torch.Size([128, 256, 65])
tensor(4.3690, device='cuda:0', grad_fn=)


In [7]:

def estimate_loss(model):
 out = {}
 model.eval()
 for split in ['train', 'val']:
 losses = torch.zeros(eval_iters)
 for k in range(eval_iters):
 X, Y = get_random_batch(split)
 with ctx:
 logits, loss = model(X, Y)
 losses[k] = loss.item()
 out[split] = losses.mean()
 model.train()
 return out

In [8]:
char_tokenizer = load_int_char_tokenizer(load_text())

In [9]:
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)

for iter in range(max_iters):

 # every once in a while evaluate the loss on train and val sets
 if iter % eval_interval == 0 or iter == max_iters - 1:
 losses = estimate_loss(m)
 print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

 # sample a batch of data
 xb, yb = get_random_batch('train')

 # evaluate the loss
 logits, loss = m(xb, yb)
 optimizer.zero_grad(set_to_none=True)
 loss.backward()
 optimizer.step()


10.788929 M parameters


step 0: train loss 4.3685, val loss 4.3640
step 500: train loss 1.9681, val loss 2.0837
step 1000: train loss 1.5377, val loss 1.7404
step 1500: train loss 1.3802, val loss 1.6101
step 2000: train loss 1.2855, val loss 1.5551
step 2500: train loss 1.2162, val loss 1.5157
step 3000: train loss 1.1617, val loss 1.5088
step 3500: train loss 1.1061, val loss 1.5088
step 4000: train loss 1.0555, val loss 1.5150
step 4500: train loss 1.0086, val loss 1.5385
step 4999: train loss 0.9583, val loss 1.5524


In [10]:
checkpoint = {
 'model': m.state_dict(),
 'optimizer': optimizer.state_dict(),
 'model_args': model_args,
 'iter_num': max_iters,
 'best_val_loss': losses['val'],

}
out_dir = "./nano_gpt_ckpts"
print(f"saving checkpoint to {out_dir}")
torch.save(checkpoint, os.path.join(out_dir, 'ckpt_5k_iters.pt'))

saving checkpoint to ./nano_gpt_ckpts


In [11]:
#m2 = BigramLanguageModel(vocab_size=65, n_embed=n_embed, block_size=BLOCK_SIZE, num_heads=n_head, n_layers=n_layer).to(device)

In [12]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
#print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))
print(char_tokenizer.decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


GLOUCESTER: learn, like a nap. Prisoner will to my intents! with my brother! and this bloody makes off flows,--and haste tear'd your roe!--I should not be the other's.---I'ld do hear that be pupy with thear; sweet Montague,--thou as done not--So that they have nage must know,--never speak so many tears,--traightful ner-light,--with'd yet a ping tymp,--which time to stir; now still hurr'd,---water'd honour,--Pray's Coitlinius: the mountake's nobled daughter.' Sir, it is some thee on Rome is sin:--'proud him 'there;' none honest seen; forsweet must be pointed, hurls thee in men; a proud confines, foot, die, gin night, old Ratchard!--Go, good lord!--will'd you not piece, I dare not.' an't; swear by the dog, belike! mother!--How sir!-Spite! Jupiteous put o's!--God leave your lawful coward!'--for I'll dry down, you in death;'--near'---for very 'ven a day.---fa, by; 'twas his mother's disposed;--'I shall make no son,--hard him hear me,--do. Madam, or smother'd wife: and that you may part th

In [20]:
m3 = BigramLanguageModel(vocab_size=65, n_embed=n_embed, block_size=BLOCK_SIZE, num_heads=n_head, n_layers=n_layer)
ckpt = torch.load(os.path.join("./nano_gpt_ckpts", "ckpt_5k_iters.pt"))
m3.load_state_dict(ckpt['model'])



In [21]:
context = torch.zeros((1, 1), dtype=torch.long)
print(char_tokenizer.decode(m3.generate(context, max_new_tokens=500)[0].tolist()))



But Dohor, aged by! At Antigonus. You see his court! For death; a talm every hand, here shall!--So,--O, I, title now point!--Who, this I sem blind--that tark;--come boy?---O pray, peace! May, two here, do not---that I troth:----to villain leave, where was the Gallent--if I look the house,--bold Jour---whether may I go,--Mine son,---as I amiled me pized,--or so fled; 'tis a famouse,--there littenants,--If an either lawful hant ther is gone.' Sicilence, if it wer done! I have twize its sourness. P
