# %% [markdown]
# %% [markdown]
# # Training Shakespeare Himself
# %% [markdown]
# For those who are not part of the ARENA program and are curious as to what this is, this was my first significant AI/ML project! I made components for a decoder-only transformer, and trained it on a corpus consisting of text from Shakespeare. Scroll to the bottom to see some output :)
# %%
import torch as t
import numpy as np
from torch import nn
import fancy_einsum as einsum
import einops
import pandas as pd
# %% [markdown]
# ## transformer functions
# %% [markdown]
# This will be from the transformer components I made earlier this week, but I'll put down optimizations so it can use the GPU.
# And I did just that. The speed improvements are MASSIVE, wow!
# %%
device = t.device("cuda:0" if t.cuda.is_available() else "cpu")
# %%
def multihead_masked_attention(Q: t.Tensor, K: t.Tensor, V: t.Tensor, num_heads: int):
Implements multihead masked attention on the matrices Q, K and V.
Q: shape (batch, seq_len, nheads*headsize)
K: shape (batch, seq_len, nheads*headsize)
V: shape (batch, seq_len, nheads*headsize)
Q = einops.rearrange(Q, 'b s (n h) -> b n s h', n = num_heads)
K = einops.rearrange(K, 'b s (n h) -> b n s h', n = num_heads)
V = einops.rearrange(V, 'b s (n h) -> b n s h', n = num_heads)
scores = einsum.einsum('b n k h, b n s h -> b n s k', K, Q)
assert scores.shape == t.Size([Q.shape[0], num_heads,Q.shape[2], K.shape[2]])
scores = scores / np.sqrt(Q.shape[-1])
attention = scores + t.triu(t.ones_like(scores,device = device) * float("-inf"), diagonal=1) # THIS IS STOLEN FROM JAY - testing it out
softed = t.softmax(attention,dim=-1)
result = einsum.einsum('batch numheads seqQ seqK, batch numheads seqK headsize -> batch numheads seqQ headsize',softed, V)
return einops.rearrange(result, 'batch numheads seqQ headsize -> batch seqQ (numheads headsize)')
# %%
class MultiheadMaskedAttention(nn.Module):
W_QKV: nn.Linear
W_O: nn.Linear
def __init__(self, hidden_size: int, num_heads: int):
self.hidden_size = hidden_size
self.num_heads = num_heads
self.head_size = hidden_size // num_heads
self.WQKV = t.nn.Linear(self.hidden_size, 3 * hidden_size) # TODO: why do we use a linear layer here? aren't they matricies?
self.W0 = t.nn.Linear(self.hidden_size, self.hidden_size)
def forward(self, x: t.Tensor) -> t.Tensor:
x: shape (batch, seq, hidden_size)
Return: shape (batch, seq, hidden_size)
x = x.float() # seems like it needs to be a float!
QKV = self.WQKV(x)
Q = QKV[:,:,:self.hidden_size]
K = QKV[:,:,self.hidden_size:self.hidden_size * 2]
V = QKV[:,:,self.hidden_size * 2:]
assert Q.shape == K.shape == V.shape == x.shape
return self.W0(multihead_masked_attention(Q,K,V,self.num_heads))
# %%
from dataclasses import dataclass
class TransformerConfig:
'''Constants used throughout your decoder-only transformer model.'''
num_layers: int
num_heads: int
vocab_size: int
hidden_size: int
max_seq_len: int
dropout: float = 0.1
layer_norm_epsilon: float = 1e-05
# %%
# from yesterday
class PositionalEncoding(nn.Module):
def __init__(self, embedding_dim: int, max_seq_len: int = 5000):
self.dim = embedding_dim
self.length = max_seq_len
# mostly copied. i understand this, just need to work on
# making more tensors and getting more exposure to methods of making tensors
def P (delta):
n = 10000 # hardcoded
d = embedding_dim
l = max_seq_len
sin_array = np.sin(delta / n ** (2 * np.arange(d//2) / d))
cos_array = np.cos(delta / n ** (2 * np.arange(d//2) / d))
array = np.zeros(d)
array[::2] = sin_array
array[1::2] = cos_array
return array
tokenArray = []
for i in range(max_seq_len):
tokenArray.append(P(i)) # changed from previous design
self.multMax = t.tensor(np.array(tokenArray), dtype=t.float, device = device)
def forward(self, x: t.Tensor) -> t.Tensor:
x: shape (batch, seq_len, embedding_dim)
return x + self.multMax[:x.shape[1]]
# %%
class MLP(nn.Module):
def __init__(self, config: TransformerConfig):
self.hidden_size = config.hidden_size
self.layers = nn.Sequential(
nn.Linear(self.hidden_size, self.hidden_size * 4),
nn.Linear(self.hidden_size * 4, self.hidden_size),
def forward(self, x: t.Tensor):
x = x.float() # seems like it needs to be a float!
return self.layers(x).float() # ima do the same thing again!
# %%
class DecoderBlock(nn.Module):
def __init__(self, config: TransformerConfig):
self.attentionBlock = nn.Sequential(
MultiheadMaskedAttention(config.hidden_size, config.num_heads),
self.MLP = nn.Sequential(
def forward(self, x: t.Tensor) -> t.Tensor:
partOne = x + self.attentionBlock(x)
return (partOne + self.MLP(partOne)).float() # seems like it needs to be a float!
# %%
class DecoderOnlyTransformer(nn.Module):
def __init__(self, config: TransformerConfig):
self.tokenize = nn.Embedding(config.vocab_size, config.hidden_size).to(device)
self.positionize = PositionalEncoding(config.hidden_size,config.max_seq_len)
self.restModel = nn.Sequential(
*[DecoderBlock(config) for i in range(config.num_layers)],
self.unembed =
def forward(self, x: t.Tensor) -> t.Tensor:
x = self.tokenize(x)
x = self.positionize(x)
toUnembed = self.restModel(x).to(device)
return toUnembed@self.unembed
# %% [markdown]
# ## Data Prep
# %% [markdown]
# Make the dataset to parse through all of the words
# %%
import re
from import Dataset, DataLoader
from torchvision import datasets
class CustomTextDataset(Dataset):
def __init__(self, words, seq_len, fractionOfWords):
self.fractionOfWords = fractionOfWords
self.words = words
self.setOfWords = set(words)
self.seq_len = seq_len
self.max_len = len(self.words) - (self.seq_len + 1)
self.vocab_size = len(self.setOfWords)
self.word_to_token = {word: idx for (idx, word) in enumerate(sorted(self.setOfWords))}
self.token_to_word = {idx: word for (idx, word) in enumerate(sorted(self.setOfWords))}
self.allTokens = t.tensor([self.word_to_token[word] for word in self.words],device = device)
if (self.fractionOfWords > 0.9):
print("Probably don't do this. Errors may about")
def __len__(self):
return int(self.max_len * self.fractionOfWords)
def __getitem__(self, idx):
tokens = self.allTokens[idx:idx + self.seq_len + 1]
input = tokens[:-1]
target = tokens[1:]
return input, target
def getDataSize(self):
return self.vocab_size
def convertToTokens(self, phrase: list) -> t.tensor:
return t.tensor([self.word_to_token[word] for word in phrase],device = device)
def convertStringToTokenList(self, phrase: str) -> list:
words = re.split(r"\b", phrase)
return [self.word_to_token[word] for word in words]
def convertToText(self, tokens: t.tensor):
temp = []
for i, value in enumerate(tokens):
return temp
def decodeList(self, words: list):
temp = []
for value in words:
return temp
def listToString(self, words: list) -> str:
temp = ""
for word in words:
temp = temp + word
return temp
# %%
file = open("shakespeare.txt")
text =
words = re.split(r"\b", text)
fractionOfWords = 0.1 # what percent of the corpus to train on
lengthOfSeq = 100
shak = CustomTextDataset(words, lengthOfSeq, fractionOfWords)
# %% [markdown]
# ## Running this data through a transformer
# %%
trainloader = DataLoader(shak, batch_size=32,shuffle=True)
# this specific one trained for 24 minutes and 9 seconds on colab GPU
thisConfig = TransformerConfig(
num_layers = 4, # 6 layers in the Attention paper
num_heads = 4, # 8 heads in Attention paper
vocab_size = trainloader.dataset.getDataSize(), # 37000 tokens in Attention paper (?)
hidden_size = 512, # recall that this = num_heads * headsize | 512 is the embedding dim used in Attention paper
max_seq_len = lengthOfSeq,
dropout = 0.1, # same as Attention paper
# %%
use_pretrained = True
if use_pretrained:
print("Using Pre-trained Model!")
myTransformer = DecoderOnlyTransformer(thisConfig).to(device)
optimizer = t.optim.Adam(myTransformer.parameters(), lr = 1e-3)
criterion = nn.CrossEntropyLoss().to(device)
myTransformer.load_state_dict(t.load("", map_location=device))
print("Training Model... better hope you got enough GPU!")
myTransformer = DecoderOnlyTransformer(thisConfig).to(device)
optimizer = t.optim.Adam(myTransformer.parameters(), lr = 1e-3)
criterion = nn.CrossEntropyLoss().to(device)
losses = []
for epoch in range(1, NUM_EPOCHS + 1):
for inputs, targets in trainloader:
outputs = myTransformer(inputs).to(device)
targets = t.nn.functional.one_hot(targets, num_classes=trainloader.dataset.getDataSize()).float().to(device)
outputs = einops.rearrange(outputs, 'batch seq vocab -> (batch seq) vocab')
targets = einops.rearrange(targets, 'batch seq vocab -> (batch seq) vocab')
outputs =
targets =
loss = criterion(outputs,targets).to(device)
# %%
if not use_pretrained:
df = pd.DataFrame(losses)
# %%
# quick test - use the sample method if you wish to actually use the transformer:
testPhrase = ["Be", " ", "not", " ", "afraid", " ", "to", " ", "the", " ", "Florentine", "\n",
input = shak.convertToTokens(testPhrase)
input = input[None, :]
tokens = myTransformer(input).argmax(dim=-1)[0]
# %% [markdown]
# # Sampling
# %%
def apply_sampling_methods(input_ids: t.Tensor, logits: t.Tensor, temperature=1.0, freq_penalty=0.0, top_k=0, top_p=0.0) -> int:
# returns a next token based on provided sampling method
# thanks callum for the this method
assert input_ids.ndim == 1, "input_ids should be a 1D sequence of token ids"
assert temperature >= 0, "Temperature should be non-negative"
assert 0 <= top_p <= 1.0, "Top-p must be a probability"
assert 0 <= top_k, "Top-k must be non-negative"
assert not (top_p != 0 and top_k != 0), "At most one of top-p and top-k supported"
if temperature == 0:
return greedy_search(logits)
if temperature != 1.0:
logits = apply_temperature(logits, temperature)
if freq_penalty != 0.0:
logits = apply_freq_penalty(input_ids, logits, freq_penalty)
if top_k > 0:
return sample_top_k(logits, top_k)
if top_p > 0:
return sample_top_p(logits, top_p)
return sample_basic(logits)
def sample_tokens(
initial_text: str,
max_tokens_generated = 40,
**kwargs) -> list:
# samples tokens until model outputs eos_token_id or token limit reached
input_ids: list = encodeMethod(initial_text)
generated_ids = []
device = next(model.parameters()).device #what is next doing here?
tokens_to_generate = max_tokens_generated - len(input_ids)
for _ in range(tokens_to_generate):
#print(input_ids + generated_ids)
new_input_ids = t.tensor(input_ids + generated_ids, dtype=t.int64, device=device)
logits = model(new_input_ids.unsqueeze(0))[0, -1]
new_token = apply_sampling_methods(new_input_ids, logits, **kwargs)
return decodeMethod(input_ids + generated_ids)
# quick test:
testPhrase = ["Be", " ", "not", " ", "afraid", " ", "to", " ", "the", " ", "Florentine", "\n",
input = shak.convertToTokens(testPhrase)
# %%
def greedy_search(logits):
returns the most likely next token, BUT THE TIEBREAKER IS INCORRECT!
i got lazy - it *is* deterministic, but it just doesn't necessarily
choose the smallest word out of the tie. perhaps treat it as a symbol
of my ingenuity?
return logits.argmax(dim=-1).item()
# %%
def sample_basic(logits) -> int:
samples from the distributions, possibly with temp and freq changes applied
logits: shape (vocab_size, ) - unnormalized log-probabilities
return: a sampled token
probs = t.distributions.categorical.Categorical(logits=logits)
return probs.sample().item()
N = 20000
probs = t.linspace(0, 0.4, 5)
unnormalized_logits = probs.log() + 1.2345
samples = t.tensor([sample_basic(unnormalized_logits) for _ in range(N)])
counts = t.bincount(samples, minlength=len(probs)) / N
print("Checking empirical frequencies (try to increase N if this test fails): ", counts)
#t.testing.assert_close(counts, probs, atol=0.01, rtol=0)
print("Tests passed!")
# %%
def apply_freq_penalty(input_ids: t.Tensor, logits: t.Tensor, freq_penalty: float) -> t.Tensor:
input_ids: shape (seq, )
logits: shape (vocab_size, )
Return: shape (vocab_size, )
(vocab_size,) = logits.shape
id_freqs = t.bincount(input_ids, minlength=vocab_size)
return logits - freq_penalty * id_freqs
bieber_prompt = "And I was like baby, baby, baby, oh Like, baby, baby, baby, no Like, baby, baby, baby, oh I thought you'd always be mine, mine"
input_ids = shak.convertStringToTokenList(bieber_prompt)
logits = t.ones(shak.getDataSize()).to(device)
penalized_logits = apply_freq_penalty(t.tensor(input_ids).to(device), logits, 2.0)
#i believe mine is different!
#assert penalized_logits[5156].item() == -11, "Expected 6 occurrences of ' baby' with leading space"
#assert penalized_logits[14801].item() == -5, "Expected 3 occurrences of ' Baby' with leading space"
#print("Tests passed!")
print(penalized_logits[2037].item()) # should be low since it was found!
# %%
def apply_temperature(logits: t.Tensor, temperature: float) -> t.Tensor:
assert temperature > 0, "temp cannot be less than or equal to 0"
return logits / temperature
logits = t.tensor([1, 2]).log()
cold_logits = apply_temperature(logits, 0.001)
#print('A low temperature "sharpens" or "peaks" the distribution: ', cold_logits)
#t.testing.assert_close(cold_logits, 1000.0 * logits)
hot_logits = apply_temperature(logits, 1000.0)
#print("A high temperature flattens the distribution: ", hot_logits)
#t.testing.assert_close(hot_logits, 0.001 * logits)
#print("Tests passed!")
# %%
# N_RUNS = 1
# your_prompt = "We are the champions, my friends"
# cases = [
# ("High freq penalty", dict(freq_penalty=100.0)),
# ("Negative freq penalty", dict(freq_penalty=-1.0)),
# ("Too hot!", dict(temperature=2.0)),
# ("Pleasantly cool", dict(temperature=0.7)),
# ("Pleasantly warm", dict(temperature=0.9)),
# ("Too cold!", dict(temperature=0.01)),
# ]
# for (name, kwargs) in cases:
# for i in range(N_RUNS):
# output = sample_tokens(myTransformer, shak.convertStringToTokenList,shak.decodeList, your_prompt, max_tokens_generated=24, **kwargs)
# print(f"Sample {i} with: {name} ({kwargs}):")
# print(f"Your model said: {shak.listToString(output)}\n")
# %%
def sample_top_k(logits: t.Tensor, top_k: int) -> int:
logits: shape (vocab_size, ) - unnormalized log-probabilities
top_k: only consider this many of the most likely tokens for sampling
Return: a sampled token
topk = t.topk(logits,top_k).indices
almost_zeroes = t.ones(logits.shape) * t.inf * -1
for _, token in enumerate(topk):
almost_zeroes[token] = 0
logits = logits + almost_zeroes
return sample_basic(logits)
k = 3
probs = t.linspace(0, 0.4, 5)
unnormalized_logits = probs.log() + 1.2345
samples = t.tensor([sample_top_k(unnormalized_logits, k) for _ in range(N)])
counts = t.bincount(samples, minlength=len(probs)) / N
expected = probs.clone()
expected[:-k] = 0
expected /= expected.sum()
# print("Checking empirical frequencies (try to increase N if this test fails): ", counts)
# #t.testing.assert_close(counts, expected, atol=0.01, rtol=0)
# print("Tests passed!")
# %%
def sample_top_p(logits: t.Tensor, top_p: float, min_tokens_to_keep: int = 1) -> int:
logits: shape (vocab_size, ) - unnormalized log-probabilities
Return: a sampled token
# find the indices of importang logits
sorted, indices = t.sort(logits,descending=True)
probs = t.nn.functional.softmax(sorted, dim=-1)
num_words_kept = 0
sum = 0
while sum < top_p:
sum = sum + probs[num_words_kept]
num_words_kept = num_words_kept + 1
if num_words_kept < min_tokens_to_keep:
num_words_kept = min_tokens_to_keep
important_indices = indices[:num_words_kept]
# prepare tensor to zero out small logits
almost_zeroes = t.ones(logits.shape) * t.inf * -1
for _, token in enumerate(important_indices):
almost_zeroes[token] = 0
logits = logits + almost_zeroes
return sample_basic(logits)
N = 2000
unnormalized_logits = t.tensor([0.2, 0.3, 0.5]).log() + 2.3456
samples = t.tensor([sample_top_p(unnormalized_logits, 0.5) for _ in range(N)])
counts = t.bincount(samples, minlength=len(unnormalized_logits)) / N
# print("top_p of 0.5 or lower should only return token 2: ", counts)
# assert counts[0] == 0 and counts[1] == 0
N = 2000
unnormalized_logits = t.tensor([0.2, 0.3, 0.5]).log() + 2.3456
samples = t.tensor([sample_top_p(unnormalized_logits, 0.50001) for _ in range(N)])
counts = t.bincount(samples, minlength=len(unnormalized_logits)) / N
# print("top_p in (0.5, 0.8] should return tokens 1 and 2: ", counts)
# assert counts[0] == 0
N = 4000
top_p = 0.71
probs = t.linspace(0, 0.4, 5)
unnormalized_logits = probs.log() + 1.2345
samples = t.tensor([sample_top_p(unnormalized_logits, top_p) for _ in range(N)])
counts = t.bincount(samples, minlength=len(probs)) / N
expected = probs.clone()
expected[0:2] = 0
expected /= expected.sum()
# print("Checking empirical frequencies (try to increase N if this test fails): ", counts)
#t.testing.assert_close(counts, expected, atol=0.01, rtol=0.0)
# print("All tests passed!")
# %% [markdown]
# # Speak, Shakespeare!
# %%
input = "Death waits at the door"
input, 80,freq_penalty=0.1, top_k = 10)))
# %% [markdown]
# ## Save the model for future use
# (This was over 20 minutes of GPU computation. Not too shabby!)
# %%, "")
# %% [markdown]
# # Publish to Gradio
# About a month after making this I realized this should be online. I'll push this to gradio
# %%
import gradio as gr
def speak(input, tokenLength):
print("input: " + input)
result = shak.listToString(sample_tokens(myTransformer,shak.convertStringToTokenList,shak.decodeList,
input, tokenLength,freq_penalty=0.1, top_k = 10))
return "one or more of the words is not compatible with the model; please try a different phrase"
print("worked! output:")
return result
model = gr.Interface(fn=speak,
inputs=[gr.Textbox(label = "initial text", placeholder="To be or not to be"), gr.Slider(40, 80, step=1, value=80)],
title = "speak shakespeare, speak!",
description = "a miniature shakespeare, built from scratch by Cody Rushing via a Decoder-Only Transformer trained on shakespeare's works.\n many, but not all, words are tokenizable - if you get an error, try again with different words!")
# %%