Ashish Reddy commited on
Commit
d00fb47
·
1 Parent(s): 2ea3f3e

Add application file

Browse files
Files changed (6) hide show
  1. .DS_Store +0 -0
  2. deploy.py +33 -0
  3. model.py +64 -0
  4. nanogpt_model.pth +3 -0
  5. requirements.txt +2 -0
  6. train.py +152 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
deploy.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import gradio as gr
3
+
4
+ from model import Model
5
+ from train import encoder, decoder
6
+
7
+ # Device
8
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
9
+
10
+ # Load model
11
+ model = Model().to(device)
12
+ model.load_state_dict(torch.load("nanogpt_model.pth", map_location=device))
13
+ model.eval()
14
+
15
+ # Generation function
16
+ def generate_text(prompt, max_tokens):
17
+ idx = torch.tensor(encoder(prompt), dtype=torch.long, device=device).unsqueeze(0)
18
+ generated = model.generate(idx, max_new_tokens=max_tokens)[0].tolist()
19
+ return decoder(generated)
20
+
21
+ # Gradio interface
22
+ iface = gr.Interface(
23
+ fn=generate_text,
24
+ inputs=[
25
+ gr.Textbox(lines=2, placeholder="Enter a prompt...", label="Prompt"),
26
+ gr.Slider(10, 500, value=200, step=10, label="Max Tokens")
27
+ ],
28
+ outputs=gr.Textbox(label="Generated Output"),
29
+ title="🧠 NanoGPT from Scratch",
30
+ description="A tiny GPT model trained on Shakespeare. Try your luck by giving it a prompt!"
31
+ )
32
+
33
+ iface.launch(share=True)
model.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch, torch.nn as nn, torch.nn.functional as F
2
+
3
+ batch_size = 64
4
+ max_len = 256
5
+ d_model = 384
6
+ n_layer = 6 # 6 blocks in the decoder
7
+ n_head = 6
8
+ d_q = int(d_model / n_head)
9
+ dropout = 0.2
10
+ vocab_size = 65
11
+
12
+ from block import Block
13
+
14
+ class Model(nn.Module):
15
+ def __init__(self):
16
+ super().__init__()
17
+ self.token_embedding_table = nn.Embedding(vocab_size, d_model) # Embedding matrix size: (65, 384)
18
+ self.positional_embedding_table = nn.Embedding(max_len, d_model) # Position matrix size: (256, 384)
19
+ self.blocks = nn.Sequential(*[Block(d_model, n_head) for _ in range(n_layer)])
20
+ self.ln = nn.LayerNorm(d_model)
21
+ self.unembedding_matrix_calc = nn.Linear(d_model, vocab_size)
22
+
23
+ def forward(self, idx, targets=None):
24
+ B, S = idx.shape
25
+
26
+ tok_emb = self.token_embedding_table(idx) # Size of embedding: (B, S, 384)
27
+ pos_emb = self.positional_embedding_table(torch.arange(S, device=idx.device)) # Shape: (S, 384)
28
+ x = tok_emb + pos_emb
29
+
30
+ x = self.blocks(x) # Pass through all 6 blocks each of all 6 heads
31
+ x = self.ln(x)
32
+
33
+ logits = self.unembedding_matrix_calc(x) # --> (B, S, 384) * (384, 65) --> (B, S, 65)
34
+
35
+ if targets is None:
36
+ loss = None
37
+ else:
38
+ B, S, V = logits.shape
39
+ logits = logits.view(-1, V) # (B, S, V) --> (B*S, V)
40
+ targets = targets.view(-1) # --> (B, S) --> (B*S)
41
+ loss = F.cross_entropy(logits, targets) # Handles softmax interally as well (better because it does log addition which reduces errors instead of log multi)
42
+
43
+ return logits, loss
44
+
45
+ def generate(self, idx, max_new_tokens):
46
+ for _ in range(max_new_tokens):
47
+ idx_cond = idx[:, -max_len:]
48
+ logits, loss = self(idx_cond)
49
+ logits = logits[:, -1, :]
50
+ probs = F.softmax(logits, dim=-1)
51
+
52
+ idx_next = torch.multinomial(probs, num_samples=1)
53
+ idx = torch.cat((idx, idx_next), dim = 1)
54
+ return idx
55
+
56
+
57
+ if __name__ == "__main__":
58
+ model = Model()
59
+ idx = torch.zeros((batch_size, max_len), dtype=torch.long)
60
+ logits, loss = model(idx, idx)
61
+
62
+ print("Input shape:", idx.shape)
63
+ print("Output logits shape:", logits.shape)
64
+ print("Calculated loss:", loss.item())
nanogpt_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04b000c9b4136c6badf5fd7c6bab668f7fec6b7ffc1838c6d85b9d4ef6a15fce
3
+ size 52673259
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ torch
2
+ gradio
train.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch, torch.nn as nn, torch.optim as optim, torch.nn.functional as F, wandb, time
2
+
3
+ batch_size = 64
4
+ max_len = 256
5
+ d_model = 384
6
+ n_layer = 6
7
+ n_head = 6
8
+ d_q = int(d_model / n_head)
9
+ dropout = 0.2
10
+ vocab_size = 65
11
+
12
+ max_iters = 5000
13
+ eval_interval = 500
14
+ learning_rate = 3e-4
15
+ eval_iters = 200
16
+
17
+ """
18
+ ---- Device ----
19
+ """
20
+
21
+ if torch.cuda.is_available():
22
+ device = torch.device('cuda')
23
+ print("Using CUDA (GPU)")
24
+ elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
25
+ device = torch.device('mps')
26
+ print("Using MPS (Apple Silicon GPU)")
27
+ else:
28
+ device = torch.device('cpu')
29
+ print("Using device's CPU")
30
+
31
+ """
32
+ --- WandB Integration ---
33
+ """
34
+
35
+
36
+ wandb.init(
37
+ project="nano-model-shakesphere-training",
38
+ config={
39
+ "learning_rate": learning_rate,
40
+ "architecture": "decoder-only-model",
41
+ "dataset": "tinyshakesphere",
42
+ "d_model": d_model,
43
+ "n_layer": n_layer,
44
+ "n_head": n_head,
45
+ "max_iters": max_iters,
46
+ "dropout": dropout
47
+ }
48
+ )
49
+
50
+ with open('input.txt', 'r', encoding='utf-8') as f:
51
+ text = f.read()
52
+
53
+ chars = sorted(list(set(text))) # --> All unique characters within the text
54
+ vocab_size = len(chars) # 65 different characters in text
55
+
56
+ stoi = {}
57
+ itos = {}
58
+
59
+ for i in range(len(chars)):
60
+ stoi[chars[i]] = i # Convert strings to ints
61
+ itos[i] = chars[i] # Convert ints to strings
62
+
63
+ # Take a string, and output its characters indices in a list
64
+ def encoder(s):
65
+ res = []
66
+ for char in s:
67
+ res.append(stoi[char])
68
+ return res
69
+
70
+ # Take a list of indices and output a string
71
+ def decoder(l):
72
+ res = ""
73
+ for i in l:
74
+ res += itos[i]
75
+ return res
76
+
77
+ data = torch.tensor(encoder(text), dtype=torch.long) # --> Same shape as length, i.e., number of characters
78
+
79
+ n = int(0.9 * len(data))
80
+ train_data = data[:n] # 90% of text
81
+ val_data = data[n:] # 10% of text
82
+
83
+ def get_batch(split):
84
+ if split.lower() == 'train':
85
+ data = train_data
86
+ else:
87
+ data = val_data
88
+
89
+ ix = torch.randint(len(data)-max_len, (batch_size,)) # Generate batch_size=64 random numbers from 0 to len(data)-max_len
90
+
91
+ x = torch.stack([data[i:i+max_len] for i in ix]) # Generates 250 ids from that random number and stacks batch_size by rows, so shape[64, 256]
92
+ y = torch.stack([data[i+1:i+max_len+1] for i in ix]) # This is done in order to test teh real y with the later predicted y by the model using cross entropy and update weights
93
+
94
+ return x.to(device), y.to(device)
95
+
96
+ """
97
+ --- Model Training ---
98
+ """
99
+
100
+ if __name__ == "__main__":
101
+
102
+ from model import Model
103
+
104
+ model = Model()
105
+ m = model.to(device)
106
+
107
+ optimizer = optim.AdamW(
108
+ model.parameters(),
109
+ lr=learning_rate
110
+ )
111
+
112
+ @torch.no_grad
113
+ def estimate_loss():
114
+ out = {}
115
+ model.eval()
116
+ for split in ['train', 'val']:
117
+ losses = torch.zeros(eval_iters)
118
+ for k in range(eval_iters):
119
+ X, Y = get_batch(split)
120
+ logits, loss = model(X, Y)
121
+ losses[k] = loss.item()
122
+ out[split] = losses.mean()
123
+ model.train()
124
+ return out
125
+
126
+ for iter in range(max_iters):
127
+ if iter % eval_interval == 0 or iter == max_iters - 1:
128
+ losses = estimate_loss()
129
+ print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
130
+
131
+ wandb.log({
132
+ "iter": iter,
133
+ "train/loss": losses['train'],
134
+ "val/loss": losses['val'],
135
+ "lr": learning_rate
136
+ })
137
+ iter_start = time.time()
138
+ xb, yb = get_batch("train")
139
+ logits, loss = model(xb, yb)
140
+ optimizer.zero_grad(set_to_none=True) # Required for new resetting as after iter, new set of batches will come
141
+ loss.backward() # Required for back passing, it gives you the amount of steepness and gradient
142
+ optimizer.step() # Required for actually nudging in that given direction (Taking a plausible value of lr right now but it influences a lot)
143
+
144
+ iter_time = time.time() - iter_start
145
+ print(f"Iteration {iter} completed in {iter_time:.2f} seconds")
146
+ wandb.log({"iter_time": iter_time})
147
+
148
+ wandb.finish()
149
+
150
+ print("Training finished. Saving model state...")
151
+ torch.save(model.state_dict(), 'nanogpt_model.pth')
152
+ print("Model saved to nanogpt_model.pth")