slahiri commited on
Commit
3e4f0c5
·
verified ·
1 Parent(s): 95e8771

Upload folder using huggingface_hub

Browse files
Files changed (7) hide show
  1. README.md +51 -17
  2. app.py +76 -0
  3. calculator_llm.py +254 -0
  4. config.json +9 -0
  5. model.pt +3 -0
  6. requirements.txt +2 -0
  7. vocab.json +38 -0
README.md CHANGED
@@ -1,34 +1,68 @@
1
  ---
2
- title: Small Calculator Model
3
  emoji: 🧮
4
  colorFrom: blue
5
  colorTo: purple
6
  sdk: gradio
7
- sdk_version: 6.3.0
8
  app_file: app.py
9
  pinned: false
10
- license: apache-2.0
11
  ---
12
 
13
- # Calculator LLM
14
 
15
- A tiny LLM that converts English math phrases to answers. Built from scratch to learn how language models work.
16
 
17
- ## What it does
18
 
19
- Converts text like:
20
- ```
21
- "two plus three" -> "five"
22
- "seven minus four" -> "three"
23
- "six times eight" -> "forty eight"
24
- ```
25
 
26
- ## Try it
27
 
28
- Type a math phrase in English (e.g., "seven times eight") and watch the model respond.
29
 
30
- ## Learn More
 
 
 
 
 
31
 
32
- This model is part of an educational tutorial: [Build Your First LLM](https://sid.sh/learn/build-your-first-llm)
33
 
34
- Source code: [GitHub](https://github.com/slahiri/small_calculator_model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Calculator LLM
3
  emoji: 🧮
4
  colorFrom: blue
5
  colorTo: purple
6
  sdk: gradio
7
+ sdk_version: "5.9.1"
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
  ---
12
 
13
+ # 🧮 Calculator LLM
14
 
15
+ A tiny transformer model (~105K parameters) that solves English math problems.
16
 
17
+ ## Try It
18
 
19
+ Enter a math problem in English like:
20
+ - "two plus three"
21
+ - "seven times eight"
22
+ - "twenty minus five"
 
 
23
 
24
+ The model will output the answer in English!
25
 
26
+ ## Examples
27
 
28
+ | Input | Output |
29
+ |-------|--------|
30
+ | two plus three | five |
31
+ | seven times eight | fifty six |
32
+ | twenty minus five | fifteen |
33
+ | nine times nine | eighty one |
34
 
35
+ ## Built From Scratch
36
 
37
+ This model was built following the tutorial at [sid.sh/learn/build-your-first-llm](https://sid.sh/learn/build-your-first-llm)
38
+
39
+ Same architecture as GPT (attention, feed-forward, transformer blocks), just much smaller!
40
+
41
+ ## Model Details
42
+
43
+ | Property | Value |
44
+ |----------|-------|
45
+ | Parameters | ~105K |
46
+ | Layers | 2 transformer blocks |
47
+ | Embedding | 64 dimensions |
48
+ | Attention Heads | 4 |
49
+ | Vocabulary | 36 tokens |
50
+ | Operations | plus, minus, times |
51
+ | Number Range | 0-99 |
52
+
53
+ ## Architecture
54
+
55
+ This is a decoder-only transformer with:
56
+ - Token embeddings + sinusoidal positional encoding
57
+ - 2 transformer blocks (multi-head attention + feed-forward)
58
+ - Causal masking for autoregressive generation
59
+ - Layer normalization and residual connections
60
+
61
+ ## Training
62
+
63
+ Trained on 5,000 randomly generated math problems for 20 epochs.
64
+
65
+ ## Links
66
+
67
+ - 📚 [Tutorial: Build Your First LLM](https://sid.sh/learn/build-your-first-llm)
68
+ - 💻 [Full Notebook on GitHub](https://github.com/slahiri/blog/blob/main/public/notebooks/full_calculator_llm.ipynb)
app.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Calculator LLM Demo - A tiny transformer that solves English math problems.
3
+ Built from scratch following: https://sid.sh/learn/build-your-first-llm
4
+ """
5
+
6
+ import gradio as gr
7
+ from calculator_llm import load_model, generate
8
+
9
+ # Load the trained model
10
+ print("Loading model...")
11
+ model, tokenizer, vocab = load_model(".")
12
+ print("Model loaded!")
13
+
14
+
15
+ def solve_math(problem: str) -> str:
16
+ """Solve an English math problem."""
17
+ if not problem.strip():
18
+ return ""
19
+
20
+ # Ensure the problem ends with 'equals'
21
+ problem = problem.lower().strip()
22
+ if not problem.endswith("equals"):
23
+ problem = problem + " equals"
24
+
25
+ # Generate the answer
26
+ result = generate(model, tokenizer, vocab, problem)
27
+
28
+ # Extract just the answer part
29
+ answer = result.replace(problem, "").strip()
30
+
31
+ return answer
32
+
33
+
34
+ # Example problems
35
+ examples = [
36
+ ["two plus three"],
37
+ ["seven times eight"],
38
+ ["fifteen minus six"],
39
+ ["twenty plus thirty"],
40
+ ["nine times nine"],
41
+ ["fifty minus twenty five"],
42
+ ["twelve plus seven"],
43
+ ["eighty one minus forty"],
44
+ ]
45
+
46
+ # Create the Gradio interface
47
+ demo = gr.Interface(
48
+ fn=solve_math,
49
+ inputs=gr.Textbox(
50
+ label="Math Problem (in English)",
51
+ placeholder="e.g., two plus three",
52
+ info="Enter numbers 0-99 with operations: plus, minus, times",
53
+ ),
54
+ outputs=gr.Textbox(label="Answer"),
55
+ title="🧮 Calculator LLM",
56
+ description="""
57
+ A tiny transformer model (~105K parameters) that solves English math problems.
58
+
59
+ **Built from scratch** following the tutorial at [sid.sh/learn/build-your-first-llm](https://sid.sh/learn/build-your-first-llm)
60
+
61
+ Same architecture as GPT, just much smaller!
62
+
63
+ | Property | Value |
64
+ |----------|-------|
65
+ | Parameters | ~105K |
66
+ | Layers | 2 |
67
+ | Attention Heads | 4 |
68
+ | Embedding Dim | 64 |
69
+ """,
70
+ examples=examples,
71
+ theme=gr.themes.Soft(),
72
+ cache_examples=True,
73
+ )
74
+
75
+ if __name__ == "__main__":
76
+ demo.launch()
calculator_llm.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Calculator LLM - A tiny transformer for solving English math problems.
3
+ Built from scratch following: https://sid.sh/learn/build-your-first-llm
4
+ """
5
+
6
+ import torch
7
+ import torch.nn as nn
8
+ import torch.nn.functional as F
9
+ import math
10
+ import json
11
+
12
+
13
+ class PositionalEncoding(nn.Module):
14
+ """Adds positional information to embeddings using sine/cosine waves."""
15
+
16
+ def __init__(self, embed_dim, max_seq_len=512, dropout=0.1):
17
+ super().__init__()
18
+ self.dropout = nn.Dropout(p=dropout)
19
+ pe = torch.zeros(max_seq_len, embed_dim)
20
+ position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
21
+ div_term = torch.exp(
22
+ torch.arange(0, embed_dim, 2).float() * (-math.log(10000.0) / embed_dim)
23
+ )
24
+ pe[:, 0::2] = torch.sin(position * div_term)
25
+ pe[:, 1::2] = torch.cos(position * div_term)
26
+ pe = pe.unsqueeze(0)
27
+ self.register_buffer("pe", pe)
28
+
29
+ def forward(self, x):
30
+ x = x + self.pe[:, : x.size(1), :]
31
+ return self.dropout(x)
32
+
33
+
34
+ class TokenEmbedding(nn.Module):
35
+ """Converts token IDs to embedding vectors with positional encoding."""
36
+
37
+ def __init__(self, vocab_size, embed_dim, max_seq_len, dropout=0.1):
38
+ super().__init__()
39
+ self.embed_dim = embed_dim
40
+ self.token_embedding = nn.Embedding(vocab_size, embed_dim)
41
+ self.pos_encoding = PositionalEncoding(embed_dim, max_seq_len, dropout)
42
+ self.scale = math.sqrt(embed_dim)
43
+
44
+ def forward(self, x):
45
+ x = self.token_embedding(x) * self.scale
46
+ x = self.pos_encoding(x)
47
+ return x
48
+
49
+
50
+ class MultiHeadAttention(nn.Module):
51
+ """Multi-head self-attention mechanism."""
52
+
53
+ def __init__(self, embed_dim, num_heads, dropout=0.1):
54
+ super().__init__()
55
+ self.embed_dim = embed_dim
56
+ self.num_heads = num_heads
57
+ self.head_dim = embed_dim // num_heads
58
+ self.q_proj = nn.Linear(embed_dim, embed_dim)
59
+ self.k_proj = nn.Linear(embed_dim, embed_dim)
60
+ self.v_proj = nn.Linear(embed_dim, embed_dim)
61
+ self.out_proj = nn.Linear(embed_dim, embed_dim)
62
+ self.dropout = nn.Dropout(dropout)
63
+ self.scale = math.sqrt(self.head_dim)
64
+
65
+ def forward(self, x, mask=None):
66
+ batch_size, seq_len, _ = x.shape
67
+ Q = (
68
+ self.q_proj(x)
69
+ .view(batch_size, seq_len, self.num_heads, self.head_dim)
70
+ .transpose(1, 2)
71
+ )
72
+ K = (
73
+ self.k_proj(x)
74
+ .view(batch_size, seq_len, self.num_heads, self.head_dim)
75
+ .transpose(1, 2)
76
+ )
77
+ V = (
78
+ self.v_proj(x)
79
+ .view(batch_size, seq_len, self.num_heads, self.head_dim)
80
+ .transpose(1, 2)
81
+ )
82
+ scores = torch.matmul(Q, K.transpose(-2, -1)) / self.scale
83
+ if mask is not None:
84
+ scores = scores.masked_fill(mask == 0, float("-inf"))
85
+ attn_weights = F.softmax(scores, dim=-1)
86
+ attn_weights = self.dropout(attn_weights)
87
+ attn_output = torch.matmul(attn_weights, V)
88
+ attn_output = (
89
+ attn_output.transpose(1, 2)
90
+ .contiguous()
91
+ .view(batch_size, seq_len, self.embed_dim)
92
+ )
93
+ return self.out_proj(attn_output), attn_weights
94
+
95
+
96
+ class FeedForward(nn.Module):
97
+ """Position-wise feed-forward network."""
98
+
99
+ def __init__(self, embed_dim, ff_dim, dropout=0.1):
100
+ super().__init__()
101
+ self.linear1 = nn.Linear(embed_dim, ff_dim)
102
+ self.linear2 = nn.Linear(ff_dim, embed_dim)
103
+ self.dropout = nn.Dropout(dropout)
104
+
105
+ def forward(self, x):
106
+ return self.linear2(self.dropout(F.relu(self.linear1(x))))
107
+
108
+
109
+ class TransformerBlock(nn.Module):
110
+ """A single transformer decoder block."""
111
+
112
+ def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
113
+ super().__init__()
114
+ self.attention = MultiHeadAttention(embed_dim, num_heads, dropout)
115
+ self.norm1 = nn.LayerNorm(embed_dim)
116
+ self.feed_forward = FeedForward(embed_dim, ff_dim, dropout)
117
+ self.norm2 = nn.LayerNorm(embed_dim)
118
+ self.dropout = nn.Dropout(dropout)
119
+
120
+ def forward(self, x, mask=None):
121
+ attn_output, attn_weights = self.attention(x, mask)
122
+ x = self.norm1(x + self.dropout(attn_output))
123
+ ff_output = self.feed_forward(x)
124
+ x = self.norm2(x + self.dropout(ff_output))
125
+ return x, attn_weights
126
+
127
+
128
+ def create_causal_mask(seq_len):
129
+ """Create a causal mask to prevent attending to future tokens."""
130
+ mask = torch.tril(torch.ones(seq_len, seq_len))
131
+ return mask.unsqueeze(0).unsqueeze(0)
132
+
133
+
134
+ class CalculatorLLM(nn.Module):
135
+ """A tiny transformer LLM for solving English math problems."""
136
+
137
+ def __init__(
138
+ self, vocab_size, embed_dim, num_heads, num_layers, ff_dim, max_seq_len, dropout=0.1
139
+ ):
140
+ super().__init__()
141
+ self.embed_dim = embed_dim
142
+ self.max_seq_len = max_seq_len
143
+ self.embedding = TokenEmbedding(vocab_size, embed_dim, max_seq_len, dropout)
144
+ self.layers = nn.ModuleList(
145
+ [
146
+ TransformerBlock(embed_dim, num_heads, ff_dim, dropout)
147
+ for _ in range(num_layers)
148
+ ]
149
+ )
150
+ self.norm = nn.LayerNorm(embed_dim)
151
+ self.output_proj = nn.Linear(embed_dim, vocab_size)
152
+
153
+ def forward(self, x, mask=None):
154
+ if mask is None:
155
+ seq_len = x.size(1)
156
+ mask = create_causal_mask(seq_len).to(x.device)
157
+ x = self.embedding(x)
158
+ for layer in self.layers:
159
+ x, _ = layer(x, mask)
160
+ x = self.norm(x)
161
+ return self.output_proj(x)
162
+
163
+
164
+ class Tokenizer:
165
+ """Converts text to token IDs and back."""
166
+
167
+ def __init__(self, vocab):
168
+ self.vocab = vocab
169
+ self.id_to_word = {v: k for k, v in vocab.items()}
170
+
171
+ def normalize(self, text):
172
+ text = text.lower().strip()
173
+ text = text.replace("+", " plus ").replace("-", " minus ")
174
+ text = (
175
+ text.replace("*", " times ").replace("x", " times ").replace("=", " equals ")
176
+ )
177
+ tens = [
178
+ "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"
179
+ ]
180
+ ones = ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
181
+ for ten in tens:
182
+ for one in ones:
183
+ text = text.replace(f"{ten}{one}", f"{ten} {one}")
184
+ return " ".join(text.split())
185
+
186
+ def encode(self, text, add_special_tokens=True):
187
+ text = self.normalize(text)
188
+ ids = [self.vocab["[START]"]] if add_special_tokens else []
189
+ for word in text.split():
190
+ ids.append(self.vocab.get(word, self.vocab["[UNK]"]))
191
+ if add_special_tokens:
192
+ ids.append(self.vocab["[END]"])
193
+ return ids
194
+
195
+ def decode(self, ids, skip_special_tokens=True):
196
+ special = {"[PAD]", "[START]", "[END]", "[UNK]"}
197
+ words = [
198
+ self.id_to_word.get(id, "[UNK]")
199
+ for id in ids
200
+ if not (skip_special_tokens and self.id_to_word.get(id, "[UNK]") in special)
201
+ ]
202
+ return " ".join(words)
203
+
204
+
205
+ def load_model(model_dir, device="cpu"):
206
+ """Load a saved Calculator LLM model."""
207
+ with open(f"{model_dir}/config.json") as f:
208
+ config = json.load(f)
209
+ with open(f"{model_dir}/vocab.json") as f:
210
+ vocab = json.load(f)
211
+
212
+ model = CalculatorLLM(
213
+ vocab_size=config["vocab_size"],
214
+ embed_dim=config["embed_dim"],
215
+ num_heads=config["num_heads"],
216
+ num_layers=config["num_layers"],
217
+ ff_dim=config["ff_dim"],
218
+ max_seq_len=config["max_seq_len"],
219
+ dropout=config["dropout"],
220
+ )
221
+ model.load_state_dict(
222
+ torch.load(f"{model_dir}/model.pt", map_location=device, weights_only=True)
223
+ )
224
+ model.to(device)
225
+ model.eval()
226
+
227
+ tokenizer = Tokenizer(vocab)
228
+ return model, tokenizer, vocab
229
+
230
+
231
+ def generate(model, tokenizer, vocab, prompt, device="cpu", max_new_tokens=10):
232
+ """Generate text from a prompt."""
233
+ model.eval()
234
+ tokens = tokenizer.encode(prompt, add_special_tokens=True)[:-1]
235
+ input_ids = torch.tensor([tokens]).to(device)
236
+
237
+ with torch.no_grad():
238
+ for _ in range(max_new_tokens):
239
+ logits = model(input_ids)
240
+ next_token = logits[0, -1, :].argmax().item()
241
+ if next_token == vocab["[END]"]:
242
+ break
243
+ input_ids = torch.cat(
244
+ [input_ids, torch.tensor([[next_token]]).to(device)], dim=1
245
+ )
246
+
247
+ return tokenizer.decode(input_ids[0].tolist())
248
+
249
+
250
+ if __name__ == "__main__":
251
+ # Example usage
252
+ model, tokenizer, vocab = load_model(".")
253
+ result = generate(model, tokenizer, vocab, "two plus three equals")
254
+ print(f"Result: {result}")
config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "vocab_size": 36,
3
+ "embed_dim": 64,
4
+ "num_heads": 4,
5
+ "num_layers": 2,
6
+ "ff_dim": 256,
7
+ "max_seq_len": 16,
8
+ "dropout": 0.1
9
+ }
model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c729d0dee5708cb1ee6f63509a3bea2556e0dafdc63e7b80b421932394e226f
3
+ size 434201
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ torch
2
+ gradio
vocab.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "[PAD]": 0,
3
+ "[START]": 1,
4
+ "[END]": 2,
5
+ "[UNK]": 3,
6
+ "zero": 4,
7
+ "one": 5,
8
+ "two": 6,
9
+ "three": 7,
10
+ "four": 8,
11
+ "five": 9,
12
+ "six": 10,
13
+ "seven": 11,
14
+ "eight": 12,
15
+ "nine": 13,
16
+ "ten": 14,
17
+ "eleven": 15,
18
+ "twelve": 16,
19
+ "thirteen": 17,
20
+ "fourteen": 18,
21
+ "fifteen": 19,
22
+ "sixteen": 20,
23
+ "seventeen": 21,
24
+ "eighteen": 22,
25
+ "nineteen": 23,
26
+ "twenty": 24,
27
+ "thirty": 25,
28
+ "forty": 26,
29
+ "fifty": 27,
30
+ "sixty": 28,
31
+ "seventy": 29,
32
+ "eighty": 30,
33
+ "ninety": 31,
34
+ "plus": 32,
35
+ "minus": 33,
36
+ "times": 34,
37
+ "equals": 35
38
+ }