MartialTerran
commited on
Commit
•
b6585a5
1
Parent(s):
96a54b7
Update Gettysburg_GPT2_v1.4final.py
Browse files- Gettysburg_GPT2_v1.4final.py +2 -463
Gettysburg_GPT2_v1.4final.py
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
#This script runs and computes loss down to under 0.001 at epoch 101, then after epoch 110 the loss rises up again. Then at epoch 150 the loss goes downward again. Next version will report the particular words that are causing the error/loss.
|
2 |
# # The tokenize method now uses the last special token in the self.special_tokens list (which is assumed to be the padding token <pad> in this case) as the default token for unknown words.
|
3 |
#text separate_punctuation focuses solely on separating the defined punctuation marks from words.
|
@@ -6,466 +8,3 @@
|
|
6 |
# The detokenizer does not yet auto-remove spaces preceding punctuations. This is because tokens are defined without leading spaces, and spaces are autoappended to all tokens in detokenizer.
|
7 |
|
8 |
# It's possible to increase training_input_seq_len over epochs. However, directly modifying training_input_seq_len inside the Dataset class after it's created isn't ideal. A better approach is to control the sequence length during batch creation within the DataLoader. You can achieve this using a custom collate_fn ?
|
9 |
-
|
10 |
-
|
11 |
-
print("loading libraries")
|
12 |
-
import os # to get filename of this script
|
13 |
-
import datetime
|
14 |
-
import torch
|
15 |
-
import torch.nn as nn
|
16 |
-
import torch.optim as optim
|
17 |
-
from torch.utils.data import Dataset, DataLoader
|
18 |
-
import torch.optim as optim
|
19 |
-
from torch.optim.lr_scheduler import ReduceLROnPlateau # Import the learning rate scheduler
|
20 |
-
import math
|
21 |
-
import inspect
|
22 |
-
#import string # replaced with self.punctuation_list = ['.', ',', '/', '\\', '[', ']', '<', '?', '>', '-']] # Specific list of punctuations
|
23 |
-
print("done loading libraries")
|
24 |
-
|
25 |
-
print("Hardcoding Memorized_Speech = Gettysburg Address") #(for simplicity in this toy example)
|
26 |
-
Memorized_Speech = """
|
27 |
-
Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, and dedicated to the proposition that all men are created equal.
|
28 |
-
|
29 |
-
Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battle-field of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this.
|
30 |
-
|
31 |
-
But, in a larger sense, we can not dedicate - we can not consecrate - we can not hallow-this ground. The brave men, living and dead, who struggled here, have consecrated it, far above our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have thus far so nobly advanced. It is rather for us to be here dedicated to the great task remaining before us - that from these honored dead we take increased devotion to that cause for which they gave the last full measure of devotion - that we here highly resolve that these dead shall not have died in vain - that this nation, under God, shall have a new birth of freedom - and that government of the people, by the people, for the people, shall not perish from the earth.
|
32 |
-
"""
|
33 |
-
print(f'Length of Memorized_Speech = {len(Memorized_Speech)} characters, as follows:')
|
34 |
-
print(Memorized_Speech)
|
35 |
-
|
36 |
-
|
37 |
-
# Add special tokens here. "<pad>" is also used for unknown words. The carriage-return specialtoken will be auto-inserted into the received text before tokenization. But tabs and newlines are not implemented/supported.
|
38 |
-
# Hyperparameters
|
39 |
-
hyperparameters = {
|
40 |
-
"vocab_size": 152, # Estimated vocabulary size for Gettysburg Address + special tokens
|
41 |
-
"special_tokens": ["<FreetheLLM>", "<cr>", "<pad>"],
|
42 |
-
"n_embd": 512, # Embedding dimension
|
43 |
-
"n_layer": 4, # Number of layers
|
44 |
-
"n_head": 16, # Number of attention heads
|
45 |
-
"n_inner": 4 * 512, # Inner dimension of feedforward network (4 times n_embd)
|
46 |
-
"max_sequence_len": 264, # Maximum sequence length
|
47 |
-
"epochs": 200, # Number of training epochs
|
48 |
-
"learning_rate": 1e-3, # [Initial] Learning rate
|
49 |
-
"batch_size": 1, # Batch size (since the dataset is small)
|
50 |
-
"dropout": 0.2 # Dropout probability
|
51 |
-
}
|
52 |
-
# More Script/Training parameters:
|
53 |
-
min_training_input_seq_len = 32
|
54 |
-
Early_stopping_loss = 0.003
|
55 |
-
|
56 |
-
Per_token_loss_threshold = 0.5
|
57 |
-
# Adjust this Per_token_loss_threshold as needed # Per-Token Loss: torch.nn.CrossEntropyLoss with reduction='none' is used to get the loss for each individual token. Reshaping Loss: The per-token loss is reshaped to have the same dimensions as target_seq for easier indexing. Threshold for Error Tokens: A threshold is defined to filter tokens with significant errors. You can adjust this threshold value (e.g., 0.5) based on your observations. Identifying Error Tokens: The code iterates through the per-token loss, and tokens with loss values above the threshold are identified.
|
58 |
-
|
59 |
-
def print_with_line(message):
|
60 |
-
frame = inspect.currentframe().f_back # needs import inspect
|
61 |
-
line_number = frame.f_lineno
|
62 |
-
print(f"{message} at script line {line_number}")
|
63 |
-
|
64 |
-
# --- Tokenizer and Detokenizer ---
|
65 |
-
class Tokenizer:
|
66 |
-
def __init__(self, text, special_tokens, vocab_size_hyperparameter):
|
67 |
-
self.special_tokens = special_tokens
|
68 |
-
self.cr_token = special_tokens[1]
|
69 |
-
#self.punctuation = string.punctuation # Store punctuation characters
|
70 |
-
self.punctuation_list = ['.', ',', '/', '\\', '[', ']', '<', '?', '>', '-'] # Specific list of punctuations
|
71 |
-
estimated_vocab_size = vocab_size_hyperparameter #hyperparameters["vocab_size"]
|
72 |
-
|
73 |
-
# Preprocess text to separate existing punctuation from words, and then auto-inserts <cr> special tokens at carriage returns.
|
74 |
-
text = self.separate_punctuation(text)
|
75 |
-
|
76 |
-
in_text_words = []
|
77 |
-
in_text_punctuations = []
|
78 |
-
for candidate in text.split(): # Split into tokens (space-separated words and punctuation; includes words attached to punctuation)
|
79 |
-
cleaned_words = ''.join(c for c in candidate if c not in self.punctuation_list) #strip punctuation from words
|
80 |
-
if cleaned_words:
|
81 |
-
in_text_words.append(cleaned_words.lower())
|
82 |
-
for char in candidate: # Iterate through each character in the candidates
|
83 |
-
if char in self.punctuation_list:
|
84 |
-
in_text_punctuations.append(char) # Add in-text punctuation as separate tokens
|
85 |
-
|
86 |
-
# Ensure unique and sorted word and punctuation tokens
|
87 |
-
in_text_words = list(set(in_text_words))
|
88 |
-
in_text_words.sort()
|
89 |
-
in_text_punctuations = list(set(in_text_punctuations))
|
90 |
-
in_text_punctuations.sort()
|
91 |
-
|
92 |
-
self.vocab = self.special_tokens + in_text_punctuations + in_text_words # Vocab starts with special tokens, then punctuation, then whole words.
|
93 |
-
self.vocab_size = len(self.vocab) # Calculate vocabulary size dynamically
|
94 |
-
# Alert if vocab_size is different from a predefined hyperparameter estimate (optional)
|
95 |
-
if self.vocab_size != estimated_vocab_size:
|
96 |
-
print(f"Warning: Calculated vocab_size ({self.vocab_size}) differs from estimated size ({estimated_vocab_size}).")
|
97 |
-
|
98 |
-
self.word_to_index = {word: i for i, word in enumerate(self.vocab)}
|
99 |
-
self.index_to_word = {i: word for i, word in enumerate(self.vocab)}
|
100 |
-
|
101 |
-
def separate_punctuation(self, text): # text passed to the tokenize method is also preprocessed to have separated punctuation before tokenization #separate_punctuation(self, text) method, as currently implemented, does not directly affect carriage returns (\r) in the original text.
|
102 |
-
#Adds spaces around punctuation to separate them from words.
|
103 |
-
for char in self.punctuation_list:
|
104 |
-
text = text.replace(char, f' {char} ')
|
105 |
-
#Replace carriage returns (backslash-r) in the input text with a special token (e.g., <cr>).
|
106 |
-
text = text.replace('\r', f' {self.cr_token} ') # Replace \r with <cr> token and pad with spaces.
|
107 |
-
#print(f"Carriage-Return's special token inserted as {self.cr_token}")
|
108 |
-
return text
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
def tokenize(self, text):
|
113 |
-
# Apply punctuation separation before tokenizing
|
114 |
-
text = self.separate_punctuation(text)
|
115 |
-
words = text.lower().split() #preserves special tokens like the auto-inserted <cr>
|
116 |
-
token_ids = []
|
117 |
-
for word in words:
|
118 |
-
if word in self.word_to_index:
|
119 |
-
token_ids.append(self.word_to_index[word])
|
120 |
-
else:
|
121 |
-
#token_ids.append(self.word_to_index['<pad>'])
|
122 |
-
token_ids.append(self.word_to_index[self.special_tokens[-1]]) # Use last special token as default (e.g., <pad>) # The tokenize method now uses the last special token in the self.special_tokens list (which is assumed to be the padding token <pad> in this case) as the default token for unknown words.
|
123 |
-
return token_ids
|
124 |
-
|
125 |
-
def detokenize(self, tokens):
|
126 |
-
return " ".join([self.index_to_word[token] for token in tokens if token in self.index_to_word])
|
127 |
-
|
128 |
-
# --- GPT-2 Model ---
|
129 |
-
class CausalSelfAttention(nn.Module):
|
130 |
-
def __init__(self, config):
|
131 |
-
super().__init__()
|
132 |
-
assert config["n_embd"] % config["n_head"] == 0
|
133 |
-
# key, query, value projections for all heads, but in a batch
|
134 |
-
self.c_attn = nn.Linear(config["n_embd"], 3 * config["n_embd"])
|
135 |
-
# output projection
|
136 |
-
self.c_proj = nn.Linear(config["n_embd"], config["n_embd"])
|
137 |
-
# regularization
|
138 |
-
self.attn_dropout = nn.Dropout(0.1)
|
139 |
-
self.resid_dropout = nn.Dropout(0.1)
|
140 |
-
self.n_head = config["n_head"]
|
141 |
-
self.n_embd = config["n_embd"]
|
142 |
-
self.register_buffer("bias", torch.tril(torch.ones(config["max_sequence_len"], config["max_sequence_len"]))
|
143 |
-
.view(1, 1, config["max_sequence_len"], config["max_sequence_len"]))
|
144 |
-
|
145 |
-
def forward(self, x):
|
146 |
-
B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
|
147 |
-
|
148 |
-
# calculate query, key, values for all heads in batch and move head forward to be the batch dim
|
149 |
-
q, k ,v = self.c_attn(x).split(self.n_embd, dim=2)
|
150 |
-
k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
|
151 |
-
q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
|
152 |
-
v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
|
153 |
-
|
154 |
-
# causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
|
155 |
-
att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
|
156 |
-
att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
|
157 |
-
att = torch.softmax(att, dim=-1)
|
158 |
-
att = self.attn_dropout(att)
|
159 |
-
y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
|
160 |
-
y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
|
161 |
-
|
162 |
-
# output projection
|
163 |
-
y = self.resid_dropout(self.c_proj(y))
|
164 |
-
return y
|
165 |
-
|
166 |
-
class Block(nn.Module):
|
167 |
-
def __init__(self, config):
|
168 |
-
super().__init__()
|
169 |
-
self.ln_1 = nn.LayerNorm(config["n_embd"])
|
170 |
-
self.attn = CausalSelfAttention(config)
|
171 |
-
self.ln_2 = nn.LayerNorm(config["n_embd"])
|
172 |
-
self.mlp = nn.Sequential(
|
173 |
-
nn.Linear(config["n_embd"], config["n_inner"]),
|
174 |
-
nn.GELU(),
|
175 |
-
nn.Linear(config["n_inner"], config["n_embd"]),
|
176 |
-
nn.Dropout(0.1),
|
177 |
-
)
|
178 |
-
|
179 |
-
def forward(self, x):
|
180 |
-
x = x + self.attn(self.ln_1(x))
|
181 |
-
x = x + self.mlp(self.ln_2(x))
|
182 |
-
return x
|
183 |
-
|
184 |
-
class ToyGPT2(nn.Module):
|
185 |
-
def __init__(self, config):
|
186 |
-
super().__init__()
|
187 |
-
self.config = config
|
188 |
-
self.token_embedding_table = nn.Embedding(config["vocab_size"], config["n_embd"])
|
189 |
-
self.position_embedding_table = nn.Embedding(config["max_sequence_len"], config["n_embd"])
|
190 |
-
self.blocks = nn.Sequential(*[Block(config) for _ in range(config["n_layer"])])
|
191 |
-
self.ln_f = nn.LayerNorm(config["n_embd"]) # final layer norm
|
192 |
-
self.lm_head = nn.Linear(config["n_embd"], config["vocab_size"])
|
193 |
-
|
194 |
-
# Initialize weights to be small for better training
|
195 |
-
self.apply(self._init_weights)
|
196 |
-
|
197 |
-
# Tie the weights of the embedding and the output layer
|
198 |
-
self.lm_head.weight = self.token_embedding_table.weight
|
199 |
-
|
200 |
-
def _init_weights(self, module):
|
201 |
-
#if isinstance(module, nn.Linear):
|
202 |
-
# torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
|
203 |
-
if isinstance(module, nn.Linear) and module.bias is not None:
|
204 |
-
#print("isinstance(module, nn.Linear) and module.bias is not None")
|
205 |
-
torch.nn.init.zeros_(module.bias)
|
206 |
-
elif isinstance(module, nn.Embedding):
|
207 |
-
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
|
208 |
-
#print("torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)")
|
209 |
-
|
210 |
-
def forward(self, idx, targets=None):
|
211 |
-
B, T = idx.shape
|
212 |
-
# idx and targets are both (B,T) tensor of integers
|
213 |
-
tok_emb = self.token_embedding_table(idx) # (B,T,C)
|
214 |
-
pos_emb = self.position_embedding_table(torch.arange(T, device=idx.device)) # (T,C)
|
215 |
-
x = tok_emb + pos_emb # (B,T,C)
|
216 |
-
x = self.blocks(x) # (B,T,C)
|
217 |
-
x = self.ln_f(x) # (B,T,C)
|
218 |
-
logits = self.lm_head(x) # (B,T,vocab_size)
|
219 |
-
|
220 |
-
if targets is None:
|
221 |
-
loss = None
|
222 |
-
else:
|
223 |
-
B, T, C = logits.shape
|
224 |
-
logits = logits.view(B*T, C)
|
225 |
-
targets = targets.view(B*T)
|
226 |
-
loss = nn.functional.cross_entropy(logits, targets)
|
227 |
-
|
228 |
-
return logits, loss
|
229 |
-
|
230 |
-
def generate(self, input_ids, max_new_tokens, temperature=1.0):
|
231 |
-
self.eval() # Set model to evaluation mode
|
232 |
-
with torch.no_grad(): # Disable gradient calculation during generation
|
233 |
-
for _ in range(max_new_tokens):
|
234 |
-
# Limit input_ids to the last max_sequence_len tokens
|
235 |
-
input_ids_truncated = input_ids[:, -self.config["max_sequence_len"]:]
|
236 |
-
|
237 |
-
# Get logits from the model
|
238 |
-
logits, _ = self(input_ids_truncated) # No need for loss during generation
|
239 |
-
|
240 |
-
# Focus on the logits for the last time step (next token prediction)
|
241 |
-
logits = logits[:, -1, :] / temperature
|
242 |
-
|
243 |
-
# Apply softmax to get probabilities
|
244 |
-
probs = torch.softmax(logits, dim=-1)
|
245 |
-
|
246 |
-
# Sample the next token
|
247 |
-
next_token = torch.multinomial(probs, num_samples=1)
|
248 |
-
|
249 |
-
|
250 |
-
# Append next token to input sequence
|
251 |
-
input_ids = torch.cat((input_ids, next_token), dim=1)
|
252 |
-
|
253 |
-
self.train() # Return model to training mode
|
254 |
-
return input_ids
|
255 |
-
|
256 |
-
# --- Dataset ---
|
257 |
-
class Dataset(Dataset):
|
258 |
-
def __init__(self, data, tokenizer, seq_len):
|
259 |
-
self.tokenizer = tokenizer
|
260 |
-
self.seq_len = seq_len
|
261 |
-
|
262 |
-
print_with_line("# Tokenize the entire data")
|
263 |
-
self.tokens = self.tokenizer.tokenize(data)
|
264 |
-
print(f"DEBUG: Total tokens: {len(self.tokens)} in Dataset(") # Add this line
|
265 |
-
|
266 |
-
# Calculate token counts
|
267 |
-
self.token_counts = self._calculate_token_counts() # Store counts in the object
|
268 |
-
|
269 |
-
# Create input-target pairs
|
270 |
-
self.data = []
|
271 |
-
for i in range(0, len(self.tokens) - seq_len - 1, seq_len):
|
272 |
-
input_seq = self.tokens[i:i + seq_len]
|
273 |
-
target_seq = self.tokens[i + 1:i + seq_len + 1]
|
274 |
-
self.data.append((torch.tensor(input_seq), torch.tensor(target_seq)))
|
275 |
-
|
276 |
-
print(f"DEBUG: Number of data samples created in class Dataset(Dataset): {len(self.data)}") # Add this line
|
277 |
-
|
278 |
-
# Print token-vocabulary information
|
279 |
-
print_with_line("# Print token-vocabulary information:")
|
280 |
-
self.print_vocabulary_info() # Call the new method
|
281 |
-
|
282 |
-
def _calculate_token_counts(self):
|
283 |
-
#Calculates the frequency of each token in self.tokens.
|
284 |
-
counts = {}
|
285 |
-
for token in self.tokens:
|
286 |
-
if token in counts:
|
287 |
-
counts[token] += 1
|
288 |
-
print(f"token {token} count has been incremented to {counts[token]}")
|
289 |
-
else:
|
290 |
-
counts[token] = 1
|
291 |
-
return counts
|
292 |
-
|
293 |
-
def print_vocabulary_info(self):
|
294 |
-
print_with_line("# Print token-vocabulary information:")
|
295 |
-
for token_id in range(self.tokenizer.vocab_size): # Iterate through indices
|
296 |
-
token = self.tokenizer.index_to_word[token_id] # Get token string from index
|
297 |
-
count = self.token_counts.get(token_id, 0) # Correct: token_id is an integer ID # Get count, default to 0 if not found
|
298 |
-
#print(f" Token {token_id}: '{token}' occurs {count} times in the dataset")
|
299 |
-
print(f" Token {token_id}:'{token}' \t\t occurs {count} times in the dataset")
|
300 |
-
|
301 |
-
def __len__(self):
|
302 |
-
return len(self.data)
|
303 |
-
|
304 |
-
def __getitem__(self, idx):
|
305 |
-
return self.data[idx] # Return the pre-processed tensor pairs
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
# --- Trainer ---
|
310 |
-
class Trainer:
|
311 |
-
def __init__(self, model, tokenizer, train_loader, hyperparameters, device):
|
312 |
-
self.model = model
|
313 |
-
self.tokenizer = tokenizer
|
314 |
-
self.train_loader = train_loader # notice this change
|
315 |
-
self.hyperparameters = hyperparameters
|
316 |
-
self.Per_token_loss_threshold = Per_token_loss_threshold # Assign global to instance
|
317 |
-
self.Early_stopping_loss = Early_stopping_loss # Set Early stopping loss
|
318 |
-
self.device = device # Store the device
|
319 |
-
|
320 |
-
|
321 |
-
self.optimizer = optim.AdamW(self.model.parameters(), lr=hyperparameters["learning_rate"])
|
322 |
-
self.scheduler = ReduceLROnPlateau(self.optimizer, mode='min', factor=0.99, patience=10)
|
323 |
-
# mode='min': Indicates that you want to minimize the loss.
|
324 |
-
# factor=0.1: The factor by which the learning rate is reduced (e.g., 0.1 means reduce to 10%).
|
325 |
-
# patience=10: Number of epochs with no improvement after which the learning rate will be reduced.
|
326 |
-
# verbose=True: Prints a message when the learning rate is adjusted.
|
327 |
-
# Step the Scheduler: Call self.scheduler.step(average_loss) after calculating average_loss. This tells the scheduler to update the learning rate based on the current loss.
|
328 |
-
# Automated Adjustment: The scheduler automatically adjusts the learning rate, removing the need for manual tuning during training.
|
329 |
-
# Improved Convergence: Can help the model converge more smoothly and potentially reach a better solution.
|
330 |
-
# Reduced Fluctuations: Helps reduce the fluctuations in the loss.
|
331 |
-
|
332 |
-
def train(self):
|
333 |
-
self.model.train() # Set model to training mode
|
334 |
-
for epoch in range(self.hyperparameters["epochs"]):
|
335 |
-
total_loss = 0
|
336 |
-
for batch_idx, (input_seq, target_seq) in enumerate(self.train_loader): # Use enumerate to get batch index # Directly use the loaded batches
|
337 |
-
input_seq = input_seq.to(self.device) # Move to device
|
338 |
-
target_seq = target_seq.to(self.device) # Move to device
|
339 |
-
|
340 |
-
self.optimizer.zero_grad()
|
341 |
-
logits, loss = self.model(input_seq, targets=target_seq) # logits are the raw predictions
|
342 |
-
|
343 |
-
# <DISABLED the non-working feature>
|
344 |
-
"""
|
345 |
-
# Per-token loss calculation (using cross-entropy as an example)
|
346 |
-
loss_fn = torch.nn.CrossEntropyLoss(reduction='none') # 'none' to get per-token loss
|
347 |
-
per_token_loss = loss_fn(logits.view(-1, logits.size(-1)), target_seq.view(-1))
|
348 |
-
per_token_loss = per_token_loss.view(target_seq.size()) # Reshape to match target_seq shape
|
349 |
-
|
350 |
-
# Move error reporting INSIDE the batch loop
|
351 |
-
if loss.item() < 0.01: # Check loss for current batch
|
352 |
-
print("Tokens with significant errors (per-token loss > threshold): [feature not working]")
|
353 |
-
for i in range(target_seq.size(0)): # Iterate over elements in the batch
|
354 |
-
for token_idx in range(target_seq.size(1)):
|
355 |
-
if per_token_loss[i, token_idx] > self.Per_token_loss_threshold:
|
356 |
-
target_token_id = target_seq[i, token_idx].item()
|
357 |
-
target_word = self.tokenizer.index_to_word[target_token_id]
|
358 |
-
print(f" Batch item {i}, Token {token_idx}: Word '{target_word}' (ID: {target_token_id}), Loss: {per_token_loss[i, token_idx].item():.4f}")
|
359 |
-
"""
|
360 |
-
# <DISABLED the non-working feature>
|
361 |
-
|
362 |
-
loss.backward()
|
363 |
-
self.optimizer.step()
|
364 |
-
total_loss += loss.item()
|
365 |
-
|
366 |
-
average_loss = total_loss / len(self.train_loader) # Consider number of batches
|
367 |
-
print(f"Epoch {epoch+1}/{self.hyperparameters['epochs']}, Loss: {average_loss:.4f}")
|
368 |
-
if loss < 0.01: # Check loss for current batch
|
369 |
-
print(" LOSS IS BELOW 0.01")
|
370 |
-
if loss < 0.001: # Check loss for current batch
|
371 |
-
print(" LOSS IS BELOW 0.001")
|
372 |
-
|
373 |
-
self.scheduler.step(average_loss) # Update the lossrate-scheduler with the current loss
|
374 |
-
|
375 |
-
# Check if the learning rate has changed and print it
|
376 |
-
current_lr = self.optimizer.param_groups[0]['lr']
|
377 |
-
last_lr = self.scheduler.get_last_lr()[0] # Get the last learning rate
|
378 |
-
if current_lr != last_lr:
|
379 |
-
print(f"Learning rate reduced to {last_lr:.6f}")
|
380 |
-
|
381 |
-
if(epoch%100 ==0):
|
382 |
-
current_lr = self.optimizer.param_groups[0]['lr'] # Get the current learning rate from the optimizer
|
383 |
-
print(f"Epoch {epoch + 1}: Current learning rate: {current_lr:.6f}") #current_lr Retrieval: Inside the if (epoch % 100 == 0) block, the current learning rate is obtained using self.optimizer.param_groups[0]['lr']. This is the standard way to access the learning rate of the first (and often only) parameter group in PyTorch optimizers.
|
384 |
-
|
385 |
-
#self.save_checkpoint(f"model_checkpoint_epoch_{epoch+1}.pth")
|
386 |
-
self.save_checkpoint(f"model_checkpoint_epoch_{epoch + 1}.pth", epoch, average_loss) # Pass epoch and average_loss
|
387 |
-
|
388 |
-
# Early stopping condition
|
389 |
-
if average_loss < self.Early_stopping_loss:
|
390 |
-
print(f"Early stopping: Average loss {average_loss:.4f} is below the threshold ({self.Early_stopping_loss}).")
|
391 |
-
self.save_checkpoint(f"model_checkpoint_early_stop.pth", epoch, average_loss) # Save checkpoint
|
392 |
-
break # Exit the training loop
|
393 |
-
|
394 |
-
|
395 |
-
def save_checkpoint(self, path, epoch, average_loss):
|
396 |
-
# Get the current script's filename
|
397 |
-
script_filename = os.path.basename(__file__) # Get filename from the current script path
|
398 |
-
|
399 |
-
# Get the current date and time
|
400 |
-
current_datetime = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
401 |
-
|
402 |
-
# Construct the new filename
|
403 |
-
base_filename, extension = os.path.splitext(path) # Split original filename
|
404 |
-
new_filename = f"{base_filename}_{script_filename}_{current_datetime}{extension}"
|
405 |
-
|
406 |
-
torch.save({
|
407 |
-
'epoch': epoch,
|
408 |
-
'model_state_dict': self.model.state_dict(),
|
409 |
-
'optimizer_state_dict': self.optimizer.state_dict(),
|
410 |
-
'loss': average_loss,
|
411 |
-
'hyperparameters': self.hyperparameters
|
412 |
-
}, new_filename)
|
413 |
-
|
414 |
-
|
415 |
-
# --- Main Execution ---
|
416 |
-
def main():
|
417 |
-
# Determine device (GPU if available, else CPU)
|
418 |
-
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
419 |
-
print(f"Using device: {device}")
|
420 |
-
|
421 |
-
print_with_line("# Initialize tokenizer")
|
422 |
-
#tokenizer = Tokenizer(Memorized_Speech)
|
423 |
-
tokenizer = Tokenizer(Memorized_Speech, hyperparameters["special_tokens"], hyperparameters["vocab_size"]) # The special_tokens list is now defined in the hyperparameters dictionary.
|
424 |
-
print(f"Vocabulary Size: {tokenizer.vocab_size}")
|
425 |
-
|
426 |
-
print_with_line("# Prepare dataset")
|
427 |
-
#dataset = Dataset(Memorized_Speech, tokenizer, hyperparameters["max_sequence_len"])
|
428 |
-
dataset = Dataset(Memorized_Speech, tokenizer, min_training_input_seq_len) # Common values of min_training_input_seq_len for smaller models or experiments are 32, 64, 128, or 256.
|
429 |
-
train_loader = DataLoader(dataset, batch_size=hyperparameters["batch_size"])
|
430 |
-
|
431 |
-
print_with_line("# Initialize model")
|
432 |
-
print(f"HyperParamters = {hyperparameters}")
|
433 |
-
model = ToyGPT2(hyperparameters).to(device)
|
434 |
-
|
435 |
-
print_with_line("# Initialize trainer")
|
436 |
-
trainer = Trainer(model, tokenizer, train_loader, hyperparameters, device)
|
437 |
-
|
438 |
-
print_with_line("# Train the model")
|
439 |
-
trainer.train()
|
440 |
-
|
441 |
-
print("") # space
|
442 |
-
print_with_line("# --- Inference Examples ---")
|
443 |
-
model.eval()
|
444 |
-
|
445 |
-
# Example 1: Recite the Gettysburg Address
|
446 |
-
print_with_line("# Example 1: Recite the Gettysburg Address")
|
447 |
-
start_text = "four score"
|
448 |
-
start_tokens = torch.tensor(tokenizer.tokenize(start_text)).unsqueeze(0).to(device)
|
449 |
-
print("Prompt:", start_text)
|
450 |
-
generated_tokens = model.generate(start_tokens, max_new_tokens=len(dataset.tokens)-len(start_tokens), temperature=1.0) # Generate a completion for the whole dataset
|
451 |
-
generated_text = tokenizer.detokenize(generated_tokens.squeeze().tolist())
|
452 |
-
print("\nResponse:\n", generated_text)
|
453 |
-
|
454 |
-
print("") # space
|
455 |
-
# Example 2: Free text generation after encountering <FreetheLLM> #### Eventually, modify to request user text inxlusinf only Gettysburg vocabulary]
|
456 |
-
print_with_line("# Example 2: Free text generation after encountering <FreetheLLM>")
|
457 |
-
|
458 |
-
start_text = "we here highly resolve that these dead shall not have died in vain and that this nation under god shall have a new "
|
459 |
-
special_token = tokenizer.special_tokens[0] # Get the <FreetheLLM> token
|
460 |
-
start_text += special_token # Append the special token directly to the string
|
461 |
-
print("Prompt:", start_text)
|
462 |
-
|
463 |
-
start_tokens = torch.tensor(tokenizer.tokenize(start_text)).unsqueeze(0).to(device) # Tokenize the combined string
|
464 |
-
|
465 |
-
generated_tokens = model.generate(start_tokens, max_new_tokens=100, temperature=1.0)
|
466 |
-
generated_text = tokenizer.detokenize(generated_tokens.squeeze().tolist())
|
467 |
-
print("\nFreestyle Generation:\n", generated_text)
|
468 |
-
|
469 |
-
|
470 |
-
if __name__ == "__main__":
|
471 |
-
main()
|
|
|
1 |
+
# Replaced with v1.4.2 at https://huggingface.co/MartialTerran/Toy_GPTs_LLMs_for_CPU_Educational/blob/main/Gettysburg_GPT2_v1.4.2.py
|
2 |
+
|
3 |
#This script runs and computes loss down to under 0.001 at epoch 101, then after epoch 110 the loss rises up again. Then at epoch 150 the loss goes downward again. Next version will report the particular words that are causing the error/loss.
|
4 |
# # The tokenize method now uses the last special token in the self.special_tokens list (which is assumed to be the padding token <pad> in this case) as the default token for unknown words.
|
5 |
#text separate_punctuation focuses solely on separating the defined punctuation marks from words.
|
|
|
8 |
# The detokenizer does not yet auto-remove spaces preceding punctuations. This is because tokens are defined without leading spaces, and spaces are autoappended to all tokens in detokenizer.
|
9 |
|
10 |
# It's possible to increase training_input_seq_len over epochs. However, directly modifying training_input_seq_len inside the Dataset class after it's created isn't ideal. A better approach is to control the sequence length during batch creation within the DataLoader. You can achieve this using a custom collate_fn ?
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|