shng2025 commited on
Commit
3bb4876
·
1 Parent(s): 4d63b0c

step 15000

Browse files
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "gpt2",
3
  "activation_function": "gelu_new",
4
  "architectures": [
5
  "GPT2LMHeadModel"
 
1
  {
2
+ "_name_or_path": "./",
3
  "activation_function": "gelu_new",
4
  "architectures": [
5
  "GPT2LMHeadModel"
gptesla_training.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import datasets, transformers
4
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, set_seed
5
+ from transformers.optimization import get_scheduler
6
+ from datasets import load_dataset, DownloadConfig
7
+
8
+ import torch
9
+ from torch.utils.data import IterableDataset
10
+ from torch.utils.data.dataloader import DataLoader
11
+ from torch.utils.tensorboard import SummaryWriter
12
+ from torch.optim import AdamW
13
+
14
+ import logging
15
+ import wandb
16
+ from huggingface_hub import Repository, create_branch
17
+ from accelerate import Accelerator
18
+ from argparse import Namespace
19
+
20
+
21
+ # Set the API token as an environment variable
22
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
23
+
24
+
25
+ def save_checkpoint_state():
26
+
27
+ dir_name = "./torch_checkpoint"
28
+ os.makedirs(dir_name, exist_ok=True)
29
+
30
+ checkpoint = {
31
+ "lr_scheduler": lr_scheduler.state_dict(),
32
+ "completed_steps": completed_steps,
33
+ "run_name": run_name,
34
+ "optimizer": optimizer.state_dict(),
35
+ "run_id": wandb_id
36
+ }
37
+ torch.save(checkpoint, f"torch_checkpoint/latest_checkpoint.pth")
38
+
39
+
40
+ class ConstantLengthDataset(IterableDataset):
41
+
42
+ def __init__(
43
+ self,
44
+ tokenizer,
45
+ dataset,
46
+ seq_length=1024,
47
+ num_of_sequences=1024,
48
+ chars_per_token=3.6,
49
+ ):
50
+ self.tokenizer = tokenizer
51
+ self.concat_token_id = tokenizer.eos_token_id
52
+ self.dataset = dataset
53
+ self.seq_length = seq_length
54
+ self.input_characters = seq_length * chars_per_token * num_of_sequences
55
+
56
+ def __iter__(self):
57
+ iterator = iter(self.dataset)
58
+ more_examples = True
59
+ while more_examples:
60
+ buffer, buffer_len = [], 0
61
+ while True:
62
+ if buffer_len >= self.input_characters:
63
+ m = f"Buffer full: {buffer_len}>={self.input_characters:.0f}"
64
+ # print(m)
65
+ break
66
+ try:
67
+ m = f"Fill buffer: {buffer_len}<{self.input_characters:.0f}"
68
+ # print(m)
69
+ buffer.append(next(iterator)["content"])
70
+ buffer_len += len(buffer[-1])
71
+ except StopIteration:
72
+ # iterator = iter(self.dataset)
73
+ more_examples = False
74
+ break
75
+
76
+ all_token_ids = []
77
+ tokenized_inputs = self.tokenizer(buffer, truncation=False)
78
+ for tokenized_input in tokenized_inputs["input_ids"]:
79
+ all_token_ids.extend(tokenized_input + [self.concat_token_id])
80
+
81
+ for i in range(0, len(all_token_ids), self.seq_length):
82
+ input_ids = all_token_ids[i : i + self.seq_length]
83
+ if len(input_ids) == self.seq_length:
84
+ yield torch.tensor(input_ids)
85
+
86
+
87
+ def setup_logging(project_name):
88
+ logger = logging.getLogger(__name__)
89
+
90
+ dir_name = "./log"
91
+ if not os.path.exists(dir_name):
92
+ os.makedirs(dir_name)
93
+ print(f"Directory '{dir_name}' was created.")
94
+ else:
95
+ print(f"Directory '{dir_name}' already exists.")
96
+
97
+ # setting up log directory
98
+ logging.basicConfig(
99
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
100
+ datefmt="%m/%d/%Y %H:%M:%S",
101
+ level=logging.INFO,
102
+ handlers=[
103
+ logging.FileHandler(f"log/debug_{accelerator.process_index}.log"),
104
+ logging.StreamHandler(),
105
+ ],
106
+ )
107
+ if accelerator.is_main_process: # We only want to set up logging once
108
+ wandb.init(project=project_name, config=args, dir="./../")
109
+ run_name = wandb.run.name
110
+ wandb_id = wandb.run.id
111
+ tb_writer = SummaryWriter()
112
+ tb_writer.add_hparams(vars(args), {"0": 0})
113
+ logger.setLevel(logging.INFO)
114
+ datasets.utils.logging.set_verbosity_debug()
115
+ transformers.utils.logging.set_verbosity_info()
116
+ else:
117
+ tb_writer = None
118
+ run_name = ""
119
+ wandb_id = ""
120
+ logger.setLevel(logging.ERROR)
121
+ datasets.utils.logging.set_verbosity_error()
122
+ transformers.utils.logging.set_verbosity_error()
123
+ return logger, tb_writer, run_name, wandb_id
124
+
125
+
126
+ def create_dataloaders(dataset_name):
127
+ train_data = load_dataset(dataset_name + "-train", split="train", streaming=True)
128
+ train_data = train_data.shuffle(buffer_size=args.shuffle_buffer, seed=args.seed)
129
+ valid_data = load_dataset(dataset_name + "-valid", split="validation", streaming=True)
130
+
131
+ train_dataset = ConstantLengthDataset(tokenizer, train_data, seq_length=args.seq_length)
132
+ valid_dataset = ConstantLengthDataset(tokenizer, valid_data, seq_length=args.seq_length)
133
+
134
+ train_dataloader = DataLoader(train_dataset, batch_size=args.train_batch_size, num_workers=96)
135
+ eval_dataloader = DataLoader(valid_dataset, batch_size=args.valid_batch_size, num_workers=1)
136
+ return train_dataloader, eval_dataloader
137
+
138
+
139
+ def log_metrics(step, metrics):
140
+ logger.info(f"Step {step}: {metrics}")
141
+ if accelerator.is_main_process:
142
+ wandb.log(metrics)
143
+ [tb_writer.add_scalar(k, v, step) for k, v in metrics.items()]
144
+
145
+
146
+ def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
147
+ params_with_wd, params_without_wd = [], []
148
+ for n, p in model.named_parameters():
149
+ if any(nd in n for nd in no_decay):
150
+ params_without_wd.append(p)
151
+ else:
152
+ params_with_wd.append(p)
153
+ return [
154
+ {"params": params_with_wd, "weight_decay": args.weight_decay},
155
+ {"params": params_without_wd, "weight_decay": 0.0},
156
+ ]
157
+
158
+
159
+ def evaluate():
160
+ model.eval()
161
+ losses = []
162
+ for step, batch in enumerate(eval_dataloader):
163
+ with torch.no_grad():
164
+ outputs = model(batch, labels=batch)
165
+ loss = outputs.loss.repeat(args.valid_batch_size)
166
+ losses.append(accelerator.gather(loss))
167
+ if args.max_eval_steps > 0 and step >= args.max_eval_steps:
168
+ break
169
+ loss = torch.mean(torch.cat(losses))
170
+
171
+ try:
172
+ perplexity = torch.exp(loss)
173
+ except OverflowError:
174
+ perplexity = torch.tensor(float("inf"))
175
+
176
+ return loss.item(), perplexity.item()
177
+
178
+
179
+ # Accelerator
180
+ accelerator = Accelerator(dispatch_batches=True)
181
+ acc_state = {str(k): str(v) for k, v in accelerator.state.__dict__.items()}
182
+
183
+ project_name = "shng2025/gptesla-small"
184
+ dataset_name = "shng2025/gptesla"
185
+
186
+ # GPTesla - 111M param setup in comment. Modification to make lighter training requirement needed
187
+ config = {
188
+ "train_batch_size": 12, # 12
189
+ "valid_batch_size": 12, # 12
190
+ "weight_decay": 0.1,
191
+ "shuffle_buffer": 1000,
192
+ "learning_rate": 5e-4, # 5e-4
193
+ "lr_scheduler_type": "cosine",
194
+ "num_warmup_steps": 700, # 2000
195
+ "gradient_accumulation_steps": 1, # 1
196
+ "max_train_steps": 150000, # 150000
197
+ "max_eval_steps": 10,
198
+ "seq_length": 1024,
199
+ "seed": 1,
200
+ "save_checkpoint_steps": 15000,
201
+ } # 15000
202
+
203
+ args = Namespace(**config, **acc_state)
204
+ samples_per_step = accelerator.state.num_processes * args.train_batch_size
205
+ set_seed(args.seed)
206
+
207
+ # Logging
208
+ logger, tb_writer, run_name, wandb_id = setup_logging(project_name.split("/")[1])
209
+ logger.info(accelerator.state)
210
+
211
+ # Load model and tokenizer
212
+ if accelerator.is_main_process:
213
+ new_branch_name = run_name
214
+ create_branch(project_name, repo_type="model", branch=new_branch_name)
215
+ hf_repo = Repository("./", clone_from=project_name, revision=run_name)
216
+
217
+ model = AutoModelForCausalLM.from_pretrained("./") # , gradient_checkpointing=True)
218
+ tokenizer = AutoTokenizer.from_pretrained("./")
219
+
220
+ # Load dataset and dataloader
221
+ train_dataloader, eval_dataloader = create_dataloaders(dataset_name)
222
+
223
+ # Prepare the optimizer and learning rate scheduler
224
+ optimizer = AdamW(get_grouped_params(model), lr=args.learning_rate)
225
+ lr_scheduler = get_scheduler(
226
+ name=args.lr_scheduler_type,
227
+ optimizer=optimizer,
228
+ num_warmup_steps=args.num_warmup_steps,
229
+ num_training_steps=args.max_train_steps,
230
+ )
231
+
232
+
233
+ def get_lr():
234
+ return optimizer.param_groups[0]["lr"]
235
+
236
+
237
+ # Prepare everything with our `accelerator` (order of args is not important)
238
+ model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
239
+ model, optimizer, train_dataloader, eval_dataloader
240
+ )
241
+
242
+
243
+ # Train model
244
+ model.train()
245
+ completed_steps = 0
246
+ for step, batch in enumerate(train_dataloader, start=1):
247
+ loss = model(batch, labels=batch).loss
248
+ log_metrics(
249
+ step,
250
+ {
251
+ "lr": get_lr(),
252
+ "samples": step * samples_per_step,
253
+ "steps": completed_steps,
254
+ "loss/train": loss.item(),
255
+ },
256
+ )
257
+ loss = loss / args.gradient_accumulation_steps
258
+ accelerator.backward(loss)
259
+ if step % args.gradient_accumulation_steps == 0:
260
+ optimizer.step()
261
+ lr_scheduler.step()
262
+ optimizer.zero_grad()
263
+ completed_steps += 1
264
+ if step % args.save_checkpoint_steps == 0:
265
+ logger.info("Evaluating and saving model checkpoint")
266
+ eval_loss, perplexity = evaluate()
267
+ log_metrics(step, {"loss/eval": eval_loss, "perplexity": perplexity})
268
+ accelerator.wait_for_everyone()
269
+ unwrapped_model = accelerator.unwrap_model(model)
270
+ if accelerator.is_main_process:
271
+ save_checkpoint_state()
272
+ unwrapped_model.save_pretrained("./")
273
+ hf_repo.push_to_hub(commit_message=f"step {step}")
274
+ model.train()
275
+ if completed_steps >= args.max_train_steps:
276
+ break
277
+
278
+
279
+ # Evaluate and save the last checkpoint
280
+ logger.info("Evaluating and saving model after training")
281
+ eval_loss, perplexity = evaluate()
282
+ log_metrics(step, {"loss/eval": eval_loss, "perplexity": perplexity})
283
+ accelerator.wait_for_everyone()
284
+ unwrapped_model = accelerator.unwrap_model(model)
285
+ if accelerator.is_main_process:
286
+ unwrapped_model.save_pretrained("./")
287
+ hf_repo.push_to_hub(commit_message="final model")
log/debug_0.log CHANGED
The diff for this file is too large to render. See raw diff
 
log/debug_1.log ADDED
File without changes
log/debug_2.log ADDED
File without changes
log/debug_3.log ADDED
File without changes
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3e5bfbaf0ff37c9cbbbf37dfdf848aaee9b060a6a19f3c79d815806112c6c2f1
3
  size 444048000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94549a0641d7eda26739eaac88ae774ce97f49e21ff95a1cd267255f2bc81e87
3
  size 444048000
runs/Jul25_11-24-47_lab/1721906687.039178/events.out.tfevents.1721906687.lab.4458.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf2f94498dcf1e03056f6d1c8813e294171cd0898fc26522e0593a109e085543
3
+ size 1702
runs/Jul25_11-24-47_lab/events.out.tfevents.1721906687.lab.4458.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93e883d74a609102810569d729019c689cf47caca4a80ee3a62d5ce77961522c
3
+ size 2699677
torch_checkpoint/latest_checkpoint.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a744f386c14e5fbf5bcd5b959fd05c4f89b61600d67eb2608366bd33abc9cb1b
3
+ size 888193914