mlopez6132 commited on
Commit
8267d00
·
verified ·
1 Parent(s): d733e66

Upload zerogpu_training.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. zerogpu_training.py +379 -0
zerogpu_training.py ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ZeroGPU Training Script for Nano-Coder
3
+ Optimized for Hugging Face's ZeroGPU free compute (H200, 70GB)
4
+ """
5
+
6
+ import os
7
+ import time
8
+ import math
9
+ import pickle
10
+ from contextlib import nullcontext
11
+
12
+ import numpy as np
13
+ import torch
14
+ from torch.nn.parallel import DistributedDataParallel as DDP
15
+ from torch.distributed import init_process_group, destroy_process_group
16
+
17
+ from model import GPTConfig, GPT
18
+
19
+ # Hugging Face specific imports
20
+ from huggingface_hub import HfApi, login
21
+ import wandb
22
+
23
+ # -----------------------------------------------------------------------------
24
+ # Configuration optimized for ZeroGPU (H200, 70GB, FREE)
25
+ # I/O
26
+ out_dir = 'out-nano-coder-zerogpu'
27
+ eval_interval = 100 # Frequent evaluation for monitoring
28
+ log_interval = 5
29
+ eval_iters = 20
30
+ eval_only = False
31
+ always_save_checkpoint = True
32
+ init_from = 'scratch'
33
+
34
+ # wandb logging - enabled for ZeroGPU
35
+ wandb_log = True
36
+ wandb_project = 'nano-coder-zerogpu'
37
+ wandb_run_name = 'nano-coder-zerogpu-training'
38
+
39
+ # data
40
+ dataset = 'python-codes-25k'
41
+ gradient_accumulation_steps = 2 * 8 # Optimized for H200
42
+ batch_size = 48 # Larger batch size for H200 efficiency
43
+ block_size = 1024 # Full context length
44
+
45
+ # model - optimized for ZeroGPU H200
46
+ n_layer = 12 # Full model
47
+ n_head = 12 # Full model
48
+ n_embd = 768 # Full model
49
+ dropout = 0.1
50
+ bias = False
51
+
52
+ # optimizer - optimized for H200
53
+ learning_rate = 6e-4 # Standard GPT-2 learning rate
54
+ max_iters = 10000 # More iterations for ZeroGPU
55
+ weight_decay = 1e-1
56
+ beta1 = 0.9
57
+ beta2 = 0.95
58
+ grad_clip = 1.0
59
+
60
+ # learning rate decay
61
+ decay_lr = True
62
+ warmup_iters = 1000
63
+ lr_decay_iters = 10000
64
+ min_lr = 6e-5
65
+
66
+ # DDP settings
67
+ backend = 'nccl'
68
+
69
+ # system
70
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
71
+ dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16'
72
+ compile = True
73
+
74
+ # HF specific
75
+ hf_repo_id = "mlopez6132/nano-coder-zerogpu" # ZeroGPU repo
76
+ push_to_hub = True
77
+
78
+ # ZeroGPU specific - no time limits!
79
+ print("🚀 ZEROGPU TRAINING - NO TIME LIMITS!")
80
+
81
+ # -----------------------------------------------------------------------------
82
+ config_keys = [k for k,v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))]
83
+ exec(open('configurator.py').read())
84
+ config = {k: globals()[k] for k in config_keys}
85
+
86
+ # -----------------------------------------------------------------------------
87
+
88
+ # HF setup
89
+ if push_to_hub:
90
+ # Check if HF_TOKEN environment variable is set
91
+ if os.environ.get('HF_TOKEN'):
92
+ login(token=os.environ.get('HF_TOKEN'))
93
+ else:
94
+ # Try to login without token (will use cached credentials)
95
+ try:
96
+ login()
97
+ except Exception as e:
98
+ print(f"Warning: Could not login to HF Hub: {e}")
99
+ print("Continuing without HF Hub upload...")
100
+ push_to_hub = False
101
+
102
+ if push_to_hub:
103
+ api = HfApi()
104
+
105
+ # various inits, derived attributes, I/O setup
106
+ ddp = int(os.environ.get('RANK', -1)) != -1
107
+ if ddp:
108
+ init_process_group(backend=backend)
109
+ ddp_rank = int(os.environ['RANK'])
110
+ ddp_local_rank = int(os.environ['LOCAL_RANK'])
111
+ ddp_world_size = int(os.environ['WORLD_SIZE'])
112
+ device = f'cuda:{ddp_local_rank}'
113
+ torch.cuda.set_device(device)
114
+ master_process = ddp_rank == 0
115
+ seed_offset = ddp_rank
116
+ assert gradient_accumulation_steps % ddp_world_size == 0
117
+ gradient_accumulation_steps //= ddp_world_size
118
+ else:
119
+ master_process = True
120
+ seed_offset = 0
121
+ ddp_world_size = 1
122
+
123
+ tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * block_size
124
+ print(f"tokens per iteration will be: {tokens_per_iter:,}")
125
+ print(f"ZEROGPU H200 TRAINING - NO TIME LIMITS!")
126
+
127
+ if master_process:
128
+ os.makedirs(out_dir, exist_ok=True)
129
+
130
+ torch.manual_seed(1337 + seed_offset)
131
+ torch.backends.cuda.matmul.allow_tf32 = True
132
+ torch.backends.cudnn.allow_tf32 = True
133
+ device_type = 'cuda' if 'cuda' in device else 'cpu'
134
+ ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
135
+ ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
136
+
137
+ # data loader
138
+ data_dir = os.path.join('data', dataset)
139
+ def get_batch(split):
140
+ if split == 'train':
141
+ data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
142
+ else:
143
+ data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')
144
+ ix = torch.randint(len(data) - block_size, (batch_size,))
145
+ x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
146
+ y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
147
+ if device_type == 'cuda':
148
+ x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
149
+ else:
150
+ x, y = x.to(device), y.to(device)
151
+ return x, y
152
+
153
+ # init these up here, can override if init_from='resume'
154
+ iter_num = 0
155
+ best_val_loss = 1e9
156
+
157
+ # attempt to derive vocab_size from the dataset
158
+ meta_path = os.path.join(data_dir, 'meta.pkl')
159
+ meta_vocab_size = None
160
+ if os.path.exists(meta_path):
161
+ with open(meta_path, 'rb') as f:
162
+ meta = pickle.load(f)
163
+ meta_vocab_size = meta['vocab_size']
164
+ print(f"found vocab_size = {meta_vocab_size} (inside {meta_path})")
165
+
166
+ # model init
167
+ model_args = dict(n_layer=n_layer, n_head=n_head, n_embd=n_embd, block_size=block_size,
168
+ bias=bias, vocab_size=None, dropout=dropout)
169
+
170
+ if init_from == 'scratch':
171
+ print("Initializing a new nano-coder model from scratch (ZEROGPU)")
172
+ if meta_vocab_size is None:
173
+ print("defaulting to vocab_size of GPT-2 to 50304")
174
+ model_args['vocab_size'] = meta_vocab_size if meta_vocab_size is not None else 50304
175
+ gptconf = GPTConfig(**model_args)
176
+ model = GPT(gptconf)
177
+ elif init_from == 'resume':
178
+ print(f"Resuming training from {out_dir}")
179
+ ckpt_path = os.path.join(out_dir, 'ckpt.pt')
180
+ checkpoint = torch.load(ckpt_path, map_location=device)
181
+ checkpoint_model_args = checkpoint['model_args']
182
+ for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
183
+ model_args[k] = checkpoint_model_args[k]
184
+ gptconf = GPTConfig(**model_args)
185
+ model = GPT(gptconf)
186
+ state_dict = checkpoint['model']
187
+ unwanted_prefix = '_orig_mod.'
188
+ for k,v in list(state_dict.items()):
189
+ if k.startswith(unwanted_prefix):
190
+ state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
191
+ model.load_state_dict(state_dict)
192
+ iter_num = checkpoint['iter_num']
193
+ best_val_loss = checkpoint['best_val_loss']
194
+ elif init_from.startswith('gpt2'):
195
+ print(f"Initializing from OpenAI GPT-2 weights: {init_from}")
196
+ override_args = dict(dropout=dropout)
197
+ model = GPT.from_pretrained(init_from, override_args)
198
+ for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
199
+ model_args[k] = getattr(model.config, k)
200
+
201
+ if block_size < model.config.block_size:
202
+ model.crop_block_size(block_size)
203
+ model_args['block_size'] = block_size
204
+
205
+ model.to(device)
206
+
207
+ # initialize a GradScaler
208
+ scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
209
+
210
+ # optimizer
211
+ optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
212
+ if init_from == 'resume':
213
+ optimizer.load_state_dict(checkpoint['optimizer'])
214
+ checkpoint = None
215
+
216
+ # compile the model
217
+ if compile:
218
+ print("compiling the model... (takes a ~minute)")
219
+ unoptimized_model = model
220
+ model = torch.compile(model)
221
+
222
+ # wrap model into DDP container
223
+ if ddp:
224
+ model = DDP(model, device_ids=[ddp_local_rank])
225
+
226
+ # helps estimate an arbitrarily accurate loss over either split using many batches
227
+ @torch.no_grad()
228
+ def estimate_loss():
229
+ out = {}
230
+ model.eval()
231
+ for split in ['train', 'val']:
232
+ losses = torch.zeros(eval_iters)
233
+ for k in range(eval_iters):
234
+ X, Y = get_batch(split)
235
+ with ctx:
236
+ logits, loss = model(X, Y)
237
+ losses[k] = loss.item()
238
+ out[split] = losses.mean()
239
+ model.train()
240
+ return out
241
+
242
+ # learning rate decay scheduler (cosine with warmup)
243
+ def get_lr(it):
244
+ if it < warmup_iters:
245
+ return learning_rate * (it + 1) / (warmup_iters + 1)
246
+ if it > lr_decay_iters:
247
+ return min_lr
248
+ decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
249
+ assert 0 <= decay_ratio <= 1
250
+ coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
251
+ return min_lr + coeff * (learning_rate - min_lr)
252
+
253
+ # logging
254
+ if wandb_log and master_process:
255
+ wandb.init(project=wandb_project, name=wandb_run_name, config=config)
256
+
257
+ # HF checkpoint upload function
258
+ def upload_checkpoint_to_hf(checkpoint_path, iter_num):
259
+ if push_to_hub and master_process:
260
+ try:
261
+ # Create a unique filename
262
+ filename = f"checkpoint_iter_{iter_num}.pt"
263
+ file_path = os.path.join(out_dir, filename)
264
+
265
+ # Copy checkpoint with new name
266
+ import shutil
267
+ shutil.copy2(checkpoint_path, file_path)
268
+
269
+ # Upload to HF
270
+ api.upload_file(
271
+ path_or_fileobj=file_path,
272
+ path_in_repo=filename,
273
+ repo_id=hf_repo_id,
274
+ repo_type="model"
275
+ )
276
+ print(f"Uploaded checkpoint to HF: {filename}")
277
+
278
+ # Clean up local copy
279
+ os.remove(file_path)
280
+ except Exception as e:
281
+ print(f"Failed to upload checkpoint: {e}")
282
+
283
+ # training loop
284
+ print("Starting ZEROGPU H200 nano-coder training...")
285
+ X, Y = get_batch('train')
286
+ t0 = time.time()
287
+ local_iter_num = 0
288
+ raw_model = model.module if ddp else model
289
+ running_mfu = -1.0
290
+
291
+ while True:
292
+ # determine and set the learning rate for this iteration
293
+ lr = get_lr(iter_num) if decay_lr else learning_rate
294
+ for param_group in optimizer.param_groups:
295
+ param_group['lr'] = lr
296
+
297
+ # evaluate the loss on train/val sets and write checkpoints
298
+ if iter_num % eval_interval == 0 and master_process:
299
+ losses = estimate_loss()
300
+ print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
301
+ if wandb_log:
302
+ wandb.log({
303
+ "iter": iter_num,
304
+ "train/loss": losses['train'],
305
+ "val/loss": losses['val'],
306
+ "lr": lr,
307
+ "mfu": running_mfu*100,
308
+ })
309
+ if losses['val'] < best_val_loss or always_save_checkpoint:
310
+ best_val_loss = losses['val']
311
+ if iter_num > 0:
312
+ checkpoint = {
313
+ 'model': raw_model.state_dict(),
314
+ 'optimizer': optimizer.state_dict(),
315
+ 'model_args': model_args,
316
+ 'iter_num': iter_num,
317
+ 'best_val_loss': best_val_loss,
318
+ 'config': config,
319
+ }
320
+ checkpoint_path = os.path.join(out_dir, 'ckpt.pt')
321
+ print(f"saving checkpoint to {out_dir}")
322
+ torch.save(checkpoint, checkpoint_path)
323
+
324
+ # Upload to HF every 1000 iterations
325
+ if iter_num % 1000 == 0:
326
+ upload_checkpoint_to_hf(checkpoint_path, iter_num)
327
+ if iter_num == 0 and eval_only:
328
+ break
329
+
330
+ # forward backward update
331
+ for micro_step in range(gradient_accumulation_steps):
332
+ if ddp:
333
+ model.require_backward_grad_sync = (micro_step == gradient_accumulation_steps - 1)
334
+ with ctx:
335
+ logits, loss = model(X, Y)
336
+ loss = loss / gradient_accumulation_steps
337
+ X, Y = get_batch('train')
338
+ scaler.scale(loss).backward()
339
+
340
+ # clip the gradient
341
+ if grad_clip != 0.0:
342
+ scaler.unscale_(optimizer)
343
+ torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
344
+
345
+ # step the optimizer and scaler
346
+ scaler.step(optimizer)
347
+ scaler.update()
348
+ optimizer.zero_grad(set_to_none=True)
349
+
350
+ # timing and logging
351
+ t1 = time.time()
352
+ dt = t1 - t0
353
+ t0 = t1
354
+ if iter_num % log_interval == 0 and master_process:
355
+ lossf = loss.item() * gradient_accumulation_steps
356
+ if local_iter_num >= 5:
357
+ mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt)
358
+ running_mfu = mfu if running_mfu == -1.0 else 0.9*running_mfu + 0.1*mfu
359
+ print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms, mfu {running_mfu*100:.2f}%")
360
+ iter_num += 1
361
+ local_iter_num += 1
362
+
363
+ # termination conditions
364
+ if iter_num > max_iters:
365
+ break
366
+
367
+ if ddp:
368
+ destroy_process_group()
369
+
370
+ # Final upload
371
+ if push_to_hub and master_process:
372
+ upload_checkpoint_to_hf(os.path.join(out_dir, 'ckpt.pt'), 'final')
373
+
374
+ total_time = time.time() - start_time
375
+ print(f"\n🎉 ZEROGPU H200 TRAINING COMPLETED!")
376
+ print(f"Total training time: {total_time/60:.1f} minutes")
377
+ print(f"Total iterations: {iter_num}")
378
+ print(f"Final validation loss: {best_val_loss:.4f}")
379
+ print(f"Model saved to: {out_dir}")