Overriding config with config/finetune_shakespeare.py: import time out_dir = 'out-shakespeare' eval_interval = 5 eval_iters = 40 wandb_log = False # feel free to turn on wandb_project = 'shakespeare' wandb_run_name = 'ft-' + str(time.time()) dataset = 'shakespeare' init_from = 'gpt2' # this is the largest GPT-2 model # only save checkpoints if the validation loss improves always_save_checkpoint = False # the number of examples per iter: # 1 batch_size * 32 grad_accum * 1024 tokens = 32,768 tokens/iter # shakespeare has 301,966 tokens, so 1 epoch ~= 9.2 iters batch_size = 1 gradient_accumulation_steps = 32 max_iters = 120 # finetune at constant LR learning_rate = 3e-5 decay_lr = False Initializing from OpenAI GPT-2 weights: gpt2 loading weights from pretrained gpt: gpt2 forcing vocab_size=50257, block_size=1024, bias=True overriding dropout rate to 0.0 number of parameters: 123.65M Downloading (…)lve/main/config.json: 100% 665/665 [00:00<00:00, 98.3kB/s] Downloading pytorch_model.bin: 100% 548M/548M [00:05<00:00, 92.8MB/s] Downloading (…)neration_config.json: 100% 124/124 [00:00<00:00, 19.2kB/s] using fused AdamW: True compiling the model... (takes a ~minute) [2023-03-21 14:22:50,795] torch._inductor.utils: [WARNING] make_fallback(aten.addmv): a decomposition exists, we should switch to it step 0: train loss 3.4423, val loss 3.0369 iter 0: loss 3.2863, time 77202.23ms, mfu -100.00% iter 1: loss 2.7469, time 22529.17ms, mfu -100.00% iter 2: loss 3.7087, time 23101.21ms, mfu -100.00% iter 3: loss 3.6040, time 23363.38ms, mfu -100.00% iter 4: loss 2.6769, time 23118.49ms, mfu -100.00% step 5: train loss 3.4339, val loss 2.9363 saving checkpoint to out-shakespeare iter 5: loss 3.1141, time 30621.41ms, mfu 2.35% iter 6: loss 3.3365, time 23426.49ms, mfu 2.42% iter 7: loss 3.8965, time 23144.13ms, mfu 2.49% iter 8: loss 3.4058, time 23061.69ms, mfu 2.55% iter 9: loss 3.2569, time 23230.68ms, mfu 2.60% step 10: train loss 3.2385, val loss 2.9982 iter 10: loss 3.1935, time 25160.57ms, mfu 2.63% iter 11: loss 3.9526, time 23125.77ms, mfu 2.68% iter 12: loss 2.4570, time 23136.22ms, mfu 2.72% iter 13: loss 3.5092, time 23120.81ms, mfu 2.76% iter 14: loss 3.4771, time 23226.29ms, mfu 2.79% step 15: train loss 2.9026, val loss 2.8705 saving checkpoint to out-shakespeare iter 15: loss 3.4825, time 30931.56ms, mfu 2.75% iter 16: loss 3.3583, time 23307.64ms, mfu 2.78% iter 17: loss 2.2991, time 23143.53ms, mfu 2.81% iter 18: loss 3.2513, time 23131.39ms, mfu 2.84% iter 19: loss 2.9859, time 23160.12ms, mfu 2.87% step 20: train loss 2.9491, val loss 2.7808 saving checkpoint to out-shakespeare iter 20: loss 3.0525, time 30909.27ms, mfu 2.81% iter 21: loss 2.9295, time 23294.73ms, mfu 2.84% iter 22: loss 2.2879, time 23094.34ms, mfu 2.87% iter 23: loss 1.8019, time 23103.56ms, mfu 2.89% iter 24: loss 3.4942, time 23172.01ms, mfu 2.91% step 25: train loss 2.8004, val loss 2.8107 iter 25: loss 2.2264, time 25127.64ms, mfu 2.91% iter 26: loss 3.4194, time 23174.40ms, mfu 2.93% iter 27: loss 2.8144, time 23152.02ms, mfu 2.94% iter 28: loss 3.0488, time 23133.18ms, mfu 2.96% iter 29: loss 3.1027, time 23085.89ms, mfu 2.98% step 30: train loss 2.6644, val loss 2.6210 saving checkpoint to out-shakespeare iter 30: loss 2.4424, time 31309.61ms, mfu 2.91% iter 31: loss 3.0193, time 23415.64ms, mfu 2.92% iter 32: loss 2.8735, time 23054.64ms, mfu 2.94% iter 33: loss 2.9842, time 23053.71ms, mfu 2.96% iter 34: loss 2.8148, time 23136.92ms, mfu 2.97% step 35: train loss 2.8676, val loss 2.5965 saving checkpoint to out-shakespeare iter 35: loss 2.8556, time 31228.61ms, mfu 2.91% iter 36: loss 2.1186, time 23332.51ms, mfu 2.92% iter 37: loss 2.4768, time 23039.16ms, mfu 2.94% iter 38: loss 2.7992, time 23035.59ms, mfu 2.96% iter 39: loss 2.7109, time 23218.08ms, mfu 2.97% step 40: train loss 2.5840, val loss 2.6467 iter 40: loss 3.0349, time 25092.98ms, mfu 2.96% iter 41: loss 2.8766, time 23084.39ms, mfu 2.98% iter 42: loss 2.5366, time 23099.15ms, mfu 2.99% iter 43: loss 2.7461, time 23183.70ms, mfu 3.00% iter 44: loss 1.4962, time 23190.74ms, mfu 3.01% step 45: train loss 2.6357, val loss 2.6529 iter 45: loss 2.1228, time 25011.92ms, mfu 3.00% iter 46: loss 1.9382, time 23127.95ms, mfu 3.01% iter 47: loss 1.7129, time 23168.21ms, mfu 3.02% iter 48: loss 2.4555, time 23162.14ms, mfu 3.03% iter 49: loss 1.3368, time 23152.22ms, mfu 3.03% step 50: train loss 2.3167, val loss 2.6496 iter 50: loss 2.3815, time 24969.84ms, mfu 3.02% iter 51: loss 1.5433, time 23013.56ms, mfu 3.03% iter 52: loss 2.5276, time 22951.87ms, mfu 3.04% iter 53: loss 2.0912, time 22989.47ms, mfu 3.05% iter 54: loss 1.6236, time 23016.77ms, mfu 3.06% step 55: train loss 2.2718, val loss 2.6701 iter 55: loss 0.9116, time 24910.16ms, mfu 3.04%