Overriding config with config/finetune_shakespeare.py: import time out_dir = 'out-shakespeare' eval_interval = 5 eval_iters = 40 wandb_log = False # feel free to turn on wandb_project = 'shakespeare' wandb_run_name = 'ft-' + str(time.time()) dataset = 'shakespeare' init_from = 'gpt2' # this is the largest GPT-2 model # only save checkpoints if the validation loss improves always_save_checkpoint = False # the number of examples per iter: # 1 batch_size * 32 grad_accum * 1024 tokens = 32,768 tokens/iter # shakespeare has 301,966 tokens, so 1 epoch ~= 9.2 iters batch_size = 1 gradient_accumulation_steps = 32 max_iters = 1000 # finetune at constant LR learning_rate = 3e-5 decay_lr = False Initializing from OpenAI GPT-2 weights: gpt2 loading weights from pretrained gpt: gpt2 forcing vocab_size=50257, block_size=1024, bias=True overriding dropout rate to 0.0 number of parameters: 123.65M using fused AdamW: True compiling the model... (takes a ~minute) [2023-03-20 21:31:13,957] torch._inductor.utils: [WARNING] make_fallback(aten.addmv): a decomposition exists, we should switch to it step 0: train loss 4.1871, val loss 4.0326 iter 0: loss 4.8126, time 53610.16ms, mfu -100.00% iter 1: loss 3.8469, time 22853.81ms, mfu -100.00% iter 2: loss 4.1342, time 23058.41ms, mfu -100.00% iter 3: loss 4.2060, time 23164.17ms, mfu -100.00% iter 4: loss 4.6711, time 23070.16ms, mfu -100.00% step 5: train loss 4.3096, val loss 3.9636 saving checkpoint to out-shakespeare iter 5: loss 3.4577, time 30970.06ms, mfu 2.32% iter 6: loss 2.9587, time 23298.83ms, mfu 2.40% iter 7: loss 3.2116, time 23132.08ms, mfu 2.47% iter 8: loss 3.4900, time 23106.50ms, mfu 2.53% iter 9: loss 3.8003, time 23125.60ms, mfu 2.59% step 10: train loss 3.6215, val loss 3.4816 saving checkpoint to out-shakespeare iter 10: loss 3.6364, time 30978.89ms, mfu 2.56% iter 11: loss 3.4725, time 23263.91ms, mfu 2.61% iter 12: loss 3.4080, time 23053.16ms, mfu 2.67% iter 13: loss 3.9510, time 23091.76ms, mfu 2.71% iter 14: loss 3.6421, time 23142.46ms, mfu 2.75% step 15: train loss 3.5292, val loss 3.2960 saving checkpoint to out-shakespeare iter 15: loss 3.2916, time 31036.47ms, mfu 2.71% iter 16: loss 3.8844, time 23232.40ms, mfu 2.74% iter 17: loss 3.2954, time 23076.36ms, mfu 2.78% iter 18: loss 2.9807, time 23073.19ms, mfu 2.81% iter 19: loss 3.4524, time 23090.94ms, mfu 2.84% step 20: train loss 3.4621, val loss 3.3625 iter 20: loss 3.3737, time 25115.53ms, mfu 2.85% iter 21: loss 3.6565, time 23165.72ms, mfu 2.87% iter 22: loss 3.3047, time 23174.77ms, mfu 2.89% iter 23: loss 3.8091, time 23135.82ms, mfu 2.92% iter 24: loss 3.1955, time 23097.90ms, mfu 2.94% step 25: train loss 3.5139, val loss 3.2854 saving checkpoint to out-shakespeare iter 25: loss 3.8481, time 30838.74ms, mfu 2.87% iter 26: loss 3.2716, time 23304.59ms, mfu 2.90% iter 27: loss 3.3729, time 23056.31ms, mfu 2.92% iter 28: loss 3.3545, time 23107.46ms, mfu 2.94% iter 29: loss 2.7101, time 23209.45ms, mfu 2.95% step 30: train loss 3.3706, val loss 3.2958 iter 30: loss 3.0968, time 25123.31ms, mfu 2.94% iter 31: loss 2.9495, time 23116.72ms, mfu 2.96% iter 32: loss 3.0179, time 23101.19ms, mfu 2.97% iter 33: loss 2.9648, time 23117.17ms, mfu 2.99% iter 34: loss 3.6522, time 23132.76ms, mfu 3.00% step 35: train loss 3.3923, val loss 3.2125 saving checkpoint to out-shakespeare iter 35: loss 3.2469, time 31079.08ms, mfu 2.93% iter 36: loss 3.1450, time 23273.02ms, mfu 2.95% iter 37: loss 3.4624, time 23046.04ms, mfu 2.96% iter 38: loss 3.4371, time 23102.73ms, mfu 2.98% iter 39: loss 3.3130, time 23178.65ms, mfu 2.99% step 40: train loss 3.3233, val loss 3.2543 iter 40: loss 3.0743, time 25069.68ms, mfu 2.98% iter 41: loss 3.1269, time 23084.39ms, mfu 2.99% iter 42: loss 3.6785, time 23076.30ms, mfu 3.00% iter 43: loss 3.3787, time 23075.87ms, mfu 3.01% iter 44: loss 3.2637, time 23098.68ms, mfu 3.02% step 45: train loss 3.1971, val loss 3.2642 iter 45: loss 3.1861, time 25003.67ms, mfu 3.01% iter 46: loss 3.4037, time 23106.62ms, mfu 3.02% iter 47: loss 3.4947, time 23109.37ms, mfu 3.03% iter 48: loss 3.3276, time 23098.50ms, mfu 3.04% iter 49: loss 2.9062, time 23171.38ms, mfu 3.04% step 50: train loss 3.2188, val loss 3.2460 iter 50: loss 3.5280, time 25111.46ms, mfu 3.02% iter 51: loss 3.5470, time 23143.40ms, mfu 3.03% iter 52: loss 3.1881, time 23109.22ms, mfu 3.04% iter 53: loss 3.4332, time 23083.68ms, mfu 3.05% iter 54: loss 3.1956, time 23117.10ms, mfu 3.05% step 55: train loss 3.2902, val loss 3.1846 saving checkpoint to out-shakespeare iter 55: loss 3.4816, time 31132.51ms, mfu 2.98% iter 56: loss 3.2971, time 23207.94ms, mfu 2.99% iter 57: loss 2.9543, time 23064.74ms, mfu 3.00% iter 58: loss 2.8729, time 23093.16ms, mfu 3.01% iter 59: loss 3.0883, time 23129.34ms, mfu 3.02% step 60: train loss 3.1288, val loss 3.1545 saving checkpoint to out-shakespeare iter 60: loss 3.7098, time 31022.27ms, mfu 2.95% iter 61: loss 3.4157, time 23229.02ms, mfu 2.97% iter 62: loss 3.0020, time 23059.02ms, mfu 2.98% iter 63: loss 3.0751, time 23063.51ms, mfu 2.99% iter 64: loss 2.9081, time 23134.60ms, mfu 3.01% step 65: train loss 3.2254, val loss 3.1772 iter 65: loss 3.3802, time 25114.58ms, mfu 2.99% iter 66: loss 3.1073, time 23118.96ms, mfu 3.00% iter 67: loss 3.1010, time 23081.32ms, mfu 3.01% iter 68: loss 3.2594, time 23058.54ms, mfu 3.02% iter 69: loss 3.4402, time 23062.45ms, mfu 3.03% step 70: train loss 3.1511, val loss 3.2315 iter 70: loss 3.4094, time 24967.39ms, mfu 3.02% iter 71: loss 3.0997, time 23070.28ms, mfu 3.03% iter 72: loss 2.1573, time 23072.48ms, mfu 3.04% iter 73: loss 3.3926, time 23060.80ms, mfu 3.04% iter 74: loss 3.2284, time 23080.48ms, mfu 3.05% step 75: train loss 3.1102, val loss 3.1017 saving checkpoint to out-shakespeare iter 75: loss 3.3760, time 31003.52ms, mfu 2.98% iter 76: loss 3.3387, time 23207.33ms, mfu 2.99% iter 77: loss 2.9299, time 23040.87ms, mfu 3.00% iter 78: loss 2.9623, time 23069.43ms, mfu 3.01% iter 79: loss 3.0674, time 23111.04ms, mfu 3.02% step 80: train loss 3.0574, val loss 3.2178 iter 80: loss 2.6808, time 25072.69ms, mfu 3.01% iter 81: loss 2.7986, time 23144.88ms, mfu 3.02% iter 82: loss 2.9121, time 23094.25ms, mfu 3.03% iter 83: loss 2.7153, time 23114.27ms, mfu 3.03% iter 84: loss 2.8444, time 23089.41ms, mfu 3.04% step 85: train loss 2.9855, val loss 3.2298 iter 85: loss 3.0517, time 25033.77ms, mfu 3.03% iter 86: loss 2.5920, time 23088.89ms, mfu 3.03% iter 87: loss 3.1241, time 23084.88ms, mfu 3.04% iter 88: loss 2.5355, time 23070.40ms, mfu 3.05% iter 89: loss 3.4543, time 23060.05ms, mfu 3.06% step 90: train loss 3.0426, val loss 3.2664 iter 90: loss 3.3099, time 24997.54ms, mfu 3.04% iter 91: loss 2.8099, time 23108.94ms, mfu 3.04% iter 92: loss 3.2419, time 23103.54ms, mfu 3.05% iter 93: loss 3.4718, time 23089.71ms, mfu 3.06% iter 94: loss 3.0708, time 23137.11ms, mfu 3.06% step 95: train loss 3.0225, val loss 3.2529 iter 95: loss 2.8545, time 25072.26ms, mfu 3.04% iter 96: loss 3.3059, time 23120.57ms, mfu 3.05% iter 97: loss 2.7528, time 23111.60ms, mfu 3.06% iter 98: loss 3.1788, time 23106.26ms, mfu 3.06% iter 99: loss 2.9023, time 23103.06ms, mfu 3.07% step 100: train loss 2.9153, val loss 3.2140 iter 100: loss 3.0090, time 24968.37ms, mfu 3.05% iter 101: loss 3.0753, time 23093.87ms, mfu 3.05% iter 102: loss 3.1295, time 23108.81ms, mfu 3.06% iter 103: loss 2.9033, time 23136.51ms, mfu 3.06% iter 104: loss 3.1117, time 23127.17ms, mfu 3.07% step 105: train loss 2.9402, val loss 3.2071 iter 105: loss 2.8862, time 25050.88ms, mfu 3.05% iter 106: loss 2.6040, time 23141.23ms, mfu 3.05% iter 107: loss 3.1831, time 23146.47ms, mfu 3.06% iter 108: loss 3.1619, time 23078.47ms, mfu 3.06% iter 109: loss 3.0995, time 23098.26ms, mfu 3.07% step 110: train loss 2.7568, val loss 3.2857 iter 110: loss 3.0392, time 24959.72ms, mfu 3.05% iter 111: loss 3.1982, time 23121.36ms, mfu 3.06% iter 112: loss 3.1794, time 23124.92ms, mfu 3.06% iter 113: loss 2.8230, time 23138.96ms, mfu 3.07% iter 114: loss 2.2634, time 23121.12ms, mfu 3.07% step 115: train loss 2.8576, val loss 3.2603 iter 115: loss 3.0414, time 24960.16ms, mfu 3.05% iter 116: loss 2.2827, time 23077.89ms, mfu 3.06% iter 117: loss 2.5435, time 23054.11ms, mfu 3.06%