|
Overriding config with config/finetune_shakespeare.py: |
|
import time |
|
|
|
out_dir = 'out-shakespeare' |
|
eval_interval = 5 |
|
eval_iters = 40 |
|
wandb_log = False # feel free to turn on |
|
wandb_project = 'shakespeare' |
|
wandb_run_name = 'ft-' + str(time.time()) |
|
|
|
dataset = 'shakespeare' |
|
init_from = 'gpt2' # this is the largest GPT-2 model |
|
|
|
# only save checkpoints if the validation loss improves |
|
always_save_checkpoint = False |
|
|
|
# the number of examples per iter: |
|
# 1 batch_size * 32 grad_accum * 1024 tokens = 32,768 tokens/iter |
|
# shakespeare has 301,966 tokens, so 1 epoch ~= 9.2 iters |
|
batch_size = 1 |
|
gradient_accumulation_steps = 32 |
|
max_iters = 1000 |
|
|
|
# finetune at constant LR |
|
learning_rate = 3e-5 |
|
decay_lr = False |
|
|
|
Initializing from OpenAI GPT-2 weights: gpt2 |
|
loading weights from pretrained gpt: gpt2 |
|
forcing vocab_size=50257, block_size=1024, bias=True |
|
overriding dropout rate to 0.0 |
|
number of parameters: 123.65M |
|
using fused AdamW: True |
|
compiling the model... (takes a ~minute) |
|
[2023-03-20 21:31:13,957] torch._inductor.utils: [WARNING] make_fallback(aten.addmv): a decomposition exists, we should switch to it |
|
step 0: train loss 4.1871, val loss 4.0326 |
|
iter 0: loss 4.8126, time 53610.16ms, mfu -100.00% |
|
iter 1: loss 3.8469, time 22853.81ms, mfu -100.00% |
|
iter 2: loss 4.1342, time 23058.41ms, mfu -100.00% |
|
iter 3: loss 4.2060, time 23164.17ms, mfu -100.00% |
|
iter 4: loss 4.6711, time 23070.16ms, mfu -100.00% |
|
step 5: train loss 4.3096, val loss 3.9636 |
|
saving checkpoint to out-shakespeare |
|
iter 5: loss 3.4577, time 30970.06ms, mfu 2.32% |
|
iter 6: loss 2.9587, time 23298.83ms, mfu 2.40% |
|
iter 7: loss 3.2116, time 23132.08ms, mfu 2.47% |
|
iter 8: loss 3.4900, time 23106.50ms, mfu 2.53% |
|
iter 9: loss 3.8003, time 23125.60ms, mfu 2.59% |
|
step 10: train loss 3.6215, val loss 3.4816 |
|
saving checkpoint to out-shakespeare |
|
iter 10: loss 3.6364, time 30978.89ms, mfu 2.56% |
|
iter 11: loss 3.4725, time 23263.91ms, mfu 2.61% |
|
iter 12: loss 3.4080, time 23053.16ms, mfu 2.67% |
|
iter 13: loss 3.9510, time 23091.76ms, mfu 2.71% |
|
iter 14: loss 3.6421, time 23142.46ms, mfu 2.75% |
|
step 15: train loss 3.5292, val loss 3.2960 |
|
saving checkpoint to out-shakespeare |
|
iter 15: loss 3.2916, time 31036.47ms, mfu 2.71% |
|
iter 16: loss 3.8844, time 23232.40ms, mfu 2.74% |
|
iter 17: loss 3.2954, time 23076.36ms, mfu 2.78% |
|
iter 18: loss 2.9807, time 23073.19ms, mfu 2.81% |
|
iter 19: loss 3.4524, time 23090.94ms, mfu 2.84% |
|
step 20: train loss 3.4621, val loss 3.3625 |
|
iter 20: loss 3.3737, time 25115.53ms, mfu 2.85% |
|
iter 21: loss 3.6565, time 23165.72ms, mfu 2.87% |
|
iter 22: loss 3.3047, time 23174.77ms, mfu 2.89% |
|
iter 23: loss 3.8091, time 23135.82ms, mfu 2.92% |
|
iter 24: loss 3.1955, time 23097.90ms, mfu 2.94% |
|
step 25: train loss 3.5139, val loss 3.2854 |
|
saving checkpoint to out-shakespeare |
|
iter 25: loss 3.8481, time 30838.74ms, mfu 2.87% |
|
iter 26: loss 3.2716, time 23304.59ms, mfu 2.90% |
|
iter 27: loss 3.3729, time 23056.31ms, mfu 2.92% |
|
iter 28: loss 3.3545, time 23107.46ms, mfu 2.94% |
|
iter 29: loss 2.7101, time 23209.45ms, mfu 2.95% |
|
step 30: train loss 3.3706, val loss 3.2958 |
|
iter 30: loss 3.0968, time 25123.31ms, mfu 2.94% |
|
iter 31: loss 2.9495, time 23116.72ms, mfu 2.96% |
|
iter 32: loss 3.0179, time 23101.19ms, mfu 2.97% |
|
iter 33: loss 2.9648, time 23117.17ms, mfu 2.99% |
|
iter 34: loss 3.6522, time 23132.76ms, mfu 3.00% |
|
step 35: train loss 3.3923, val loss 3.2125 |
|
saving checkpoint to out-shakespeare |
|
iter 35: loss 3.2469, time 31079.08ms, mfu 2.93% |
|
iter 36: loss 3.1450, time 23273.02ms, mfu 2.95% |
|
iter 37: loss 3.4624, time 23046.04ms, mfu 2.96% |
|
iter 38: loss 3.4371, time 23102.73ms, mfu 2.98% |
|
iter 39: loss 3.3130, time 23178.65ms, mfu 2.99% |
|
step 40: train loss 3.3233, val loss 3.2543 |
|
iter 40: loss 3.0743, time 25069.68ms, mfu 2.98% |
|
iter 41: loss 3.1269, time 23084.39ms, mfu 2.99% |
|
iter 42: loss 3.6785, time 23076.30ms, mfu 3.00% |
|
iter 43: loss 3.3787, time 23075.87ms, mfu 3.01% |
|
iter 44: loss 3.2637, time 23098.68ms, mfu 3.02% |
|
step 45: train loss 3.1971, val loss 3.2642 |
|
iter 45: loss 3.1861, time 25003.67ms, mfu 3.01% |
|
iter 46: loss 3.4037, time 23106.62ms, mfu 3.02% |
|
iter 47: loss 3.4947, time 23109.37ms, mfu 3.03% |
|
iter 48: loss 3.3276, time 23098.50ms, mfu 3.04% |
|
iter 49: loss 2.9062, time 23171.38ms, mfu 3.04% |
|
step 50: train loss 3.2188, val loss 3.2460 |
|
iter 50: loss 3.5280, time 25111.46ms, mfu 3.02% |
|
iter 51: loss 3.5470, time 23143.40ms, mfu 3.03% |
|
iter 52: loss 3.1881, time 23109.22ms, mfu 3.04% |
|
iter 53: loss 3.4332, time 23083.68ms, mfu 3.05% |
|
iter 54: loss 3.1956, time 23117.10ms, mfu 3.05% |
|
step 55: train loss 3.2902, val loss 3.1846 |
|
saving checkpoint to out-shakespeare |
|
iter 55: loss 3.4816, time 31132.51ms, mfu 2.98% |
|
iter 56: loss 3.2971, time 23207.94ms, mfu 2.99% |
|
iter 57: loss 2.9543, time 23064.74ms, mfu 3.00% |
|
iter 58: loss 2.8729, time 23093.16ms, mfu 3.01% |
|
iter 59: loss 3.0883, time 23129.34ms, mfu 3.02% |
|
step 60: train loss 3.1288, val loss 3.1545 |
|
saving checkpoint to out-shakespeare |
|
iter 60: loss 3.7098, time 31022.27ms, mfu 2.95% |
|
iter 61: loss 3.4157, time 23229.02ms, mfu 2.97% |
|
iter 62: loss 3.0020, time 23059.02ms, mfu 2.98% |
|
iter 63: loss 3.0751, time 23063.51ms, mfu 2.99% |
|
iter 64: loss 2.9081, time 23134.60ms, mfu 3.01% |
|
step 65: train loss 3.2254, val loss 3.1772 |
|
iter 65: loss 3.3802, time 25114.58ms, mfu 2.99% |
|
iter 66: loss 3.1073, time 23118.96ms, mfu 3.00% |
|
iter 67: loss 3.1010, time 23081.32ms, mfu 3.01% |
|
iter 68: loss 3.2594, time 23058.54ms, mfu 3.02% |
|
iter 69: loss 3.4402, time 23062.45ms, mfu 3.03% |
|
step 70: train loss 3.1511, val loss 3.2315 |
|
iter 70: loss 3.4094, time 24967.39ms, mfu 3.02% |
|
iter 71: loss 3.0997, time 23070.28ms, mfu 3.03% |
|
iter 72: loss 2.1573, time 23072.48ms, mfu 3.04% |
|
iter 73: loss 3.3926, time 23060.80ms, mfu 3.04% |
|
iter 74: loss 3.2284, time 23080.48ms, mfu 3.05% |
|
step 75: train loss 3.1102, val loss 3.1017 |
|
saving checkpoint to out-shakespeare |
|
iter 75: loss 3.3760, time 31003.52ms, mfu 2.98% |
|
iter 76: loss 3.3387, time 23207.33ms, mfu 2.99% |
|
iter 77: loss 2.9299, time 23040.87ms, mfu 3.00% |
|
iter 78: loss 2.9623, time 23069.43ms, mfu 3.01% |
|
iter 79: loss 3.0674, time 23111.04ms, mfu 3.02% |
|
step 80: train loss 3.0574, val loss 3.2178 |
|
iter 80: loss 2.6808, time 25072.69ms, mfu 3.01% |
|
iter 81: loss 2.7986, time 23144.88ms, mfu 3.02% |
|
iter 82: loss 2.9121, time 23094.25ms, mfu 3.03% |
|
iter 83: loss 2.7153, time 23114.27ms, mfu 3.03% |
|
iter 84: loss 2.8444, time 23089.41ms, mfu 3.04% |
|
step 85: train loss 2.9855, val loss 3.2298 |
|
iter 85: loss 3.0517, time 25033.77ms, mfu 3.03% |
|
iter 86: loss 2.5920, time 23088.89ms, mfu 3.03% |
|
iter 87: loss 3.1241, time 23084.88ms, mfu 3.04% |
|
iter 88: loss 2.5355, time 23070.40ms, mfu 3.05% |
|
iter 89: loss 3.4543, time 23060.05ms, mfu 3.06% |
|
step 90: train loss 3.0426, val loss 3.2664 |
|
iter 90: loss 3.3099, time 24997.54ms, mfu 3.04% |
|
iter 91: loss 2.8099, time 23108.94ms, mfu 3.04% |
|
iter 92: loss 3.2419, time 23103.54ms, mfu 3.05% |
|
iter 93: loss 3.4718, time 23089.71ms, mfu 3.06% |
|
iter 94: loss 3.0708, time 23137.11ms, mfu 3.06% |
|
step 95: train loss 3.0225, val loss 3.2529 |
|
iter 95: loss 2.8545, time 25072.26ms, mfu 3.04% |
|
iter 96: loss 3.3059, time 23120.57ms, mfu 3.05% |
|
iter 97: loss 2.7528, time 23111.60ms, mfu 3.06% |
|
iter 98: loss 3.1788, time 23106.26ms, mfu 3.06% |
|
iter 99: loss 2.9023, time 23103.06ms, mfu 3.07% |
|
step 100: train loss 2.9153, val loss 3.2140 |
|
iter 100: loss 3.0090, time 24968.37ms, mfu 3.05% |
|
iter 101: loss 3.0753, time 23093.87ms, mfu 3.05% |
|
iter 102: loss 3.1295, time 23108.81ms, mfu 3.06% |
|
iter 103: loss 2.9033, time 23136.51ms, mfu 3.06% |
|
iter 104: loss 3.1117, time 23127.17ms, mfu 3.07% |
|
step 105: train loss 2.9402, val loss 3.2071 |
|
iter 105: loss 2.8862, time 25050.88ms, mfu 3.05% |
|
iter 106: loss 2.6040, time 23141.23ms, mfu 3.05% |
|
iter 107: loss 3.1831, time 23146.47ms, mfu 3.06% |
|
iter 108: loss 3.1619, time 23078.47ms, mfu 3.06% |
|
iter 109: loss 3.0995, time 23098.26ms, mfu 3.07% |
|
step 110: train loss 2.7568, val loss 3.2857 |
|
iter 110: loss 3.0392, time 24959.72ms, mfu 3.05% |
|
iter 111: loss 3.1982, time 23121.36ms, mfu 3.06% |
|
iter 112: loss 3.1794, time 23124.92ms, mfu 3.06% |
|
iter 113: loss 2.8230, time 23138.96ms, mfu 3.07% |
|
iter 114: loss 2.2634, time 23121.12ms, mfu 3.07% |
|
step 115: train loss 2.8576, val loss 3.2603 |
|
iter 115: loss 3.0414, time 24960.16ms, mfu 3.05% |
|
iter 116: loss 2.2827, time 23077.89ms, mfu 3.06% |
|
iter 117: loss 2.5435, time 23054.11ms, mfu 3.06% |