|
Overriding config with config/finetune_shakespeare.py: |
|
import time |
|
|
|
out_dir = 'out-shakespeare' |
|
eval_interval = 5 |
|
eval_iters = 40 |
|
wandb_log = False # feel free to turn on |
|
wandb_project = 'shakespeare' |
|
wandb_run_name = 'ft-' + str(time.time()) |
|
|
|
dataset = 'shakespeare' |
|
init_from = 'gpt2' # this is the largest GPT-2 model |
|
|
|
# only save checkpoints if the validation loss improves |
|
always_save_checkpoint = False |
|
|
|
# the number of examples per iter: |
|
# 1 batch_size * 32 grad_accum * 1024 tokens = 32,768 tokens/iter |
|
# shakespeare has 301,966 tokens, so 1 epoch ~= 9.2 iters |
|
batch_size = 1 |
|
gradient_accumulation_steps = 32 |
|
max_iters = 300 |
|
|
|
# finetune at constant LR |
|
learning_rate = 3e-5 |
|
decay_lr = False |
|
|
|
Initializing from OpenAI GPT-2 weights: gpt2 |
|
loading weights from pretrained gpt: gpt2 |
|
forcing vocab_size=50257, block_size=1024, bias=True |
|
overriding dropout rate to 0.0 |
|
number of parameters: 123.65M |
|
Downloading (…)lve/main/config.json: 100% 665/665 [00:00<00:00, 88.4kB/s] |
|
Downloading pytorch_model.bin: 100% 548M/548M [00:01<00:00, 289MB/s] |
|
Downloading (…)neration_config.json: 100% 124/124 [00:00<00:00, 22.5kB/s] |
|
using fused AdamW: True |
|
compiling the model... (takes a ~minute) |
|
[2023-03-21 06:17:18,366] torch._inductor.utils: [WARNING] make_fallback(aten.addmv): a decomposition exists, we should switch to it |
|
step 0: train loss 3.3086, val loss 3.2349 |
|
iter 0: loss 3.4443, time 75907.68ms, mfu -100.00% |
|
iter 1: loss 3.6624, time 23156.16ms, mfu -100.00% |
|
iter 2: loss 4.4039, time 23248.46ms, mfu -100.00% |
|
iter 3: loss 3.2693, time 22877.27ms, mfu -100.00% |
|
iter 4: loss 3.4597, time 22906.52ms, mfu -100.00% |
|
step 5: train loss 3.2166, val loss 3.2212 |
|
saving checkpoint to out-shakespeare |
|
iter 5: loss 3.2885, time 30843.38ms, mfu 2.33% |
|
iter 6: loss 3.2423, time 23117.67ms, mfu 2.41% |
|
iter 7: loss 3.2239, time 23014.83ms, mfu 2.48% |
|
iter 8: loss 3.3878, time 23083.71ms, mfu 2.54% |
|
iter 9: loss 3.0245, time 23127.68ms, mfu 2.60% |
|
step 10: train loss 3.1367, val loss 3.0886 |
|
saving checkpoint to out-shakespeare |
|
iter 10: loss 3.2588, time 31026.66ms, mfu 2.57% |
|
iter 11: loss 2.7963, time 23215.41ms, mfu 2.62% |
|
iter 12: loss 3.0799, time 23045.69ms, mfu 2.67% |
|
iter 13: loss 3.0391, time 23081.70ms, mfu 2.72% |
|
iter 14: loss 2.9285, time 23144.99ms, mfu 2.76% |
|
step 15: train loss 3.0614, val loss 3.0357 |
|
saving checkpoint to out-shakespeare |
|
iter 15: loss 2.9088, time 31131.17ms, mfu 2.71% |
|
iter 16: loss 2.8854, time 23203.33ms, mfu 2.75% |
|
iter 17: loss 2.8941, time 23045.51ms, mfu 2.79% |
|
iter 18: loss 3.1116, time 23058.43ms, mfu 2.82% |
|
iter 19: loss 3.1542, time 23076.86ms, mfu 2.85% |
|
step 20: train loss 2.9382, val loss 2.9662 |
|
saving checkpoint to out-shakespeare |
|
iter 20: loss 2.8674, time 30800.95ms, mfu 2.80% |
|
iter 21: loss 3.0158, time 23210.44ms, mfu 2.83% |
|
iter 22: loss 3.0376, time 23028.93ms, mfu 2.86% |
|
iter 23: loss 2.5614, time 23053.57ms, mfu 2.88% |
|
iter 24: loss 3.0086, time 23135.53ms, mfu 2.90% |
|
step 25: train loss 2.9386, val loss 2.9689 |
|
iter 25: loss 2.8633, time 25037.75ms, mfu 2.90% |
|
iter 26: loss 3.2887, time 23087.04ms, mfu 2.92% |
|
iter 27: loss 2.7507, time 23061.28ms, mfu 2.94% |
|
iter 28: loss 3.0676, time 23047.93ms, mfu 2.96% |
|
iter 29: loss 2.7316, time 23042.36ms, mfu 2.98% |
|
step 30: train loss 2.9721, val loss 2.9042 |
|
saving checkpoint to out-shakespeare |
|
iter 30: loss 2.7163, time 30867.03ms, mfu 2.91% |
|
iter 31: loss 2.9423, time 23225.75ms, mfu 2.93% |
|
iter 32: loss 2.9405, time 23012.47ms, mfu 2.95% |
|
iter 33: loss 2.9208, time 23059.76ms, mfu 2.96% |
|
iter 34: loss 2.9996, time 23121.13ms, mfu 2.98% |
|
step 35: train loss 2.9496, val loss 2.8374 |
|
saving checkpoint to out-shakespeare |
|
iter 35: loss 2.8072, time 31122.96ms, mfu 2.91% |
|
iter 36: loss 2.9798, time 23209.16ms, mfu 2.93% |
|
iter 37: loss 2.8476, time 23019.32ms, mfu 2.95% |
|
iter 38: loss 2.7276, time 23056.09ms, mfu 2.97% |
|
iter 39: loss 2.8636, time 23101.19ms, mfu 2.98% |
|
step 40: train loss 2.8282, val loss 2.9073 |
|
iter 40: loss 2.7667, time 25022.64ms, mfu 2.97% |
|
iter 41: loss 2.6111, time 23100.99ms, mfu 2.98% |
|
iter 42: loss 3.1776, time 23107.88ms, mfu 3.00% |
|
iter 43: loss 2.7963, time 23090.82ms, mfu 3.01% |
|
iter 44: loss 3.2658, time 23084.78ms, mfu 3.02% |
|
step 45: train loss 2.8171, val loss 2.8487 |
|
iter 45: loss 3.0523, time 24981.39ms, mfu 3.00% |
|
iter 46: loss 2.6204, time 23087.28ms, mfu 3.01% |
|
iter 47: loss 2.8938, time 23081.95ms, mfu 3.02% |
|
iter 48: loss 3.1726, time 23092.57ms, mfu 3.03% |
|
iter 49: loss 3.7836, time 23077.55ms, mfu 3.04% |
|
step 50: train loss 2.8675, val loss 2.7787 |
|
saving checkpoint to out-shakespeare |
|
iter 50: loss 3.0882, time 30881.37ms, mfu 2.97% |
|
iter 51: loss 2.8358, time 23200.14ms, mfu 2.98% |
|
iter 52: loss 2.9847, time 23008.69ms, mfu 3.00% |
|
iter 53: loss 3.1992, time 23066.07ms, mfu 3.01% |
|
iter 54: loss 2.4085, time 23118.93ms, mfu 3.02% |
|
step 55: train loss 2.8049, val loss 2.7507 |
|
saving checkpoint to out-shakespeare |
|
iter 55: loss 2.9964, time 31115.78ms, mfu 2.95% |
|
iter 56: loss 2.9647, time 23212.73ms, mfu 2.96% |
|
iter 57: loss 2.8880, time 23003.95ms, mfu 2.98% |
|
iter 58: loss 2.8726, time 23053.90ms, mfu 2.99% |
|
iter 59: loss 2.6470, time 23124.33ms, mfu 3.00% |
|
step 60: train loss 2.8041, val loss 2.8827 |
|
iter 60: loss 2.8115, time 24978.80ms, mfu 2.99% |
|
iter 61: loss 2.6765, time 23058.07ms, mfu 3.00% |
|
iter 62: loss 2.6801, time 23052.27ms, mfu 3.01% |
|
iter 63: loss 3.4295, time 23048.58ms, mfu 3.03% |
|
iter 64: loss 2.5933, time 23062.70ms, mfu 3.03% |
|
step 65: train loss 2.7894, val loss 2.7606 |
|
iter 65: loss 2.5231, time 24991.85ms, mfu 3.02% |
|
iter 66: loss 2.8913, time 23099.31ms, mfu 3.03% |
|
iter 67: loss 2.9515, time 23106.81ms, mfu 3.04% |
|
iter 68: loss 2.8017, time 23098.12ms, mfu 3.04% |
|
iter 69: loss 2.7759, time 23110.16ms, mfu 3.05% |
|
step 70: train loss 2.8044, val loss 2.8498 |
|
iter 70: loss 2.9694, time 25009.31ms, mfu 3.03% |
|
iter 71: loss 3.3238, time 23090.32ms, mfu 3.04% |
|
iter 72: loss 2.6931, time 23086.35ms, mfu 3.05% |
|
iter 73: loss 2.6097, time 23085.74ms, mfu 3.05% |
|
iter 74: loss 2.1781, time 23096.25ms, mfu 3.06% |
|
step 75: train loss 2.7755, val loss 2.6869 |
|
saving checkpoint to out-shakespeare |
|
iter 75: loss 2.9208, time 30879.90ms, mfu 2.99% |
|
iter 76: loss 2.7619, time 23186.69ms, mfu 3.00% |
|
iter 77: loss 2.8394, time 23017.46ms, mfu 3.01% |
|
iter 78: loss 2.5907, time 23049.26ms, mfu 3.02% |
|
iter 79: loss 2.5660, time 23102.38ms, mfu 3.03% |
|
step 80: train loss 2.7759, val loss 2.7603 |
|
iter 80: loss 2.6889, time 25011.13ms, mfu 3.01% |
|
iter 81: loss 2.6940, time 23088.64ms, mfu 3.02% |
|
iter 82: loss 2.6596, time 23050.35ms, mfu 3.03% |
|
iter 83: loss 2.7638, time 23066.22ms, mfu 3.04% |
|
iter 84: loss 2.6515, time 23059.01ms, mfu 3.05% |
|
step 85: train loss 2.7404, val loss 2.7290 |
|
iter 85: loss 3.1829, time 24970.26ms, mfu 3.03% |
|
iter 86: loss 2.5451, time 23052.03ms, mfu 3.04% |
|
iter 87: loss 2.4363, time 23051.53ms, mfu 3.05% |
|
iter 88: loss 2.8023, time 23039.12ms, mfu 3.05% |
|
iter 89: loss 2.4755, time 23044.45ms, mfu 3.06% |
|
step 90: train loss 2.7140, val loss 2.7692 |
|
iter 90: loss 2.7225, time 24960.52ms, mfu 3.04% |
|
iter 91: loss 2.4655, time 23037.54ms, mfu 3.05% |
|
iter 92: loss 2.5291, time 23029.37ms, mfu 3.06% |
|
iter 93: loss 2.7720, time 23032.99ms, mfu 3.06% |
|
iter 94: loss 2.7614, time 23039.50ms, mfu 3.07% |
|
step 95: train loss 2.7932, val loss 2.7953 |
|
iter 95: loss 2.6881, time 24974.66ms, mfu 3.05% |
|
iter 96: loss 2.9315, time 23044.89ms, mfu 3.06% |
|
iter 97: loss 2.7099, time 23035.52ms, mfu 3.06% |
|
iter 98: loss 2.6858, time 23036.10ms, mfu 3.07% |
|
iter 99: loss 2.5341, time 23048.24ms, mfu 3.07% |
|
step 100: train loss 2.6788, val loss 2.8138 |
|
iter 100: loss 2.7993, time 25008.37ms, mfu 3.05% |
|
iter 101: loss 2.5996, time 23052.62ms, mfu 3.06% |
|
iter 102: loss 2.7768, time 23059.09ms, mfu 3.07% |
|
iter 103: loss 2.6378, time 23046.82ms, mfu 3.07% |
|
iter 104: loss 2.7511, time 23043.40ms, mfu 3.08% |
|
step 105: train loss 2.7542, val loss 2.6568 |
|
saving checkpoint to out-shakespeare |
|
iter 105: loss 2.6596, time 31000.96ms, mfu 3.00% |
|
iter 106: loss 2.8566, time 23195.71ms, mfu 3.01% |
|
iter 107: loss 2.6284, time 22995.46ms, mfu 3.02% |
|
iter 108: loss 2.6670, time 23031.45ms, mfu 3.03% |
|
iter 109: loss 2.4732, time 23093.11ms, mfu 3.04% |
|
step 110: train loss 2.7094, val loss 2.6684 |
|
iter 110: loss 2.5577, time 25028.10ms, mfu 3.02% |
|
iter 111: loss 2.9250, time 23089.98ms, mfu 3.03% |
|
iter 112: loss 2.6274, time 23072.14ms, mfu 3.04% |
|
iter 113: loss 2.5337, time 23078.52ms, mfu 3.05% |
|
iter 114: loss 2.7248, time 23061.41ms, mfu 3.05% |
|
step 115: train loss 2.7062, val loss 2.7398 |
|
iter 115: loss 2.7654, time 24968.79ms, mfu 3.04% |
|
iter 116: loss 2.6394, time 23049.91ms, mfu 3.04% |
|
iter 117: loss 2.5259, time 23068.72ms, mfu 3.05% |
|
iter 118: loss 2.8312, time 23061.73ms, mfu 3.06% |
|
iter 119: loss 2.6137, time 23049.41ms, mfu 3.06% |
|
step 120: train loss 2.6704, val loss 2.7120 |
|
iter 120: loss 2.6794, time 24958.89ms, mfu 3.05% |
|
iter 121: loss 2.7400, time 23040.45ms, mfu 3.05% |
|
iter 122: loss 2.6322, time 23047.61ms, mfu 3.06% |
|
iter 123: loss 2.4416, time 23062.33ms, mfu 3.06% |
|
iter 124: loss 2.6756, time 23048.99ms, mfu 3.07% |
|
step 125: train loss 2.5866, val loss 2.6882 |
|
iter 125: loss 2.6490, time 24950.30ms, mfu 3.05% |
|
iter 126: loss 2.5888, time 23027.86ms, mfu 3.06% |
|
iter 127: loss 2.3960, time 23012.31ms, mfu 3.06% |
|
iter 128: loss 2.6581, time 23025.51ms, mfu 3.07% |
|
iter 129: loss 2.6202, time 23042.65ms, mfu 3.07% |
|
step 130: train loss 2.6151, val loss 2.6532 |
|
saving checkpoint to out-shakespeare |
|
iter 130: loss 2.8148, time 31009.76ms, mfu 3.00% |