{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9982905982905983, "eval_steps": 500, "global_step": 2631, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011396011396011397, "grad_norm": 1.1062365284901086, "learning_rate": 5e-06, "loss": 0.7552, "step": 10 }, { "epoch": 0.022792022792022793, "grad_norm": 1.023752202077829, "learning_rate": 5e-06, "loss": 0.7051, "step": 20 }, { "epoch": 0.03418803418803419, "grad_norm": 1.0176219936883037, "learning_rate": 5e-06, "loss": 0.6821, "step": 30 }, { "epoch": 0.045584045584045586, "grad_norm": 0.7858042820189418, "learning_rate": 5e-06, "loss": 0.6856, "step": 40 }, { "epoch": 0.05698005698005698, "grad_norm": 0.7933715158385674, "learning_rate": 5e-06, "loss": 0.6637, "step": 50 }, { "epoch": 0.06837606837606838, "grad_norm": 0.8447246301495516, "learning_rate": 5e-06, "loss": 0.6622, "step": 60 }, { "epoch": 0.07977207977207977, "grad_norm": 0.5473425796189046, "learning_rate": 5e-06, "loss": 0.6663, "step": 70 }, { "epoch": 0.09116809116809117, "grad_norm": 0.4937146235526688, "learning_rate": 5e-06, "loss": 0.6571, "step": 80 }, { "epoch": 0.10256410256410256, "grad_norm": 0.44446776727990156, "learning_rate": 5e-06, "loss": 0.6638, "step": 90 }, { "epoch": 0.11396011396011396, "grad_norm": 0.41849443527281166, "learning_rate": 5e-06, "loss": 0.6579, "step": 100 }, { "epoch": 0.12535612535612536, "grad_norm": 0.45940085033829986, "learning_rate": 5e-06, "loss": 0.6526, "step": 110 }, { "epoch": 0.13675213675213677, "grad_norm": 0.45931809899313636, "learning_rate": 5e-06, "loss": 0.6369, "step": 120 }, { "epoch": 0.14814814814814814, "grad_norm": 0.40591863341923856, "learning_rate": 5e-06, "loss": 0.6341, "step": 130 }, { "epoch": 0.15954415954415954, "grad_norm": 0.42649491292164343, "learning_rate": 5e-06, "loss": 0.6573, "step": 140 }, { "epoch": 0.17094017094017094, "grad_norm": 0.42419739622977437, "learning_rate": 5e-06, "loss": 0.6449, "step": 150 }, { "epoch": 0.18233618233618235, "grad_norm": 0.41999442196069786, "learning_rate": 5e-06, "loss": 0.6558, "step": 160 }, { "epoch": 0.19373219373219372, "grad_norm": 0.4637200181201795, "learning_rate": 5e-06, "loss": 0.6287, "step": 170 }, { "epoch": 0.20512820512820512, "grad_norm": 0.41547705456707573, "learning_rate": 5e-06, "loss": 0.6439, "step": 180 }, { "epoch": 0.21652421652421652, "grad_norm": 0.43068607713697277, "learning_rate": 5e-06, "loss": 0.6396, "step": 190 }, { "epoch": 0.22792022792022792, "grad_norm": 0.4469322057262852, "learning_rate": 5e-06, "loss": 0.6356, "step": 200 }, { "epoch": 0.23931623931623933, "grad_norm": 0.4680911563203023, "learning_rate": 5e-06, "loss": 0.6306, "step": 210 }, { "epoch": 0.25071225071225073, "grad_norm": 0.4095294859092795, "learning_rate": 5e-06, "loss": 0.6277, "step": 220 }, { "epoch": 0.2621082621082621, "grad_norm": 0.45759740926828324, "learning_rate": 5e-06, "loss": 0.6339, "step": 230 }, { "epoch": 0.27350427350427353, "grad_norm": 0.4140379707131278, "learning_rate": 5e-06, "loss": 0.641, "step": 240 }, { "epoch": 0.2849002849002849, "grad_norm": 0.4150377896672994, "learning_rate": 5e-06, "loss": 0.6372, "step": 250 }, { "epoch": 0.2962962962962963, "grad_norm": 0.402341599576737, "learning_rate": 5e-06, "loss": 0.6403, "step": 260 }, { "epoch": 0.3076923076923077, "grad_norm": 0.42585340932157245, "learning_rate": 5e-06, "loss": 0.6415, "step": 270 }, { "epoch": 0.3190883190883191, "grad_norm": 0.45653778556147656, "learning_rate": 5e-06, "loss": 0.6399, "step": 280 }, { "epoch": 0.33048433048433046, "grad_norm": 0.4195393934267986, "learning_rate": 5e-06, "loss": 0.6336, "step": 290 }, { "epoch": 0.3418803418803419, "grad_norm": 0.46802670579447797, "learning_rate": 5e-06, "loss": 0.6337, "step": 300 }, { "epoch": 0.35327635327635326, "grad_norm": 0.4203687337846972, "learning_rate": 5e-06, "loss": 0.637, "step": 310 }, { "epoch": 0.3646723646723647, "grad_norm": 0.421822849143681, "learning_rate": 5e-06, "loss": 0.64, "step": 320 }, { "epoch": 0.37606837606837606, "grad_norm": 0.4283596513144174, "learning_rate": 5e-06, "loss": 0.6252, "step": 330 }, { "epoch": 0.38746438746438744, "grad_norm": 0.42324206057009117, "learning_rate": 5e-06, "loss": 0.636, "step": 340 }, { "epoch": 0.39886039886039887, "grad_norm": 0.4062746526152, "learning_rate": 5e-06, "loss": 0.6387, "step": 350 }, { "epoch": 0.41025641025641024, "grad_norm": 0.43787430045384385, "learning_rate": 5e-06, "loss": 0.6338, "step": 360 }, { "epoch": 0.42165242165242167, "grad_norm": 0.4067600081663935, "learning_rate": 5e-06, "loss": 0.6337, "step": 370 }, { "epoch": 0.43304843304843305, "grad_norm": 0.405651618692542, "learning_rate": 5e-06, "loss": 0.6235, "step": 380 }, { "epoch": 0.4444444444444444, "grad_norm": 0.39893273449497857, "learning_rate": 5e-06, "loss": 0.6305, "step": 390 }, { "epoch": 0.45584045584045585, "grad_norm": 0.4181843535226299, "learning_rate": 5e-06, "loss": 0.6424, "step": 400 }, { "epoch": 0.4672364672364672, "grad_norm": 0.42805555885189545, "learning_rate": 5e-06, "loss": 0.6319, "step": 410 }, { "epoch": 0.47863247863247865, "grad_norm": 0.44895050300003103, "learning_rate": 5e-06, "loss": 0.6332, "step": 420 }, { "epoch": 0.49002849002849, "grad_norm": 0.3919946319959885, "learning_rate": 5e-06, "loss": 0.6328, "step": 430 }, { "epoch": 0.5014245014245015, "grad_norm": 0.415410598131448, "learning_rate": 5e-06, "loss": 0.6402, "step": 440 }, { "epoch": 0.5128205128205128, "grad_norm": 0.4393590856709396, "learning_rate": 5e-06, "loss": 0.6306, "step": 450 }, { "epoch": 0.5242165242165242, "grad_norm": 0.43301735874135633, "learning_rate": 5e-06, "loss": 0.6344, "step": 460 }, { "epoch": 0.5356125356125356, "grad_norm": 0.44449051507968934, "learning_rate": 5e-06, "loss": 0.6231, "step": 470 }, { "epoch": 0.5470085470085471, "grad_norm": 0.41409814672813067, "learning_rate": 5e-06, "loss": 0.6341, "step": 480 }, { "epoch": 0.5584045584045584, "grad_norm": 0.404549684025027, "learning_rate": 5e-06, "loss": 0.6351, "step": 490 }, { "epoch": 0.5698005698005698, "grad_norm": 0.4180598818867765, "learning_rate": 5e-06, "loss": 0.6309, "step": 500 }, { "epoch": 0.5811965811965812, "grad_norm": 0.40656312646305987, "learning_rate": 5e-06, "loss": 0.6331, "step": 510 }, { "epoch": 0.5925925925925926, "grad_norm": 0.40247877044565616, "learning_rate": 5e-06, "loss": 0.6303, "step": 520 }, { "epoch": 0.603988603988604, "grad_norm": 0.4186724709073127, "learning_rate": 5e-06, "loss": 0.6295, "step": 530 }, { "epoch": 0.6153846153846154, "grad_norm": 0.3956067792496914, "learning_rate": 5e-06, "loss": 0.629, "step": 540 }, { "epoch": 0.6267806267806267, "grad_norm": 0.43010520803632213, "learning_rate": 5e-06, "loss": 0.6418, "step": 550 }, { "epoch": 0.6381766381766382, "grad_norm": 0.4242582783709579, "learning_rate": 5e-06, "loss": 0.6365, "step": 560 }, { "epoch": 0.6495726495726496, "grad_norm": 0.4574479642511814, "learning_rate": 5e-06, "loss": 0.6298, "step": 570 }, { "epoch": 0.6609686609686609, "grad_norm": 0.3999462091117723, "learning_rate": 5e-06, "loss": 0.6264, "step": 580 }, { "epoch": 0.6723646723646723, "grad_norm": 0.43650664891174007, "learning_rate": 5e-06, "loss": 0.6338, "step": 590 }, { "epoch": 0.6837606837606838, "grad_norm": 0.4209881207979195, "learning_rate": 5e-06, "loss": 0.6185, "step": 600 }, { "epoch": 0.6951566951566952, "grad_norm": 0.4356837089917804, "learning_rate": 5e-06, "loss": 0.6285, "step": 610 }, { "epoch": 0.7065527065527065, "grad_norm": 0.4267755900128707, "learning_rate": 5e-06, "loss": 0.6249, "step": 620 }, { "epoch": 0.717948717948718, "grad_norm": 0.4252749404036598, "learning_rate": 5e-06, "loss": 0.6297, "step": 630 }, { "epoch": 0.7293447293447294, "grad_norm": 0.43616986641525424, "learning_rate": 5e-06, "loss": 0.624, "step": 640 }, { "epoch": 0.7407407407407407, "grad_norm": 0.4164486549654651, "learning_rate": 5e-06, "loss": 0.629, "step": 650 }, { "epoch": 0.7521367521367521, "grad_norm": 0.476343190261518, "learning_rate": 5e-06, "loss": 0.6177, "step": 660 }, { "epoch": 0.7635327635327636, "grad_norm": 0.40486827396324065, "learning_rate": 5e-06, "loss": 0.6261, "step": 670 }, { "epoch": 0.7749287749287749, "grad_norm": 0.4212351136466915, "learning_rate": 5e-06, "loss": 0.6304, "step": 680 }, { "epoch": 0.7863247863247863, "grad_norm": 0.41575901401793347, "learning_rate": 5e-06, "loss": 0.6398, "step": 690 }, { "epoch": 0.7977207977207977, "grad_norm": 0.4285454155969582, "learning_rate": 5e-06, "loss": 0.6319, "step": 700 }, { "epoch": 0.8091168091168092, "grad_norm": 0.40726171067131095, "learning_rate": 5e-06, "loss": 0.6314, "step": 710 }, { "epoch": 0.8205128205128205, "grad_norm": 0.41168149111216795, "learning_rate": 5e-06, "loss": 0.6243, "step": 720 }, { "epoch": 0.8319088319088319, "grad_norm": 0.435567753751087, "learning_rate": 5e-06, "loss": 0.6226, "step": 730 }, { "epoch": 0.8433048433048433, "grad_norm": 0.43940850789677355, "learning_rate": 5e-06, "loss": 0.6208, "step": 740 }, { "epoch": 0.8547008547008547, "grad_norm": 0.4188384621992378, "learning_rate": 5e-06, "loss": 0.6338, "step": 750 }, { "epoch": 0.8660968660968661, "grad_norm": 0.3960108041735021, "learning_rate": 5e-06, "loss": 0.6337, "step": 760 }, { "epoch": 0.8774928774928775, "grad_norm": 0.40675640823017994, "learning_rate": 5e-06, "loss": 0.6296, "step": 770 }, { "epoch": 0.8888888888888888, "grad_norm": 0.43353876595216656, "learning_rate": 5e-06, "loss": 0.6357, "step": 780 }, { "epoch": 0.9002849002849003, "grad_norm": 0.43992543662793077, "learning_rate": 5e-06, "loss": 0.6333, "step": 790 }, { "epoch": 0.9116809116809117, "grad_norm": 0.41627535741522503, "learning_rate": 5e-06, "loss": 0.6384, "step": 800 }, { "epoch": 0.9230769230769231, "grad_norm": 0.4274496512159185, "learning_rate": 5e-06, "loss": 0.6309, "step": 810 }, { "epoch": 0.9344729344729344, "grad_norm": 0.5000942948514508, "learning_rate": 5e-06, "loss": 0.6323, "step": 820 }, { "epoch": 0.9458689458689459, "grad_norm": 0.39649163621370453, "learning_rate": 5e-06, "loss": 0.6117, "step": 830 }, { "epoch": 0.9572649572649573, "grad_norm": 0.45128894713654466, "learning_rate": 5e-06, "loss": 0.6258, "step": 840 }, { "epoch": 0.9686609686609686, "grad_norm": 0.4053334632337957, "learning_rate": 5e-06, "loss": 0.6334, "step": 850 }, { "epoch": 0.98005698005698, "grad_norm": 0.4570308695791834, "learning_rate": 5e-06, "loss": 0.6299, "step": 860 }, { "epoch": 0.9914529914529915, "grad_norm": 0.4142729888175128, "learning_rate": 5e-06, "loss": 0.6134, "step": 870 }, { "epoch": 0.9994301994301994, "eval_loss": 0.622437059879303, "eval_runtime": 442.3461, "eval_samples_per_second": 26.728, "eval_steps_per_second": 0.418, "step": 877 }, { "epoch": 1.002849002849003, "grad_norm": 0.467206811021719, "learning_rate": 5e-06, "loss": 0.6384, "step": 880 }, { "epoch": 1.0142450142450143, "grad_norm": 0.4575873633037112, "learning_rate": 5e-06, "loss": 0.5855, "step": 890 }, { "epoch": 1.0256410256410255, "grad_norm": 0.4094192073196508, "learning_rate": 5e-06, "loss": 0.5924, "step": 900 }, { "epoch": 1.037037037037037, "grad_norm": 0.41727147235729756, "learning_rate": 5e-06, "loss": 0.5882, "step": 910 }, { "epoch": 1.0484330484330484, "grad_norm": 0.40097390374474684, "learning_rate": 5e-06, "loss": 0.5834, "step": 920 }, { "epoch": 1.0598290598290598, "grad_norm": 0.3988722663272877, "learning_rate": 5e-06, "loss": 0.5875, "step": 930 }, { "epoch": 1.0712250712250713, "grad_norm": 0.409835543782938, "learning_rate": 5e-06, "loss": 0.578, "step": 940 }, { "epoch": 1.0826210826210827, "grad_norm": 0.4348656181993297, "learning_rate": 5e-06, "loss": 0.5945, "step": 950 }, { "epoch": 1.0940170940170941, "grad_norm": 0.4560769367527893, "learning_rate": 5e-06, "loss": 0.591, "step": 960 }, { "epoch": 1.1054131054131053, "grad_norm": 0.3987301391233058, "learning_rate": 5e-06, "loss": 0.5947, "step": 970 }, { "epoch": 1.1168091168091168, "grad_norm": 0.4310263093448157, "learning_rate": 5e-06, "loss": 0.5989, "step": 980 }, { "epoch": 1.1282051282051282, "grad_norm": 0.3988555704488419, "learning_rate": 5e-06, "loss": 0.5883, "step": 990 }, { "epoch": 1.1396011396011396, "grad_norm": 0.41694498325264395, "learning_rate": 5e-06, "loss": 0.5857, "step": 1000 }, { "epoch": 1.150997150997151, "grad_norm": 0.4261280155159663, "learning_rate": 5e-06, "loss": 0.5846, "step": 1010 }, { "epoch": 1.1623931623931625, "grad_norm": 0.4090258551630524, "learning_rate": 5e-06, "loss": 0.5862, "step": 1020 }, { "epoch": 1.173789173789174, "grad_norm": 0.39703392125897946, "learning_rate": 5e-06, "loss": 0.5828, "step": 1030 }, { "epoch": 1.1851851851851851, "grad_norm": 0.4171232168803472, "learning_rate": 5e-06, "loss": 0.5969, "step": 1040 }, { "epoch": 1.1965811965811965, "grad_norm": 0.3986677142839061, "learning_rate": 5e-06, "loss": 0.5849, "step": 1050 }, { "epoch": 1.207977207977208, "grad_norm": 0.4210046425391405, "learning_rate": 5e-06, "loss": 0.5866, "step": 1060 }, { "epoch": 1.2193732193732194, "grad_norm": 0.4497366233089093, "learning_rate": 5e-06, "loss": 0.5963, "step": 1070 }, { "epoch": 1.2307692307692308, "grad_norm": 0.43086405644231185, "learning_rate": 5e-06, "loss": 0.5872, "step": 1080 }, { "epoch": 1.242165242165242, "grad_norm": 0.4519037391850927, "learning_rate": 5e-06, "loss": 0.5952, "step": 1090 }, { "epoch": 1.2535612535612537, "grad_norm": 0.41349582244683747, "learning_rate": 5e-06, "loss": 0.5903, "step": 1100 }, { "epoch": 1.264957264957265, "grad_norm": 0.3837938001947666, "learning_rate": 5e-06, "loss": 0.5989, "step": 1110 }, { "epoch": 1.2763532763532763, "grad_norm": 0.38645298038964926, "learning_rate": 5e-06, "loss": 0.583, "step": 1120 }, { "epoch": 1.2877492877492878, "grad_norm": 0.39026828874261793, "learning_rate": 5e-06, "loss": 0.5938, "step": 1130 }, { "epoch": 1.2991452991452992, "grad_norm": 0.48601873116831096, "learning_rate": 5e-06, "loss": 0.5805, "step": 1140 }, { "epoch": 1.3105413105413106, "grad_norm": 0.4496341989317277, "learning_rate": 5e-06, "loss": 0.5862, "step": 1150 }, { "epoch": 1.3219373219373218, "grad_norm": 0.43314588815183497, "learning_rate": 5e-06, "loss": 0.5883, "step": 1160 }, { "epoch": 1.3333333333333333, "grad_norm": 0.4373497446033339, "learning_rate": 5e-06, "loss": 0.5833, "step": 1170 }, { "epoch": 1.3447293447293447, "grad_norm": 0.4061985333964508, "learning_rate": 5e-06, "loss": 0.5824, "step": 1180 }, { "epoch": 1.3561253561253561, "grad_norm": 0.4144234110159319, "learning_rate": 5e-06, "loss": 0.6004, "step": 1190 }, { "epoch": 1.3675213675213675, "grad_norm": 0.4373239103878606, "learning_rate": 5e-06, "loss": 0.5818, "step": 1200 }, { "epoch": 1.378917378917379, "grad_norm": 0.4210723366091624, "learning_rate": 5e-06, "loss": 0.5859, "step": 1210 }, { "epoch": 1.3903133903133904, "grad_norm": 0.4052006957338942, "learning_rate": 5e-06, "loss": 0.5906, "step": 1220 }, { "epoch": 1.4017094017094016, "grad_norm": 0.4292623892695985, "learning_rate": 5e-06, "loss": 0.5927, "step": 1230 }, { "epoch": 1.413105413105413, "grad_norm": 0.4232783608596394, "learning_rate": 5e-06, "loss": 0.5956, "step": 1240 }, { "epoch": 1.4245014245014245, "grad_norm": 0.43895695326546535, "learning_rate": 5e-06, "loss": 0.6033, "step": 1250 }, { "epoch": 1.435897435897436, "grad_norm": 0.4349281709940867, "learning_rate": 5e-06, "loss": 0.5825, "step": 1260 }, { "epoch": 1.4472934472934473, "grad_norm": 0.4124297881341476, "learning_rate": 5e-06, "loss": 0.5842, "step": 1270 }, { "epoch": 1.4586894586894588, "grad_norm": 0.4103899829789082, "learning_rate": 5e-06, "loss": 0.5851, "step": 1280 }, { "epoch": 1.4700854700854702, "grad_norm": 0.4187405725906187, "learning_rate": 5e-06, "loss": 0.5815, "step": 1290 }, { "epoch": 1.4814814814814814, "grad_norm": 0.4335514785672904, "learning_rate": 5e-06, "loss": 0.5893, "step": 1300 }, { "epoch": 1.4928774928774928, "grad_norm": 0.4095416189258966, "learning_rate": 5e-06, "loss": 0.5812, "step": 1310 }, { "epoch": 1.5042735042735043, "grad_norm": 0.4327159045023668, "learning_rate": 5e-06, "loss": 0.5872, "step": 1320 }, { "epoch": 1.5156695156695157, "grad_norm": 0.42071355477765043, "learning_rate": 5e-06, "loss": 0.5894, "step": 1330 }, { "epoch": 1.5270655270655271, "grad_norm": 0.41206489314047035, "learning_rate": 5e-06, "loss": 0.5948, "step": 1340 }, { "epoch": 1.5384615384615383, "grad_norm": 0.4743925797235051, "learning_rate": 5e-06, "loss": 0.5755, "step": 1350 }, { "epoch": 1.54985754985755, "grad_norm": 0.40595707601991954, "learning_rate": 5e-06, "loss": 0.5892, "step": 1360 }, { "epoch": 1.5612535612535612, "grad_norm": 0.3947481991815675, "learning_rate": 5e-06, "loss": 0.5784, "step": 1370 }, { "epoch": 1.5726495726495726, "grad_norm": 0.38785299206305784, "learning_rate": 5e-06, "loss": 0.5864, "step": 1380 }, { "epoch": 1.584045584045584, "grad_norm": 0.4149251097325744, "learning_rate": 5e-06, "loss": 0.6075, "step": 1390 }, { "epoch": 1.5954415954415955, "grad_norm": 0.43637706913229096, "learning_rate": 5e-06, "loss": 0.5944, "step": 1400 }, { "epoch": 1.606837606837607, "grad_norm": 0.4169030325172147, "learning_rate": 5e-06, "loss": 0.5915, "step": 1410 }, { "epoch": 1.618233618233618, "grad_norm": 0.40313516644365976, "learning_rate": 5e-06, "loss": 0.5817, "step": 1420 }, { "epoch": 1.6296296296296298, "grad_norm": 0.39503546583616356, "learning_rate": 5e-06, "loss": 0.5983, "step": 1430 }, { "epoch": 1.641025641025641, "grad_norm": 0.3985527253164314, "learning_rate": 5e-06, "loss": 0.5826, "step": 1440 }, { "epoch": 1.6524216524216524, "grad_norm": 0.4244488951950044, "learning_rate": 5e-06, "loss": 0.5889, "step": 1450 }, { "epoch": 1.6638176638176638, "grad_norm": 0.39693307887587553, "learning_rate": 5e-06, "loss": 0.5859, "step": 1460 }, { "epoch": 1.6752136752136753, "grad_norm": 0.3849679459506633, "learning_rate": 5e-06, "loss": 0.5806, "step": 1470 }, { "epoch": 1.6866096866096867, "grad_norm": 0.4201985578364686, "learning_rate": 5e-06, "loss": 0.5866, "step": 1480 }, { "epoch": 1.698005698005698, "grad_norm": 0.42432125023319545, "learning_rate": 5e-06, "loss": 0.5803, "step": 1490 }, { "epoch": 1.7094017094017095, "grad_norm": 0.40730542273295467, "learning_rate": 5e-06, "loss": 0.5913, "step": 1500 }, { "epoch": 1.7207977207977208, "grad_norm": 0.42837414750466624, "learning_rate": 5e-06, "loss": 0.5795, "step": 1510 }, { "epoch": 1.7321937321937322, "grad_norm": 0.44083872834956234, "learning_rate": 5e-06, "loss": 0.5888, "step": 1520 }, { "epoch": 1.7435897435897436, "grad_norm": 0.41133942102181764, "learning_rate": 5e-06, "loss": 0.5865, "step": 1530 }, { "epoch": 1.7549857549857548, "grad_norm": 0.40260223356507924, "learning_rate": 5e-06, "loss": 0.5816, "step": 1540 }, { "epoch": 1.7663817663817665, "grad_norm": 0.4054088563875919, "learning_rate": 5e-06, "loss": 0.5952, "step": 1550 }, { "epoch": 1.7777777777777777, "grad_norm": 0.4218451424068199, "learning_rate": 5e-06, "loss": 0.5879, "step": 1560 }, { "epoch": 1.7891737891737893, "grad_norm": 0.4423529568236007, "learning_rate": 5e-06, "loss": 0.5907, "step": 1570 }, { "epoch": 1.8005698005698005, "grad_norm": 0.41215301182035746, "learning_rate": 5e-06, "loss": 0.5841, "step": 1580 }, { "epoch": 1.811965811965812, "grad_norm": 0.4555696841177031, "learning_rate": 5e-06, "loss": 0.5849, "step": 1590 }, { "epoch": 1.8233618233618234, "grad_norm": 0.41997083905529, "learning_rate": 5e-06, "loss": 0.5712, "step": 1600 }, { "epoch": 1.8347578347578346, "grad_norm": 0.40350765403827904, "learning_rate": 5e-06, "loss": 0.5773, "step": 1610 }, { "epoch": 1.8461538461538463, "grad_norm": 0.41505233462990104, "learning_rate": 5e-06, "loss": 0.5828, "step": 1620 }, { "epoch": 1.8575498575498575, "grad_norm": 0.4094044224106121, "learning_rate": 5e-06, "loss": 0.577, "step": 1630 }, { "epoch": 1.868945868945869, "grad_norm": 0.3989458077194491, "learning_rate": 5e-06, "loss": 0.5852, "step": 1640 }, { "epoch": 1.8803418803418803, "grad_norm": 0.3968449176678109, "learning_rate": 5e-06, "loss": 0.5765, "step": 1650 }, { "epoch": 1.8917378917378918, "grad_norm": 0.3975827713442406, "learning_rate": 5e-06, "loss": 0.5941, "step": 1660 }, { "epoch": 1.9031339031339032, "grad_norm": 0.4591167052806216, "learning_rate": 5e-06, "loss": 0.5958, "step": 1670 }, { "epoch": 1.9145299145299144, "grad_norm": 0.4763985809192953, "learning_rate": 5e-06, "loss": 0.5822, "step": 1680 }, { "epoch": 1.925925925925926, "grad_norm": 0.40816873290685, "learning_rate": 5e-06, "loss": 0.591, "step": 1690 }, { "epoch": 1.9373219373219372, "grad_norm": 0.43451011164507114, "learning_rate": 5e-06, "loss": 0.5866, "step": 1700 }, { "epoch": 1.9487179487179487, "grad_norm": 0.42502005410583105, "learning_rate": 5e-06, "loss": 0.5812, "step": 1710 }, { "epoch": 1.96011396011396, "grad_norm": 0.3868140358085357, "learning_rate": 5e-06, "loss": 0.5952, "step": 1720 }, { "epoch": 1.9715099715099715, "grad_norm": 0.4233434645527226, "learning_rate": 5e-06, "loss": 0.5905, "step": 1730 }, { "epoch": 1.982905982905983, "grad_norm": 0.46128367957303146, "learning_rate": 5e-06, "loss": 0.5835, "step": 1740 }, { "epoch": 1.9943019943019942, "grad_norm": 0.41962900843595113, "learning_rate": 5e-06, "loss": 0.5823, "step": 1750 }, { "epoch": 2.0, "eval_loss": 0.6174917817115784, "eval_runtime": 442.5819, "eval_samples_per_second": 26.714, "eval_steps_per_second": 0.418, "step": 1755 }, { "epoch": 2.005698005698006, "grad_norm": 0.43638289381677664, "learning_rate": 5e-06, "loss": 0.6003, "step": 1760 }, { "epoch": 2.017094017094017, "grad_norm": 0.4032954694771035, "learning_rate": 5e-06, "loss": 0.5295, "step": 1770 }, { "epoch": 2.0284900284900287, "grad_norm": 0.3978342138531873, "learning_rate": 5e-06, "loss": 0.5396, "step": 1780 }, { "epoch": 2.03988603988604, "grad_norm": 0.3941941742542143, "learning_rate": 5e-06, "loss": 0.5498, "step": 1790 }, { "epoch": 2.051282051282051, "grad_norm": 0.40614413388153375, "learning_rate": 5e-06, "loss": 0.5485, "step": 1800 }, { "epoch": 2.0626780626780628, "grad_norm": 0.4062005374187212, "learning_rate": 5e-06, "loss": 0.5443, "step": 1810 }, { "epoch": 2.074074074074074, "grad_norm": 0.38632662394247547, "learning_rate": 5e-06, "loss": 0.5499, "step": 1820 }, { "epoch": 2.0854700854700856, "grad_norm": 0.3877796238652637, "learning_rate": 5e-06, "loss": 0.5472, "step": 1830 }, { "epoch": 2.096866096866097, "grad_norm": 0.4031396151639763, "learning_rate": 5e-06, "loss": 0.5557, "step": 1840 }, { "epoch": 2.1082621082621085, "grad_norm": 0.3758020305089208, "learning_rate": 5e-06, "loss": 0.5423, "step": 1850 }, { "epoch": 2.1196581196581197, "grad_norm": 0.46333515136342907, "learning_rate": 5e-06, "loss": 0.5556, "step": 1860 }, { "epoch": 2.131054131054131, "grad_norm": 0.3990721210469113, "learning_rate": 5e-06, "loss": 0.5464, "step": 1870 }, { "epoch": 2.1424501424501425, "grad_norm": 0.41896529664740606, "learning_rate": 5e-06, "loss": 0.5459, "step": 1880 }, { "epoch": 2.1538461538461537, "grad_norm": 0.40224292638674486, "learning_rate": 5e-06, "loss": 0.5432, "step": 1890 }, { "epoch": 2.1652421652421654, "grad_norm": 0.3703829495333715, "learning_rate": 5e-06, "loss": 0.5434, "step": 1900 }, { "epoch": 2.1766381766381766, "grad_norm": 0.4195807512147461, "learning_rate": 5e-06, "loss": 0.548, "step": 1910 }, { "epoch": 2.1880341880341883, "grad_norm": 0.42078014349068604, "learning_rate": 5e-06, "loss": 0.551, "step": 1920 }, { "epoch": 2.1994301994301995, "grad_norm": 0.39550870444336733, "learning_rate": 5e-06, "loss": 0.5487, "step": 1930 }, { "epoch": 2.2108262108262107, "grad_norm": 0.403560752581769, "learning_rate": 5e-06, "loss": 0.5518, "step": 1940 }, { "epoch": 2.2222222222222223, "grad_norm": 0.4148295164570796, "learning_rate": 5e-06, "loss": 0.5455, "step": 1950 }, { "epoch": 2.2336182336182335, "grad_norm": 0.37681071283125916, "learning_rate": 5e-06, "loss": 0.5371, "step": 1960 }, { "epoch": 2.245014245014245, "grad_norm": 0.4085602540294654, "learning_rate": 5e-06, "loss": 0.548, "step": 1970 }, { "epoch": 2.2564102564102564, "grad_norm": 0.42666262080387535, "learning_rate": 5e-06, "loss": 0.5489, "step": 1980 }, { "epoch": 2.267806267806268, "grad_norm": 0.413370914720578, "learning_rate": 5e-06, "loss": 0.5452, "step": 1990 }, { "epoch": 2.2792022792022792, "grad_norm": 0.3924258676572947, "learning_rate": 5e-06, "loss": 0.5506, "step": 2000 }, { "epoch": 2.2905982905982905, "grad_norm": 0.4347195110430224, "learning_rate": 5e-06, "loss": 0.5495, "step": 2010 }, { "epoch": 2.301994301994302, "grad_norm": 0.40213883875930767, "learning_rate": 5e-06, "loss": 0.5447, "step": 2020 }, { "epoch": 2.3133903133903133, "grad_norm": 0.42546941310471453, "learning_rate": 5e-06, "loss": 0.5533, "step": 2030 }, { "epoch": 2.324786324786325, "grad_norm": 0.40042388002057316, "learning_rate": 5e-06, "loss": 0.5493, "step": 2040 }, { "epoch": 2.336182336182336, "grad_norm": 0.40985989196559397, "learning_rate": 5e-06, "loss": 0.5484, "step": 2050 }, { "epoch": 2.347578347578348, "grad_norm": 0.4262197347046128, "learning_rate": 5e-06, "loss": 0.5561, "step": 2060 }, { "epoch": 2.358974358974359, "grad_norm": 0.4079997903297647, "learning_rate": 5e-06, "loss": 0.5479, "step": 2070 }, { "epoch": 2.3703703703703702, "grad_norm": 0.4171995256710412, "learning_rate": 5e-06, "loss": 0.5482, "step": 2080 }, { "epoch": 2.381766381766382, "grad_norm": 0.4067288627883757, "learning_rate": 5e-06, "loss": 0.5495, "step": 2090 }, { "epoch": 2.393162393162393, "grad_norm": 0.39812759486187826, "learning_rate": 5e-06, "loss": 0.5475, "step": 2100 }, { "epoch": 2.4045584045584047, "grad_norm": 0.4252046487226247, "learning_rate": 5e-06, "loss": 0.564, "step": 2110 }, { "epoch": 2.415954415954416, "grad_norm": 0.385246050290494, "learning_rate": 5e-06, "loss": 0.5495, "step": 2120 }, { "epoch": 2.427350427350427, "grad_norm": 0.4086146276427414, "learning_rate": 5e-06, "loss": 0.56, "step": 2130 }, { "epoch": 2.438746438746439, "grad_norm": 0.40396684063143223, "learning_rate": 5e-06, "loss": 0.5592, "step": 2140 }, { "epoch": 2.45014245014245, "grad_norm": 0.40575491064321195, "learning_rate": 5e-06, "loss": 0.5633, "step": 2150 }, { "epoch": 2.4615384615384617, "grad_norm": 0.4073296395669543, "learning_rate": 5e-06, "loss": 0.5488, "step": 2160 }, { "epoch": 2.472934472934473, "grad_norm": 0.43882905338245753, "learning_rate": 5e-06, "loss": 0.5513, "step": 2170 }, { "epoch": 2.484330484330484, "grad_norm": 0.4031322481681622, "learning_rate": 5e-06, "loss": 0.5554, "step": 2180 }, { "epoch": 2.4957264957264957, "grad_norm": 0.42227630442588826, "learning_rate": 5e-06, "loss": 0.5574, "step": 2190 }, { "epoch": 2.5071225071225074, "grad_norm": 0.4277624308363176, "learning_rate": 5e-06, "loss": 0.5629, "step": 2200 }, { "epoch": 2.5185185185185186, "grad_norm": 0.40043500855114567, "learning_rate": 5e-06, "loss": 0.5444, "step": 2210 }, { "epoch": 2.52991452991453, "grad_norm": 0.427445344908136, "learning_rate": 5e-06, "loss": 0.5591, "step": 2220 }, { "epoch": 2.5413105413105415, "grad_norm": 0.4197028690010052, "learning_rate": 5e-06, "loss": 0.5513, "step": 2230 }, { "epoch": 2.5527065527065527, "grad_norm": 0.41806376493939207, "learning_rate": 5e-06, "loss": 0.5501, "step": 2240 }, { "epoch": 2.564102564102564, "grad_norm": 0.4080384204790527, "learning_rate": 5e-06, "loss": 0.5488, "step": 2250 }, { "epoch": 2.5754985754985755, "grad_norm": 0.4339972064470789, "learning_rate": 5e-06, "loss": 0.5534, "step": 2260 }, { "epoch": 2.5868945868945867, "grad_norm": 0.4139873128656014, "learning_rate": 5e-06, "loss": 0.5509, "step": 2270 }, { "epoch": 2.5982905982905984, "grad_norm": 0.39593523779791756, "learning_rate": 5e-06, "loss": 0.5515, "step": 2280 }, { "epoch": 2.6096866096866096, "grad_norm": 0.3887745966959367, "learning_rate": 5e-06, "loss": 0.5567, "step": 2290 }, { "epoch": 2.6210826210826212, "grad_norm": 0.3899940114191536, "learning_rate": 5e-06, "loss": 0.5429, "step": 2300 }, { "epoch": 2.6324786324786325, "grad_norm": 0.4176311832860518, "learning_rate": 5e-06, "loss": 0.553, "step": 2310 }, { "epoch": 2.6438746438746437, "grad_norm": 0.46727727994302587, "learning_rate": 5e-06, "loss": 0.5524, "step": 2320 }, { "epoch": 2.6552706552706553, "grad_norm": 0.4368321834367039, "learning_rate": 5e-06, "loss": 0.5552, "step": 2330 }, { "epoch": 2.6666666666666665, "grad_norm": 0.4479324367839254, "learning_rate": 5e-06, "loss": 0.5534, "step": 2340 }, { "epoch": 2.678062678062678, "grad_norm": 0.41411545835899133, "learning_rate": 5e-06, "loss": 0.5467, "step": 2350 }, { "epoch": 2.6894586894586894, "grad_norm": 0.4201299885965421, "learning_rate": 5e-06, "loss": 0.5565, "step": 2360 }, { "epoch": 2.700854700854701, "grad_norm": 0.40978702073303064, "learning_rate": 5e-06, "loss": 0.5444, "step": 2370 }, { "epoch": 2.7122507122507122, "grad_norm": 0.4233459449335634, "learning_rate": 5e-06, "loss": 0.5563, "step": 2380 }, { "epoch": 2.7236467236467234, "grad_norm": 0.4159458912952842, "learning_rate": 5e-06, "loss": 0.5551, "step": 2390 }, { "epoch": 2.735042735042735, "grad_norm": 0.41425606346483057, "learning_rate": 5e-06, "loss": 0.5539, "step": 2400 }, { "epoch": 2.7464387464387463, "grad_norm": 0.4166133827092343, "learning_rate": 5e-06, "loss": 0.5588, "step": 2410 }, { "epoch": 2.757834757834758, "grad_norm": 0.4263688845736852, "learning_rate": 5e-06, "loss": 0.5575, "step": 2420 }, { "epoch": 2.769230769230769, "grad_norm": 0.41269883049053624, "learning_rate": 5e-06, "loss": 0.5471, "step": 2430 }, { "epoch": 2.780626780626781, "grad_norm": 0.3894335667283599, "learning_rate": 5e-06, "loss": 0.5468, "step": 2440 }, { "epoch": 2.792022792022792, "grad_norm": 0.40933546113606567, "learning_rate": 5e-06, "loss": 0.5501, "step": 2450 }, { "epoch": 2.8034188034188032, "grad_norm": 0.39714648665213204, "learning_rate": 5e-06, "loss": 0.5444, "step": 2460 }, { "epoch": 2.814814814814815, "grad_norm": 0.40517136322070096, "learning_rate": 5e-06, "loss": 0.5601, "step": 2470 }, { "epoch": 2.826210826210826, "grad_norm": 0.44447910033491683, "learning_rate": 5e-06, "loss": 0.5623, "step": 2480 }, { "epoch": 2.8376068376068377, "grad_norm": 0.388103652560322, "learning_rate": 5e-06, "loss": 0.5543, "step": 2490 }, { "epoch": 2.849002849002849, "grad_norm": 0.40171877838716236, "learning_rate": 5e-06, "loss": 0.562, "step": 2500 }, { "epoch": 2.8603988603988606, "grad_norm": 0.41856657884436094, "learning_rate": 5e-06, "loss": 0.5536, "step": 2510 }, { "epoch": 2.871794871794872, "grad_norm": 0.4173395435456696, "learning_rate": 5e-06, "loss": 0.5539, "step": 2520 }, { "epoch": 2.883190883190883, "grad_norm": 0.39093712576995243, "learning_rate": 5e-06, "loss": 0.5601, "step": 2530 }, { "epoch": 2.8945868945868947, "grad_norm": 0.4255070470787294, "learning_rate": 5e-06, "loss": 0.5557, "step": 2540 }, { "epoch": 2.905982905982906, "grad_norm": 0.45247644117965885, "learning_rate": 5e-06, "loss": 0.5528, "step": 2550 }, { "epoch": 2.9173789173789175, "grad_norm": 0.41729192613775734, "learning_rate": 5e-06, "loss": 0.5416, "step": 2560 }, { "epoch": 2.9287749287749287, "grad_norm": 0.3959874387272076, "learning_rate": 5e-06, "loss": 0.5471, "step": 2570 }, { "epoch": 2.9401709401709404, "grad_norm": 0.40279780924522723, "learning_rate": 5e-06, "loss": 0.5438, "step": 2580 }, { "epoch": 2.9515669515669516, "grad_norm": 0.41492112649690777, "learning_rate": 5e-06, "loss": 0.5533, "step": 2590 }, { "epoch": 2.962962962962963, "grad_norm": 0.4072236941032463, "learning_rate": 5e-06, "loss": 0.5446, "step": 2600 }, { "epoch": 2.9743589743589745, "grad_norm": 0.3967690970697916, "learning_rate": 5e-06, "loss": 0.556, "step": 2610 }, { "epoch": 2.9857549857549857, "grad_norm": 0.4004788690287786, "learning_rate": 5e-06, "loss": 0.5571, "step": 2620 }, { "epoch": 2.9971509971509973, "grad_norm": 0.39905352277311656, "learning_rate": 5e-06, "loss": 0.5443, "step": 2630 }, { "epoch": 2.9982905982905983, "eval_loss": 0.6213015913963318, "eval_runtime": 442.3632, "eval_samples_per_second": 26.727, "eval_steps_per_second": 0.418, "step": 2631 }, { "epoch": 2.9982905982905983, "step": 2631, "total_flos": 2758364765356032.0, "train_loss": 0.5923774672614808, "train_runtime": 70850.8498, "train_samples_per_second": 9.511, "train_steps_per_second": 0.037 } ], "logging_steps": 10, "max_steps": 2631, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2758364765356032.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }