{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9994378864530633, "eval_steps": 500, "global_step": 2001, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014989694584972832, "grad_norm": 12.777873823078783, "learning_rate": 1.6611295681063126e-07, "loss": 0.902, "step": 10 }, { "epoch": 0.029979389169945664, "grad_norm": 7.3435241316074045, "learning_rate": 3.322259136212625e-07, "loss": 0.8565, "step": 20 }, { "epoch": 0.044969083754918496, "grad_norm": 3.9849563392808864, "learning_rate": 4.983388704318938e-07, "loss": 0.7765, "step": 30 }, { "epoch": 0.05995877833989133, "grad_norm": 1.8633726457542599, "learning_rate": 6.64451827242525e-07, "loss": 0.7102, "step": 40 }, { "epoch": 0.07494847292486416, "grad_norm": 1.4954062046250922, "learning_rate": 8.305647840531563e-07, "loss": 0.6762, "step": 50 }, { "epoch": 0.08993816750983699, "grad_norm": 1.3750012507878848, "learning_rate": 9.966777408637875e-07, "loss": 0.6471, "step": 60 }, { "epoch": 0.10492786209480982, "grad_norm": 1.4482820464395114, "learning_rate": 1.1627906976744188e-06, "loss": 0.6264, "step": 70 }, { "epoch": 0.11991755667978266, "grad_norm": 1.862315554960826, "learning_rate": 1.32890365448505e-06, "loss": 0.6131, "step": 80 }, { "epoch": 0.13490725126475547, "grad_norm": 1.6724167400141936, "learning_rate": 1.4950166112956813e-06, "loss": 0.6011, "step": 90 }, { "epoch": 0.14989694584972832, "grad_norm": 1.6824102308369775, "learning_rate": 1.6611295681063126e-06, "loss": 0.5888, "step": 100 }, { "epoch": 0.16488664043470114, "grad_norm": 1.7516516157128537, "learning_rate": 1.8272425249169438e-06, "loss": 0.5836, "step": 110 }, { "epoch": 0.17987633501967398, "grad_norm": 1.623499220073038, "learning_rate": 1.993355481727575e-06, "loss": 0.5767, "step": 120 }, { "epoch": 0.1948660296046468, "grad_norm": 2.0314714470085358, "learning_rate": 2.1594684385382063e-06, "loss": 0.5689, "step": 130 }, { "epoch": 0.20985572418961965, "grad_norm": 1.6951555702146335, "learning_rate": 2.3255813953488376e-06, "loss": 0.5694, "step": 140 }, { "epoch": 0.22484541877459246, "grad_norm": 1.894679433473508, "learning_rate": 2.4916943521594684e-06, "loss": 0.5576, "step": 150 }, { "epoch": 0.2398351133595653, "grad_norm": 1.7422683371902348, "learning_rate": 2.6578073089701e-06, "loss": 0.5556, "step": 160 }, { "epoch": 0.25482480794453816, "grad_norm": 1.9008699070898252, "learning_rate": 2.8239202657807313e-06, "loss": 0.5521, "step": 170 }, { "epoch": 0.26981450252951095, "grad_norm": 1.844172367351659, "learning_rate": 2.9900332225913626e-06, "loss": 0.5471, "step": 180 }, { "epoch": 0.2848041971144838, "grad_norm": 1.785147460867048, "learning_rate": 3.156146179401994e-06, "loss": 0.5444, "step": 190 }, { "epoch": 0.29979389169945664, "grad_norm": 1.8204441132637552, "learning_rate": 3.322259136212625e-06, "loss": 0.5385, "step": 200 }, { "epoch": 0.31478358628442943, "grad_norm": 1.849611308520918, "learning_rate": 3.4883720930232564e-06, "loss": 0.5351, "step": 210 }, { "epoch": 0.3297732808694023, "grad_norm": 1.8876145088553256, "learning_rate": 3.6544850498338876e-06, "loss": 0.5363, "step": 220 }, { "epoch": 0.3447629754543751, "grad_norm": 2.4886914406463867, "learning_rate": 3.8205980066445185e-06, "loss": 0.5334, "step": 230 }, { "epoch": 0.35975267003934797, "grad_norm": 2.421754684568758, "learning_rate": 3.98671096345515e-06, "loss": 0.5316, "step": 240 }, { "epoch": 0.37474236462432076, "grad_norm": 1.6687380919697075, "learning_rate": 4.152823920265781e-06, "loss": 0.5268, "step": 250 }, { "epoch": 0.3897320592092936, "grad_norm": 1.8499077931578278, "learning_rate": 4.318936877076413e-06, "loss": 0.5289, "step": 260 }, { "epoch": 0.40472175379426645, "grad_norm": 2.4628946829032587, "learning_rate": 4.4850498338870435e-06, "loss": 0.5235, "step": 270 }, { "epoch": 0.4197114483792393, "grad_norm": 2.204174864640197, "learning_rate": 4.651162790697675e-06, "loss": 0.5265, "step": 280 }, { "epoch": 0.4347011429642121, "grad_norm": 2.366488200680214, "learning_rate": 4.817275747508306e-06, "loss": 0.5193, "step": 290 }, { "epoch": 0.44969083754918493, "grad_norm": 2.8012926552797257, "learning_rate": 4.983388704318937e-06, "loss": 0.5156, "step": 300 }, { "epoch": 0.4646805321341578, "grad_norm": 4.139795722954455, "learning_rate": 4.999654230307899e-06, "loss": 0.5177, "step": 310 }, { "epoch": 0.4796702267191306, "grad_norm": 3.952786519090627, "learning_rate": 4.998459099844914e-06, "loss": 0.512, "step": 320 }, { "epoch": 0.4946599213041034, "grad_norm": 2.9745265076009946, "learning_rate": 4.996410747888184e-06, "loss": 0.5104, "step": 330 }, { "epoch": 0.5096496158890763, "grad_norm": 3.279789637235632, "learning_rate": 4.993509873948025e-06, "loss": 0.5083, "step": 340 }, { "epoch": 0.5246393104740491, "grad_norm": 3.4046254908112803, "learning_rate": 4.989757468670231e-06, "loss": 0.5049, "step": 350 }, { "epoch": 0.5396290050590219, "grad_norm": 2.5862130652191, "learning_rate": 4.985154813497765e-06, "loss": 0.5093, "step": 360 }, { "epoch": 0.5546186996439948, "grad_norm": 1.8501758264267991, "learning_rate": 4.979703480233156e-06, "loss": 0.5034, "step": 370 }, { "epoch": 0.5696083942289676, "grad_norm": 2.2116554860251667, "learning_rate": 4.973405330501715e-06, "loss": 0.5023, "step": 380 }, { "epoch": 0.5845980888139404, "grad_norm": 2.2510169713465586, "learning_rate": 4.966262515115808e-06, "loss": 0.4999, "step": 390 }, { "epoch": 0.5995877833989133, "grad_norm": 2.6499432191306025, "learning_rate": 4.958277473340345e-06, "loss": 0.5043, "step": 400 }, { "epoch": 0.6145774779838861, "grad_norm": 1.8449367407295139, "learning_rate": 4.949452932059777e-06, "loss": 0.5012, "step": 410 }, { "epoch": 0.6295671725688589, "grad_norm": 1.9215837970775265, "learning_rate": 4.939791904846869e-06, "loss": 0.4969, "step": 420 }, { "epoch": 0.6445568671538318, "grad_norm": 2.1023812126184387, "learning_rate": 4.929297690933561e-06, "loss": 0.4976, "step": 430 }, { "epoch": 0.6595465617388045, "grad_norm": 2.042184330044649, "learning_rate": 4.917973874084293e-06, "loss": 0.494, "step": 440 }, { "epoch": 0.6745362563237775, "grad_norm": 2.517472777976901, "learning_rate": 4.905824321372143e-06, "loss": 0.4946, "step": 450 }, { "epoch": 0.6895259509087502, "grad_norm": 1.6569332477157788, "learning_rate": 4.892853181858234e-06, "loss": 0.4883, "step": 460 }, { "epoch": 0.704515645493723, "grad_norm": 1.771900963881147, "learning_rate": 4.879064885174825e-06, "loss": 0.4873, "step": 470 }, { "epoch": 0.7195053400786959, "grad_norm": 1.927208700693383, "learning_rate": 4.8644641400126035e-06, "loss": 0.4922, "step": 480 }, { "epoch": 0.7344950346636687, "grad_norm": 1.732154103024004, "learning_rate": 4.849055932512666e-06, "loss": 0.4911, "step": 490 }, { "epoch": 0.7494847292486415, "grad_norm": 1.7314920101435785, "learning_rate": 4.832845524563758e-06, "loss": 0.4884, "step": 500 }, { "epoch": 0.7644744238336144, "grad_norm": 2.0936474922460304, "learning_rate": 4.815838452005344e-06, "loss": 0.4941, "step": 510 }, { "epoch": 0.7794641184185872, "grad_norm": 2.075312501604676, "learning_rate": 4.798040522737122e-06, "loss": 0.4873, "step": 520 }, { "epoch": 0.7944538130035601, "grad_norm": 1.6868419550348641, "learning_rate": 4.779457814735632e-06, "loss": 0.4841, "step": 530 }, { "epoch": 0.8094435075885329, "grad_norm": 1.6894890638345692, "learning_rate": 4.760096673978634e-06, "loss": 0.4824, "step": 540 }, { "epoch": 0.8244332021735057, "grad_norm": 1.7041316420782846, "learning_rate": 4.73996371227796e-06, "loss": 0.4872, "step": 550 }, { "epoch": 0.8394228967584786, "grad_norm": 1.5724665047442083, "learning_rate": 4.719065805021584e-06, "loss": 0.4839, "step": 560 }, { "epoch": 0.8544125913434514, "grad_norm": 1.493096241789178, "learning_rate": 4.697410088825692e-06, "loss": 0.4832, "step": 570 }, { "epoch": 0.8694022859284242, "grad_norm": 2.031583132991568, "learning_rate": 4.675003959097525e-06, "loss": 0.4767, "step": 580 }, { "epoch": 0.8843919805133971, "grad_norm": 1.4245462788100998, "learning_rate": 4.65185506750986e-06, "loss": 0.4782, "step": 590 }, { "epoch": 0.8993816750983699, "grad_norm": 1.5793606614864946, "learning_rate": 4.627971319387965e-06, "loss": 0.4795, "step": 600 }, { "epoch": 0.9143713696833426, "grad_norm": 1.4355119503255267, "learning_rate": 4.603360871009946e-06, "loss": 0.4774, "step": 610 }, { "epoch": 0.9293610642683156, "grad_norm": 1.52114987838548, "learning_rate": 4.578032126821377e-06, "loss": 0.4771, "step": 620 }, { "epoch": 0.9443507588532883, "grad_norm": 1.5400335880344016, "learning_rate": 4.5519937365651945e-06, "loss": 0.4785, "step": 630 }, { "epoch": 0.9593404534382612, "grad_norm": 1.6430985109756742, "learning_rate": 4.525254592327817e-06, "loss": 0.4712, "step": 640 }, { "epoch": 0.974330148023234, "grad_norm": 1.4624345774601337, "learning_rate": 4.497823825502507e-06, "loss": 0.4746, "step": 650 }, { "epoch": 0.9893198426082068, "grad_norm": 1.9756966886927916, "learning_rate": 4.469710803671004e-06, "loss": 0.4707, "step": 660 }, { "epoch": 0.9998126288176878, "eval_loss": 0.058926939964294434, "eval_runtime": 456.5855, "eval_samples_per_second": 39.373, "eval_steps_per_second": 0.615, "step": 667 }, { "epoch": 1.0043095371931796, "grad_norm": 2.6844759304769448, "learning_rate": 4.440925127404512e-06, "loss": 0.4526, "step": 670 }, { "epoch": 1.0192992317781526, "grad_norm": 1.6812914263129781, "learning_rate": 4.411476626985102e-06, "loss": 0.3995, "step": 680 }, { "epoch": 1.0342889263631254, "grad_norm": 1.6671947730035994, "learning_rate": 4.381375359048691e-06, "loss": 0.3918, "step": 690 }, { "epoch": 1.0492786209480982, "grad_norm": 1.541048196484826, "learning_rate": 4.350631603150702e-06, "loss": 0.3933, "step": 700 }, { "epoch": 1.064268315533071, "grad_norm": 1.7329710214895389, "learning_rate": 4.319255858255612e-06, "loss": 0.3901, "step": 710 }, { "epoch": 1.0792580101180438, "grad_norm": 1.7281442329309784, "learning_rate": 4.287258839151557e-06, "loss": 0.3926, "step": 720 }, { "epoch": 1.0942477047030166, "grad_norm": 1.4548547056485654, "learning_rate": 4.254651472791249e-06, "loss": 0.3964, "step": 730 }, { "epoch": 1.1092373992879896, "grad_norm": 1.5263837537606915, "learning_rate": 4.221444894560429e-06, "loss": 0.3915, "step": 740 }, { "epoch": 1.1242270938729624, "grad_norm": 1.8283686921084465, "learning_rate": 4.1876504444751406e-06, "loss": 0.393, "step": 750 }, { "epoch": 1.1392167884579352, "grad_norm": 1.9745034884067223, "learning_rate": 4.15327966330913e-06, "loss": 0.3933, "step": 760 }, { "epoch": 1.154206483042908, "grad_norm": 2.0006637742508473, "learning_rate": 4.1183442886526784e-06, "loss": 0.3927, "step": 770 }, { "epoch": 1.1691961776278808, "grad_norm": 2.398957927996368, "learning_rate": 4.082856250904224e-06, "loss": 0.3946, "step": 780 }, { "epoch": 1.1841858722128538, "grad_norm": 2.1339474482590086, "learning_rate": 4.04682766919615e-06, "loss": 0.3912, "step": 790 }, { "epoch": 1.1991755667978266, "grad_norm": 2.0801601343322735, "learning_rate": 4.010270847256104e-06, "loss": 0.3872, "step": 800 }, { "epoch": 1.2141652613827993, "grad_norm": 2.241507677816275, "learning_rate": 3.973198269205286e-06, "loss": 0.3911, "step": 810 }, { "epoch": 1.2291549559677721, "grad_norm": 1.9465898484553392, "learning_rate": 3.9356225952951384e-06, "loss": 0.3934, "step": 820 }, { "epoch": 1.244144650552745, "grad_norm": 1.73525671044772, "learning_rate": 3.8975566575838805e-06, "loss": 0.3905, "step": 830 }, { "epoch": 1.259134345137718, "grad_norm": 1.8854981866970535, "learning_rate": 3.859013455554366e-06, "loss": 0.385, "step": 840 }, { "epoch": 1.2741240397226907, "grad_norm": 1.9676037916759839, "learning_rate": 3.82000615167478e-06, "loss": 0.3874, "step": 850 }, { "epoch": 1.2891137343076635, "grad_norm": 1.6949021870149008, "learning_rate": 3.780548066903661e-06, "loss": 0.3879, "step": 860 }, { "epoch": 1.3041034288926363, "grad_norm": 1.847929926533703, "learning_rate": 3.740652676140812e-06, "loss": 0.3923, "step": 870 }, { "epoch": 1.319093123477609, "grad_norm": 1.8219949958766242, "learning_rate": 3.700333603625629e-06, "loss": 0.3871, "step": 880 }, { "epoch": 1.334082818062582, "grad_norm": 2.1571236655174233, "learning_rate": 3.65960461828444e-06, "loss": 0.3915, "step": 890 }, { "epoch": 1.3490725126475547, "grad_norm": 2.028728831427641, "learning_rate": 3.6184796290284267e-06, "loss": 0.3815, "step": 900 }, { "epoch": 1.3640622072325277, "grad_norm": 1.907400343061468, "learning_rate": 3.5769726800037462e-06, "loss": 0.3915, "step": 910 }, { "epoch": 1.3790519018175005, "grad_norm": 2.1829895016343897, "learning_rate": 3.5350979457954638e-06, "loss": 0.39, "step": 920 }, { "epoch": 1.3940415964024733, "grad_norm": 1.886862579732133, "learning_rate": 3.4928697265869516e-06, "loss": 0.3872, "step": 930 }, { "epoch": 1.409031290987446, "grad_norm": 1.819745393510499, "learning_rate": 3.4503024432763838e-06, "loss": 0.3867, "step": 940 }, { "epoch": 1.4240209855724189, "grad_norm": 1.8714086663251082, "learning_rate": 3.4074106325520133e-06, "loss": 0.3879, "step": 950 }, { "epoch": 1.4390106801573919, "grad_norm": 1.4361408847037231, "learning_rate": 3.3642089419279107e-06, "loss": 0.386, "step": 960 }, { "epoch": 1.4540003747423647, "grad_norm": 1.477381601060273, "learning_rate": 3.3207121247418456e-06, "loss": 0.3888, "step": 970 }, { "epoch": 1.4689900693273374, "grad_norm": 1.4721765084901293, "learning_rate": 3.2769350351170405e-06, "loss": 0.3867, "step": 980 }, { "epoch": 1.4839797639123102, "grad_norm": 1.4925458845511457, "learning_rate": 3.232892622889497e-06, "loss": 0.3891, "step": 990 }, { "epoch": 1.498969458497283, "grad_norm": 1.4566645795724527, "learning_rate": 3.1885999285026438e-06, "loss": 0.3881, "step": 1000 }, { "epoch": 1.513959153082256, "grad_norm": 1.5822180202872997, "learning_rate": 3.1440720778710365e-06, "loss": 0.3866, "step": 1010 }, { "epoch": 1.5289488476672288, "grad_norm": 1.4373920553624056, "learning_rate": 3.0993242772148753e-06, "loss": 0.3881, "step": 1020 }, { "epoch": 1.5439385422522016, "grad_norm": 1.381099707101281, "learning_rate": 3.054371807867088e-06, "loss": 0.3888, "step": 1030 }, { "epoch": 1.5589282368371744, "grad_norm": 1.3759750364038397, "learning_rate": 3.0092300210547775e-06, "loss": 0.3869, "step": 1040 }, { "epoch": 1.5739179314221472, "grad_norm": 1.4366787735561277, "learning_rate": 2.9639143326567847e-06, "loss": 0.3865, "step": 1050 }, { "epoch": 1.5889076260071202, "grad_norm": 1.369352819988061, "learning_rate": 2.918440217939185e-06, "loss": 0.3862, "step": 1060 }, { "epoch": 1.6038973205920928, "grad_norm": 1.5096071928255457, "learning_rate": 2.872823206270494e-06, "loss": 0.3894, "step": 1070 }, { "epoch": 1.6188870151770658, "grad_norm": 1.4303634190845425, "learning_rate": 2.8270788758184076e-06, "loss": 0.3864, "step": 1080 }, { "epoch": 1.6338767097620386, "grad_norm": 1.4207147216727412, "learning_rate": 2.7812228482298674e-06, "loss": 0.3876, "step": 1090 }, { "epoch": 1.6488664043470114, "grad_norm": 1.3978109627955357, "learning_rate": 2.7352707832962865e-06, "loss": 0.3882, "step": 1100 }, { "epoch": 1.6638560989319844, "grad_norm": 1.4425647311953784, "learning_rate": 2.689238373605742e-06, "loss": 0.3847, "step": 1110 }, { "epoch": 1.678845793516957, "grad_norm": 1.3945982747038834, "learning_rate": 2.6431413391839746e-06, "loss": 0.3848, "step": 1120 }, { "epoch": 1.69383548810193, "grad_norm": 1.4307341663030686, "learning_rate": 2.5969954221260076e-06, "loss": 0.3823, "step": 1130 }, { "epoch": 1.7088251826869028, "grad_norm": 1.4833837652546649, "learning_rate": 2.550816381220243e-06, "loss": 0.3846, "step": 1140 }, { "epoch": 1.7238148772718755, "grad_norm": 1.36133901438266, "learning_rate": 2.5046199865668455e-06, "loss": 0.3831, "step": 1150 }, { "epoch": 1.7388045718568486, "grad_norm": 1.4439860851050665, "learning_rate": 2.458422014192267e-06, "loss": 0.3811, "step": 1160 }, { "epoch": 1.7537942664418211, "grad_norm": 1.4534544264872151, "learning_rate": 2.412238240661751e-06, "loss": 0.3803, "step": 1170 }, { "epoch": 1.7687839610267941, "grad_norm": 1.4387117913666696, "learning_rate": 2.366084437691651e-06, "loss": 0.3824, "step": 1180 }, { "epoch": 1.783773655611767, "grad_norm": 1.5378659951687177, "learning_rate": 2.319976366763399e-06, "loss": 0.3826, "step": 1190 }, { "epoch": 1.7987633501967397, "grad_norm": 1.483205097472637, "learning_rate": 2.2739297737409762e-06, "loss": 0.3835, "step": 1200 }, { "epoch": 1.8137530447817127, "grad_norm": 1.3300069022725933, "learning_rate": 2.2279603834937174e-06, "loss": 0.3812, "step": 1210 }, { "epoch": 1.8287427393666853, "grad_norm": 1.3109923177187024, "learning_rate": 2.1820838945262777e-06, "loss": 0.3812, "step": 1220 }, { "epoch": 1.8437324339516583, "grad_norm": 1.2956220619232373, "learning_rate": 2.1363159736176124e-06, "loss": 0.3835, "step": 1230 }, { "epoch": 1.858722128536631, "grad_norm": 1.346364686268756, "learning_rate": 2.090672250470785e-06, "loss": 0.3801, "step": 1240 }, { "epoch": 1.873711823121604, "grad_norm": 1.3161532319413825, "learning_rate": 2.0451683123754335e-06, "loss": 0.3817, "step": 1250 }, { "epoch": 1.8887015177065767, "grad_norm": 1.4484501985394258, "learning_rate": 1.9998196988847323e-06, "loss": 0.3805, "step": 1260 }, { "epoch": 1.9036912122915495, "grad_norm": 1.4224251098150107, "learning_rate": 1.9546418965086444e-06, "loss": 0.3793, "step": 1270 }, { "epoch": 1.9186809068765225, "grad_norm": 1.3230244176007022, "learning_rate": 1.9096503334252925e-06, "loss": 0.3789, "step": 1280 }, { "epoch": 1.9336706014614953, "grad_norm": 1.2689730119223448, "learning_rate": 1.864860374212254e-06, "loss": 0.3756, "step": 1290 }, { "epoch": 1.948660296046468, "grad_norm": 1.5073419751839903, "learning_rate": 1.820287314599567e-06, "loss": 0.3774, "step": 1300 }, { "epoch": 1.9636499906314409, "grad_norm": 1.372795270016621, "learning_rate": 1.775946376246256e-06, "loss": 0.3788, "step": 1310 }, { "epoch": 1.9786396852164136, "grad_norm": 1.3705608316142892, "learning_rate": 1.7318527015421476e-06, "loss": 0.3788, "step": 1320 }, { "epoch": 1.9936293798013867, "grad_norm": 1.3066140520804252, "learning_rate": 1.6880213484367574e-06, "loss": 0.3775, "step": 1330 }, { "epoch": 1.9996252576353757, "eval_loss": 0.055769529193639755, "eval_runtime": 447.0633, "eval_samples_per_second": 40.211, "eval_steps_per_second": 0.629, "step": 1334 }, { "epoch": 2.0086190743863592, "grad_norm": 2.4434187554731905, "learning_rate": 1.6444672852970194e-06, "loss": 0.3289, "step": 1340 }, { "epoch": 2.0236087689713322, "grad_norm": 1.6962075463721413, "learning_rate": 1.6012053857955943e-06, "loss": 0.2871, "step": 1350 }, { "epoch": 2.0385984635563053, "grad_norm": 1.4546619427160516, "learning_rate": 1.5582504238315293e-06, "loss": 0.2842, "step": 1360 }, { "epoch": 2.053588158141278, "grad_norm": 1.4039067144529458, "learning_rate": 1.5156170684849844e-06, "loss": 0.2841, "step": 1370 }, { "epoch": 2.068577852726251, "grad_norm": 1.4415471956114632, "learning_rate": 1.4733198790077518e-06, "loss": 0.2841, "step": 1380 }, { "epoch": 2.0835675473112234, "grad_norm": 1.3650988765023055, "learning_rate": 1.4313732998512927e-06, "loss": 0.2841, "step": 1390 }, { "epoch": 2.0985572418961964, "grad_norm": 1.334414412623332, "learning_rate": 1.389791655733959e-06, "loss": 0.283, "step": 1400 }, { "epoch": 2.1135469364811694, "grad_norm": 1.455857794359506, "learning_rate": 1.3485891467491247e-06, "loss": 0.2822, "step": 1410 }, { "epoch": 2.128536631066142, "grad_norm": 1.4035493431707644, "learning_rate": 1.3077798435158615e-06, "loss": 0.2858, "step": 1420 }, { "epoch": 2.143526325651115, "grad_norm": 1.3923539809440533, "learning_rate": 1.2673776823738385e-06, "loss": 0.2832, "step": 1430 }, { "epoch": 2.1585160202360876, "grad_norm": 1.4708372198575632, "learning_rate": 1.2273964606240718e-06, "loss": 0.2849, "step": 1440 }, { "epoch": 2.1735057148210606, "grad_norm": 1.3821731614793016, "learning_rate": 1.1878498318171641e-06, "loss": 0.2814, "step": 1450 }, { "epoch": 2.188495409406033, "grad_norm": 1.3685254042104387, "learning_rate": 1.1487513010906232e-06, "loss": 0.2832, "step": 1460 }, { "epoch": 2.203485103991006, "grad_norm": 1.4477117467489642, "learning_rate": 1.1101142205568734e-06, "loss": 0.2849, "step": 1470 }, { "epoch": 2.218474798575979, "grad_norm": 1.3837837548677747, "learning_rate": 1.0719517847435218e-06, "loss": 0.2866, "step": 1480 }, { "epoch": 2.2334644931609517, "grad_norm": 1.3463627570256054, "learning_rate": 1.0342770260874365e-06, "loss": 0.2857, "step": 1490 }, { "epoch": 2.2484541877459248, "grad_norm": 1.3928362831606316, "learning_rate": 9.971028104841869e-07, "loss": 0.2817, "step": 1500 }, { "epoch": 2.2634438823308973, "grad_norm": 1.3461504649063996, "learning_rate": 9.604418328943447e-07, "loss": 0.2821, "step": 1510 }, { "epoch": 2.2784335769158703, "grad_norm": 1.4687885214881415, "learning_rate": 9.243066130081713e-07, "loss": 0.2835, "step": 1520 }, { "epoch": 2.2934232715008434, "grad_norm": 1.3886295267035924, "learning_rate": 8.887094909701547e-07, "loss": 0.2834, "step": 1530 }, { "epoch": 2.308412966085816, "grad_norm": 1.407350206723039, "learning_rate": 8.536626231648595e-07, "loss": 0.2842, "step": 1540 }, { "epoch": 2.323402660670789, "grad_norm": 1.4132800816314468, "learning_rate": 8.191779780655354e-07, "loss": 0.2832, "step": 1550 }, { "epoch": 2.3383923552557615, "grad_norm": 1.3897894960759776, "learning_rate": 7.852673321468918e-07, "loss": 0.2843, "step": 1560 }, { "epoch": 2.3533820498407345, "grad_norm": 1.3674616825887331, "learning_rate": 7.519422658634443e-07, "loss": 0.2823, "step": 1570 }, { "epoch": 2.3683717444257075, "grad_norm": 1.3431475279285507, "learning_rate": 7.19214159694799e-07, "loss": 0.2828, "step": 1580 }, { "epoch": 2.38336143901068, "grad_norm": 1.36945515422048, "learning_rate": 6.870941902592307e-07, "loss": 0.2825, "step": 1590 }, { "epoch": 2.398351133595653, "grad_norm": 1.3520131185837208, "learning_rate": 6.555933264968753e-07, "loss": 0.2822, "step": 1600 }, { "epoch": 2.4133408281806257, "grad_norm": 1.3647629196713102, "learning_rate": 6.247223259238511e-07, "loss": 0.2836, "step": 1610 }, { "epoch": 2.4283305227655987, "grad_norm": 1.3570825621596334, "learning_rate": 5.944917309585721e-07, "loss": 0.2826, "step": 1620 }, { "epoch": 2.4433202173505713, "grad_norm": 1.3656854475785716, "learning_rate": 5.649118653215243e-07, "loss": 0.2812, "step": 1630 }, { "epoch": 2.4583099119355443, "grad_norm": 1.363047695272134, "learning_rate": 5.359928305097245e-07, "loss": 0.2831, "step": 1640 }, { "epoch": 2.4732996065205173, "grad_norm": 1.3184093693943801, "learning_rate": 5.077445023470676e-07, "loss": 0.2811, "step": 1650 }, { "epoch": 2.48828930110549, "grad_norm": 1.3684794544811556, "learning_rate": 4.801765276117437e-07, "loss": 0.2808, "step": 1660 }, { "epoch": 2.503278995690463, "grad_norm": 1.3593878648901627, "learning_rate": 4.5329832074186953e-07, "loss": 0.2794, "step": 1670 }, { "epoch": 2.518268690275436, "grad_norm": 1.3880945115426904, "learning_rate": 4.271190606204681e-07, "loss": 0.2826, "step": 1680 }, { "epoch": 2.5332583848604084, "grad_norm": 1.3996837354689144, "learning_rate": 4.016476874408867e-07, "loss": 0.2828, "step": 1690 }, { "epoch": 2.5482480794453815, "grad_norm": 1.3109833857184636, "learning_rate": 3.768928996537319e-07, "loss": 0.2803, "step": 1700 }, { "epoch": 2.563237774030354, "grad_norm": 1.3073342543118396, "learning_rate": 3.528631509963562e-07, "loss": 0.2799, "step": 1710 }, { "epoch": 2.578227468615327, "grad_norm": 1.2885685486893088, "learning_rate": 3.2956664760591143e-07, "loss": 0.2829, "step": 1720 }, { "epoch": 2.5932171632002996, "grad_norm": 1.290934296805027, "learning_rate": 3.0701134521696373e-07, "loss": 0.2832, "step": 1730 }, { "epoch": 2.6082068577852726, "grad_norm": 1.3206650202687746, "learning_rate": 2.852049464446158e-07, "loss": 0.2815, "step": 1740 }, { "epoch": 2.6231965523702456, "grad_norm": 1.3086968161796841, "learning_rate": 2.641548981540712e-07, "loss": 0.2818, "step": 1750 }, { "epoch": 2.638186246955218, "grad_norm": 1.3204039831254113, "learning_rate": 2.4386838891753695e-07, "loss": 0.2804, "step": 1760 }, { "epoch": 2.653175941540191, "grad_norm": 1.2730494032898578, "learning_rate": 2.2435234655933363e-07, "loss": 0.2802, "step": 1770 }, { "epoch": 2.668165636125164, "grad_norm": 1.3271281759783882, "learning_rate": 2.0561343579004716e-07, "loss": 0.2824, "step": 1780 }, { "epoch": 2.683155330710137, "grad_norm": 1.272138432855408, "learning_rate": 1.8765805593053855e-07, "loss": 0.2818, "step": 1790 }, { "epoch": 2.6981450252951094, "grad_norm": 1.3350408814644847, "learning_rate": 1.7049233872658084e-07, "loss": 0.2852, "step": 1800 }, { "epoch": 2.7131347198800824, "grad_norm": 1.2658770429218549, "learning_rate": 1.5412214625487336e-07, "loss": 0.2824, "step": 1810 }, { "epoch": 2.7281244144650554, "grad_norm": 1.274527251752829, "learning_rate": 1.3855306892114867e-07, "loss": 0.2814, "step": 1820 }, { "epoch": 2.743114109050028, "grad_norm": 1.3482622846337229, "learning_rate": 1.2379042355105314e-07, "loss": 0.2774, "step": 1830 }, { "epoch": 2.758103803635001, "grad_norm": 1.2728958780405912, "learning_rate": 1.0983925157445674e-07, "loss": 0.2836, "step": 1840 }, { "epoch": 2.773093498219974, "grad_norm": 1.3207497967530821, "learning_rate": 9.670431730380847e-08, "loss": 0.2821, "step": 1850 }, { "epoch": 2.7880831928049465, "grad_norm": 1.2622090696282842, "learning_rate": 8.439010630712841e-08, "loss": 0.2816, "step": 1860 }, { "epoch": 2.8030728873899196, "grad_norm": 1.2844690751523753, "learning_rate": 7.29008238761919e-08, "loss": 0.2808, "step": 1870 }, { "epoch": 2.818062581974892, "grad_norm": 1.2941614154033414, "learning_rate": 6.224039359042284e-08, "loss": 0.2819, "step": 1880 }, { "epoch": 2.833052276559865, "grad_norm": 1.2887376421660586, "learning_rate": 5.2412455976998125e-08, "loss": 0.2815, "step": 1890 }, { "epoch": 2.8480419711448377, "grad_norm": 1.3081913600486883, "learning_rate": 4.342036726760895e-08, "loss": 0.2823, "step": 1900 }, { "epoch": 2.8630316657298107, "grad_norm": 1.288972689881046, "learning_rate": 3.5267198252312286e-08, "loss": 0.2847, "step": 1910 }, { "epoch": 2.8780213603147837, "grad_norm": 1.270248787618671, "learning_rate": 2.795573323085721e-08, "loss": 0.2794, "step": 1920 }, { "epoch": 2.8930110548997563, "grad_norm": 1.2837754103434724, "learning_rate": 2.148846906185109e-08, "loss": 0.2798, "step": 1930 }, { "epoch": 2.9080007494847293, "grad_norm": 1.2838868602793667, "learning_rate": 1.586761431008249e-08, "loss": 0.2798, "step": 1940 }, { "epoch": 2.9229904440697023, "grad_norm": 1.3051039321280986, "learning_rate": 1.109508849230001e-08, "loss": 0.281, "step": 1950 }, { "epoch": 2.937980138654675, "grad_norm": 1.3082478451744641, "learning_rate": 7.172521421698331e-09, "loss": 0.2805, "step": 1960 }, { "epoch": 2.952969833239648, "grad_norm": 1.2997068680343178, "learning_rate": 4.101252651338428e-09, "loss": 0.2816, "step": 1970 }, { "epoch": 2.9679595278246205, "grad_norm": 1.280634823351821, "learning_rate": 1.8823310166918297e-09, "loss": 0.2806, "step": 1980 }, { "epoch": 2.9829492224095935, "grad_norm": 1.2887159666168426, "learning_rate": 5.165142774640752e-10, "loss": 0.28, "step": 1990 }, { "epoch": 2.997938916994566, "grad_norm": 1.2890732808929788, "learning_rate": 4.26885882032213e-12, "loss": 0.2791, "step": 2000 }, { "epoch": 2.9994378864530633, "eval_loss": 0.058169443160295486, "eval_runtime": 450.8985, "eval_samples_per_second": 39.869, "eval_steps_per_second": 0.623, "step": 2001 }, { "epoch": 2.9994378864530633, "step": 2001, "total_flos": 3351540148469760.0, "train_loss": 0.4023907120408921, "train_runtime": 64801.3882, "train_samples_per_second": 15.812, "train_steps_per_second": 0.031 } ], "logging_steps": 10, "max_steps": 2001, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3351540148469760.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }