{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9995767195767196, "eval_steps": 500, "global_step": 2362, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008465608465608466, "grad_norm": 1.2536740494709757, "learning_rate": 9.999557744171606e-05, "loss": 1.6294, "step": 10 }, { "epoch": 0.016931216931216932, "grad_norm": 1.0805568114986361, "learning_rate": 9.998231054922511e-05, "loss": 0.9236, "step": 20 }, { "epoch": 0.025396825396825397, "grad_norm": 0.8782195431244562, "learning_rate": 9.996020166947136e-05, "loss": 0.7603, "step": 30 }, { "epoch": 0.033862433862433865, "grad_norm": 0.9267321940966723, "learning_rate": 9.99292547135672e-05, "loss": 0.6731, "step": 40 }, { "epoch": 0.042328042328042326, "grad_norm": 0.8701878795418962, "learning_rate": 9.988947515610124e-05, "loss": 0.6137, "step": 50 }, { "epoch": 0.050793650793650794, "grad_norm": 1.2272042644235537, "learning_rate": 9.984087003416997e-05, "loss": 0.6346, "step": 60 }, { "epoch": 0.05925925925925926, "grad_norm": 1.0686141559390336, "learning_rate": 9.978344794613277e-05, "loss": 0.6107, "step": 70 }, { "epoch": 0.06772486772486773, "grad_norm": 0.9849077305305056, "learning_rate": 9.971721905009086e-05, "loss": 0.5656, "step": 80 }, { "epoch": 0.0761904761904762, "grad_norm": 1.2368756418600437, "learning_rate": 9.964219506209039e-05, "loss": 0.5726, "step": 90 }, { "epoch": 0.08465608465608465, "grad_norm": 1.0374346188421604, "learning_rate": 9.955838925404971e-05, "loss": 0.5894, "step": 100 }, { "epoch": 0.09312169312169312, "grad_norm": 1.258972872704355, "learning_rate": 9.946581645141167e-05, "loss": 0.534, "step": 110 }, { "epoch": 0.10158730158730159, "grad_norm": 1.0466165338353233, "learning_rate": 9.936449303052086e-05, "loss": 0.5235, "step": 120 }, { "epoch": 0.11005291005291006, "grad_norm": 0.9956790587338181, "learning_rate": 9.925443691572668e-05, "loss": 0.5208, "step": 130 }, { "epoch": 0.11851851851851852, "grad_norm": 1.2599033904063475, "learning_rate": 9.91356675762124e-05, "loss": 0.5355, "step": 140 }, { "epoch": 0.12698412698412698, "grad_norm": 1.0353580902393011, "learning_rate": 9.900820602255108e-05, "loss": 0.5361, "step": 150 }, { "epoch": 0.13544973544973546, "grad_norm": 0.9863947603039183, "learning_rate": 9.88720748029887e-05, "loss": 0.5105, "step": 160 }, { "epoch": 0.1439153439153439, "grad_norm": 0.9272966295135331, "learning_rate": 9.87272979994554e-05, "loss": 0.5286, "step": 170 }, { "epoch": 0.1523809523809524, "grad_norm": 0.9357784657446446, "learning_rate": 9.857390122330522e-05, "loss": 0.4913, "step": 180 }, { "epoch": 0.16084656084656085, "grad_norm": 0.9706862272089483, "learning_rate": 9.84119116107855e-05, "loss": 0.5358, "step": 190 }, { "epoch": 0.1693121693121693, "grad_norm": 1.037354134026295, "learning_rate": 9.82413578182364e-05, "loss": 0.5511, "step": 200 }, { "epoch": 0.17777777777777778, "grad_norm": 1.2141517165745577, "learning_rate": 9.806227001702135e-05, "loss": 0.5099, "step": 210 }, { "epoch": 0.18624338624338624, "grad_norm": 1.0430093698701248, "learning_rate": 9.787467988818997e-05, "loss": 0.5019, "step": 220 }, { "epoch": 0.19470899470899472, "grad_norm": 1.0478269154568778, "learning_rate": 9.767862061687337e-05, "loss": 0.5014, "step": 230 }, { "epoch": 0.20317460317460317, "grad_norm": 1.0602153985712557, "learning_rate": 9.747412688641372e-05, "loss": 0.4933, "step": 240 }, { "epoch": 0.21164021164021163, "grad_norm": 1.0783975799525436, "learning_rate": 9.726123487222871e-05, "loss": 0.4762, "step": 250 }, { "epoch": 0.2201058201058201, "grad_norm": 1.008218163473069, "learning_rate": 9.703998223541196e-05, "loss": 0.4843, "step": 260 }, { "epoch": 0.22857142857142856, "grad_norm": 0.9852689728116106, "learning_rate": 9.681040811607075e-05, "loss": 0.4603, "step": 270 }, { "epoch": 0.23703703703703705, "grad_norm": 1.0930039587507208, "learning_rate": 9.6572553126402e-05, "loss": 0.4859, "step": 280 }, { "epoch": 0.2455026455026455, "grad_norm": 0.998867899511094, "learning_rate": 9.632645934350792e-05, "loss": 0.4764, "step": 290 }, { "epoch": 0.25396825396825395, "grad_norm": 1.1270572507755159, "learning_rate": 9.607217030195242e-05, "loss": 0.5125, "step": 300 }, { "epoch": 0.2624338624338624, "grad_norm": 1.0417587165454332, "learning_rate": 9.58097309860598e-05, "loss": 0.4753, "step": 310 }, { "epoch": 0.2708994708994709, "grad_norm": 1.0092231428385308, "learning_rate": 9.553918782195688e-05, "loss": 0.4654, "step": 320 }, { "epoch": 0.27936507936507937, "grad_norm": 1.019655360850703, "learning_rate": 9.526058866936013e-05, "loss": 0.4897, "step": 330 }, { "epoch": 0.2878306878306878, "grad_norm": 1.2136875720692115, "learning_rate": 9.497398281310914e-05, "loss": 0.4698, "step": 340 }, { "epoch": 0.2962962962962963, "grad_norm": 1.0156909584259968, "learning_rate": 9.467942095444809e-05, "loss": 0.4635, "step": 350 }, { "epoch": 0.3047619047619048, "grad_norm": 1.0330410864779689, "learning_rate": 9.43769552020565e-05, "loss": 0.4955, "step": 360 }, { "epoch": 0.31322751322751324, "grad_norm": 1.1555703560002792, "learning_rate": 9.406663906283109e-05, "loss": 0.4779, "step": 370 }, { "epoch": 0.3216931216931217, "grad_norm": 1.0300804678514643, "learning_rate": 9.374852743242037e-05, "loss": 0.4581, "step": 380 }, { "epoch": 0.33015873015873015, "grad_norm": 1.0577163168213004, "learning_rate": 9.34226765855134e-05, "loss": 0.4578, "step": 390 }, { "epoch": 0.3386243386243386, "grad_norm": 1.1528354628813937, "learning_rate": 9.308914416588468e-05, "loss": 0.4688, "step": 400 }, { "epoch": 0.3470899470899471, "grad_norm": 1.041087172048898, "learning_rate": 9.27479891761968e-05, "loss": 0.4782, "step": 410 }, { "epoch": 0.35555555555555557, "grad_norm": 0.9791427857125372, "learning_rate": 9.239927196756279e-05, "loss": 0.4488, "step": 420 }, { "epoch": 0.364021164021164, "grad_norm": 0.9683401420738389, "learning_rate": 9.204305422886987e-05, "loss": 0.4631, "step": 430 }, { "epoch": 0.3724867724867725, "grad_norm": 1.0489264170989323, "learning_rate": 9.167939897586647e-05, "loss": 0.4493, "step": 440 }, { "epoch": 0.38095238095238093, "grad_norm": 1.0133114072958438, "learning_rate": 9.130837054001463e-05, "loss": 0.461, "step": 450 }, { "epoch": 0.38941798941798944, "grad_norm": 1.0851755168134365, "learning_rate": 9.09300345571097e-05, "loss": 0.469, "step": 460 }, { "epoch": 0.3978835978835979, "grad_norm": 1.1009516736250549, "learning_rate": 9.054445795566907e-05, "loss": 0.4504, "step": 470 }, { "epoch": 0.40634920634920635, "grad_norm": 1.0034292869828323, "learning_rate": 9.015170894509244e-05, "loss": 0.4498, "step": 480 }, { "epoch": 0.4148148148148148, "grad_norm": 1.0546642511020514, "learning_rate": 8.975185700359542e-05, "loss": 0.4334, "step": 490 }, { "epoch": 0.42328042328042326, "grad_norm": 0.9211704485733719, "learning_rate": 8.93449728659187e-05, "loss": 0.4535, "step": 500 }, { "epoch": 0.43174603174603177, "grad_norm": 0.9075636796189445, "learning_rate": 8.893112851081478e-05, "loss": 0.4149, "step": 510 }, { "epoch": 0.4402116402116402, "grad_norm": 0.9394762728420766, "learning_rate": 8.851039714831492e-05, "loss": 0.452, "step": 520 }, { "epoch": 0.4486772486772487, "grad_norm": 1.0202540982390778, "learning_rate": 8.8082853206778e-05, "loss": 0.4491, "step": 530 }, { "epoch": 0.45714285714285713, "grad_norm": 1.0209784880401114, "learning_rate": 8.764857231972408e-05, "loss": 0.4374, "step": 540 }, { "epoch": 0.4656084656084656, "grad_norm": 1.004488049664675, "learning_rate": 8.72076313124545e-05, "loss": 0.459, "step": 550 }, { "epoch": 0.4740740740740741, "grad_norm": 1.0341305920498378, "learning_rate": 8.676010818846145e-05, "loss": 0.4271, "step": 560 }, { "epoch": 0.48253968253968255, "grad_norm": 1.0664004827817963, "learning_rate": 8.630608211562891e-05, "loss": 0.4622, "step": 570 }, { "epoch": 0.491005291005291, "grad_norm": 0.9934436898635304, "learning_rate": 8.584563341222765e-05, "loss": 0.4432, "step": 580 }, { "epoch": 0.49947089947089945, "grad_norm": 0.9857548315821839, "learning_rate": 8.537884353270677e-05, "loss": 0.4639, "step": 590 }, { "epoch": 0.5079365079365079, "grad_norm": 1.0178863758450196, "learning_rate": 8.490579505328424e-05, "loss": 0.4138, "step": 600 }, { "epoch": 0.5164021164021164, "grad_norm": 1.038252359047521, "learning_rate": 8.442657165733886e-05, "loss": 0.419, "step": 610 }, { "epoch": 0.5248677248677248, "grad_norm": 0.8829314461573528, "learning_rate": 8.394125812060666e-05, "loss": 0.4057, "step": 620 }, { "epoch": 0.5333333333333333, "grad_norm": 1.0470304353319828, "learning_rate": 8.344994029618374e-05, "loss": 0.4478, "step": 630 }, { "epoch": 0.5417989417989418, "grad_norm": 1.0281770671480395, "learning_rate": 8.295270509933862e-05, "loss": 0.4382, "step": 640 }, { "epoch": 0.5502645502645502, "grad_norm": 0.8764644053490013, "learning_rate": 8.24496404921369e-05, "loss": 0.4377, "step": 650 }, { "epoch": 0.5587301587301587, "grad_norm": 0.9473160233886304, "learning_rate": 8.19408354678804e-05, "loss": 0.4468, "step": 660 }, { "epoch": 0.5671957671957671, "grad_norm": 1.0853088610186052, "learning_rate": 8.142638003536413e-05, "loss": 0.43, "step": 670 }, { "epoch": 0.5756613756613757, "grad_norm": 0.9386265978912677, "learning_rate": 8.090636520295348e-05, "loss": 0.4475, "step": 680 }, { "epoch": 0.5841269841269842, "grad_norm": 1.0740222193078277, "learning_rate": 8.038088296248462e-05, "loss": 0.4424, "step": 690 }, { "epoch": 0.5925925925925926, "grad_norm": 1.0857350308205733, "learning_rate": 7.9850026272991e-05, "loss": 0.4036, "step": 700 }, { "epoch": 0.6010582010582011, "grad_norm": 0.9193145888168105, "learning_rate": 7.93138890442586e-05, "loss": 0.4511, "step": 710 }, { "epoch": 0.6095238095238096, "grad_norm": 0.9551109099001511, "learning_rate": 7.877256612021312e-05, "loss": 0.3774, "step": 720 }, { "epoch": 0.617989417989418, "grad_norm": 0.9570542406982319, "learning_rate": 7.822615326214183e-05, "loss": 0.4371, "step": 730 }, { "epoch": 0.6264550264550265, "grad_norm": 1.0153420855979158, "learning_rate": 7.767474713175321e-05, "loss": 0.4628, "step": 740 }, { "epoch": 0.6349206349206349, "grad_norm": 1.0695763456638891, "learning_rate": 7.711844527407724e-05, "loss": 0.4238, "step": 750 }, { "epoch": 0.6433862433862434, "grad_norm": 1.0044619876891803, "learning_rate": 7.655734610020952e-05, "loss": 0.4296, "step": 760 }, { "epoch": 0.6518518518518519, "grad_norm": 0.9458827543873214, "learning_rate": 7.599154886990199e-05, "loss": 0.396, "step": 770 }, { "epoch": 0.6603174603174603, "grad_norm": 1.074091533938013, "learning_rate": 7.542115367400376e-05, "loss": 0.4035, "step": 780 }, { "epoch": 0.6687830687830688, "grad_norm": 1.0152819418160979, "learning_rate": 7.484626141675481e-05, "loss": 0.4135, "step": 790 }, { "epoch": 0.6772486772486772, "grad_norm": 0.8781718355357042, "learning_rate": 7.426697379793572e-05, "loss": 0.401, "step": 800 }, { "epoch": 0.6857142857142857, "grad_norm": 0.9320133553311277, "learning_rate": 7.368339329487677e-05, "loss": 0.4257, "step": 810 }, { "epoch": 0.6941798941798942, "grad_norm": 1.0257232524871047, "learning_rate": 7.309562314432952e-05, "loss": 0.4096, "step": 820 }, { "epoch": 0.7026455026455026, "grad_norm": 1.000399097214694, "learning_rate": 7.250376732420387e-05, "loss": 0.4217, "step": 830 }, { "epoch": 0.7111111111111111, "grad_norm": 0.8946747414376721, "learning_rate": 7.190793053517424e-05, "loss": 0.4391, "step": 840 }, { "epoch": 0.7195767195767195, "grad_norm": 1.059007250343128, "learning_rate": 7.13082181821577e-05, "loss": 0.4477, "step": 850 }, { "epoch": 0.728042328042328, "grad_norm": 0.9750233036191156, "learning_rate": 7.070473635566768e-05, "loss": 0.4111, "step": 860 }, { "epoch": 0.7365079365079366, "grad_norm": 1.2069014928768744, "learning_rate": 7.009759181304619e-05, "loss": 0.385, "step": 870 }, { "epoch": 0.744973544973545, "grad_norm": 0.9675202014376196, "learning_rate": 6.948689195957829e-05, "loss": 0.4303, "step": 880 }, { "epoch": 0.7534391534391535, "grad_norm": 1.0273630814532357, "learning_rate": 6.887274482949185e-05, "loss": 0.4197, "step": 890 }, { "epoch": 0.7619047619047619, "grad_norm": 0.8936119385994451, "learning_rate": 6.825525906684593e-05, "loss": 0.428, "step": 900 }, { "epoch": 0.7703703703703704, "grad_norm": 0.9839762657140783, "learning_rate": 6.763454390631156e-05, "loss": 0.4318, "step": 910 }, { "epoch": 0.7788359788359789, "grad_norm": 0.9938930890241121, "learning_rate": 6.701070915384776e-05, "loss": 0.3992, "step": 920 }, { "epoch": 0.7873015873015873, "grad_norm": 1.0647555531218702, "learning_rate": 6.638386516727656e-05, "loss": 0.4055, "step": 930 }, { "epoch": 0.7957671957671958, "grad_norm": 0.9510971038124535, "learning_rate": 6.575412283676063e-05, "loss": 0.3999, "step": 940 }, { "epoch": 0.8042328042328042, "grad_norm": 0.9903300056996712, "learning_rate": 6.512159356518638e-05, "loss": 0.3947, "step": 950 }, { "epoch": 0.8126984126984127, "grad_norm": 0.9769472788235855, "learning_rate": 6.448638924845662e-05, "loss": 0.4052, "step": 960 }, { "epoch": 0.8211640211640212, "grad_norm": 1.0442770249636188, "learning_rate": 6.384862225569584e-05, "loss": 0.3906, "step": 970 }, { "epoch": 0.8296296296296296, "grad_norm": 1.1520118825182077, "learning_rate": 6.320840540937196e-05, "loss": 0.3942, "step": 980 }, { "epoch": 0.8380952380952381, "grad_norm": 0.973580565603432, "learning_rate": 6.256585196533763e-05, "loss": 0.3976, "step": 990 }, { "epoch": 0.8465608465608465, "grad_norm": 1.0010129765253053, "learning_rate": 6.192107559279513e-05, "loss": 0.4132, "step": 1000 }, { "epoch": 0.855026455026455, "grad_norm": 1.0317834662297176, "learning_rate": 6.127419035418798e-05, "loss": 0.4006, "step": 1010 }, { "epoch": 0.8634920634920635, "grad_norm": 1.033103546662518, "learning_rate": 6.0625310685023006e-05, "loss": 0.4015, "step": 1020 }, { "epoch": 0.8719576719576719, "grad_norm": 0.9814116522619839, "learning_rate": 5.9974551373626456e-05, "loss": 0.4042, "step": 1030 }, { "epoch": 0.8804232804232804, "grad_norm": 0.9434073558115307, "learning_rate": 5.932202754083765e-05, "loss": 0.4155, "step": 1040 }, { "epoch": 0.8888888888888888, "grad_norm": 0.9305038165748942, "learning_rate": 5.8667854619643926e-05, "loss": 0.409, "step": 1050 }, { "epoch": 0.8973544973544973, "grad_norm": 0.9961689454529794, "learning_rate": 5.8012148334760077e-05, "loss": 0.4094, "step": 1060 }, { "epoch": 0.9058201058201059, "grad_norm": 1.1061454998502234, "learning_rate": 5.735502468215663e-05, "loss": 0.3936, "step": 1070 }, { "epoch": 0.9142857142857143, "grad_norm": 0.9207561081731734, "learning_rate": 5.669659990853975e-05, "loss": 0.4096, "step": 1080 }, { "epoch": 0.9227513227513228, "grad_norm": 1.105503110809865, "learning_rate": 5.603699049078685e-05, "loss": 0.3951, "step": 1090 }, { "epoch": 0.9312169312169312, "grad_norm": 1.0603861460237525, "learning_rate": 5.537631311534176e-05, "loss": 0.3881, "step": 1100 }, { "epoch": 0.9396825396825397, "grad_norm": 0.9136553453309776, "learning_rate": 5.4714684657572456e-05, "loss": 0.4021, "step": 1110 }, { "epoch": 0.9481481481481482, "grad_norm": 1.0267048307590627, "learning_rate": 5.40522221610956e-05, "loss": 0.4026, "step": 1120 }, { "epoch": 0.9566137566137566, "grad_norm": 1.1963746364272148, "learning_rate": 5.338904281707128e-05, "loss": 0.3758, "step": 1130 }, { "epoch": 0.9650793650793651, "grad_norm": 0.9462817093916603, "learning_rate": 5.272526394347156e-05, "loss": 0.4041, "step": 1140 }, { "epoch": 0.9735449735449735, "grad_norm": 0.9508846912892318, "learning_rate": 5.2061002964326655e-05, "loss": 0.4102, "step": 1150 }, { "epoch": 0.982010582010582, "grad_norm": 0.9999646043761261, "learning_rate": 5.139637738895243e-05, "loss": 0.3942, "step": 1160 }, { "epoch": 0.9904761904761905, "grad_norm": 0.9017617882068949, "learning_rate": 5.0731504791162645e-05, "loss": 0.3873, "step": 1170 }, { "epoch": 0.9989417989417989, "grad_norm": 1.0252778174062445, "learning_rate": 5.006650278846991e-05, "loss": 0.3813, "step": 1180 }, { "epoch": 1.0074074074074073, "grad_norm": 1.0599548491574322, "learning_rate": 4.9401489021278844e-05, "loss": 0.3662, "step": 1190 }, { "epoch": 1.0158730158730158, "grad_norm": 1.218509380892567, "learning_rate": 4.873658113207526e-05, "loss": 0.3422, "step": 1200 }, { "epoch": 1.0243386243386243, "grad_norm": 1.0795258633356928, "learning_rate": 4.807189674461489e-05, "loss": 0.3386, "step": 1210 }, { "epoch": 1.0328042328042328, "grad_norm": 0.9886890483794867, "learning_rate": 4.740755344311549e-05, "loss": 0.3307, "step": 1220 }, { "epoch": 1.0412698412698413, "grad_norm": 1.050177935297077, "learning_rate": 4.6743668751455935e-05, "loss": 0.3137, "step": 1230 }, { "epoch": 1.0497354497354496, "grad_norm": 1.0381034500976596, "learning_rate": 4.6080360112385914e-05, "loss": 0.3381, "step": 1240 }, { "epoch": 1.0582010582010581, "grad_norm": 1.2730980092196418, "learning_rate": 4.5417744866750096e-05, "loss": 0.3166, "step": 1250 }, { "epoch": 1.0666666666666667, "grad_norm": 1.2352467183360096, "learning_rate": 4.475594023273024e-05, "loss": 0.3264, "step": 1260 }, { "epoch": 1.0751322751322752, "grad_norm": 1.0837500411632388, "learning_rate": 4.4095063285108975e-05, "loss": 0.3207, "step": 1270 }, { "epoch": 1.0835978835978837, "grad_norm": 1.0653199477181592, "learning_rate": 4.343523093455909e-05, "loss": 0.3232, "step": 1280 }, { "epoch": 1.0920634920634922, "grad_norm": 1.2017002126624405, "learning_rate": 4.277655990696171e-05, "loss": 0.3189, "step": 1290 }, { "epoch": 1.1005291005291005, "grad_norm": 1.2016108585957874, "learning_rate": 4.211916672275722e-05, "loss": 0.3305, "step": 1300 }, { "epoch": 1.108994708994709, "grad_norm": 0.9929918539213213, "learning_rate": 4.14631676763325e-05, "loss": 0.3061, "step": 1310 }, { "epoch": 1.1174603174603175, "grad_norm": 1.1653749824747823, "learning_rate": 4.080867881544826e-05, "loss": 0.3182, "step": 1320 }, { "epoch": 1.125925925925926, "grad_norm": 1.2857542752538433, "learning_rate": 4.0155815920709825e-05, "loss": 0.3234, "step": 1330 }, { "epoch": 1.1343915343915345, "grad_norm": 1.1562023839114728, "learning_rate": 3.950469448508532e-05, "loss": 0.3143, "step": 1340 }, { "epoch": 1.1428571428571428, "grad_norm": 1.1140077209285717, "learning_rate": 3.885542969347472e-05, "loss": 0.2943, "step": 1350 }, { "epoch": 1.1513227513227513, "grad_norm": 1.3253628791792769, "learning_rate": 3.8208136402333314e-05, "loss": 0.3452, "step": 1360 }, { "epoch": 1.1597883597883598, "grad_norm": 1.1124786786593033, "learning_rate": 3.756292911935339e-05, "loss": 0.3371, "step": 1370 }, { "epoch": 1.1682539682539683, "grad_norm": 1.199051505503681, "learning_rate": 3.691992198320753e-05, "loss": 0.3602, "step": 1380 }, { "epoch": 1.1767195767195768, "grad_norm": 1.1816565721598766, "learning_rate": 3.627922874335716e-05, "loss": 0.3441, "step": 1390 }, { "epoch": 1.1851851851851851, "grad_norm": 1.1528028837948154, "learning_rate": 3.564096273993012e-05, "loss": 0.3354, "step": 1400 }, { "epoch": 1.1936507936507936, "grad_norm": 1.1065642388382553, "learning_rate": 3.5005236883670436e-05, "loss": 0.3161, "step": 1410 }, { "epoch": 1.2021164021164021, "grad_norm": 1.2712527045188007, "learning_rate": 3.437216363596418e-05, "loss": 0.3304, "step": 1420 }, { "epoch": 1.2105820105820106, "grad_norm": 1.14541451693123, "learning_rate": 3.37418549889448e-05, "loss": 0.3302, "step": 1430 }, { "epoch": 1.2190476190476192, "grad_norm": 1.0859928515093296, "learning_rate": 3.3114422445681425e-05, "loss": 0.3107, "step": 1440 }, { "epoch": 1.2275132275132274, "grad_norm": 1.2559757962594507, "learning_rate": 3.2489977000453745e-05, "loss": 0.3424, "step": 1450 }, { "epoch": 1.235978835978836, "grad_norm": 1.1302399638013196, "learning_rate": 3.1868629119116794e-05, "loss": 0.3225, "step": 1460 }, { "epoch": 1.2444444444444445, "grad_norm": 1.182346821657365, "learning_rate": 3.1250488719559383e-05, "loss": 0.3232, "step": 1470 }, { "epoch": 1.252910052910053, "grad_norm": 1.2327829899094018, "learning_rate": 3.0635665152259295e-05, "loss": 0.3288, "step": 1480 }, { "epoch": 1.2613756613756615, "grad_norm": 1.1858722427961588, "learning_rate": 3.002426718093897e-05, "loss": 0.3309, "step": 1490 }, { "epoch": 1.2698412698412698, "grad_norm": 1.0878551421788079, "learning_rate": 2.9416402963324896e-05, "loss": 0.3339, "step": 1500 }, { "epoch": 1.2783068783068783, "grad_norm": 1.3522545182756271, "learning_rate": 2.881218003201437e-05, "loss": 0.3191, "step": 1510 }, { "epoch": 1.2867724867724868, "grad_norm": 1.1604905763699422, "learning_rate": 2.821170527545254e-05, "loss": 0.3191, "step": 1520 }, { "epoch": 1.2952380952380953, "grad_norm": 1.0896975112303264, "learning_rate": 2.7615084919023794e-05, "loss": 0.3278, "step": 1530 }, { "epoch": 1.3037037037037038, "grad_norm": 1.2783898410901267, "learning_rate": 2.7022424506260113e-05, "loss": 0.3552, "step": 1540 }, { "epoch": 1.312169312169312, "grad_norm": 1.2412629592249576, "learning_rate": 2.643382888017022e-05, "loss": 0.3177, "step": 1550 }, { "epoch": 1.3206349206349206, "grad_norm": 1.182083170394083, "learning_rate": 2.5849402164692593e-05, "loss": 0.352, "step": 1560 }, { "epoch": 1.3291005291005291, "grad_norm": 1.2384377434425395, "learning_rate": 2.5269247746275716e-05, "loss": 0.3186, "step": 1570 }, { "epoch": 1.3375661375661376, "grad_norm": 1.1317133426240273, "learning_rate": 2.4693468255588752e-05, "loss": 0.3143, "step": 1580 }, { "epoch": 1.3460317460317461, "grad_norm": 1.127659032580244, "learning_rate": 2.412216554936595e-05, "loss": 0.3043, "step": 1590 }, { "epoch": 1.3544973544973544, "grad_norm": 1.1270117058544764, "learning_rate": 2.3555440692387947e-05, "loss": 0.3122, "step": 1600 }, { "epoch": 1.362962962962963, "grad_norm": 1.1012994404894896, "learning_rate": 2.2993393939603186e-05, "loss": 0.2964, "step": 1610 }, { "epoch": 1.3714285714285714, "grad_norm": 1.0779235542307226, "learning_rate": 2.2436124718392565e-05, "loss": 0.3206, "step": 1620 }, { "epoch": 1.37989417989418, "grad_norm": 1.169194411783028, "learning_rate": 2.1883731610980518e-05, "loss": 0.3057, "step": 1630 }, { "epoch": 1.3883597883597885, "grad_norm": 1.073218506562628, "learning_rate": 2.1336312336995572e-05, "loss": 0.3196, "step": 1640 }, { "epoch": 1.3968253968253967, "grad_norm": 1.1448212371915947, "learning_rate": 2.0793963736183507e-05, "loss": 0.3023, "step": 1650 }, { "epoch": 1.4052910052910053, "grad_norm": 1.3562963055025201, "learning_rate": 2.025678175127623e-05, "loss": 0.3344, "step": 1660 }, { "epoch": 1.4137566137566138, "grad_norm": 1.2735177478315347, "learning_rate": 1.972486141101922e-05, "loss": 0.3224, "step": 1670 }, { "epoch": 1.4222222222222223, "grad_norm": 1.1826426943582573, "learning_rate": 1.9198296813360777e-05, "loss": 0.3279, "step": 1680 }, { "epoch": 1.4306878306878308, "grad_norm": 1.1291458767171278, "learning_rate": 1.8677181108805823e-05, "loss": 0.3126, "step": 1690 }, { "epoch": 1.439153439153439, "grad_norm": 1.2660744922449119, "learning_rate": 1.8161606483937433e-05, "loss": 0.3335, "step": 1700 }, { "epoch": 1.4476190476190476, "grad_norm": 1.0518342184370524, "learning_rate": 1.7651664145108725e-05, "loss": 0.3149, "step": 1710 }, { "epoch": 1.456084656084656, "grad_norm": 1.2016784882503588, "learning_rate": 1.7147444302308273e-05, "loss": 0.3243, "step": 1720 }, { "epoch": 1.4645502645502646, "grad_norm": 1.1815033136893196, "learning_rate": 1.6649036153201797e-05, "loss": 0.3219, "step": 1730 }, { "epoch": 1.4730158730158731, "grad_norm": 1.1655388711655206, "learning_rate": 1.6156527867352845e-05, "loss": 0.3187, "step": 1740 }, { "epoch": 1.4814814814814814, "grad_norm": 1.2317123234341782, "learning_rate": 1.5670006570625396e-05, "loss": 0.2999, "step": 1750 }, { "epoch": 1.48994708994709, "grad_norm": 1.2768357746714913, "learning_rate": 1.5189558329771092e-05, "loss": 0.3237, "step": 1760 }, { "epoch": 1.4984126984126984, "grad_norm": 1.1451972982515424, "learning_rate": 1.4715268137203852e-05, "loss": 0.296, "step": 1770 }, { "epoch": 1.506878306878307, "grad_norm": 1.2388164845125278, "learning_rate": 1.4247219895964458e-05, "loss": 0.337, "step": 1780 }, { "epoch": 1.5153439153439154, "grad_norm": 1.196613109479644, "learning_rate": 1.3785496404877967e-05, "loss": 0.3117, "step": 1790 }, { "epoch": 1.5238095238095237, "grad_norm": 1.0738735120315672, "learning_rate": 1.3330179343906424e-05, "loss": 0.3236, "step": 1800 }, { "epoch": 1.5322751322751322, "grad_norm": 1.1319149412117087, "learning_rate": 1.2881349259699416e-05, "loss": 0.309, "step": 1810 }, { "epoch": 1.5407407407407407, "grad_norm": 1.1664826422000727, "learning_rate": 1.2439085551345209e-05, "loss": 0.3251, "step": 1820 }, { "epoch": 1.5492063492063493, "grad_norm": 1.1591594193359072, "learning_rate": 1.2003466456324907e-05, "loss": 0.2985, "step": 1830 }, { "epoch": 1.5576719576719578, "grad_norm": 1.2020574172279288, "learning_rate": 1.1574569036671978e-05, "loss": 0.3269, "step": 1840 }, { "epoch": 1.566137566137566, "grad_norm": 1.1386243506460356, "learning_rate": 1.1152469165339885e-05, "loss": 0.305, "step": 1850 }, { "epoch": 1.5746031746031746, "grad_norm": 1.085874089650789, "learning_rate": 1.0737241512779928e-05, "loss": 0.3173, "step": 1860 }, { "epoch": 1.583068783068783, "grad_norm": 1.197846294368878, "learning_rate": 1.0328959533731886e-05, "loss": 0.3167, "step": 1870 }, { "epoch": 1.5915343915343916, "grad_norm": 1.1368052438876846, "learning_rate": 9.927695454229713e-06, "loss": 0.2983, "step": 1880 }, { "epoch": 1.6, "grad_norm": 1.121353872740907, "learning_rate": 9.533520258824557e-06, "loss": 0.3074, "step": 1890 }, { "epoch": 1.6084656084656084, "grad_norm": 1.3293469389715453, "learning_rate": 9.146503678027451e-06, "loss": 0.312, "step": 1900 }, { "epoch": 1.6169312169312169, "grad_norm": 1.1026868387450763, "learning_rate": 8.766714175973817e-06, "loss": 0.2977, "step": 1910 }, { "epoch": 1.6253968253968254, "grad_norm": 1.1989702128623307, "learning_rate": 8.394218938311982e-06, "loss": 0.3064, "step": 1920 }, { "epoch": 1.633862433862434, "grad_norm": 1.2736653421212538, "learning_rate": 8.029083860317915e-06, "loss": 0.3128, "step": 1930 }, { "epoch": 1.6423280423280424, "grad_norm": 1.1811892666540489, "learning_rate": 7.671373535238168e-06, "loss": 0.3331, "step": 1940 }, { "epoch": 1.6507936507936507, "grad_norm": 1.251659796122848, "learning_rate": 7.3211512428632115e-06, "loss": 0.3035, "step": 1950 }, { "epoch": 1.6592592592592592, "grad_norm": 1.0585624571230603, "learning_rate": 6.978478938333055e-06, "loss": 0.2911, "step": 1960 }, { "epoch": 1.6677248677248677, "grad_norm": 1.0285014490372995, "learning_rate": 6.643417241177252e-06, "loss": 0.299, "step": 1970 }, { "epoch": 1.6761904761904762, "grad_norm": 1.2077574437861864, "learning_rate": 6.3160254245911825e-06, "loss": 0.3086, "step": 1980 }, { "epoch": 1.6846560846560847, "grad_norm": 1.1940524793802747, "learning_rate": 5.9963614049504625e-06, "loss": 0.2972, "step": 1990 }, { "epoch": 1.693121693121693, "grad_norm": 1.2956482047870082, "learning_rate": 5.684481731565433e-06, "loss": 0.2978, "step": 2000 }, { "epoch": 1.7015873015873015, "grad_norm": 1.07352837871318, "learning_rate": 5.380441576677409e-06, "loss": 0.302, "step": 2010 }, { "epoch": 1.71005291005291, "grad_norm": 1.1429522648599941, "learning_rate": 5.0842947256986014e-06, "loss": 0.3171, "step": 2020 }, { "epoch": 1.7185185185185186, "grad_norm": 1.2683501315969479, "learning_rate": 4.796093567697413e-06, "loss": 0.3059, "step": 2030 }, { "epoch": 1.726984126984127, "grad_norm": 1.134598484064839, "learning_rate": 4.515889086130565e-06, "loss": 0.311, "step": 2040 }, { "epoch": 1.7354497354497354, "grad_norm": 1.2457646181888378, "learning_rate": 4.243730849824101e-06, "loss": 0.2849, "step": 2050 }, { "epoch": 1.7439153439153439, "grad_norm": 1.2046362639691561, "learning_rate": 3.979667004204535e-06, "loss": 0.2962, "step": 2060 }, { "epoch": 1.7523809523809524, "grad_norm": 1.1899110108249271, "learning_rate": 3.7237442627817753e-06, "loss": 0.3174, "step": 2070 }, { "epoch": 1.7608465608465609, "grad_norm": 1.1957979006872952, "learning_rate": 3.4760078988854305e-06, "loss": 0.2868, "step": 2080 }, { "epoch": 1.7693121693121694, "grad_norm": 1.0992839797811276, "learning_rate": 3.236501737655845e-06, "loss": 0.3062, "step": 2090 }, { "epoch": 1.7777777777777777, "grad_norm": 1.2094059824739993, "learning_rate": 3.005268148291296e-06, "loss": 0.3024, "step": 2100 }, { "epoch": 1.7862433862433862, "grad_norm": 1.2877529846662792, "learning_rate": 2.7823480365528345e-06, "loss": 0.3109, "step": 2110 }, { "epoch": 1.7947089947089947, "grad_norm": 1.0626267084104033, "learning_rate": 2.5677808375279443e-06, "loss": 0.2933, "step": 2120 }, { "epoch": 1.8031746031746032, "grad_norm": 1.2728324345583901, "learning_rate": 2.3616045086543637e-06, "loss": 0.313, "step": 2130 }, { "epoch": 1.8116402116402117, "grad_norm": 1.1679924754572557, "learning_rate": 2.1638555230053315e-06, "loss": 0.3177, "step": 2140 }, { "epoch": 1.82010582010582, "grad_norm": 1.2780416859602457, "learning_rate": 1.974568862837445e-06, "loss": 0.3161, "step": 2150 }, { "epoch": 1.8285714285714287, "grad_norm": 1.1373367274657828, "learning_rate": 1.7937780134021808e-06, "loss": 0.3064, "step": 2160 }, { "epoch": 1.837037037037037, "grad_norm": 1.371545482098203, "learning_rate": 1.62151495702228e-06, "loss": 0.2827, "step": 2170 }, { "epoch": 1.8455026455026455, "grad_norm": 1.2124638052571406, "learning_rate": 1.4578101674340372e-06, "loss": 0.2928, "step": 2180 }, { "epoch": 1.853968253968254, "grad_norm": 1.3282762207991974, "learning_rate": 1.3026926043963717e-06, "loss": 0.312, "step": 2190 }, { "epoch": 1.8624338624338623, "grad_norm": 1.2169277319430767, "learning_rate": 1.1561897085678341e-06, "loss": 0.3267, "step": 2200 }, { "epoch": 1.870899470899471, "grad_norm": 1.283700664779273, "learning_rate": 1.0183273966522356e-06, "loss": 0.302, "step": 2210 }, { "epoch": 1.8793650793650793, "grad_norm": 1.1764816717143525, "learning_rate": 8.891300568139638e-07, "loss": 0.2905, "step": 2220 }, { "epoch": 1.8878306878306879, "grad_norm": 1.1049909165614302, "learning_rate": 7.686205443636407e-07, "loss": 0.3135, "step": 2230 }, { "epoch": 1.8962962962962964, "grad_norm": 1.1865755254945776, "learning_rate": 6.568201777149729e-07, "loss": 0.2974, "step": 2240 }, { "epoch": 1.9047619047619047, "grad_norm": 1.2726871875485801, "learning_rate": 5.537487346134629e-07, "loss": 0.2736, "step": 2250 }, { "epoch": 1.9132275132275134, "grad_norm": 1.1770066603089386, "learning_rate": 4.594244486376964e-07, "loss": 0.3008, "step": 2260 }, { "epoch": 1.9216931216931217, "grad_norm": 1.1745505888575511, "learning_rate": 3.738640059737619e-07, "loss": 0.317, "step": 2270 }, { "epoch": 1.9301587301587302, "grad_norm": 1.2261734009974063, "learning_rate": 2.9708254246344494e-07, "loss": 0.295, "step": 2280 }, { "epoch": 1.9386243386243387, "grad_norm": 1.2436623050786833, "learning_rate": 2.2909364092663643e-07, "loss": 0.3023, "step": 2290 }, { "epoch": 1.947089947089947, "grad_norm": 1.2153390698718014, "learning_rate": 1.699093287585274e-07, "loss": 0.3012, "step": 2300 }, { "epoch": 1.9555555555555557, "grad_norm": 1.21126029855381, "learning_rate": 1.1954007580192718e-07, "loss": 0.3165, "step": 2310 }, { "epoch": 1.964021164021164, "grad_norm": 1.1700440086578683, "learning_rate": 7.799479249510633e-08, "loss": 0.3153, "step": 2320 }, { "epoch": 1.9724867724867725, "grad_norm": 1.2008513089503834, "learning_rate": 4.528082829553526e-08, "loss": 0.2764, "step": 2330 }, { "epoch": 1.980952380952381, "grad_norm": 1.096133202492879, "learning_rate": 2.14039703797575e-08, "loss": 0.3095, "step": 2340 }, { "epoch": 1.9894179894179893, "grad_norm": 1.1416782493668638, "learning_rate": 6.36844261959757e-09, "loss": 0.2999, "step": 2350 }, { "epoch": 1.997883597883598, "grad_norm": 1.2624410334213494, "learning_rate": 1.7690483496979326e-10, "loss": 0.3005, "step": 2360 }, { "epoch": 1.9995767195767196, "step": 2362, "total_flos": 434751562907648.0, "train_loss": 0.3921900775147938, "train_runtime": 21626.3872, "train_samples_per_second": 0.874, "train_steps_per_second": 0.109 } ], "logging_steps": 10, "max_steps": 2362, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 434751562907648.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }