{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6700827749310209, "eval_steps": 500, "global_step": 1700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.999997870262096e-05, "loss": 2.714, "step": 5 }, { "epoch": 0.0, "learning_rate": 1.999991481057455e-05, "loss": 2.7679, "step": 10 }, { "epoch": 0.01, "learning_rate": 1.9999808324132915e-05, "loss": 2.6887, "step": 15 }, { "epoch": 0.01, "learning_rate": 1.999965924374964e-05, "loss": 2.6986, "step": 20 }, { "epoch": 0.01, "learning_rate": 1.999946757005972e-05, "loss": 2.7536, "step": 25 }, { "epoch": 0.01, "learning_rate": 1.9999233303879592e-05, "loss": 2.7516, "step": 30 }, { "epoch": 0.01, "learning_rate": 1.99989564462071e-05, "loss": 2.6283, "step": 35 }, { "epoch": 0.02, "learning_rate": 1.999863699822152e-05, "loss": 2.708, "step": 40 }, { "epoch": 0.02, "learning_rate": 1.9998274961283523e-05, "loss": 2.6932, "step": 45 }, { "epoch": 0.02, "learning_rate": 1.9997870336935207e-05, "loss": 2.6321, "step": 50 }, { "epoch": 0.02, "learning_rate": 1.9997423126900056e-05, "loss": 2.7129, "step": 55 }, { "epoch": 0.02, "learning_rate": 1.9996933333082945e-05, "loss": 2.6179, "step": 60 }, { "epoch": 0.03, "learning_rate": 1.9996400957570148e-05, "loss": 2.6052, "step": 65 }, { "epoch": 0.03, "learning_rate": 1.99958260026293e-05, "loss": 2.7054, "step": 70 }, { "epoch": 0.03, "learning_rate": 1.9995208470709405e-05, "loss": 2.6568, "step": 75 }, { "epoch": 0.03, "learning_rate": 1.9994548364440836e-05, "loss": 2.6436, "step": 80 }, { "epoch": 0.03, "learning_rate": 1.999384568663529e-05, "loss": 2.7091, "step": 85 }, { "epoch": 0.04, "learning_rate": 1.9993100440285805e-05, "loss": 2.5557, "step": 90 }, { "epoch": 0.04, "learning_rate": 1.999231262856675e-05, "loss": 2.5945, "step": 95 }, { "epoch": 0.04, "learning_rate": 1.999148225483378e-05, "loss": 2.4729, "step": 100 }, { "epoch": 0.04, "learning_rate": 1.9990609322623854e-05, "loss": 2.6724, "step": 105 }, { "epoch": 0.04, "learning_rate": 1.9989693835655205e-05, "loss": 2.5272, "step": 110 }, { "epoch": 0.05, "learning_rate": 1.9988735797827336e-05, "loss": 2.6145, "step": 115 }, { "epoch": 0.05, "learning_rate": 1.9987735213220975e-05, "loss": 2.6772, "step": 120 }, { "epoch": 0.05, "learning_rate": 1.9986692086098095e-05, "loss": 2.6388, "step": 125 }, { "epoch": 0.05, "learning_rate": 1.998560642090187e-05, "loss": 2.4973, "step": 130 }, { "epoch": 0.05, "learning_rate": 1.998447822225666e-05, "loss": 2.5338, "step": 135 }, { "epoch": 0.06, "learning_rate": 1.9983307494968e-05, "loss": 2.6469, "step": 140 }, { "epoch": 0.06, "learning_rate": 1.9982094244022582e-05, "loss": 2.6008, "step": 145 }, { "epoch": 0.06, "learning_rate": 1.9980838474588214e-05, "loss": 2.5936, "step": 150 }, { "epoch": 0.06, "learning_rate": 1.9979540192013814e-05, "loss": 2.5862, "step": 155 }, { "epoch": 0.06, "learning_rate": 1.997819940182939e-05, "loss": 2.568, "step": 160 }, { "epoch": 0.07, "learning_rate": 1.9976816109746e-05, "loss": 2.5343, "step": 165 }, { "epoch": 0.07, "learning_rate": 1.9975390321655745e-05, "loss": 2.5575, "step": 170 }, { "epoch": 0.07, "learning_rate": 1.9973922043631737e-05, "loss": 2.5399, "step": 175 }, { "epoch": 0.07, "learning_rate": 1.9972411281928068e-05, "loss": 2.6394, "step": 180 }, { "epoch": 0.07, "learning_rate": 1.9970858042979794e-05, "loss": 2.5378, "step": 185 }, { "epoch": 0.07, "learning_rate": 1.9969262333402893e-05, "loss": 2.5373, "step": 190 }, { "epoch": 0.08, "learning_rate": 1.9967624159994262e-05, "loss": 2.5414, "step": 195 }, { "epoch": 0.08, "learning_rate": 1.9965943529731646e-05, "loss": 2.5606, "step": 200 }, { "epoch": 0.08, "learning_rate": 1.9964220449773664e-05, "loss": 2.5154, "step": 205 }, { "epoch": 0.08, "learning_rate": 1.9962454927459723e-05, "loss": 2.5468, "step": 210 }, { "epoch": 0.08, "learning_rate": 1.9960646970310027e-05, "loss": 2.5137, "step": 215 }, { "epoch": 0.09, "learning_rate": 1.9958796586025527e-05, "loss": 2.5453, "step": 220 }, { "epoch": 0.09, "learning_rate": 1.9956903782487885e-05, "loss": 2.5325, "step": 225 }, { "epoch": 0.09, "learning_rate": 1.9954968567759456e-05, "loss": 2.5054, "step": 230 }, { "epoch": 0.09, "learning_rate": 1.9952990950083236e-05, "loss": 2.5139, "step": 235 }, { "epoch": 0.09, "learning_rate": 1.995097093788285e-05, "loss": 2.5097, "step": 240 }, { "epoch": 0.1, "learning_rate": 1.994890853976248e-05, "loss": 2.5546, "step": 245 }, { "epoch": 0.1, "learning_rate": 1.994680376450686e-05, "loss": 2.4609, "step": 250 }, { "epoch": 0.1, "learning_rate": 1.994465662108124e-05, "loss": 2.5063, "step": 255 }, { "epoch": 0.1, "learning_rate": 1.9942467118631322e-05, "loss": 2.5162, "step": 260 }, { "epoch": 0.1, "learning_rate": 1.994023526648323e-05, "loss": 2.5317, "step": 265 }, { "epoch": 0.11, "learning_rate": 1.9937961074143492e-05, "loss": 2.4621, "step": 270 }, { "epoch": 0.11, "learning_rate": 1.9935644551298976e-05, "loss": 2.5879, "step": 275 }, { "epoch": 0.11, "learning_rate": 1.993328570781685e-05, "loss": 2.4742, "step": 280 }, { "epoch": 0.11, "learning_rate": 1.993088455374456e-05, "loss": 2.4661, "step": 285 }, { "epoch": 0.11, "learning_rate": 1.992844109930975e-05, "loss": 2.4943, "step": 290 }, { "epoch": 0.12, "learning_rate": 1.9925955354920265e-05, "loss": 2.4322, "step": 295 }, { "epoch": 0.12, "learning_rate": 1.9923427331164072e-05, "loss": 2.484, "step": 300 }, { "epoch": 0.12, "learning_rate": 1.9920857038809223e-05, "loss": 2.5334, "step": 305 }, { "epoch": 0.12, "learning_rate": 1.991824448880382e-05, "loss": 2.6554, "step": 310 }, { "epoch": 0.12, "learning_rate": 1.9915589692275955e-05, "loss": 2.4749, "step": 315 }, { "epoch": 0.13, "learning_rate": 1.991289266053367e-05, "loss": 2.5076, "step": 320 }, { "epoch": 0.13, "learning_rate": 1.9910153405064904e-05, "loss": 2.4635, "step": 325 }, { "epoch": 0.13, "learning_rate": 1.990737193753745e-05, "loss": 2.5443, "step": 330 }, { "epoch": 0.13, "learning_rate": 1.9904548269798906e-05, "loss": 2.6012, "step": 335 }, { "epoch": 0.13, "learning_rate": 1.990168241387662e-05, "loss": 2.4818, "step": 340 }, { "epoch": 0.14, "learning_rate": 1.9898774381977618e-05, "loss": 2.4717, "step": 345 }, { "epoch": 0.14, "learning_rate": 1.989582418648861e-05, "loss": 2.4756, "step": 350 }, { "epoch": 0.14, "learning_rate": 1.9892831839975874e-05, "loss": 2.4868, "step": 355 }, { "epoch": 0.14, "learning_rate": 1.9889797355185237e-05, "loss": 2.5652, "step": 360 }, { "epoch": 0.14, "learning_rate": 1.9886720745042017e-05, "loss": 2.3907, "step": 365 }, { "epoch": 0.15, "learning_rate": 1.988360202265096e-05, "loss": 2.4984, "step": 370 }, { "epoch": 0.15, "learning_rate": 1.9880441201296186e-05, "loss": 2.5654, "step": 375 }, { "epoch": 0.15, "learning_rate": 1.987723829444114e-05, "loss": 2.581, "step": 380 }, { "epoch": 0.15, "learning_rate": 1.9873993315728523e-05, "loss": 2.5189, "step": 385 }, { "epoch": 0.15, "learning_rate": 1.987070627898025e-05, "loss": 2.491, "step": 390 }, { "epoch": 0.16, "learning_rate": 1.9867377198197367e-05, "loss": 2.4875, "step": 395 }, { "epoch": 0.16, "learning_rate": 1.9864006087560016e-05, "loss": 2.4828, "step": 400 }, { "epoch": 0.16, "learning_rate": 1.9860592961427358e-05, "loss": 2.5298, "step": 405 }, { "epoch": 0.16, "learning_rate": 1.9857137834337527e-05, "loss": 2.4873, "step": 410 }, { "epoch": 0.16, "learning_rate": 1.985364072100755e-05, "loss": 2.5371, "step": 415 }, { "epoch": 0.17, "learning_rate": 1.98501016363333e-05, "loss": 2.4167, "step": 420 }, { "epoch": 0.17, "learning_rate": 1.9846520595389415e-05, "loss": 2.4312, "step": 425 }, { "epoch": 0.17, "learning_rate": 1.984289761342926e-05, "loss": 2.4471, "step": 430 }, { "epoch": 0.17, "learning_rate": 1.9839232705884836e-05, "loss": 2.4971, "step": 435 }, { "epoch": 0.17, "learning_rate": 1.9835525888366727e-05, "loss": 2.5206, "step": 440 }, { "epoch": 0.18, "learning_rate": 1.9831777176664035e-05, "loss": 2.5067, "step": 445 }, { "epoch": 0.18, "learning_rate": 1.9827986586744302e-05, "loss": 2.4903, "step": 450 }, { "epoch": 0.18, "learning_rate": 1.982415413475346e-05, "loss": 2.4391, "step": 455 }, { "epoch": 0.18, "learning_rate": 1.9820279837015742e-05, "loss": 2.4358, "step": 460 }, { "epoch": 0.18, "learning_rate": 1.981636371003363e-05, "loss": 2.4809, "step": 465 }, { "epoch": 0.19, "learning_rate": 1.9812405770487763e-05, "loss": 2.516, "step": 470 }, { "epoch": 0.19, "learning_rate": 1.9808406035236897e-05, "loss": 2.4091, "step": 475 }, { "epoch": 0.19, "learning_rate": 1.9804364521317806e-05, "loss": 2.5363, "step": 480 }, { "epoch": 0.19, "learning_rate": 1.9800281245945217e-05, "loss": 2.51, "step": 485 }, { "epoch": 0.19, "learning_rate": 1.9796156226511747e-05, "loss": 2.5049, "step": 490 }, { "epoch": 0.2, "learning_rate": 1.9791989480587815e-05, "loss": 2.4722, "step": 495 }, { "epoch": 0.2, "learning_rate": 1.978778102592157e-05, "loss": 2.4111, "step": 500 }, { "epoch": 0.2, "learning_rate": 1.9783530880438832e-05, "loss": 2.4078, "step": 505 }, { "epoch": 0.2, "learning_rate": 1.9779239062242988e-05, "loss": 2.3981, "step": 510 }, { "epoch": 0.2, "learning_rate": 1.9774905589614935e-05, "loss": 2.5459, "step": 515 }, { "epoch": 0.2, "learning_rate": 1.977053048101299e-05, "loss": 2.4432, "step": 520 }, { "epoch": 0.21, "learning_rate": 1.976611375507283e-05, "loss": 2.5329, "step": 525 }, { "epoch": 0.21, "learning_rate": 1.9761655430607384e-05, "loss": 2.5107, "step": 530 }, { "epoch": 0.21, "learning_rate": 1.975715552660678e-05, "loss": 2.4389, "step": 535 }, { "epoch": 0.21, "learning_rate": 1.9752614062238256e-05, "loss": 2.4939, "step": 540 }, { "epoch": 0.21, "learning_rate": 1.974803105684606e-05, "loss": 2.5349, "step": 545 }, { "epoch": 0.22, "learning_rate": 1.9743406529951403e-05, "loss": 2.5268, "step": 550 }, { "epoch": 0.22, "learning_rate": 1.9738740501252337e-05, "loss": 2.4477, "step": 555 }, { "epoch": 0.22, "learning_rate": 1.9734032990623702e-05, "loss": 2.4597, "step": 560 }, { "epoch": 0.22, "learning_rate": 1.972928401811703e-05, "loss": 2.4938, "step": 565 }, { "epoch": 0.22, "learning_rate": 1.9724493603960443e-05, "loss": 2.4756, "step": 570 }, { "epoch": 0.23, "learning_rate": 1.9719661768558604e-05, "loss": 2.3664, "step": 575 }, { "epoch": 0.23, "learning_rate": 1.9714788532492595e-05, "loss": 2.5166, "step": 580 }, { "epoch": 0.23, "learning_rate": 1.9709873916519853e-05, "loss": 2.4143, "step": 585 }, { "epoch": 0.23, "learning_rate": 1.9704917941574053e-05, "loss": 2.4666, "step": 590 }, { "epoch": 0.23, "learning_rate": 1.9699920628765065e-05, "loss": 2.4617, "step": 595 }, { "epoch": 0.24, "learning_rate": 1.969488199937881e-05, "loss": 2.4912, "step": 600 }, { "epoch": 0.24, "learning_rate": 1.9689802074877216e-05, "loss": 2.4386, "step": 605 }, { "epoch": 0.24, "learning_rate": 1.9684680876898096e-05, "loss": 2.5002, "step": 610 }, { "epoch": 0.24, "learning_rate": 1.967951842725507e-05, "loss": 2.5668, "step": 615 }, { "epoch": 0.24, "learning_rate": 1.9674314747937462e-05, "loss": 2.4955, "step": 620 }, { "epoch": 0.25, "learning_rate": 1.9669069861110225e-05, "loss": 2.5317, "step": 625 }, { "epoch": 0.25, "learning_rate": 1.9663783789113827e-05, "loss": 2.4709, "step": 630 }, { "epoch": 0.25, "learning_rate": 1.9658456554464157e-05, "loss": 2.4066, "step": 635 }, { "epoch": 0.25, "learning_rate": 1.9653088179852448e-05, "loss": 2.4462, "step": 640 }, { "epoch": 0.25, "learning_rate": 1.9647678688145163e-05, "loss": 2.5093, "step": 645 }, { "epoch": 0.26, "learning_rate": 1.9642228102383894e-05, "loss": 2.4634, "step": 650 }, { "epoch": 0.26, "learning_rate": 1.9636736445785288e-05, "loss": 2.424, "step": 655 }, { "epoch": 0.26, "learning_rate": 1.963120374174092e-05, "loss": 2.4968, "step": 660 }, { "epoch": 0.26, "learning_rate": 1.9625630013817204e-05, "loss": 2.3947, "step": 665 }, { "epoch": 0.26, "learning_rate": 1.9620015285755306e-05, "loss": 2.3954, "step": 670 }, { "epoch": 0.27, "learning_rate": 1.961435958147102e-05, "loss": 2.3764, "step": 675 }, { "epoch": 0.27, "learning_rate": 1.9608662925054684e-05, "loss": 2.386, "step": 680 }, { "epoch": 0.27, "learning_rate": 1.960292534077107e-05, "loss": 2.4218, "step": 685 }, { "epoch": 0.27, "learning_rate": 1.9597146853059273e-05, "loss": 2.4479, "step": 690 }, { "epoch": 0.27, "learning_rate": 1.959132748653263e-05, "loss": 2.4301, "step": 695 }, { "epoch": 0.28, "learning_rate": 1.9585467265978585e-05, "loss": 2.5069, "step": 700 }, { "epoch": 0.28, "learning_rate": 1.957956621635861e-05, "loss": 2.4144, "step": 705 }, { "epoch": 0.28, "learning_rate": 1.9573624362808078e-05, "loss": 2.4491, "step": 710 }, { "epoch": 0.28, "learning_rate": 1.9567641730636174e-05, "loss": 2.4723, "step": 715 }, { "epoch": 0.28, "learning_rate": 1.956161834532578e-05, "loss": 2.4839, "step": 720 }, { "epoch": 0.29, "learning_rate": 1.955555423253335e-05, "loss": 2.4881, "step": 725 }, { "epoch": 0.29, "learning_rate": 1.9549449418088832e-05, "loss": 2.5076, "step": 730 }, { "epoch": 0.29, "learning_rate": 1.9543303927995536e-05, "loss": 2.4704, "step": 735 }, { "epoch": 0.29, "learning_rate": 1.9537117788430024e-05, "loss": 2.3958, "step": 740 }, { "epoch": 0.29, "learning_rate": 1.953089102574201e-05, "loss": 2.5256, "step": 745 }, { "epoch": 0.3, "learning_rate": 1.9524623666454243e-05, "loss": 2.4658, "step": 750 }, { "epoch": 0.3, "learning_rate": 1.951831573726238e-05, "loss": 2.4636, "step": 755 }, { "epoch": 0.3, "learning_rate": 1.9511967265034904e-05, "loss": 2.4249, "step": 760 }, { "epoch": 0.3, "learning_rate": 1.9505578276812964e-05, "loss": 2.5412, "step": 765 }, { "epoch": 0.3, "learning_rate": 1.9499148799810314e-05, "loss": 2.4355, "step": 770 }, { "epoch": 0.31, "learning_rate": 1.949267886141315e-05, "loss": 2.5814, "step": 775 }, { "epoch": 0.31, "learning_rate": 1.948616848918002e-05, "loss": 2.5195, "step": 780 }, { "epoch": 0.31, "learning_rate": 1.9479617710841693e-05, "loss": 2.5177, "step": 785 }, { "epoch": 0.31, "learning_rate": 1.9473026554301057e-05, "loss": 2.4418, "step": 790 }, { "epoch": 0.31, "learning_rate": 1.946639504763298e-05, "loss": 2.5187, "step": 795 }, { "epoch": 0.32, "learning_rate": 1.94597232190842e-05, "loss": 2.479, "step": 800 }, { "epoch": 0.32, "learning_rate": 1.9453011097073217e-05, "loss": 2.4451, "step": 805 }, { "epoch": 0.32, "learning_rate": 1.9446258710190152e-05, "loss": 2.5426, "step": 810 }, { "epoch": 0.32, "learning_rate": 1.9439466087196627e-05, "loss": 2.4299, "step": 815 }, { "epoch": 0.32, "learning_rate": 1.943263325702566e-05, "loss": 2.5317, "step": 820 }, { "epoch": 0.33, "learning_rate": 1.9425760248781525e-05, "loss": 2.4384, "step": 825 }, { "epoch": 0.33, "learning_rate": 1.9418847091739638e-05, "loss": 2.4445, "step": 830 }, { "epoch": 0.33, "learning_rate": 1.9411893815346418e-05, "loss": 2.4872, "step": 835 }, { "epoch": 0.33, "learning_rate": 1.9404900449219178e-05, "loss": 2.5123, "step": 840 }, { "epoch": 0.33, "learning_rate": 1.9397867023146e-05, "loss": 2.3569, "step": 845 }, { "epoch": 0.34, "learning_rate": 1.9390793567085585e-05, "loss": 2.467, "step": 850 }, { "epoch": 0.34, "learning_rate": 1.9383680111167146e-05, "loss": 2.4097, "step": 855 }, { "epoch": 0.34, "learning_rate": 1.937652668569028e-05, "loss": 2.4653, "step": 860 }, { "epoch": 0.34, "learning_rate": 1.9369333321124832e-05, "loss": 2.4164, "step": 865 }, { "epoch": 0.34, "learning_rate": 1.936210004811076e-05, "loss": 2.3906, "step": 870 }, { "epoch": 0.34, "learning_rate": 1.9354826897458016e-05, "loss": 2.4433, "step": 875 }, { "epoch": 0.35, "learning_rate": 1.9347513900146412e-05, "loss": 2.42, "step": 880 }, { "epoch": 0.35, "learning_rate": 1.9340161087325483e-05, "loss": 2.4606, "step": 885 }, { "epoch": 0.35, "learning_rate": 1.9332768490314354e-05, "loss": 2.4242, "step": 890 }, { "epoch": 0.35, "learning_rate": 1.9325336140601612e-05, "loss": 2.461, "step": 895 }, { "epoch": 0.35, "learning_rate": 1.931786406984518e-05, "loss": 2.3949, "step": 900 }, { "epoch": 0.36, "learning_rate": 1.9310352309872153e-05, "loss": 2.4449, "step": 905 }, { "epoch": 0.36, "learning_rate": 1.9302800892678693e-05, "loss": 2.4864, "step": 910 }, { "epoch": 0.36, "learning_rate": 1.9295209850429884e-05, "loss": 2.5432, "step": 915 }, { "epoch": 0.36, "learning_rate": 1.9287579215459585e-05, "loss": 2.4357, "step": 920 }, { "epoch": 0.36, "learning_rate": 1.9279909020270294e-05, "loss": 2.4625, "step": 925 }, { "epoch": 0.37, "learning_rate": 1.9272199297533027e-05, "loss": 2.3937, "step": 930 }, { "epoch": 0.37, "learning_rate": 1.9264450080087163e-05, "loss": 2.5051, "step": 935 }, { "epoch": 0.37, "learning_rate": 1.9256661400940303e-05, "loss": 2.4894, "step": 940 }, { "epoch": 0.37, "learning_rate": 1.9248833293268144e-05, "loss": 2.4117, "step": 945 }, { "epoch": 0.37, "learning_rate": 1.9240965790414312e-05, "loss": 2.4892, "step": 950 }, { "epoch": 0.38, "learning_rate": 1.923305892589025e-05, "loss": 2.4059, "step": 955 }, { "epoch": 0.38, "learning_rate": 1.9225112733375057e-05, "loss": 2.3975, "step": 960 }, { "epoch": 0.38, "learning_rate": 1.9217127246715344e-05, "loss": 2.4867, "step": 965 }, { "epoch": 0.38, "learning_rate": 1.92091024999251e-05, "loss": 2.4757, "step": 970 }, { "epoch": 0.38, "learning_rate": 1.9201038527185546e-05, "loss": 2.4164, "step": 975 }, { "epoch": 0.39, "learning_rate": 1.919293536284497e-05, "loss": 2.4024, "step": 980 }, { "epoch": 0.39, "learning_rate": 1.9184793041418607e-05, "loss": 2.4706, "step": 985 }, { "epoch": 0.39, "learning_rate": 1.917661159758848e-05, "loss": 2.465, "step": 990 }, { "epoch": 0.39, "learning_rate": 1.9168391066203248e-05, "loss": 2.555, "step": 995 }, { "epoch": 0.39, "learning_rate": 1.9160131482278068e-05, "loss": 2.4465, "step": 1000 }, { "epoch": 0.4, "learning_rate": 1.9151832880994438e-05, "loss": 2.4014, "step": 1005 }, { "epoch": 0.4, "learning_rate": 1.914349529770005e-05, "loss": 2.4358, "step": 1010 }, { "epoch": 0.4, "learning_rate": 1.9135118767908637e-05, "loss": 2.402, "step": 1015 }, { "epoch": 0.4, "learning_rate": 1.9126703327299822e-05, "loss": 2.3952, "step": 1020 }, { "epoch": 0.4, "learning_rate": 1.9118249011718975e-05, "loss": 2.4051, "step": 1025 }, { "epoch": 0.41, "learning_rate": 1.9109755857177053e-05, "loss": 2.4198, "step": 1030 }, { "epoch": 0.41, "learning_rate": 1.910122389985043e-05, "loss": 2.4414, "step": 1035 }, { "epoch": 0.41, "learning_rate": 1.909265317608078e-05, "loss": 2.4688, "step": 1040 }, { "epoch": 0.41, "learning_rate": 1.9084043722374895e-05, "loss": 2.3766, "step": 1045 }, { "epoch": 0.41, "learning_rate": 1.907539557540453e-05, "loss": 2.3966, "step": 1050 }, { "epoch": 0.42, "learning_rate": 1.9066708772006262e-05, "loss": 2.5027, "step": 1055 }, { "epoch": 0.42, "learning_rate": 1.9057983349181316e-05, "loss": 2.4904, "step": 1060 }, { "epoch": 0.42, "learning_rate": 1.904921934409542e-05, "loss": 2.4296, "step": 1065 }, { "epoch": 0.42, "learning_rate": 1.9040416794078648e-05, "loss": 2.4294, "step": 1070 }, { "epoch": 0.42, "learning_rate": 1.903157573662524e-05, "loss": 2.4442, "step": 1075 }, { "epoch": 0.43, "learning_rate": 1.902269620939347e-05, "loss": 2.382, "step": 1080 }, { "epoch": 0.43, "learning_rate": 1.901377825020547e-05, "loss": 2.5004, "step": 1085 }, { "epoch": 0.43, "learning_rate": 1.9004821897047067e-05, "loss": 2.3837, "step": 1090 }, { "epoch": 0.43, "learning_rate": 1.899582718806764e-05, "loss": 2.3654, "step": 1095 }, { "epoch": 0.43, "learning_rate": 1.8986794161579927e-05, "loss": 2.4624, "step": 1100 }, { "epoch": 0.44, "learning_rate": 1.8977722856059886e-05, "loss": 2.4875, "step": 1105 }, { "epoch": 0.44, "learning_rate": 1.8968613310146527e-05, "loss": 2.4575, "step": 1110 }, { "epoch": 0.44, "learning_rate": 1.8959465562641738e-05, "loss": 2.337, "step": 1115 }, { "epoch": 0.44, "learning_rate": 1.895027965251013e-05, "loss": 2.4485, "step": 1120 }, { "epoch": 0.44, "learning_rate": 1.8941055618878864e-05, "loss": 2.5231, "step": 1125 }, { "epoch": 0.45, "learning_rate": 1.8931793501037483e-05, "loss": 2.3996, "step": 1130 }, { "epoch": 0.45, "learning_rate": 1.8922493338437765e-05, "loss": 2.4486, "step": 1135 }, { "epoch": 0.45, "learning_rate": 1.8913155170693514e-05, "loss": 2.5302, "step": 1140 }, { "epoch": 0.45, "learning_rate": 1.8903779037580442e-05, "loss": 2.4209, "step": 1145 }, { "epoch": 0.45, "learning_rate": 1.8894364979035956e-05, "loss": 2.3894, "step": 1150 }, { "epoch": 0.46, "learning_rate": 1.8884913035159008e-05, "loss": 2.3971, "step": 1155 }, { "epoch": 0.46, "learning_rate": 1.8875423246209928e-05, "loss": 2.3819, "step": 1160 }, { "epoch": 0.46, "learning_rate": 1.8865895652610244e-05, "loss": 2.4543, "step": 1165 }, { "epoch": 0.46, "learning_rate": 1.8856330294942506e-05, "loss": 2.4273, "step": 1170 }, { "epoch": 0.46, "learning_rate": 1.8846727213950125e-05, "loss": 2.4266, "step": 1175 }, { "epoch": 0.47, "learning_rate": 1.8837086450537195e-05, "loss": 2.4436, "step": 1180 }, { "epoch": 0.47, "learning_rate": 1.8827408045768308e-05, "loss": 2.5767, "step": 1185 }, { "epoch": 0.47, "learning_rate": 1.8817692040868404e-05, "loss": 2.3555, "step": 1190 }, { "epoch": 0.47, "learning_rate": 1.880793847722256e-05, "loss": 2.4157, "step": 1195 }, { "epoch": 0.47, "learning_rate": 1.8798147396375855e-05, "loss": 2.4256, "step": 1200 }, { "epoch": 0.47, "learning_rate": 1.8788318840033155e-05, "loss": 2.529, "step": 1205 }, { "epoch": 0.48, "learning_rate": 1.8778452850058957e-05, "loss": 2.4668, "step": 1210 }, { "epoch": 0.48, "learning_rate": 1.8768549468477212e-05, "loss": 2.4191, "step": 1215 }, { "epoch": 0.48, "learning_rate": 1.875860873747113e-05, "loss": 2.3992, "step": 1220 }, { "epoch": 0.48, "learning_rate": 1.8748630699383016e-05, "loss": 2.4485, "step": 1225 }, { "epoch": 0.48, "learning_rate": 1.8738615396714083e-05, "loss": 2.3803, "step": 1230 }, { "epoch": 0.49, "learning_rate": 1.8728562872124264e-05, "loss": 2.428, "step": 1235 }, { "epoch": 0.49, "learning_rate": 1.8718473168432054e-05, "loss": 2.4404, "step": 1240 }, { "epoch": 0.49, "learning_rate": 1.8708346328614297e-05, "loss": 2.5451, "step": 1245 }, { "epoch": 0.49, "learning_rate": 1.869818239580602e-05, "loss": 2.4906, "step": 1250 }, { "epoch": 0.49, "learning_rate": 1.8687981413300246e-05, "loss": 2.4486, "step": 1255 }, { "epoch": 0.5, "learning_rate": 1.8677743424547824e-05, "loss": 2.4106, "step": 1260 }, { "epoch": 0.5, "learning_rate": 1.8667468473157212e-05, "loss": 2.3316, "step": 1265 }, { "epoch": 0.5, "learning_rate": 1.865715660289432e-05, "loss": 2.4518, "step": 1270 }, { "epoch": 0.5, "learning_rate": 1.8646807857682308e-05, "loss": 2.4549, "step": 1275 }, { "epoch": 0.5, "learning_rate": 1.8636422281601406e-05, "loss": 2.4411, "step": 1280 }, { "epoch": 0.51, "learning_rate": 1.8625999918888726e-05, "loss": 2.509, "step": 1285 }, { "epoch": 0.51, "learning_rate": 1.8615540813938063e-05, "loss": 2.4455, "step": 1290 }, { "epoch": 0.51, "learning_rate": 1.860504501129973e-05, "loss": 2.5486, "step": 1295 }, { "epoch": 0.51, "learning_rate": 1.8594512555680338e-05, "loss": 2.4134, "step": 1300 }, { "epoch": 0.51, "learning_rate": 1.8583943491942635e-05, "loss": 2.483, "step": 1305 }, { "epoch": 0.52, "learning_rate": 1.8573337865105285e-05, "loss": 2.4329, "step": 1310 }, { "epoch": 0.52, "learning_rate": 1.8562695720342704e-05, "loss": 2.3614, "step": 1315 }, { "epoch": 0.52, "learning_rate": 1.8552017102984842e-05, "loss": 2.4119, "step": 1320 }, { "epoch": 0.52, "learning_rate": 1.854130205851702e-05, "loss": 2.4654, "step": 1325 }, { "epoch": 0.52, "learning_rate": 1.853055063257971e-05, "loss": 2.3784, "step": 1330 }, { "epoch": 0.53, "learning_rate": 1.8519762870968344e-05, "loss": 2.4059, "step": 1335 }, { "epoch": 0.53, "learning_rate": 1.8508938819633138e-05, "loss": 2.3768, "step": 1340 }, { "epoch": 0.53, "learning_rate": 1.8498078524678874e-05, "loss": 2.3345, "step": 1345 }, { "epoch": 0.53, "learning_rate": 1.8487182032364714e-05, "loss": 2.4273, "step": 1350 }, { "epoch": 0.53, "learning_rate": 1.8476249389104007e-05, "loss": 2.3947, "step": 1355 }, { "epoch": 0.54, "learning_rate": 1.8465280641464085e-05, "loss": 2.4527, "step": 1360 }, { "epoch": 0.54, "learning_rate": 1.8454275836166052e-05, "loss": 2.3342, "step": 1365 }, { "epoch": 0.54, "learning_rate": 1.8443235020084624e-05, "loss": 2.4361, "step": 1370 }, { "epoch": 0.54, "learning_rate": 1.843215824024788e-05, "loss": 2.502, "step": 1375 }, { "epoch": 0.54, "learning_rate": 1.84210455438371e-05, "loss": 2.4718, "step": 1380 }, { "epoch": 0.55, "learning_rate": 1.8409896978186547e-05, "loss": 2.428, "step": 1385 }, { "epoch": 0.55, "learning_rate": 1.8398712590783258e-05, "loss": 2.4213, "step": 1390 }, { "epoch": 0.55, "learning_rate": 1.838749242926687e-05, "loss": 2.3812, "step": 1395 }, { "epoch": 0.55, "learning_rate": 1.8376236541429386e-05, "loss": 2.4458, "step": 1400 }, { "epoch": 0.55, "learning_rate": 1.836494497521499e-05, "loss": 2.4359, "step": 1405 }, { "epoch": 0.56, "learning_rate": 1.835361777871983e-05, "loss": 2.3377, "step": 1410 }, { "epoch": 0.56, "learning_rate": 1.8342255000191832e-05, "loss": 2.414, "step": 1415 }, { "epoch": 0.56, "learning_rate": 1.8330856688030474e-05, "loss": 2.4517, "step": 1420 }, { "epoch": 0.56, "learning_rate": 1.8319422890786586e-05, "loss": 2.375, "step": 1425 }, { "epoch": 0.56, "learning_rate": 1.830795365716216e-05, "loss": 2.3515, "step": 1430 }, { "epoch": 0.57, "learning_rate": 1.829644903601011e-05, "loss": 2.4428, "step": 1435 }, { "epoch": 0.57, "learning_rate": 1.8284909076334094e-05, "loss": 2.4186, "step": 1440 }, { "epoch": 0.57, "learning_rate": 1.8273333827288294e-05, "loss": 2.4604, "step": 1445 }, { "epoch": 0.57, "learning_rate": 1.8261723338177204e-05, "loss": 2.4712, "step": 1450 }, { "epoch": 0.57, "learning_rate": 1.825007765845542e-05, "loss": 2.4355, "step": 1455 }, { "epoch": 0.58, "learning_rate": 1.823839683772743e-05, "loss": 2.4399, "step": 1460 }, { "epoch": 0.58, "learning_rate": 1.822668092574741e-05, "loss": 2.4747, "step": 1465 }, { "epoch": 0.58, "learning_rate": 1.8214929972419004e-05, "loss": 2.3503, "step": 1470 }, { "epoch": 0.58, "learning_rate": 1.820314402779511e-05, "loss": 2.42, "step": 1475 }, { "epoch": 0.58, "learning_rate": 1.819132314207768e-05, "loss": 2.3705, "step": 1480 }, { "epoch": 0.59, "learning_rate": 1.8179467365617486e-05, "loss": 2.3828, "step": 1485 }, { "epoch": 0.59, "learning_rate": 1.816757674891392e-05, "loss": 2.4663, "step": 1490 }, { "epoch": 0.59, "learning_rate": 1.8155651342614784e-05, "loss": 2.4991, "step": 1495 }, { "epoch": 0.59, "learning_rate": 1.8143691197516048e-05, "loss": 2.3769, "step": 1500 }, { "epoch": 0.59, "learning_rate": 1.813169636456167e-05, "loss": 2.4001, "step": 1505 }, { "epoch": 0.6, "learning_rate": 1.811966689484334e-05, "loss": 2.4054, "step": 1510 }, { "epoch": 0.6, "learning_rate": 1.8107602839600306e-05, "loss": 2.4524, "step": 1515 }, { "epoch": 0.6, "learning_rate": 1.8095504250219103e-05, "loss": 2.423, "step": 1520 }, { "epoch": 0.6, "learning_rate": 1.80833711782334e-05, "loss": 2.5138, "step": 1525 }, { "epoch": 0.6, "learning_rate": 1.8071203675323708e-05, "loss": 2.3669, "step": 1530 }, { "epoch": 0.61, "learning_rate": 1.8059001793317215e-05, "loss": 2.3542, "step": 1535 }, { "epoch": 0.61, "learning_rate": 1.8046765584187544e-05, "loss": 2.3358, "step": 1540 }, { "epoch": 0.61, "learning_rate": 1.803449510005453e-05, "loss": 2.3739, "step": 1545 }, { "epoch": 0.61, "learning_rate": 1.8022190393184008e-05, "loss": 2.4784, "step": 1550 }, { "epoch": 0.61, "learning_rate": 1.8009851515987573e-05, "loss": 2.3919, "step": 1555 }, { "epoch": 0.61, "learning_rate": 1.7997478521022378e-05, "loss": 2.4998, "step": 1560 }, { "epoch": 0.62, "learning_rate": 1.7985071460990894e-05, "loss": 2.4079, "step": 1565 }, { "epoch": 0.62, "learning_rate": 1.7972630388740696e-05, "loss": 2.3714, "step": 1570 }, { "epoch": 0.62, "learning_rate": 1.7960155357264224e-05, "loss": 2.4316, "step": 1575 }, { "epoch": 0.62, "learning_rate": 1.7947646419698578e-05, "loss": 2.3991, "step": 1580 }, { "epoch": 0.62, "learning_rate": 1.793510362932527e-05, "loss": 2.4027, "step": 1585 }, { "epoch": 0.63, "learning_rate": 1.7922527039570022e-05, "loss": 2.443, "step": 1590 }, { "epoch": 0.63, "learning_rate": 1.7909916704002506e-05, "loss": 2.3988, "step": 1595 }, { "epoch": 0.63, "learning_rate": 1.7897272676336143e-05, "loss": 2.5286, "step": 1600 }, { "epoch": 0.63, "learning_rate": 1.788459501042786e-05, "loss": 2.4104, "step": 1605 }, { "epoch": 0.63, "learning_rate": 1.7871883760277872e-05, "loss": 2.3934, "step": 1610 }, { "epoch": 0.64, "learning_rate": 1.785913898002944e-05, "loss": 2.4028, "step": 1615 }, { "epoch": 0.64, "learning_rate": 1.784636072396865e-05, "loss": 2.3815, "step": 1620 }, { "epoch": 0.64, "learning_rate": 1.783354904652417e-05, "loss": 2.3652, "step": 1625 }, { "epoch": 0.64, "learning_rate": 1.7820704002267034e-05, "loss": 2.4487, "step": 1630 }, { "epoch": 0.64, "learning_rate": 1.7807825645910396e-05, "loss": 2.5422, "step": 1635 }, { "epoch": 0.65, "learning_rate": 1.77949140323093e-05, "loss": 2.3982, "step": 1640 }, { "epoch": 0.65, "learning_rate": 1.7781969216460458e-05, "loss": 2.3603, "step": 1645 }, { "epoch": 0.65, "learning_rate": 1.7768991253501993e-05, "loss": 2.3725, "step": 1650 }, { "epoch": 0.65, "learning_rate": 1.775598019871323e-05, "loss": 2.4896, "step": 1655 }, { "epoch": 0.65, "learning_rate": 1.7742936107514442e-05, "loss": 2.4789, "step": 1660 }, { "epoch": 0.66, "learning_rate": 1.7729859035466617e-05, "loss": 2.4811, "step": 1665 }, { "epoch": 0.66, "learning_rate": 1.7716749038271225e-05, "loss": 2.4251, "step": 1670 }, { "epoch": 0.66, "learning_rate": 1.770360617176999e-05, "loss": 2.4285, "step": 1675 }, { "epoch": 0.66, "learning_rate": 1.7690430491944625e-05, "loss": 2.4722, "step": 1680 }, { "epoch": 0.66, "learning_rate": 1.7677222054916627e-05, "loss": 2.384, "step": 1685 }, { "epoch": 0.67, "learning_rate": 1.7663980916947007e-05, "loss": 2.3686, "step": 1690 }, { "epoch": 0.67, "learning_rate": 1.7650707134436075e-05, "loss": 2.4773, "step": 1695 }, { "epoch": 0.67, "learning_rate": 1.7637400763923187e-05, "loss": 2.3632, "step": 1700 } ], "logging_steps": 5, "max_steps": 7611, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 2.290424685330432e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }