{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 1040, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009615384615384616, "grad_norm": 4.21875, "learning_rate": 1.9230769230769234e-06, "loss": 3.0033, "step": 1 }, { "epoch": 0.04807692307692308, "grad_norm": 4.0, "learning_rate": 9.615384615384616e-06, "loss": 3.0098, "step": 5 }, { "epoch": 0.09615384615384616, "grad_norm": 3.15625, "learning_rate": 1.923076923076923e-05, "loss": 2.9448, "step": 10 }, { "epoch": 0.14423076923076922, "grad_norm": 4.15625, "learning_rate": 2.8846153846153845e-05, "loss": 2.7074, "step": 15 }, { "epoch": 0.19230769230769232, "grad_norm": 1.8984375, "learning_rate": 3.846153846153846e-05, "loss": 2.4264, "step": 20 }, { "epoch": 0.2403846153846154, "grad_norm": 3.75, "learning_rate": 4.8076923076923084e-05, "loss": 2.1972, "step": 25 }, { "epoch": 0.28846153846153844, "grad_norm": 1.3671875, "learning_rate": 5.769230769230769e-05, "loss": 1.9962, "step": 30 }, { "epoch": 0.33653846153846156, "grad_norm": 5.53125, "learning_rate": 6.730769230769232e-05, "loss": 1.8086, "step": 35 }, { "epoch": 0.38461538461538464, "grad_norm": 0.65625, "learning_rate": 7.692307692307693e-05, "loss": 1.6425, "step": 40 }, { "epoch": 0.4326923076923077, "grad_norm": 0.60546875, "learning_rate": 8.653846153846155e-05, "loss": 1.5058, "step": 45 }, { "epoch": 0.4807692307692308, "grad_norm": 0.5625, "learning_rate": 9.615384615384617e-05, "loss": 1.401, "step": 50 }, { "epoch": 0.5288461538461539, "grad_norm": 0.59765625, "learning_rate": 0.00010576923076923077, "loss": 1.3223, "step": 55 }, { "epoch": 0.5769230769230769, "grad_norm": 0.2265625, "learning_rate": 0.00011538461538461538, "loss": 1.2631, "step": 60 }, { "epoch": 0.625, "grad_norm": 0.7578125, "learning_rate": 0.000125, "loss": 1.2263, "step": 65 }, { "epoch": 0.6730769230769231, "grad_norm": 0.66796875, "learning_rate": 0.00013461538461538464, "loss": 1.1954, "step": 70 }, { "epoch": 0.7211538461538461, "grad_norm": 0.57421875, "learning_rate": 0.00014423076923076924, "loss": 1.1671, "step": 75 }, { "epoch": 0.7692307692307693, "grad_norm": 0.1904296875, "learning_rate": 0.00015384615384615385, "loss": 1.1505, "step": 80 }, { "epoch": 0.8173076923076923, "grad_norm": 0.88671875, "learning_rate": 0.00016346153846153846, "loss": 1.1211, "step": 85 }, { "epoch": 0.8653846153846154, "grad_norm": 0.26953125, "learning_rate": 0.0001730769230769231, "loss": 1.1226, "step": 90 }, { "epoch": 0.9134615384615384, "grad_norm": 0.330078125, "learning_rate": 0.0001826923076923077, "loss": 1.114, "step": 95 }, { "epoch": 0.9615384615384616, "grad_norm": 0.28515625, "learning_rate": 0.00019230769230769233, "loss": 1.0978, "step": 100 }, { "epoch": 1.0, "eval_loss": 2.483147621154785, "eval_runtime": 0.4801, "eval_samples_per_second": 20.827, "eval_steps_per_second": 2.083, "step": 104 }, { "epoch": 1.0096153846153846, "grad_norm": 0.640625, "learning_rate": 0.0001999994367286727, "loss": 1.0851, "step": 105 }, { "epoch": 1.0576923076923077, "grad_norm": 0.263671875, "learning_rate": 0.00019997972289848503, "loss": 1.0778, "step": 110 }, { "epoch": 1.1057692307692308, "grad_norm": 0.91015625, "learning_rate": 0.00019993185184710165, "loss": 1.0708, "step": 115 }, { "epoch": 1.1538461538461537, "grad_norm": 1.484375, "learning_rate": 0.00019985583705641418, "loss": 1.0615, "step": 120 }, { "epoch": 1.2019230769230769, "grad_norm": 0.625, "learning_rate": 0.00019975169993441627, "loss": 1.0549, "step": 125 }, { "epoch": 1.25, "grad_norm": 0.416015625, "learning_rate": 0.00019961946980917456, "loss": 1.039, "step": 130 }, { "epoch": 1.2980769230769231, "grad_norm": 0.3359375, "learning_rate": 0.0001994591839205691, "loss": 1.0429, "step": 135 }, { "epoch": 1.3461538461538463, "grad_norm": 0.478515625, "learning_rate": 0.0001992708874098054, "loss": 1.0433, "step": 140 }, { "epoch": 1.3942307692307692, "grad_norm": 0.40234375, "learning_rate": 0.00019905463330670143, "loss": 1.0285, "step": 145 }, { "epoch": 1.4423076923076923, "grad_norm": 0.37109375, "learning_rate": 0.0001988104825147528, "loss": 1.0263, "step": 150 }, { "epoch": 1.4903846153846154, "grad_norm": 0.3671875, "learning_rate": 0.0001985385037939806, "loss": 1.0219, "step": 155 }, { "epoch": 1.5384615384615383, "grad_norm": 0.3125, "learning_rate": 0.00019823877374156647, "loss": 1.0197, "step": 160 }, { "epoch": 1.5865384615384617, "grad_norm": 0.291015625, "learning_rate": 0.00019791137677028082, "loss": 1.0135, "step": 165 }, { "epoch": 1.6346153846153846, "grad_norm": 0.298828125, "learning_rate": 0.00019755640508470942, "loss": 1.0121, "step": 170 }, { "epoch": 1.6826923076923077, "grad_norm": 0.419921875, "learning_rate": 0.00019717395865528602, "loss": 1.0105, "step": 175 }, { "epoch": 1.7307692307692308, "grad_norm": 0.55078125, "learning_rate": 0.00019676414519013781, "loss": 1.0087, "step": 180 }, { "epoch": 1.7788461538461537, "grad_norm": 0.79296875, "learning_rate": 0.00019632708010475165, "loss": 1.0026, "step": 185 }, { "epoch": 1.8269230769230769, "grad_norm": 0.7265625, "learning_rate": 0.00019586288648946947, "loss": 1.0078, "step": 190 }, { "epoch": 1.875, "grad_norm": 0.21484375, "learning_rate": 0.0001953716950748227, "loss": 1.0, "step": 195 }, { "epoch": 1.9230769230769231, "grad_norm": 0.30078125, "learning_rate": 0.00019485364419471454, "loss": 0.9957, "step": 200 }, { "epoch": 1.9711538461538463, "grad_norm": 0.59375, "learning_rate": 0.0001943088797474612, "loss": 0.9985, "step": 205 }, { "epoch": 2.0, "eval_loss": 2.4666388034820557, "eval_runtime": 0.4867, "eval_samples_per_second": 20.548, "eval_steps_per_second": 2.055, "step": 208 }, { "epoch": 2.019230769230769, "grad_norm": 0.38671875, "learning_rate": 0.00019373755515470254, "loss": 0.9897, "step": 210 }, { "epoch": 2.0673076923076925, "grad_norm": 0.453125, "learning_rate": 0.00019313983131819407, "loss": 0.9717, "step": 215 }, { "epoch": 2.1153846153846154, "grad_norm": 0.279296875, "learning_rate": 0.00019251587657449236, "loss": 0.9697, "step": 220 }, { "epoch": 2.1634615384615383, "grad_norm": 0.33984375, "learning_rate": 0.0001918658666475465, "loss": 0.9697, "step": 225 }, { "epoch": 2.2115384615384617, "grad_norm": 0.5625, "learning_rate": 0.00019118998459920902, "loss": 0.9735, "step": 230 }, { "epoch": 2.2596153846153846, "grad_norm": 0.28515625, "learning_rate": 0.0001904884207776804, "loss": 0.9675, "step": 235 }, { "epoch": 2.3076923076923075, "grad_norm": 0.41015625, "learning_rate": 0.0001897613727639014, "loss": 0.9654, "step": 240 }, { "epoch": 2.355769230769231, "grad_norm": 0.25, "learning_rate": 0.00018900904531590846, "loss": 0.9655, "step": 245 }, { "epoch": 2.4038461538461537, "grad_norm": 0.341796875, "learning_rate": 0.0001882316503111678, "loss": 0.9653, "step": 250 }, { "epoch": 2.451923076923077, "grad_norm": 0.81640625, "learning_rate": 0.00018742940668690464, "loss": 0.9729, "step": 255 }, { "epoch": 2.5, "grad_norm": 0.33203125, "learning_rate": 0.00018660254037844388, "loss": 0.9619, "step": 260 }, { "epoch": 2.5480769230769234, "grad_norm": 0.30859375, "learning_rate": 0.00018575128425558023, "loss": 0.9702, "step": 265 }, { "epoch": 2.5961538461538463, "grad_norm": 0.48828125, "learning_rate": 0.00018487587805699526, "loss": 0.964, "step": 270 }, { "epoch": 2.644230769230769, "grad_norm": 0.546875, "learning_rate": 0.0001839765683227398, "loss": 0.9603, "step": 275 }, { "epoch": 2.6923076923076925, "grad_norm": 0.455078125, "learning_rate": 0.00018305360832480117, "loss": 0.9593, "step": 280 }, { "epoch": 2.7403846153846154, "grad_norm": 0.2890625, "learning_rate": 0.00018210725799577439, "loss": 0.963, "step": 285 }, { "epoch": 2.7884615384615383, "grad_norm": 0.76953125, "learning_rate": 0.00018113778385565733, "loss": 0.9626, "step": 290 }, { "epoch": 2.8365384615384617, "grad_norm": 1.515625, "learning_rate": 0.00018014545893679115, "loss": 0.9655, "step": 295 }, { "epoch": 2.8846153846153846, "grad_norm": 0.44921875, "learning_rate": 0.0001791305627069662, "loss": 0.9599, "step": 300 }, { "epoch": 2.9326923076923075, "grad_norm": 0.310546875, "learning_rate": 0.00017809338099071577, "loss": 0.9557, "step": 305 }, { "epoch": 2.980769230769231, "grad_norm": 0.326171875, "learning_rate": 0.00017703420588881946, "loss": 0.9543, "step": 310 }, { "epoch": 3.0, "eval_loss": 2.4561338424682617, "eval_runtime": 0.4747, "eval_samples_per_second": 21.065, "eval_steps_per_second": 2.107, "step": 312 }, { "epoch": 3.0288461538461537, "grad_norm": 0.2451171875, "learning_rate": 0.0001759533356960391, "loss": 0.946, "step": 315 }, { "epoch": 3.076923076923077, "grad_norm": 0.2255859375, "learning_rate": 0.00017485107481711012, "loss": 0.9384, "step": 320 }, { "epoch": 3.125, "grad_norm": 0.2470703125, "learning_rate": 0.0001737277336810124, "loss": 0.9299, "step": 325 }, { "epoch": 3.173076923076923, "grad_norm": 0.2255859375, "learning_rate": 0.00017258362865354426, "loss": 0.9322, "step": 330 }, { "epoch": 3.2211538461538463, "grad_norm": 0.2890625, "learning_rate": 0.00017141908194822446, "loss": 0.9328, "step": 335 }, { "epoch": 3.269230769230769, "grad_norm": 0.28125, "learning_rate": 0.00017023442153554777, "loss": 0.9307, "step": 340 }, { "epoch": 3.3173076923076925, "grad_norm": 0.2177734375, "learning_rate": 0.00016902998105061844, "loss": 0.9273, "step": 345 }, { "epoch": 3.3653846153846154, "grad_norm": 0.265625, "learning_rate": 0.0001678060996991891, "loss": 0.9324, "step": 350 }, { "epoch": 3.4134615384615383, "grad_norm": 0.234375, "learning_rate": 0.00016656312216213034, "loss": 0.9233, "step": 355 }, { "epoch": 3.4615384615384617, "grad_norm": 0.25390625, "learning_rate": 0.0001653013984983585, "loss": 0.925, "step": 360 }, { "epoch": 3.5096153846153846, "grad_norm": 0.34765625, "learning_rate": 0.00016402128404624882, "loss": 0.9351, "step": 365 }, { "epoch": 3.5576923076923075, "grad_norm": 0.349609375, "learning_rate": 0.00016272313932356162, "loss": 0.9266, "step": 370 }, { "epoch": 3.605769230769231, "grad_norm": 0.251953125, "learning_rate": 0.0001614073299259101, "loss": 0.9259, "step": 375 }, { "epoch": 3.6538461538461537, "grad_norm": 0.2109375, "learning_rate": 0.0001600742264237979, "loss": 0.9322, "step": 380 }, { "epoch": 3.7019230769230766, "grad_norm": 0.296875, "learning_rate": 0.0001587242042582554, "loss": 0.9327, "step": 385 }, { "epoch": 3.75, "grad_norm": 0.265625, "learning_rate": 0.0001573576436351046, "loss": 0.9308, "step": 390 }, { "epoch": 3.7980769230769234, "grad_norm": 0.3984375, "learning_rate": 0.00015597492941788222, "loss": 0.9265, "step": 395 }, { "epoch": 3.8461538461538463, "grad_norm": 0.41015625, "learning_rate": 0.00015457645101945046, "loss": 0.9273, "step": 400 }, { "epoch": 3.894230769230769, "grad_norm": 0.25, "learning_rate": 0.00015316260229232727, "loss": 0.9285, "step": 405 }, { "epoch": 3.9423076923076925, "grad_norm": 0.2138671875, "learning_rate": 0.00015173378141776568, "loss": 0.9243, "step": 410 }, { "epoch": 3.9903846153846154, "grad_norm": 0.2236328125, "learning_rate": 0.00015029039079361448, "loss": 0.92, "step": 415 }, { "epoch": 4.0, "eval_loss": 2.4799063205718994, "eval_runtime": 0.4817, "eval_samples_per_second": 20.76, "eval_steps_per_second": 2.076, "step": 416 }, { "epoch": 4.038461538461538, "grad_norm": 0.310546875, "learning_rate": 0.00014883283692099112, "loss": 0.9122, "step": 420 }, { "epoch": 4.086538461538462, "grad_norm": 0.279296875, "learning_rate": 0.00014736153028979893, "loss": 0.9007, "step": 425 }, { "epoch": 4.134615384615385, "grad_norm": 0.25390625, "learning_rate": 0.00014587688526312143, "loss": 0.9098, "step": 430 }, { "epoch": 4.1826923076923075, "grad_norm": 0.26171875, "learning_rate": 0.00014437931996052518, "loss": 0.8987, "step": 435 }, { "epoch": 4.230769230769231, "grad_norm": 0.255859375, "learning_rate": 0.00014286925614030542, "loss": 0.9044, "step": 440 }, { "epoch": 4.278846153846154, "grad_norm": 0.27734375, "learning_rate": 0.00014134711908070631, "loss": 0.8988, "step": 445 }, { "epoch": 4.326923076923077, "grad_norm": 0.298828125, "learning_rate": 0.0001398133374601501, "loss": 0.9037, "step": 450 }, { "epoch": 4.375, "grad_norm": 0.248046875, "learning_rate": 0.000138268343236509, "loss": 0.9034, "step": 455 }, { "epoch": 4.423076923076923, "grad_norm": 0.25, "learning_rate": 0.00013671257152545277, "loss": 0.8994, "step": 460 }, { "epoch": 4.471153846153846, "grad_norm": 0.265625, "learning_rate": 0.00013514646047790775, "loss": 0.9013, "step": 465 }, { "epoch": 4.519230769230769, "grad_norm": 0.23828125, "learning_rate": 0.0001335704511566605, "loss": 0.9069, "step": 470 }, { "epoch": 4.5673076923076925, "grad_norm": 0.28125, "learning_rate": 0.00013198498741214166, "loss": 0.9028, "step": 475 }, { "epoch": 4.615384615384615, "grad_norm": 0.271484375, "learning_rate": 0.0001303905157574247, "loss": 0.9059, "step": 480 }, { "epoch": 4.663461538461538, "grad_norm": 0.265625, "learning_rate": 0.00012878748524247462, "loss": 0.9004, "step": 485 }, { "epoch": 4.711538461538462, "grad_norm": 0.333984375, "learning_rate": 0.00012717634732768243, "loss": 0.905, "step": 490 }, { "epoch": 4.759615384615385, "grad_norm": 0.2734375, "learning_rate": 0.0001255575557567207, "loss": 0.9035, "step": 495 }, { "epoch": 4.8076923076923075, "grad_norm": 0.26953125, "learning_rate": 0.0001239315664287558, "loss": 0.8974, "step": 500 }, { "epoch": 4.855769230769231, "grad_norm": 0.271484375, "learning_rate": 0.00012229883727005365, "loss": 0.9034, "step": 505 }, { "epoch": 4.903846153846154, "grad_norm": 0.2431640625, "learning_rate": 0.00012065982810501404, "loss": 0.9056, "step": 510 }, { "epoch": 4.951923076923077, "grad_norm": 0.25390625, "learning_rate": 0.00011901500052667068, "loss": 0.902, "step": 515 }, { "epoch": 5.0, "grad_norm": 0.267578125, "learning_rate": 0.00011736481776669306, "loss": 0.9016, "step": 520 }, { "epoch": 5.0, "eval_loss": 2.4989540576934814, "eval_runtime": 0.4791, "eval_samples_per_second": 20.875, "eval_steps_per_second": 2.087, "step": 520 }, { "epoch": 5.048076923076923, "grad_norm": 0.25, "learning_rate": 0.00011570974456492678, "loss": 0.8816, "step": 525 }, { "epoch": 5.096153846153846, "grad_norm": 0.2578125, "learning_rate": 0.00011405024703850929, "loss": 0.8777, "step": 530 }, { "epoch": 5.144230769230769, "grad_norm": 0.2265625, "learning_rate": 0.00011238679255059752, "loss": 0.8809, "step": 535 }, { "epoch": 5.1923076923076925, "grad_norm": 0.2421875, "learning_rate": 0.00011071984957874479, "loss": 0.882, "step": 540 }, { "epoch": 5.240384615384615, "grad_norm": 0.236328125, "learning_rate": 0.0001090498875829638, "loss": 0.8787, "step": 545 }, { "epoch": 5.288461538461538, "grad_norm": 0.2333984375, "learning_rate": 0.00010737737687351284, "loss": 0.8806, "step": 550 }, { "epoch": 5.336538461538462, "grad_norm": 0.2275390625, "learning_rate": 0.00010570278847844275, "loss": 0.8859, "step": 555 }, { "epoch": 5.384615384615385, "grad_norm": 0.2578125, "learning_rate": 0.00010402659401094152, "loss": 0.8832, "step": 560 }, { "epoch": 5.4326923076923075, "grad_norm": 0.267578125, "learning_rate": 0.00010234926553651422, "loss": 0.8803, "step": 565 }, { "epoch": 5.480769230769231, "grad_norm": 0.259765625, "learning_rate": 0.00010067127544003563, "loss": 0.882, "step": 570 }, { "epoch": 5.528846153846154, "grad_norm": 0.2275390625, "learning_rate": 9.899309629271246e-05, "loss": 0.8797, "step": 575 }, { "epoch": 5.576923076923077, "grad_norm": 0.2734375, "learning_rate": 9.73152007189939e-05, "loss": 0.8818, "step": 580 }, { "epoch": 5.625, "grad_norm": 0.3828125, "learning_rate": 9.563806126346642e-05, "loss": 0.8864, "step": 585 }, { "epoch": 5.673076923076923, "grad_norm": 0.2451171875, "learning_rate": 9.396215025777139e-05, "loss": 0.8876, "step": 590 }, { "epoch": 5.721153846153846, "grad_norm": 0.279296875, "learning_rate": 9.22879396875828e-05, "loss": 0.8843, "step": 595 }, { "epoch": 5.769230769230769, "grad_norm": 0.294921875, "learning_rate": 9.061590105968208e-05, "loss": 0.8832, "step": 600 }, { "epoch": 5.8173076923076925, "grad_norm": 0.359375, "learning_rate": 8.894650526916803e-05, "loss": 0.884, "step": 605 }, { "epoch": 5.865384615384615, "grad_norm": 0.388671875, "learning_rate": 8.728022246683894e-05, "loss": 0.8814, "step": 610 }, { "epoch": 5.913461538461538, "grad_norm": 0.33203125, "learning_rate": 8.561752192678443e-05, "loss": 0.8826, "step": 615 }, { "epoch": 5.961538461538462, "grad_norm": 0.328125, "learning_rate": 8.395887191422397e-05, "loss": 0.8871, "step": 620 }, { "epoch": 6.0, "eval_loss": 2.5249791145324707, "eval_runtime": 0.4816, "eval_samples_per_second": 20.766, "eval_steps_per_second": 2.077, "step": 624 }, { "epoch": 6.009615384615385, "grad_norm": 0.234375, "learning_rate": 8.23047395536298e-05, "loss": 0.8778, "step": 625 }, { "epoch": 6.0576923076923075, "grad_norm": 0.2373046875, "learning_rate": 8.065559069717088e-05, "loss": 0.8638, "step": 630 }, { "epoch": 6.105769230769231, "grad_norm": 0.248046875, "learning_rate": 7.901188979351526e-05, "loss": 0.8612, "step": 635 }, { "epoch": 6.153846153846154, "grad_norm": 0.2431640625, "learning_rate": 7.73740997570278e-05, "loss": 0.8638, "step": 640 }, { "epoch": 6.201923076923077, "grad_norm": 0.224609375, "learning_rate": 7.574268183739989e-05, "loss": 0.866, "step": 645 }, { "epoch": 6.25, "grad_norm": 0.236328125, "learning_rate": 7.411809548974792e-05, "loss": 0.8673, "step": 650 }, { "epoch": 6.298076923076923, "grad_norm": 0.263671875, "learning_rate": 7.250079824521743e-05, "loss": 0.87, "step": 655 }, { "epoch": 6.346153846153846, "grad_norm": 0.30859375, "learning_rate": 7.089124558212871e-05, "loss": 0.8634, "step": 660 }, { "epoch": 6.394230769230769, "grad_norm": 0.23828125, "learning_rate": 6.928989079770094e-05, "loss": 0.8688, "step": 665 }, { "epoch": 6.4423076923076925, "grad_norm": 0.234375, "learning_rate": 6.769718488039023e-05, "loss": 0.864, "step": 670 }, { "epoch": 6.490384615384615, "grad_norm": 0.236328125, "learning_rate": 6.611357638287823e-05, "loss": 0.8698, "step": 675 }, { "epoch": 6.538461538461538, "grad_norm": 0.2431640625, "learning_rate": 6.453951129574644e-05, "loss": 0.865, "step": 680 }, { "epoch": 6.586538461538462, "grad_norm": 0.23046875, "learning_rate": 6.297543292187215e-05, "loss": 0.874, "step": 685 }, { "epoch": 6.634615384615385, "grad_norm": 0.2294921875, "learning_rate": 6.142178175158149e-05, "loss": 0.8694, "step": 690 }, { "epoch": 6.6826923076923075, "grad_norm": 0.2421875, "learning_rate": 5.9878995338594224e-05, "loss": 0.8688, "step": 695 }, { "epoch": 6.730769230769231, "grad_norm": 0.23046875, "learning_rate": 5.834750817679606e-05, "loss": 0.8671, "step": 700 }, { "epoch": 6.778846153846154, "grad_norm": 0.240234375, "learning_rate": 5.682775157787213e-05, "loss": 0.8666, "step": 705 }, { "epoch": 6.826923076923077, "grad_norm": 0.2158203125, "learning_rate": 5.5320153549837415e-05, "loss": 0.8623, "step": 710 }, { "epoch": 6.875, "grad_norm": 0.21484375, "learning_rate": 5.382513867649663e-05, "loss": 0.8677, "step": 715 }, { "epoch": 6.923076923076923, "grad_norm": 0.2138671875, "learning_rate": 5.234312799786921e-05, "loss": 0.8654, "step": 720 }, { "epoch": 6.971153846153846, "grad_norm": 0.208984375, "learning_rate": 5.087453889161229e-05, "loss": 0.8635, "step": 725 }, { "epoch": 7.0, "eval_loss": 2.5363287925720215, "eval_runtime": 0.4813, "eval_samples_per_second": 20.776, "eval_steps_per_second": 2.078, "step": 728 }, { "epoch": 7.019230769230769, "grad_norm": 0.2373046875, "learning_rate": 4.9419784955474524e-05, "loss": 0.8611, "step": 730 }, { "epoch": 7.0673076923076925, "grad_norm": 0.2333984375, "learning_rate": 4.797927589081509e-05, "loss": 0.8528, "step": 735 }, { "epoch": 7.115384615384615, "grad_norm": 0.24609375, "learning_rate": 4.6553417387219886e-05, "loss": 0.858, "step": 740 }, { "epoch": 7.163461538461538, "grad_norm": 0.240234375, "learning_rate": 4.514261100824709e-05, "loss": 0.8525, "step": 745 }, { "epoch": 7.211538461538462, "grad_norm": 0.2373046875, "learning_rate": 4.374725407833532e-05, "loss": 0.8593, "step": 750 }, { "epoch": 7.259615384615385, "grad_norm": 0.244140625, "learning_rate": 4.236773957090548e-05, "loss": 0.8563, "step": 755 }, { "epoch": 7.3076923076923075, "grad_norm": 0.259765625, "learning_rate": 4.100445599768774e-05, "loss": 0.8562, "step": 760 }, { "epoch": 7.355769230769231, "grad_norm": 0.2177734375, "learning_rate": 3.96577872993053e-05, "loss": 0.8574, "step": 765 }, { "epoch": 7.403846153846154, "grad_norm": 0.224609375, "learning_rate": 3.832811273714569e-05, "loss": 0.85, "step": 770 }, { "epoch": 7.451923076923077, "grad_norm": 0.2158203125, "learning_rate": 3.701580678654925e-05, "loss": 0.8537, "step": 775 }, { "epoch": 7.5, "grad_norm": 0.25390625, "learning_rate": 3.5721239031346066e-05, "loss": 0.8579, "step": 780 }, { "epoch": 7.548076923076923, "grad_norm": 0.2099609375, "learning_rate": 3.4444774059770536e-05, "loss": 0.8567, "step": 785 }, { "epoch": 7.596153846153846, "grad_norm": 0.2177734375, "learning_rate": 3.318677136178228e-05, "loss": 0.8515, "step": 790 }, { "epoch": 7.644230769230769, "grad_norm": 0.224609375, "learning_rate": 3.1947585227823394e-05, "loss": 0.8519, "step": 795 }, { "epoch": 7.6923076923076925, "grad_norm": 0.2216796875, "learning_rate": 3.072756464904006e-05, "loss": 0.8611, "step": 800 }, { "epoch": 7.740384615384615, "grad_norm": 0.21484375, "learning_rate": 2.9527053218996037e-05, "loss": 0.855, "step": 805 }, { "epoch": 7.788461538461538, "grad_norm": 0.220703125, "learning_rate": 2.8346389036906828e-05, "loss": 0.8529, "step": 810 }, { "epoch": 7.836538461538462, "grad_norm": 0.21484375, "learning_rate": 2.7185904612421176e-05, "loss": 0.8506, "step": 815 }, { "epoch": 7.884615384615385, "grad_norm": 0.2236328125, "learning_rate": 2.6045926771976303e-05, "loss": 0.8563, "step": 820 }, { "epoch": 7.9326923076923075, "grad_norm": 0.2109375, "learning_rate": 2.492677656675414e-05, "loss": 0.861, "step": 825 }, { "epoch": 7.980769230769231, "grad_norm": 0.2177734375, "learning_rate": 2.382876918226409e-05, "loss": 0.8535, "step": 830 }, { "epoch": 8.0, "eval_loss": 2.5545527935028076, "eval_runtime": 0.4771, "eval_samples_per_second": 20.959, "eval_steps_per_second": 2.096, "step": 832 }, { "epoch": 8.028846153846153, "grad_norm": 0.2158203125, "learning_rate": 2.2752213849577188e-05, "loss": 0.8505, "step": 835 }, { "epoch": 8.076923076923077, "grad_norm": 0.2177734375, "learning_rate": 2.1697413758237784e-05, "loss": 0.8473, "step": 840 }, { "epoch": 8.125, "grad_norm": 0.220703125, "learning_rate": 2.0664665970876496e-05, "loss": 0.8438, "step": 845 }, { "epoch": 8.173076923076923, "grad_norm": 0.212890625, "learning_rate": 1.965426133954854e-05, "loss": 0.8551, "step": 850 }, { "epoch": 8.221153846153847, "grad_norm": 0.2119140625, "learning_rate": 1.8666484423821373e-05, "loss": 0.8528, "step": 855 }, { "epoch": 8.26923076923077, "grad_norm": 0.2236328125, "learning_rate": 1.7701613410634365e-05, "loss": 0.848, "step": 860 }, { "epoch": 8.317307692307692, "grad_norm": 0.216796875, "learning_rate": 1.6759920035953093e-05, "loss": 0.8475, "step": 865 }, { "epoch": 8.365384615384615, "grad_norm": 0.208984375, "learning_rate": 1.584166950824061e-05, "loss": 0.8535, "step": 870 }, { "epoch": 8.413461538461538, "grad_norm": 0.2119140625, "learning_rate": 1.4947120433767047e-05, "loss": 0.8508, "step": 875 }, { "epoch": 8.461538461538462, "grad_norm": 0.212890625, "learning_rate": 1.4076524743778319e-05, "loss": 0.8459, "step": 880 }, { "epoch": 8.509615384615385, "grad_norm": 0.21484375, "learning_rate": 1.3230127623545064e-05, "loss": 0.8481, "step": 885 }, { "epoch": 8.557692307692308, "grad_norm": 0.2109375, "learning_rate": 1.2408167443311214e-05, "loss": 0.8508, "step": 890 }, { "epoch": 8.60576923076923, "grad_norm": 0.212890625, "learning_rate": 1.1610875691161915e-05, "loss": 0.8489, "step": 895 }, { "epoch": 8.653846153846153, "grad_norm": 0.2060546875, "learning_rate": 1.083847690782972e-05, "loss": 0.8423, "step": 900 }, { "epoch": 8.701923076923077, "grad_norm": 0.22265625, "learning_rate": 1.0091188623457415e-05, "loss": 0.8562, "step": 905 }, { "epoch": 8.75, "grad_norm": 0.2138671875, "learning_rate": 9.369221296335006e-06, "loss": 0.8515, "step": 910 }, { "epoch": 8.798076923076923, "grad_norm": 0.2138671875, "learning_rate": 8.672778253628621e-06, "loss": 0.8497, "step": 915 }, { "epoch": 8.846153846153847, "grad_norm": 0.208984375, "learning_rate": 8.002055634117578e-06, "loss": 0.8469, "step": 920 }, { "epoch": 8.89423076923077, "grad_norm": 0.208984375, "learning_rate": 7.357242332955916e-06, "loss": 0.8509, "step": 925 }, { "epoch": 8.942307692307692, "grad_norm": 0.208984375, "learning_rate": 6.738519948473976e-06, "loss": 0.8545, "step": 930 }, { "epoch": 8.990384615384615, "grad_norm": 0.205078125, "learning_rate": 6.146062731035129e-06, "loss": 0.845, "step": 935 }, { "epoch": 9.0, "eval_loss": 2.5566022396087646, "eval_runtime": 0.4819, "eval_samples_per_second": 20.752, "eval_steps_per_second": 2.075, "step": 936 }, { "epoch": 9.038461538461538, "grad_norm": 0.2138671875, "learning_rate": 5.580037533961546e-06, "loss": 0.8439, "step": 940 }, { "epoch": 9.086538461538462, "grad_norm": 0.2060546875, "learning_rate": 5.040603766543594e-06, "loss": 0.8496, "step": 945 }, { "epoch": 9.134615384615385, "grad_norm": 0.208984375, "learning_rate": 4.527913349145441e-06, "loss": 0.8483, "step": 950 }, { "epoch": 9.182692307692308, "grad_norm": 0.2060546875, "learning_rate": 4.042110670419763e-06, "loss": 0.8508, "step": 955 }, { "epoch": 9.23076923076923, "grad_norm": 0.205078125, "learning_rate": 3.5833325466437694e-06, "loss": 0.8508, "step": 960 }, { "epoch": 9.278846153846153, "grad_norm": 0.2021484375, "learning_rate": 3.1517081831876737e-06, "loss": 0.8437, "step": 965 }, { "epoch": 9.326923076923077, "grad_norm": 0.212890625, "learning_rate": 2.7473591381266708e-06, "loss": 0.845, "step": 970 }, { "epoch": 9.375, "grad_norm": 0.2060546875, "learning_rate": 2.3703992880066638e-06, "loss": 0.8428, "step": 975 }, { "epoch": 9.423076923076923, "grad_norm": 0.20703125, "learning_rate": 2.0209347957732328e-06, "loss": 0.8495, "step": 980 }, { "epoch": 9.471153846153847, "grad_norm": 0.2138671875, "learning_rate": 1.6990640808730696e-06, "loss": 0.846, "step": 985 }, { "epoch": 9.51923076923077, "grad_norm": 0.2109375, "learning_rate": 1.404877791536141e-06, "loss": 0.8475, "step": 990 }, { "epoch": 9.567307692307692, "grad_norm": 0.203125, "learning_rate": 1.1384587792465872e-06, "loss": 0.8478, "step": 995 }, { "epoch": 9.615384615384615, "grad_norm": 0.21875, "learning_rate": 8.998820754091531e-07, "loss": 0.844, "step": 1000 }, { "epoch": 9.663461538461538, "grad_norm": 0.2060546875, "learning_rate": 6.892148702183133e-07, "loss": 0.8478, "step": 1005 }, { "epoch": 9.711538461538462, "grad_norm": 0.2060546875, "learning_rate": 5.065164937354428e-07, "loss": 0.8459, "step": 1010 }, { "epoch": 9.759615384615385, "grad_norm": 0.2041015625, "learning_rate": 3.5183839917972697e-07, "loss": 0.8475, "step": 1015 }, { "epoch": 9.807692307692308, "grad_norm": 0.2216796875, "learning_rate": 2.2522414843748618e-07, "loss": 0.8533, "step": 1020 }, { "epoch": 9.85576923076923, "grad_norm": 0.205078125, "learning_rate": 1.2670939979384512e-07, "loss": 0.8547, "step": 1025 }, { "epoch": 9.903846153846153, "grad_norm": 0.2080078125, "learning_rate": 5.632189789027687e-08, "loss": 0.8463, "step": 1030 }, { "epoch": 9.951923076923077, "grad_norm": 0.2060546875, "learning_rate": 1.4081465910975588e-08, "loss": 0.8418, "step": 1035 }, { "epoch": 10.0, "grad_norm": 0.2080078125, "learning_rate": 0.0, "loss": 0.853, "step": 1040 }, { "epoch": 10.0, "eval_loss": 2.5572547912597656, "eval_runtime": 0.4824, "eval_samples_per_second": 20.728, "eval_steps_per_second": 2.073, "step": 1040 }, { "epoch": 10.0, "step": 1040, "total_flos": 1.6530423313701274e+18, "train_loss": 0.977329820394516, "train_runtime": 7126.3412, "train_samples_per_second": 18.665, "train_steps_per_second": 0.146 } ], "logging_steps": 5, "max_steps": 1040, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.6530423313701274e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }