{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.993815708101423, "eval_steps": 500, "global_step": 8080, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012368583797155227, "grad_norm": 6.28125, "learning_rate": 2.4752475247524754e-07, "loss": 2.8093, "step": 1 }, { "epoch": 0.006184291898577613, "grad_norm": 4.125, "learning_rate": 1.2376237623762377e-06, "loss": 2.8212, "step": 5 }, { "epoch": 0.012368583797155226, "grad_norm": 4.03125, "learning_rate": 2.4752475247524753e-06, "loss": 2.7914, "step": 10 }, { "epoch": 0.01855287569573284, "grad_norm": 3.421875, "learning_rate": 3.7128712871287128e-06, "loss": 2.7819, "step": 15 }, { "epoch": 0.024737167594310452, "grad_norm": 3.546875, "learning_rate": 4.950495049504951e-06, "loss": 2.8066, "step": 20 }, { "epoch": 0.030921459492888066, "grad_norm": 6.25, "learning_rate": 6.1881188118811885e-06, "loss": 2.7294, "step": 25 }, { "epoch": 0.03710575139146568, "grad_norm": 2.4375, "learning_rate": 7.4257425742574256e-06, "loss": 2.668, "step": 30 }, { "epoch": 0.04329004329004329, "grad_norm": 13.375, "learning_rate": 8.663366336633663e-06, "loss": 2.6216, "step": 35 }, { "epoch": 0.049474335188620905, "grad_norm": 3.4375, "learning_rate": 9.900990099009901e-06, "loss": 2.5458, "step": 40 }, { "epoch": 0.055658627087198514, "grad_norm": 3.484375, "learning_rate": 1.113861386138614e-05, "loss": 2.4339, "step": 45 }, { "epoch": 0.06184291898577613, "grad_norm": 2.140625, "learning_rate": 1.2376237623762377e-05, "loss": 2.3575, "step": 50 }, { "epoch": 0.06802721088435375, "grad_norm": 1.125, "learning_rate": 1.3613861386138616e-05, "loss": 2.2852, "step": 55 }, { "epoch": 0.07421150278293136, "grad_norm": 1.3671875, "learning_rate": 1.4851485148514851e-05, "loss": 2.2098, "step": 60 }, { "epoch": 0.08039579468150897, "grad_norm": 6.09375, "learning_rate": 1.608910891089109e-05, "loss": 2.1279, "step": 65 }, { "epoch": 0.08658008658008658, "grad_norm": 1.625, "learning_rate": 1.7326732673267325e-05, "loss": 2.0726, "step": 70 }, { "epoch": 0.09276437847866419, "grad_norm": 1.2578125, "learning_rate": 1.8564356435643564e-05, "loss": 1.9797, "step": 75 }, { "epoch": 0.09894867037724181, "grad_norm": 1.09375, "learning_rate": 1.9801980198019803e-05, "loss": 1.9154, "step": 80 }, { "epoch": 0.10513296227581942, "grad_norm": 1.9140625, "learning_rate": 2.103960396039604e-05, "loss": 1.8348, "step": 85 }, { "epoch": 0.11131725417439703, "grad_norm": 1.890625, "learning_rate": 2.227722772277228e-05, "loss": 1.7348, "step": 90 }, { "epoch": 0.11750154607297464, "grad_norm": 1.296875, "learning_rate": 2.3514851485148515e-05, "loss": 1.705, "step": 95 }, { "epoch": 0.12368583797155226, "grad_norm": 1.875, "learning_rate": 2.4752475247524754e-05, "loss": 1.643, "step": 100 }, { "epoch": 0.12987012987012986, "grad_norm": 0.5703125, "learning_rate": 2.5990099009900993e-05, "loss": 1.5904, "step": 105 }, { "epoch": 0.1360544217687075, "grad_norm": 2.078125, "learning_rate": 2.722772277227723e-05, "loss": 1.537, "step": 110 }, { "epoch": 0.1422387136672851, "grad_norm": 0.546875, "learning_rate": 2.8465346534653464e-05, "loss": 1.4979, "step": 115 }, { "epoch": 0.14842300556586271, "grad_norm": 0.474609375, "learning_rate": 2.9702970297029702e-05, "loss": 1.4599, "step": 120 }, { "epoch": 0.15460729746444032, "grad_norm": 0.68359375, "learning_rate": 3.094059405940594e-05, "loss": 1.4298, "step": 125 }, { "epoch": 0.16079158936301793, "grad_norm": 0.57421875, "learning_rate": 3.217821782178218e-05, "loss": 1.3996, "step": 130 }, { "epoch": 0.16697588126159554, "grad_norm": 0.431640625, "learning_rate": 3.341584158415842e-05, "loss": 1.3705, "step": 135 }, { "epoch": 0.17316017316017315, "grad_norm": 0.482421875, "learning_rate": 3.465346534653465e-05, "loss": 1.3501, "step": 140 }, { "epoch": 0.17934446505875076, "grad_norm": 0.373046875, "learning_rate": 3.589108910891089e-05, "loss": 1.3185, "step": 145 }, { "epoch": 0.18552875695732837, "grad_norm": 0.376953125, "learning_rate": 3.712871287128713e-05, "loss": 1.3105, "step": 150 }, { "epoch": 0.191713048855906, "grad_norm": 0.734375, "learning_rate": 3.8366336633663367e-05, "loss": 1.2914, "step": 155 }, { "epoch": 0.19789734075448362, "grad_norm": 0.423828125, "learning_rate": 3.9603960396039605e-05, "loss": 1.2799, "step": 160 }, { "epoch": 0.20408163265306123, "grad_norm": 0.4765625, "learning_rate": 4.0841584158415844e-05, "loss": 1.2651, "step": 165 }, { "epoch": 0.21026592455163884, "grad_norm": 0.5078125, "learning_rate": 4.207920792079208e-05, "loss": 1.2502, "step": 170 }, { "epoch": 0.21645021645021645, "grad_norm": 0.486328125, "learning_rate": 4.331683168316832e-05, "loss": 1.2377, "step": 175 }, { "epoch": 0.22263450834879406, "grad_norm": 0.46484375, "learning_rate": 4.455445544554456e-05, "loss": 1.2192, "step": 180 }, { "epoch": 0.22881880024737167, "grad_norm": 0.5078125, "learning_rate": 4.57920792079208e-05, "loss": 1.2274, "step": 185 }, { "epoch": 0.23500309214594928, "grad_norm": 0.578125, "learning_rate": 4.702970297029703e-05, "loss": 1.2065, "step": 190 }, { "epoch": 0.24118738404452691, "grad_norm": 0.380859375, "learning_rate": 4.826732673267327e-05, "loss": 1.1975, "step": 195 }, { "epoch": 0.24737167594310452, "grad_norm": 0.35546875, "learning_rate": 4.950495049504951e-05, "loss": 1.188, "step": 200 }, { "epoch": 0.2535559678416821, "grad_norm": 0.376953125, "learning_rate": 5.074257425742575e-05, "loss": 1.1719, "step": 205 }, { "epoch": 0.2597402597402597, "grad_norm": 0.5078125, "learning_rate": 5.1980198019801986e-05, "loss": 1.187, "step": 210 }, { "epoch": 0.2659245516388373, "grad_norm": 0.439453125, "learning_rate": 5.3217821782178224e-05, "loss": 1.1684, "step": 215 }, { "epoch": 0.272108843537415, "grad_norm": 0.490234375, "learning_rate": 5.445544554455446e-05, "loss": 1.1803, "step": 220 }, { "epoch": 0.2782931354359926, "grad_norm": 0.46875, "learning_rate": 5.56930693069307e-05, "loss": 1.1543, "step": 225 }, { "epoch": 0.2844774273345702, "grad_norm": 0.486328125, "learning_rate": 5.693069306930693e-05, "loss": 1.1618, "step": 230 }, { "epoch": 0.2906617192331478, "grad_norm": 0.376953125, "learning_rate": 5.8168316831683166e-05, "loss": 1.1529, "step": 235 }, { "epoch": 0.29684601113172543, "grad_norm": 0.4765625, "learning_rate": 5.9405940594059404e-05, "loss": 1.1397, "step": 240 }, { "epoch": 0.30303030303030304, "grad_norm": 0.462890625, "learning_rate": 6.064356435643564e-05, "loss": 1.1355, "step": 245 }, { "epoch": 0.30921459492888065, "grad_norm": 0.57421875, "learning_rate": 6.188118811881188e-05, "loss": 1.1376, "step": 250 }, { "epoch": 0.31539888682745826, "grad_norm": 0.83203125, "learning_rate": 6.311881188118812e-05, "loss": 1.1359, "step": 255 }, { "epoch": 0.32158317872603587, "grad_norm": 0.64453125, "learning_rate": 6.435643564356436e-05, "loss": 1.1293, "step": 260 }, { "epoch": 0.3277674706246135, "grad_norm": 0.78125, "learning_rate": 6.55940594059406e-05, "loss": 1.117, "step": 265 }, { "epoch": 0.3339517625231911, "grad_norm": 0.54296875, "learning_rate": 6.683168316831684e-05, "loss": 1.1217, "step": 270 }, { "epoch": 0.3401360544217687, "grad_norm": 0.62109375, "learning_rate": 6.806930693069308e-05, "loss": 1.1266, "step": 275 }, { "epoch": 0.3463203463203463, "grad_norm": 0.90234375, "learning_rate": 6.93069306930693e-05, "loss": 1.1214, "step": 280 }, { "epoch": 0.3525046382189239, "grad_norm": 0.4921875, "learning_rate": 7.054455445544554e-05, "loss": 1.1032, "step": 285 }, { "epoch": 0.3586889301175015, "grad_norm": 0.76171875, "learning_rate": 7.178217821782178e-05, "loss": 1.1243, "step": 290 }, { "epoch": 0.36487322201607914, "grad_norm": 0.455078125, "learning_rate": 7.301980198019802e-05, "loss": 1.1129, "step": 295 }, { "epoch": 0.37105751391465674, "grad_norm": 0.4609375, "learning_rate": 7.425742574257426e-05, "loss": 1.0952, "step": 300 }, { "epoch": 0.3772418058132344, "grad_norm": 0.453125, "learning_rate": 7.54950495049505e-05, "loss": 1.1083, "step": 305 }, { "epoch": 0.383426097711812, "grad_norm": 0.50390625, "learning_rate": 7.673267326732673e-05, "loss": 1.0979, "step": 310 }, { "epoch": 0.38961038961038963, "grad_norm": 0.494140625, "learning_rate": 7.797029702970297e-05, "loss": 1.1026, "step": 315 }, { "epoch": 0.39579468150896724, "grad_norm": 0.443359375, "learning_rate": 7.920792079207921e-05, "loss": 1.0854, "step": 320 }, { "epoch": 0.40197897340754485, "grad_norm": 0.55078125, "learning_rate": 8.044554455445545e-05, "loss": 1.0892, "step": 325 }, { "epoch": 0.40816326530612246, "grad_norm": 1.1640625, "learning_rate": 8.168316831683169e-05, "loss": 1.0795, "step": 330 }, { "epoch": 0.41434755720470007, "grad_norm": 0.67578125, "learning_rate": 8.292079207920793e-05, "loss": 1.099, "step": 335 }, { "epoch": 0.4205318491032777, "grad_norm": 0.65625, "learning_rate": 8.415841584158417e-05, "loss": 1.087, "step": 340 }, { "epoch": 0.4267161410018553, "grad_norm": 0.48828125, "learning_rate": 8.53960396039604e-05, "loss": 1.0812, "step": 345 }, { "epoch": 0.4329004329004329, "grad_norm": 0.69921875, "learning_rate": 8.663366336633664e-05, "loss": 1.0831, "step": 350 }, { "epoch": 0.4390847247990105, "grad_norm": 0.5390625, "learning_rate": 8.787128712871288e-05, "loss": 1.072, "step": 355 }, { "epoch": 0.4452690166975881, "grad_norm": 0.67578125, "learning_rate": 8.910891089108912e-05, "loss": 1.0684, "step": 360 }, { "epoch": 0.4514533085961657, "grad_norm": 0.625, "learning_rate": 9.034653465346536e-05, "loss": 1.0639, "step": 365 }, { "epoch": 0.45763760049474334, "grad_norm": 1.234375, "learning_rate": 9.15841584158416e-05, "loss": 1.0802, "step": 370 }, { "epoch": 0.46382189239332094, "grad_norm": 0.78515625, "learning_rate": 9.282178217821784e-05, "loss": 1.0749, "step": 375 }, { "epoch": 0.47000618429189855, "grad_norm": 1.03125, "learning_rate": 9.405940594059406e-05, "loss": 1.0647, "step": 380 }, { "epoch": 0.47619047619047616, "grad_norm": 0.78515625, "learning_rate": 9.52970297029703e-05, "loss": 1.0707, "step": 385 }, { "epoch": 0.48237476808905383, "grad_norm": 1.0703125, "learning_rate": 9.653465346534654e-05, "loss": 1.0695, "step": 390 }, { "epoch": 0.48855905998763144, "grad_norm": 0.671875, "learning_rate": 9.777227722772278e-05, "loss": 1.0599, "step": 395 }, { "epoch": 0.49474335188620905, "grad_norm": 0.62890625, "learning_rate": 9.900990099009902e-05, "loss": 1.0522, "step": 400 }, { "epoch": 0.5009276437847866, "grad_norm": 0.4453125, "learning_rate": 0.00010024752475247526, "loss": 1.0553, "step": 405 }, { "epoch": 0.5071119356833642, "grad_norm": 0.482421875, "learning_rate": 0.0001014851485148515, "loss": 1.0575, "step": 410 }, { "epoch": 0.5132962275819418, "grad_norm": 0.57421875, "learning_rate": 0.00010272277227722773, "loss": 1.0644, "step": 415 }, { "epoch": 0.5194805194805194, "grad_norm": 1.015625, "learning_rate": 0.00010396039603960397, "loss": 1.0479, "step": 420 }, { "epoch": 0.525664811379097, "grad_norm": 0.60546875, "learning_rate": 0.00010519801980198021, "loss": 1.0548, "step": 425 }, { "epoch": 0.5318491032776747, "grad_norm": 0.56640625, "learning_rate": 0.00010643564356435645, "loss": 1.0569, "step": 430 }, { "epoch": 0.5380333951762524, "grad_norm": 0.486328125, "learning_rate": 0.00010767326732673269, "loss": 1.0541, "step": 435 }, { "epoch": 0.54421768707483, "grad_norm": 0.4765625, "learning_rate": 0.00010891089108910893, "loss": 1.0588, "step": 440 }, { "epoch": 0.5504019789734076, "grad_norm": 0.55078125, "learning_rate": 0.00011014851485148517, "loss": 1.0609, "step": 445 }, { "epoch": 0.5565862708719852, "grad_norm": 0.6171875, "learning_rate": 0.0001113861386138614, "loss": 1.0463, "step": 450 }, { "epoch": 0.5627705627705628, "grad_norm": 0.51953125, "learning_rate": 0.00011262376237623762, "loss": 1.0491, "step": 455 }, { "epoch": 0.5689548546691404, "grad_norm": 0.455078125, "learning_rate": 0.00011386138613861385, "loss": 1.0472, "step": 460 }, { "epoch": 0.575139146567718, "grad_norm": 0.55859375, "learning_rate": 0.00011509900990099009, "loss": 1.0456, "step": 465 }, { "epoch": 0.5813234384662956, "grad_norm": 0.515625, "learning_rate": 0.00011633663366336633, "loss": 1.0289, "step": 470 }, { "epoch": 0.5875077303648732, "grad_norm": 0.447265625, "learning_rate": 0.00011757425742574257, "loss": 1.04, "step": 475 }, { "epoch": 0.5936920222634509, "grad_norm": 0.5234375, "learning_rate": 0.00011881188118811881, "loss": 1.0322, "step": 480 }, { "epoch": 0.5998763141620285, "grad_norm": 0.4609375, "learning_rate": 0.00012004950495049505, "loss": 1.0225, "step": 485 }, { "epoch": 0.6060606060606061, "grad_norm": 0.451171875, "learning_rate": 0.00012128712871287129, "loss": 1.0438, "step": 490 }, { "epoch": 0.6122448979591837, "grad_norm": 0.6484375, "learning_rate": 0.00012252475247524753, "loss": 1.0431, "step": 495 }, { "epoch": 0.6184291898577613, "grad_norm": 0.65234375, "learning_rate": 0.00012376237623762376, "loss": 1.0262, "step": 500 }, { "epoch": 0.6246134817563389, "grad_norm": 0.60546875, "learning_rate": 0.000125, "loss": 1.0338, "step": 505 }, { "epoch": 0.6307977736549165, "grad_norm": 0.65625, "learning_rate": 0.00012623762376237624, "loss": 1.0423, "step": 510 }, { "epoch": 0.6369820655534941, "grad_norm": 0.3984375, "learning_rate": 0.00012747524752475248, "loss": 1.0428, "step": 515 }, { "epoch": 0.6431663574520717, "grad_norm": 0.57421875, "learning_rate": 0.00012871287128712872, "loss": 1.0311, "step": 520 }, { "epoch": 0.6493506493506493, "grad_norm": 0.5859375, "learning_rate": 0.00012995049504950496, "loss": 1.0284, "step": 525 }, { "epoch": 0.655534941249227, "grad_norm": 0.5546875, "learning_rate": 0.0001311881188118812, "loss": 1.0259, "step": 530 }, { "epoch": 0.6617192331478046, "grad_norm": 0.6171875, "learning_rate": 0.00013242574257425743, "loss": 1.0183, "step": 535 }, { "epoch": 0.6679035250463822, "grad_norm": 1.1015625, "learning_rate": 0.00013366336633663367, "loss": 1.0342, "step": 540 }, { "epoch": 0.6740878169449598, "grad_norm": 0.64453125, "learning_rate": 0.0001349009900990099, "loss": 1.0192, "step": 545 }, { "epoch": 0.6802721088435374, "grad_norm": 0.74609375, "learning_rate": 0.00013613861386138615, "loss": 1.0251, "step": 550 }, { "epoch": 0.686456400742115, "grad_norm": 1.0078125, "learning_rate": 0.0001373762376237624, "loss": 1.0268, "step": 555 }, { "epoch": 0.6926406926406926, "grad_norm": 0.7265625, "learning_rate": 0.0001386138613861386, "loss": 1.0152, "step": 560 }, { "epoch": 0.6988249845392702, "grad_norm": 0.703125, "learning_rate": 0.00013985148514851484, "loss": 1.0283, "step": 565 }, { "epoch": 0.7050092764378478, "grad_norm": 0.53125, "learning_rate": 0.00014108910891089108, "loss": 1.0177, "step": 570 }, { "epoch": 0.7111935683364254, "grad_norm": 0.58203125, "learning_rate": 0.00014232673267326732, "loss": 1.0115, "step": 575 }, { "epoch": 0.717377860235003, "grad_norm": 0.5546875, "learning_rate": 0.00014356435643564356, "loss": 1.0195, "step": 580 }, { "epoch": 0.7235621521335807, "grad_norm": 0.5703125, "learning_rate": 0.0001448019801980198, "loss": 1.0104, "step": 585 }, { "epoch": 0.7297464440321583, "grad_norm": 0.6171875, "learning_rate": 0.00014603960396039603, "loss": 1.0054, "step": 590 }, { "epoch": 0.7359307359307359, "grad_norm": 0.76953125, "learning_rate": 0.00014727722772277227, "loss": 1.0204, "step": 595 }, { "epoch": 0.7421150278293135, "grad_norm": 0.447265625, "learning_rate": 0.0001485148514851485, "loss": 0.9982, "step": 600 }, { "epoch": 0.7482993197278912, "grad_norm": 0.498046875, "learning_rate": 0.00014975247524752475, "loss": 1.0143, "step": 605 }, { "epoch": 0.7544836116264688, "grad_norm": 0.6171875, "learning_rate": 0.000150990099009901, "loss": 1.0077, "step": 610 }, { "epoch": 0.7606679035250464, "grad_norm": 0.515625, "learning_rate": 0.00015222772277227723, "loss": 1.0123, "step": 615 }, { "epoch": 0.766852195423624, "grad_norm": 0.69921875, "learning_rate": 0.00015346534653465347, "loss": 1.0123, "step": 620 }, { "epoch": 0.7730364873222016, "grad_norm": 0.4765625, "learning_rate": 0.0001547029702970297, "loss": 1.012, "step": 625 }, { "epoch": 0.7792207792207793, "grad_norm": 0.423828125, "learning_rate": 0.00015594059405940594, "loss": 1.0055, "step": 630 }, { "epoch": 0.7854050711193569, "grad_norm": 0.42578125, "learning_rate": 0.00015717821782178218, "loss": 1.0076, "step": 635 }, { "epoch": 0.7915893630179345, "grad_norm": 0.89453125, "learning_rate": 0.00015841584158415842, "loss": 1.007, "step": 640 }, { "epoch": 0.7977736549165121, "grad_norm": 0.462890625, "learning_rate": 0.00015965346534653466, "loss": 1.004, "step": 645 }, { "epoch": 0.8039579468150897, "grad_norm": 0.4765625, "learning_rate": 0.0001608910891089109, "loss": 1.0002, "step": 650 }, { "epoch": 0.8101422387136673, "grad_norm": 0.412109375, "learning_rate": 0.00016212871287128714, "loss": 0.9985, "step": 655 }, { "epoch": 0.8163265306122449, "grad_norm": 0.392578125, "learning_rate": 0.00016336633663366338, "loss": 0.9997, "step": 660 }, { "epoch": 0.8225108225108225, "grad_norm": 0.421875, "learning_rate": 0.00016460396039603961, "loss": 0.9955, "step": 665 }, { "epoch": 0.8286951144094001, "grad_norm": 0.490234375, "learning_rate": 0.00016584158415841585, "loss": 0.9878, "step": 670 }, { "epoch": 0.8348794063079777, "grad_norm": 0.439453125, "learning_rate": 0.0001670792079207921, "loss": 1.0049, "step": 675 }, { "epoch": 0.8410636982065554, "grad_norm": 0.70703125, "learning_rate": 0.00016831683168316833, "loss": 1.0004, "step": 680 }, { "epoch": 0.847247990105133, "grad_norm": 0.74609375, "learning_rate": 0.00016955445544554457, "loss": 0.9907, "step": 685 }, { "epoch": 0.8534322820037106, "grad_norm": 0.6015625, "learning_rate": 0.0001707920792079208, "loss": 0.992, "step": 690 }, { "epoch": 0.8596165739022882, "grad_norm": 0.671875, "learning_rate": 0.00017202970297029705, "loss": 0.9973, "step": 695 }, { "epoch": 0.8658008658008658, "grad_norm": 0.47265625, "learning_rate": 0.00017326732673267329, "loss": 1.0015, "step": 700 }, { "epoch": 0.8719851576994434, "grad_norm": 0.55859375, "learning_rate": 0.00017450495049504952, "loss": 0.9928, "step": 705 }, { "epoch": 0.878169449598021, "grad_norm": 0.51171875, "learning_rate": 0.00017574257425742576, "loss": 0.9891, "step": 710 }, { "epoch": 0.8843537414965986, "grad_norm": 0.376953125, "learning_rate": 0.000176980198019802, "loss": 0.9895, "step": 715 }, { "epoch": 0.8905380333951762, "grad_norm": 0.53515625, "learning_rate": 0.00017821782178217824, "loss": 0.9783, "step": 720 }, { "epoch": 0.8967223252937538, "grad_norm": 0.9375, "learning_rate": 0.00017945544554455448, "loss": 0.9943, "step": 725 }, { "epoch": 0.9029066171923315, "grad_norm": 0.63671875, "learning_rate": 0.00018069306930693072, "loss": 0.9906, "step": 730 }, { "epoch": 0.9090909090909091, "grad_norm": 0.416015625, "learning_rate": 0.00018193069306930696, "loss": 0.9906, "step": 735 }, { "epoch": 0.9152752009894867, "grad_norm": 0.5234375, "learning_rate": 0.0001831683168316832, "loss": 0.9826, "step": 740 }, { "epoch": 0.9214594928880643, "grad_norm": 0.470703125, "learning_rate": 0.00018440594059405943, "loss": 0.9912, "step": 745 }, { "epoch": 0.9276437847866419, "grad_norm": 0.4609375, "learning_rate": 0.00018564356435643567, "loss": 0.9796, "step": 750 }, { "epoch": 0.9338280766852195, "grad_norm": 0.53125, "learning_rate": 0.0001868811881188119, "loss": 0.9801, "step": 755 }, { "epoch": 0.9400123685837971, "grad_norm": 0.5234375, "learning_rate": 0.00018811881188118812, "loss": 0.9776, "step": 760 }, { "epoch": 0.9461966604823747, "grad_norm": 0.380859375, "learning_rate": 0.00018935643564356436, "loss": 0.9785, "step": 765 }, { "epoch": 0.9523809523809523, "grad_norm": 0.40625, "learning_rate": 0.0001905940594059406, "loss": 0.9771, "step": 770 }, { "epoch": 0.95856524427953, "grad_norm": 0.44140625, "learning_rate": 0.00019183168316831684, "loss": 0.9786, "step": 775 }, { "epoch": 0.9647495361781077, "grad_norm": 0.443359375, "learning_rate": 0.00019306930693069308, "loss": 0.9706, "step": 780 }, { "epoch": 0.9709338280766853, "grad_norm": 0.4296875, "learning_rate": 0.00019430693069306932, "loss": 0.9776, "step": 785 }, { "epoch": 0.9771181199752629, "grad_norm": 0.486328125, "learning_rate": 0.00019554455445544556, "loss": 0.9755, "step": 790 }, { "epoch": 0.9833024118738405, "grad_norm": 0.42578125, "learning_rate": 0.0001967821782178218, "loss": 0.9723, "step": 795 }, { "epoch": 0.9894867037724181, "grad_norm": 0.609375, "learning_rate": 0.00019801980198019803, "loss": 0.9878, "step": 800 }, { "epoch": 0.9956709956709957, "grad_norm": 0.4609375, "learning_rate": 0.00019925742574257427, "loss": 0.9714, "step": 805 }, { "epoch": 0.9993815708101422, "eval_loss": 2.4534544944763184, "eval_runtime": 0.806, "eval_samples_per_second": 12.408, "eval_steps_per_second": 1.241, "step": 808 }, { "epoch": 1.0018552875695732, "grad_norm": 0.423828125, "learning_rate": 0.0001999999626730957, "loss": 0.9646, "step": 810 }, { "epoch": 1.008039579468151, "grad_norm": 0.5, "learning_rate": 0.000199999542745742, "loss": 0.9632, "step": 815 }, { "epoch": 1.0142238713667284, "grad_norm": 0.5390625, "learning_rate": 0.00019999865623437013, "loss": 0.9571, "step": 820 }, { "epoch": 1.0204081632653061, "grad_norm": 0.4609375, "learning_rate": 0.00019999730314311637, "loss": 0.9559, "step": 825 }, { "epoch": 1.0265924551638836, "grad_norm": 0.4140625, "learning_rate": 0.0001999954834782941, "loss": 0.9832, "step": 830 }, { "epoch": 1.0327767470624614, "grad_norm": 0.431640625, "learning_rate": 0.00019999319724839354, "loss": 0.9583, "step": 835 }, { "epoch": 1.0389610389610389, "grad_norm": 0.486328125, "learning_rate": 0.000199990444464082, "loss": 0.9696, "step": 840 }, { "epoch": 1.0451453308596166, "grad_norm": 0.73046875, "learning_rate": 0.0001999872251382036, "loss": 0.9661, "step": 845 }, { "epoch": 1.051329622758194, "grad_norm": 0.54296875, "learning_rate": 0.00019998353928577919, "loss": 0.9668, "step": 850 }, { "epoch": 1.0575139146567718, "grad_norm": 0.5078125, "learning_rate": 0.00019997938692400648, "loss": 0.9591, "step": 855 }, { "epoch": 1.0636982065553493, "grad_norm": 0.34375, "learning_rate": 0.00019997476807225985, "loss": 0.9564, "step": 860 }, { "epoch": 1.069882498453927, "grad_norm": 0.369140625, "learning_rate": 0.0001999696827520902, "loss": 0.9541, "step": 865 }, { "epoch": 1.0760667903525047, "grad_norm": 0.375, "learning_rate": 0.00019996413098722493, "loss": 0.9466, "step": 870 }, { "epoch": 1.0822510822510822, "grad_norm": 0.38671875, "learning_rate": 0.00019995811280356778, "loss": 0.9512, "step": 875 }, { "epoch": 1.08843537414966, "grad_norm": 0.37109375, "learning_rate": 0.00019995162822919883, "loss": 0.9392, "step": 880 }, { "epoch": 1.0946196660482375, "grad_norm": 0.412109375, "learning_rate": 0.00019994467729437412, "loss": 0.9599, "step": 885 }, { "epoch": 1.1008039579468152, "grad_norm": 0.439453125, "learning_rate": 0.00019993726003152582, "loss": 0.9506, "step": 890 }, { "epoch": 1.1069882498453927, "grad_norm": 0.412109375, "learning_rate": 0.0001999293764752618, "loss": 0.9553, "step": 895 }, { "epoch": 1.1131725417439704, "grad_norm": 0.453125, "learning_rate": 0.00019992102666236566, "loss": 0.9369, "step": 900 }, { "epoch": 1.119356833642548, "grad_norm": 0.451171875, "learning_rate": 0.00019991221063179652, "loss": 0.9516, "step": 905 }, { "epoch": 1.1255411255411256, "grad_norm": 0.40625, "learning_rate": 0.00019990292842468868, "loss": 0.954, "step": 910 }, { "epoch": 1.1317254174397031, "grad_norm": 0.396484375, "learning_rate": 0.00019989318008435165, "loss": 0.9495, "step": 915 }, { "epoch": 1.1379097093382808, "grad_norm": 0.484375, "learning_rate": 0.00019988296565626987, "loss": 0.957, "step": 920 }, { "epoch": 1.1440940012368583, "grad_norm": 0.53515625, "learning_rate": 0.00019987228518810244, "loss": 0.9508, "step": 925 }, { "epoch": 1.150278293135436, "grad_norm": 0.455078125, "learning_rate": 0.0001998611387296829, "loss": 0.9433, "step": 930 }, { "epoch": 1.1564625850340136, "grad_norm": 0.36328125, "learning_rate": 0.00019984952633301915, "loss": 0.9576, "step": 935 }, { "epoch": 1.1626468769325913, "grad_norm": 0.408203125, "learning_rate": 0.00019983744805229296, "loss": 0.9433, "step": 940 }, { "epoch": 1.1688311688311688, "grad_norm": 0.435546875, "learning_rate": 0.00019982490394385995, "loss": 0.9551, "step": 945 }, { "epoch": 1.1750154607297465, "grad_norm": 0.4921875, "learning_rate": 0.00019981189406624922, "loss": 0.953, "step": 950 }, { "epoch": 1.181199752628324, "grad_norm": 0.384765625, "learning_rate": 0.00019979841848016298, "loss": 0.9393, "step": 955 }, { "epoch": 1.1873840445269017, "grad_norm": 0.392578125, "learning_rate": 0.00019978447724847652, "loss": 0.9523, "step": 960 }, { "epoch": 1.1935683364254792, "grad_norm": 0.49609375, "learning_rate": 0.0001997700704362377, "loss": 0.9481, "step": 965 }, { "epoch": 1.199752628324057, "grad_norm": 0.400390625, "learning_rate": 0.00019975519811066663, "loss": 0.95, "step": 970 }, { "epoch": 1.2059369202226344, "grad_norm": 0.46875, "learning_rate": 0.0001997398603411556, "loss": 0.9416, "step": 975 }, { "epoch": 1.2121212121212122, "grad_norm": 0.388671875, "learning_rate": 0.0001997240571992685, "loss": 0.9443, "step": 980 }, { "epoch": 1.2183055040197897, "grad_norm": 0.4140625, "learning_rate": 0.00019970778875874056, "loss": 0.9414, "step": 985 }, { "epoch": 1.2244897959183674, "grad_norm": 0.416015625, "learning_rate": 0.00019969105509547812, "loss": 0.939, "step": 990 }, { "epoch": 1.2306740878169449, "grad_norm": 0.45703125, "learning_rate": 0.00019967385628755812, "loss": 0.9453, "step": 995 }, { "epoch": 1.2368583797155226, "grad_norm": 0.3828125, "learning_rate": 0.0001996561924152278, "loss": 0.935, "step": 1000 }, { "epoch": 1.2430426716141, "grad_norm": 0.345703125, "learning_rate": 0.0001996380635609044, "loss": 0.9314, "step": 1005 }, { "epoch": 1.2492269635126778, "grad_norm": 0.4765625, "learning_rate": 0.00019961946980917456, "loss": 0.9376, "step": 1010 }, { "epoch": 1.2554112554112553, "grad_norm": 0.40234375, "learning_rate": 0.00019960041124679421, "loss": 0.941, "step": 1015 }, { "epoch": 1.261595547309833, "grad_norm": 0.443359375, "learning_rate": 0.00019958088796268793, "loss": 0.9455, "step": 1020 }, { "epoch": 1.2677798392084108, "grad_norm": 3.625, "learning_rate": 0.00019956090004794868, "loss": 0.9533, "step": 1025 }, { "epoch": 1.2739641311069883, "grad_norm": 0.99609375, "learning_rate": 0.0001995404475958373, "loss": 0.9513, "step": 1030 }, { "epoch": 1.2801484230055657, "grad_norm": 0.765625, "learning_rate": 0.00019951953070178208, "loss": 0.9381, "step": 1035 }, { "epoch": 1.2863327149041435, "grad_norm": 0.447265625, "learning_rate": 0.00019949814946337838, "loss": 0.9484, "step": 1040 }, { "epoch": 1.2925170068027212, "grad_norm": 0.609375, "learning_rate": 0.00019947630398038811, "loss": 0.9378, "step": 1045 }, { "epoch": 1.2987012987012987, "grad_norm": 0.384765625, "learning_rate": 0.00019945399435473922, "loss": 0.9462, "step": 1050 }, { "epoch": 1.3048855905998762, "grad_norm": 0.373046875, "learning_rate": 0.00019943122069052534, "loss": 0.9385, "step": 1055 }, { "epoch": 1.311069882498454, "grad_norm": 0.4765625, "learning_rate": 0.00019940798309400526, "loss": 0.9374, "step": 1060 }, { "epoch": 1.3172541743970316, "grad_norm": 0.4140625, "learning_rate": 0.0001993842816736024, "loss": 0.9305, "step": 1065 }, { "epoch": 1.3234384662956091, "grad_norm": 0.419921875, "learning_rate": 0.00019936011653990426, "loss": 0.9379, "step": 1070 }, { "epoch": 1.3296227581941866, "grad_norm": 0.3359375, "learning_rate": 0.00019933548780566202, "loss": 0.9347, "step": 1075 }, { "epoch": 1.3358070500927643, "grad_norm": 0.39453125, "learning_rate": 0.00019931039558578997, "loss": 0.9397, "step": 1080 }, { "epoch": 1.341991341991342, "grad_norm": 0.55859375, "learning_rate": 0.00019928483999736492, "loss": 0.946, "step": 1085 }, { "epoch": 1.3481756338899196, "grad_norm": 0.349609375, "learning_rate": 0.00019925882115962568, "loss": 0.9304, "step": 1090 }, { "epoch": 1.3543599257884973, "grad_norm": 0.58984375, "learning_rate": 0.00019923233919397258, "loss": 0.9405, "step": 1095 }, { "epoch": 1.3605442176870748, "grad_norm": 0.365234375, "learning_rate": 0.0001992053942239668, "loss": 0.9394, "step": 1100 }, { "epoch": 1.3667285095856525, "grad_norm": 0.41796875, "learning_rate": 0.0001991779863753298, "loss": 0.9427, "step": 1105 }, { "epoch": 1.37291280148423, "grad_norm": 0.375, "learning_rate": 0.00019915011577594286, "loss": 0.935, "step": 1110 }, { "epoch": 1.3790970933828077, "grad_norm": 0.380859375, "learning_rate": 0.00019912178255584632, "loss": 0.9321, "step": 1115 }, { "epoch": 1.3852813852813852, "grad_norm": 0.349609375, "learning_rate": 0.00019909298684723904, "loss": 0.9222, "step": 1120 }, { "epoch": 1.391465677179963, "grad_norm": 0.412109375, "learning_rate": 0.00019906372878447784, "loss": 0.9359, "step": 1125 }, { "epoch": 1.3976499690785404, "grad_norm": 0.466796875, "learning_rate": 0.00019903400850407676, "loss": 0.94, "step": 1130 }, { "epoch": 1.4038342609771182, "grad_norm": 0.369140625, "learning_rate": 0.00019900382614470652, "loss": 0.9289, "step": 1135 }, { "epoch": 1.4100185528756957, "grad_norm": 0.361328125, "learning_rate": 0.00019897318184719385, "loss": 0.9355, "step": 1140 }, { "epoch": 1.4162028447742734, "grad_norm": 0.34375, "learning_rate": 0.00019894207575452076, "loss": 0.9321, "step": 1145 }, { "epoch": 1.4223871366728509, "grad_norm": 0.369140625, "learning_rate": 0.000198910508011824, "loss": 0.9298, "step": 1150 }, { "epoch": 1.4285714285714286, "grad_norm": 0.427734375, "learning_rate": 0.0001988784787663943, "loss": 0.9316, "step": 1155 }, { "epoch": 1.434755720470006, "grad_norm": 0.41015625, "learning_rate": 0.00019884598816767563, "loss": 0.9288, "step": 1160 }, { "epoch": 1.4409400123685838, "grad_norm": 0.380859375, "learning_rate": 0.00019881303636726466, "loss": 0.927, "step": 1165 }, { "epoch": 1.4471243042671613, "grad_norm": 0.38671875, "learning_rate": 0.00019877962351890993, "loss": 0.9287, "step": 1170 }, { "epoch": 1.453308596165739, "grad_norm": 0.369140625, "learning_rate": 0.0001987457497785112, "loss": 0.927, "step": 1175 }, { "epoch": 1.4594928880643168, "grad_norm": 0.369140625, "learning_rate": 0.00019871141530411853, "loss": 0.9161, "step": 1180 }, { "epoch": 1.4656771799628943, "grad_norm": 0.357421875, "learning_rate": 0.00019867662025593194, "loss": 0.9287, "step": 1185 }, { "epoch": 1.4718614718614718, "grad_norm": 0.369140625, "learning_rate": 0.0001986413647963003, "loss": 0.93, "step": 1190 }, { "epoch": 1.4780457637600495, "grad_norm": 0.33203125, "learning_rate": 0.00019860564908972064, "loss": 0.9287, "step": 1195 }, { "epoch": 1.4842300556586272, "grad_norm": 0.3828125, "learning_rate": 0.00019856947330283752, "loss": 0.9222, "step": 1200 }, { "epoch": 1.4904143475572047, "grad_norm": 0.359375, "learning_rate": 0.0001985328376044422, "loss": 0.9219, "step": 1205 }, { "epoch": 1.4965986394557822, "grad_norm": 0.349609375, "learning_rate": 0.00019849574216547171, "loss": 0.9223, "step": 1210 }, { "epoch": 1.50278293135436, "grad_norm": 0.333984375, "learning_rate": 0.00019845818715900822, "loss": 0.9213, "step": 1215 }, { "epoch": 1.5089672232529376, "grad_norm": 0.431640625, "learning_rate": 0.00019842017276027832, "loss": 0.921, "step": 1220 }, { "epoch": 1.5151515151515151, "grad_norm": 0.396484375, "learning_rate": 0.00019838169914665178, "loss": 0.9168, "step": 1225 }, { "epoch": 1.5213358070500926, "grad_norm": 0.416015625, "learning_rate": 0.00019834276649764124, "loss": 0.9215, "step": 1230 }, { "epoch": 1.5275200989486704, "grad_norm": 0.380859375, "learning_rate": 0.000198303374994901, "loss": 0.9073, "step": 1235 }, { "epoch": 1.533704390847248, "grad_norm": 0.333984375, "learning_rate": 0.00019826352482222638, "loss": 0.9226, "step": 1240 }, { "epoch": 1.5398886827458256, "grad_norm": 0.3515625, "learning_rate": 0.00019822321616555277, "loss": 0.9128, "step": 1245 }, { "epoch": 1.546072974644403, "grad_norm": 0.361328125, "learning_rate": 0.0001981824492129548, "loss": 0.918, "step": 1250 }, { "epoch": 1.5522572665429808, "grad_norm": 0.38671875, "learning_rate": 0.00019814122415464535, "loss": 0.924, "step": 1255 }, { "epoch": 1.5584415584415585, "grad_norm": 0.365234375, "learning_rate": 0.0001980995411829749, "loss": 0.9257, "step": 1260 }, { "epoch": 1.564625850340136, "grad_norm": 0.3515625, "learning_rate": 0.00019805740049243042, "loss": 0.9307, "step": 1265 }, { "epoch": 1.5708101422387135, "grad_norm": 0.333984375, "learning_rate": 0.0001980148022796345, "loss": 0.9338, "step": 1270 }, { "epoch": 1.5769944341372912, "grad_norm": 0.3515625, "learning_rate": 0.0001979717467433446, "loss": 0.9194, "step": 1275 }, { "epoch": 1.583178726035869, "grad_norm": 0.34375, "learning_rate": 0.00019792823408445174, "loss": 0.9285, "step": 1280 }, { "epoch": 1.5893630179344465, "grad_norm": 0.359375, "learning_rate": 0.00019788426450598006, "loss": 0.9142, "step": 1285 }, { "epoch": 1.595547309833024, "grad_norm": 0.37109375, "learning_rate": 0.0001978398382130855, "loss": 0.9154, "step": 1290 }, { "epoch": 1.601731601731602, "grad_norm": 0.46484375, "learning_rate": 0.00019779495541305498, "loss": 0.9218, "step": 1295 }, { "epoch": 1.6079158936301794, "grad_norm": 0.365234375, "learning_rate": 0.00019774961631530545, "loss": 0.9219, "step": 1300 }, { "epoch": 1.614100185528757, "grad_norm": 0.365234375, "learning_rate": 0.00019770382113138283, "loss": 0.9121, "step": 1305 }, { "epoch": 1.6202844774273346, "grad_norm": 0.359375, "learning_rate": 0.00019765757007496115, "loss": 0.9229, "step": 1310 }, { "epoch": 1.6264687693259123, "grad_norm": 0.328125, "learning_rate": 0.0001976108633618414, "loss": 0.9164, "step": 1315 }, { "epoch": 1.6326530612244898, "grad_norm": 0.4765625, "learning_rate": 0.00019756370120995066, "loss": 0.9068, "step": 1320 }, { "epoch": 1.6388373531230673, "grad_norm": 0.36328125, "learning_rate": 0.00019751608383934097, "loss": 0.9179, "step": 1325 }, { "epoch": 1.645021645021645, "grad_norm": 0.52734375, "learning_rate": 0.00019746801147218842, "loss": 0.9315, "step": 1330 }, { "epoch": 1.6512059369202228, "grad_norm": 0.365234375, "learning_rate": 0.00019741948433279197, "loss": 0.9172, "step": 1335 }, { "epoch": 1.6573902288188003, "grad_norm": 0.376953125, "learning_rate": 0.0001973705026475726, "loss": 0.9281, "step": 1340 }, { "epoch": 1.6635745207173778, "grad_norm": 0.353515625, "learning_rate": 0.00019732106664507203, "loss": 0.9371, "step": 1345 }, { "epoch": 1.6697588126159555, "grad_norm": 0.43359375, "learning_rate": 0.0001972711765559518, "loss": 0.9252, "step": 1350 }, { "epoch": 1.6759431045145332, "grad_norm": 0.34375, "learning_rate": 0.00019722083261299216, "loss": 0.9219, "step": 1355 }, { "epoch": 1.6821273964131107, "grad_norm": 0.33203125, "learning_rate": 0.00019717003505109095, "loss": 0.9137, "step": 1360 }, { "epoch": 1.6883116883116882, "grad_norm": 0.330078125, "learning_rate": 0.00019711878410726263, "loss": 0.9163, "step": 1365 }, { "epoch": 1.694495980210266, "grad_norm": 0.361328125, "learning_rate": 0.00019706708002063694, "loss": 0.9065, "step": 1370 }, { "epoch": 1.7006802721088436, "grad_norm": 0.3828125, "learning_rate": 0.00019701492303245802, "loss": 0.9216, "step": 1375 }, { "epoch": 1.7068645640074211, "grad_norm": 0.431640625, "learning_rate": 0.00019696231338608316, "loss": 0.9122, "step": 1380 }, { "epoch": 1.7130488559059986, "grad_norm": 0.421875, "learning_rate": 0.00019690925132698165, "loss": 0.9267, "step": 1385 }, { "epoch": 1.7192331478045764, "grad_norm": 0.49609375, "learning_rate": 0.00019685573710273376, "loss": 0.9166, "step": 1390 }, { "epoch": 1.725417439703154, "grad_norm": 0.447265625, "learning_rate": 0.0001968017709630294, "loss": 0.9188, "step": 1395 }, { "epoch": 1.7316017316017316, "grad_norm": 0.38671875, "learning_rate": 0.0001967473531596671, "loss": 0.923, "step": 1400 }, { "epoch": 1.737786023500309, "grad_norm": 0.345703125, "learning_rate": 0.00019669248394655283, "loss": 0.9151, "step": 1405 }, { "epoch": 1.7439703153988868, "grad_norm": 0.345703125, "learning_rate": 0.00019663716357969874, "loss": 0.9187, "step": 1410 }, { "epoch": 1.7501546072974645, "grad_norm": 0.392578125, "learning_rate": 0.00019658139231722198, "loss": 0.9111, "step": 1415 }, { "epoch": 1.756338899196042, "grad_norm": 0.353515625, "learning_rate": 0.00019652517041934356, "loss": 0.9076, "step": 1420 }, { "epoch": 1.7625231910946195, "grad_norm": 0.322265625, "learning_rate": 0.00019646849814838706, "loss": 0.9199, "step": 1425 }, { "epoch": 1.7687074829931972, "grad_norm": 0.46875, "learning_rate": 0.00019641137576877744, "loss": 0.9179, "step": 1430 }, { "epoch": 1.774891774891775, "grad_norm": 0.337890625, "learning_rate": 0.0001963538035470398, "loss": 0.9118, "step": 1435 }, { "epoch": 1.7810760667903525, "grad_norm": 0.359375, "learning_rate": 0.0001962957817517982, "loss": 0.917, "step": 1440 }, { "epoch": 1.78726035868893, "grad_norm": 0.306640625, "learning_rate": 0.00019623731065377426, "loss": 0.9092, "step": 1445 }, { "epoch": 1.7934446505875077, "grad_norm": 0.349609375, "learning_rate": 0.00019617839052578603, "loss": 0.924, "step": 1450 }, { "epoch": 1.7996289424860854, "grad_norm": 0.3515625, "learning_rate": 0.0001961190216427467, "loss": 0.9108, "step": 1455 }, { "epoch": 1.805813234384663, "grad_norm": 0.404296875, "learning_rate": 0.00019605920428166323, "loss": 0.9035, "step": 1460 }, { "epoch": 1.8119975262832406, "grad_norm": 0.341796875, "learning_rate": 0.00019599893872163514, "loss": 0.906, "step": 1465 }, { "epoch": 1.8181818181818183, "grad_norm": 0.33203125, "learning_rate": 0.00019593822524385316, "loss": 0.9139, "step": 1470 }, { "epoch": 1.8243661100803958, "grad_norm": 0.337890625, "learning_rate": 0.00019587706413159804, "loss": 0.9043, "step": 1475 }, { "epoch": 1.8305504019789733, "grad_norm": 0.4140625, "learning_rate": 0.000195815455670239, "loss": 0.9131, "step": 1480 }, { "epoch": 1.836734693877551, "grad_norm": 0.365234375, "learning_rate": 0.00019575340014723263, "loss": 0.914, "step": 1485 }, { "epoch": 1.8429189857761288, "grad_norm": 0.40234375, "learning_rate": 0.0001956908978521214, "loss": 0.9206, "step": 1490 }, { "epoch": 1.8491032776747063, "grad_norm": 0.380859375, "learning_rate": 0.00019562794907653235, "loss": 0.9122, "step": 1495 }, { "epoch": 1.8552875695732838, "grad_norm": 0.328125, "learning_rate": 0.00019556455411417573, "loss": 0.9059, "step": 1500 }, { "epoch": 1.8614718614718615, "grad_norm": 0.3515625, "learning_rate": 0.00019550071326084368, "loss": 0.8951, "step": 1505 }, { "epoch": 1.8676561533704392, "grad_norm": 0.3828125, "learning_rate": 0.0001954364268144088, "loss": 0.9186, "step": 1510 }, { "epoch": 1.8738404452690167, "grad_norm": 0.310546875, "learning_rate": 0.0001953716950748227, "loss": 0.9037, "step": 1515 }, { "epoch": 1.8800247371675942, "grad_norm": 0.326171875, "learning_rate": 0.00019530651834411474, "loss": 0.9029, "step": 1520 }, { "epoch": 1.886209029066172, "grad_norm": 0.310546875, "learning_rate": 0.00019524089692639053, "loss": 0.9071, "step": 1525 }, { "epoch": 1.8923933209647497, "grad_norm": 0.345703125, "learning_rate": 0.00019517483112783054, "loss": 0.9062, "step": 1530 }, { "epoch": 1.8985776128633272, "grad_norm": 0.33984375, "learning_rate": 0.00019510832125668853, "loss": 0.9028, "step": 1535 }, { "epoch": 1.9047619047619047, "grad_norm": 0.33984375, "learning_rate": 0.00019504136762329047, "loss": 0.9107, "step": 1540 }, { "epoch": 1.9109461966604824, "grad_norm": 0.421875, "learning_rate": 0.00019497397054003265, "loss": 0.9097, "step": 1545 }, { "epoch": 1.91713048855906, "grad_norm": 0.400390625, "learning_rate": 0.00019490613032138062, "loss": 0.9222, "step": 1550 }, { "epoch": 1.9233147804576376, "grad_norm": 0.3359375, "learning_rate": 0.00019483784728386737, "loss": 0.8979, "step": 1555 }, { "epoch": 1.929499072356215, "grad_norm": 0.369140625, "learning_rate": 0.0001947691217460921, "loss": 0.9016, "step": 1560 }, { "epoch": 1.9356833642547928, "grad_norm": 0.349609375, "learning_rate": 0.0001946999540287187, "loss": 0.9059, "step": 1565 }, { "epoch": 1.9418676561533705, "grad_norm": 0.365234375, "learning_rate": 0.0001946303444544741, "loss": 0.9139, "step": 1570 }, { "epoch": 1.948051948051948, "grad_norm": 0.34375, "learning_rate": 0.000194560293348147, "loss": 0.9062, "step": 1575 }, { "epoch": 1.9542362399505255, "grad_norm": 0.44921875, "learning_rate": 0.00019448980103658613, "loss": 0.9007, "step": 1580 }, { "epoch": 1.9604205318491033, "grad_norm": 0.314453125, "learning_rate": 0.00019441886784869885, "loss": 0.9022, "step": 1585 }, { "epoch": 1.966604823747681, "grad_norm": 0.427734375, "learning_rate": 0.00019434749411544958, "loss": 0.9035, "step": 1590 }, { "epoch": 1.9727891156462585, "grad_norm": 0.4765625, "learning_rate": 0.00019427568016985828, "loss": 0.8953, "step": 1595 }, { "epoch": 1.978973407544836, "grad_norm": 0.515625, "learning_rate": 0.0001942034263469989, "loss": 0.901, "step": 1600 }, { "epoch": 1.9851576994434137, "grad_norm": 0.41015625, "learning_rate": 0.00019413073298399778, "loss": 0.9065, "step": 1605 }, { "epoch": 1.9913419913419914, "grad_norm": 0.3515625, "learning_rate": 0.00019405760042003203, "loss": 0.9068, "step": 1610 }, { "epoch": 1.997526283240569, "grad_norm": 0.39453125, "learning_rate": 0.00019398402899632812, "loss": 0.8916, "step": 1615 }, { "epoch": 2.0, "eval_loss": 2.4784626960754395, "eval_runtime": 0.5361, "eval_samples_per_second": 18.652, "eval_steps_per_second": 1.865, "step": 1617 }, { "epoch": 2.0037105751391464, "grad_norm": 0.361328125, "learning_rate": 0.0001939100190561601, "loss": 0.9017, "step": 1620 }, { "epoch": 2.0098948670377244, "grad_norm": 0.3359375, "learning_rate": 0.00019383557094484807, "loss": 0.8759, "step": 1625 }, { "epoch": 2.016079158936302, "grad_norm": 0.32421875, "learning_rate": 0.00019376068500975667, "loss": 0.8808, "step": 1630 }, { "epoch": 2.0222634508348794, "grad_norm": 0.3203125, "learning_rate": 0.00019368536160029327, "loss": 0.8838, "step": 1635 }, { "epoch": 2.028447742733457, "grad_norm": 0.357421875, "learning_rate": 0.00019360960106790643, "loss": 0.8802, "step": 1640 }, { "epoch": 2.034632034632035, "grad_norm": 0.349609375, "learning_rate": 0.0001935334037660844, "loss": 0.8811, "step": 1645 }, { "epoch": 2.0408163265306123, "grad_norm": 0.369140625, "learning_rate": 0.00019345677005035315, "loss": 0.8814, "step": 1650 }, { "epoch": 2.04700061842919, "grad_norm": 0.427734375, "learning_rate": 0.00019337970027827504, "loss": 0.8767, "step": 1655 }, { "epoch": 2.0531849103277673, "grad_norm": 0.44140625, "learning_rate": 0.00019330219480944694, "loss": 0.875, "step": 1660 }, { "epoch": 2.0593692022263452, "grad_norm": 0.400390625, "learning_rate": 0.0001932242540054986, "loss": 0.8844, "step": 1665 }, { "epoch": 2.0655534941249227, "grad_norm": 0.333984375, "learning_rate": 0.00019314587823009103, "loss": 0.8775, "step": 1670 }, { "epoch": 2.0717377860235002, "grad_norm": 0.345703125, "learning_rate": 0.00019306706784891477, "loss": 0.8856, "step": 1675 }, { "epoch": 2.0779220779220777, "grad_norm": 0.361328125, "learning_rate": 0.00019298782322968815, "loss": 0.8767, "step": 1680 }, { "epoch": 2.0841063698206557, "grad_norm": 0.388671875, "learning_rate": 0.00019290814474215556, "loss": 0.8707, "step": 1685 }, { "epoch": 2.090290661719233, "grad_norm": 0.380859375, "learning_rate": 0.0001928280327580858, "loss": 0.8751, "step": 1690 }, { "epoch": 2.0964749536178107, "grad_norm": 0.357421875, "learning_rate": 0.00019274748765127028, "loss": 0.8709, "step": 1695 }, { "epoch": 2.102659245516388, "grad_norm": 0.337890625, "learning_rate": 0.00019266650979752136, "loss": 0.8805, "step": 1700 }, { "epoch": 2.108843537414966, "grad_norm": 0.4453125, "learning_rate": 0.00019258509957467042, "loss": 0.8732, "step": 1705 }, { "epoch": 2.1150278293135436, "grad_norm": 0.439453125, "learning_rate": 0.00019250325736256633, "loss": 0.8902, "step": 1710 }, { "epoch": 2.121212121212121, "grad_norm": 0.32421875, "learning_rate": 0.00019242098354307354, "loss": 0.8804, "step": 1715 }, { "epoch": 2.1273964131106986, "grad_norm": 0.34765625, "learning_rate": 0.00019233827850007027, "loss": 0.8706, "step": 1720 }, { "epoch": 2.1335807050092765, "grad_norm": 0.357421875, "learning_rate": 0.00019225514261944678, "loss": 0.8682, "step": 1725 }, { "epoch": 2.139764996907854, "grad_norm": 0.345703125, "learning_rate": 0.0001921715762891036, "loss": 0.8767, "step": 1730 }, { "epoch": 2.1459492888064315, "grad_norm": 0.38671875, "learning_rate": 0.00019208757989894965, "loss": 0.8747, "step": 1735 }, { "epoch": 2.1521335807050095, "grad_norm": 0.384765625, "learning_rate": 0.00019200315384090044, "loss": 0.8601, "step": 1740 }, { "epoch": 2.158317872603587, "grad_norm": 0.43359375, "learning_rate": 0.0001919182985088763, "loss": 0.8821, "step": 1745 }, { "epoch": 2.1645021645021645, "grad_norm": 0.390625, "learning_rate": 0.00019183301429880043, "loss": 0.8812, "step": 1750 }, { "epoch": 2.170686456400742, "grad_norm": 0.330078125, "learning_rate": 0.00019174730160859715, "loss": 0.8707, "step": 1755 }, { "epoch": 2.17687074829932, "grad_norm": 0.306640625, "learning_rate": 0.00019166116083819002, "loss": 0.8756, "step": 1760 }, { "epoch": 2.1830550401978974, "grad_norm": 0.31640625, "learning_rate": 0.00019157459238949991, "loss": 0.8796, "step": 1765 }, { "epoch": 2.189239332096475, "grad_norm": 0.345703125, "learning_rate": 0.00019148759666644325, "loss": 0.8795, "step": 1770 }, { "epoch": 2.1954236239950524, "grad_norm": 0.35546875, "learning_rate": 0.00019140017407493, "loss": 0.8804, "step": 1775 }, { "epoch": 2.2016079158936304, "grad_norm": 0.345703125, "learning_rate": 0.00019131232502286188, "loss": 0.8748, "step": 1780 }, { "epoch": 2.207792207792208, "grad_norm": 0.431640625, "learning_rate": 0.00019122404992013043, "loss": 0.8694, "step": 1785 }, { "epoch": 2.2139764996907854, "grad_norm": 0.30859375, "learning_rate": 0.00019113534917861502, "loss": 0.8806, "step": 1790 }, { "epoch": 2.220160791589363, "grad_norm": 0.361328125, "learning_rate": 0.00019104622321218105, "loss": 0.8618, "step": 1795 }, { "epoch": 2.226345083487941, "grad_norm": 0.390625, "learning_rate": 0.0001909566724366779, "loss": 0.88, "step": 1800 }, { "epoch": 2.2325293753865183, "grad_norm": 0.3828125, "learning_rate": 0.0001908666972699371, "loss": 0.887, "step": 1805 }, { "epoch": 2.238713667285096, "grad_norm": 0.36328125, "learning_rate": 0.00019077629813177036, "loss": 0.8812, "step": 1810 }, { "epoch": 2.2448979591836733, "grad_norm": 0.337890625, "learning_rate": 0.00019068547544396754, "loss": 0.8777, "step": 1815 }, { "epoch": 2.2510822510822512, "grad_norm": 0.32421875, "learning_rate": 0.00019059422963029464, "loss": 0.8711, "step": 1820 }, { "epoch": 2.2572665429808287, "grad_norm": 0.353515625, "learning_rate": 0.00019050256111649206, "loss": 0.8878, "step": 1825 }, { "epoch": 2.2634508348794062, "grad_norm": 0.33203125, "learning_rate": 0.00019041047033027236, "loss": 0.8706, "step": 1830 }, { "epoch": 2.2696351267779837, "grad_norm": 0.333984375, "learning_rate": 0.0001903179577013184, "loss": 0.8893, "step": 1835 }, { "epoch": 2.2758194186765617, "grad_norm": 0.337890625, "learning_rate": 0.00019022502366128135, "loss": 0.8747, "step": 1840 }, { "epoch": 2.282003710575139, "grad_norm": 0.33203125, "learning_rate": 0.00019013166864377851, "loss": 0.8791, "step": 1845 }, { "epoch": 2.2881880024737167, "grad_norm": 0.3359375, "learning_rate": 0.00019003789308439148, "loss": 0.8871, "step": 1850 }, { "epoch": 2.2943722943722946, "grad_norm": 0.33203125, "learning_rate": 0.00018994369742066403, "loss": 0.8811, "step": 1855 }, { "epoch": 2.300556586270872, "grad_norm": 0.375, "learning_rate": 0.0001898490820921001, "loss": 0.8721, "step": 1860 }, { "epoch": 2.3067408781694496, "grad_norm": 0.3359375, "learning_rate": 0.00018975404754016165, "loss": 0.8729, "step": 1865 }, { "epoch": 2.312925170068027, "grad_norm": 0.392578125, "learning_rate": 0.00018965859420826684, "loss": 0.8826, "step": 1870 }, { "epoch": 2.3191094619666046, "grad_norm": 0.369140625, "learning_rate": 0.00018956272254178763, "loss": 0.8646, "step": 1875 }, { "epoch": 2.3252937538651826, "grad_norm": 0.375, "learning_rate": 0.00018946643298804793, "loss": 0.8784, "step": 1880 }, { "epoch": 2.33147804576376, "grad_norm": 0.396484375, "learning_rate": 0.00018936972599632151, "loss": 0.8744, "step": 1885 }, { "epoch": 2.3376623376623376, "grad_norm": 0.39453125, "learning_rate": 0.00018927260201782978, "loss": 0.878, "step": 1890 }, { "epoch": 2.3438466295609155, "grad_norm": 0.349609375, "learning_rate": 0.00018917506150573977, "loss": 0.8791, "step": 1895 }, { "epoch": 2.350030921459493, "grad_norm": 0.380859375, "learning_rate": 0.00018907710491516199, "loss": 0.8831, "step": 1900 }, { "epoch": 2.3562152133580705, "grad_norm": 0.34375, "learning_rate": 0.0001889787327031483, "loss": 0.8786, "step": 1905 }, { "epoch": 2.362399505256648, "grad_norm": 0.486328125, "learning_rate": 0.0001888799453286899, "loss": 0.8768, "step": 1910 }, { "epoch": 2.3685837971552255, "grad_norm": 0.4453125, "learning_rate": 0.00018878074325271498, "loss": 0.8767, "step": 1915 }, { "epoch": 2.3747680890538034, "grad_norm": 0.482421875, "learning_rate": 0.00018868112693808665, "loss": 0.8725, "step": 1920 }, { "epoch": 2.380952380952381, "grad_norm": 0.333984375, "learning_rate": 0.00018858109684960082, "loss": 0.8594, "step": 1925 }, { "epoch": 2.3871366728509584, "grad_norm": 0.34765625, "learning_rate": 0.0001884806534539841, "loss": 0.8748, "step": 1930 }, { "epoch": 2.3933209647495364, "grad_norm": 0.32421875, "learning_rate": 0.0001883797972198914, "loss": 0.8782, "step": 1935 }, { "epoch": 2.399505256648114, "grad_norm": 0.330078125, "learning_rate": 0.00018827852861790398, "loss": 0.8716, "step": 1940 }, { "epoch": 2.4056895485466914, "grad_norm": 0.3359375, "learning_rate": 0.00018817684812052712, "loss": 0.8684, "step": 1945 }, { "epoch": 2.411873840445269, "grad_norm": 0.333984375, "learning_rate": 0.00018807475620218788, "loss": 0.8726, "step": 1950 }, { "epoch": 2.418058132343847, "grad_norm": 0.427734375, "learning_rate": 0.0001879722533392331, "loss": 0.8738, "step": 1955 }, { "epoch": 2.4242424242424243, "grad_norm": 0.412109375, "learning_rate": 0.00018786934000992688, "loss": 0.872, "step": 1960 }, { "epoch": 2.430426716141002, "grad_norm": 0.384765625, "learning_rate": 0.0001877660166944486, "loss": 0.8765, "step": 1965 }, { "epoch": 2.4366110080395793, "grad_norm": 0.33203125, "learning_rate": 0.00018766228387489048, "loss": 0.8849, "step": 1970 }, { "epoch": 2.4427952999381572, "grad_norm": 0.404296875, "learning_rate": 0.0001875581420352556, "loss": 0.8766, "step": 1975 }, { "epoch": 2.4489795918367347, "grad_norm": 0.341796875, "learning_rate": 0.00018745359166145523, "loss": 0.8724, "step": 1980 }, { "epoch": 2.4551638837353122, "grad_norm": 0.376953125, "learning_rate": 0.00018734863324130702, "loss": 0.8675, "step": 1985 }, { "epoch": 2.4613481756338897, "grad_norm": 0.408203125, "learning_rate": 0.00018724326726453244, "loss": 0.8771, "step": 1990 }, { "epoch": 2.4675324675324677, "grad_norm": 0.4140625, "learning_rate": 0.00018713749422275447, "loss": 0.8745, "step": 1995 }, { "epoch": 2.473716759431045, "grad_norm": 0.3984375, "learning_rate": 0.00018703131460949554, "loss": 0.8707, "step": 2000 }, { "epoch": 2.4799010513296227, "grad_norm": 0.376953125, "learning_rate": 0.000186924728920175, "loss": 0.8764, "step": 2005 }, { "epoch": 2.4860853432282, "grad_norm": 0.349609375, "learning_rate": 0.0001868177376521069, "loss": 0.8817, "step": 2010 }, { "epoch": 2.492269635126778, "grad_norm": 0.404296875, "learning_rate": 0.0001867103413044977, "loss": 0.8771, "step": 2015 }, { "epoch": 2.4984539270253556, "grad_norm": 0.314453125, "learning_rate": 0.00018660254037844388, "loss": 0.8693, "step": 2020 }, { "epoch": 2.504638218923933, "grad_norm": 0.396484375, "learning_rate": 0.00018649433537692964, "loss": 0.8803, "step": 2025 }, { "epoch": 2.5108225108225106, "grad_norm": 0.392578125, "learning_rate": 0.00018638572680482448, "loss": 0.8728, "step": 2030 }, { "epoch": 2.5170068027210886, "grad_norm": 0.36328125, "learning_rate": 0.00018627671516888104, "loss": 0.8724, "step": 2035 }, { "epoch": 2.523191094619666, "grad_norm": 0.380859375, "learning_rate": 0.0001861673009777325, "loss": 0.8683, "step": 2040 }, { "epoch": 2.5293753865182436, "grad_norm": 0.32421875, "learning_rate": 0.0001860574847418903, "loss": 0.8693, "step": 2045 }, { "epoch": 2.5355596784168215, "grad_norm": 0.326171875, "learning_rate": 0.00018594726697374175, "loss": 0.8809, "step": 2050 }, { "epoch": 2.541743970315399, "grad_norm": 0.328125, "learning_rate": 0.00018583664818754776, "loss": 0.8744, "step": 2055 }, { "epoch": 2.5479282622139765, "grad_norm": 0.326171875, "learning_rate": 0.0001857256288994402, "loss": 0.8833, "step": 2060 }, { "epoch": 2.554112554112554, "grad_norm": 0.30859375, "learning_rate": 0.00018561420962741977, "loss": 0.8742, "step": 2065 }, { "epoch": 2.5602968460111315, "grad_norm": 0.333984375, "learning_rate": 0.00018550239089135334, "loss": 0.8714, "step": 2070 }, { "epoch": 2.5664811379097094, "grad_norm": 0.46484375, "learning_rate": 0.00018539017321297162, "loss": 0.8716, "step": 2075 }, { "epoch": 2.572665429808287, "grad_norm": 0.34765625, "learning_rate": 0.00018527755711586678, "loss": 0.8731, "step": 2080 }, { "epoch": 2.5788497217068644, "grad_norm": 0.408203125, "learning_rate": 0.00018516454312548995, "loss": 0.8722, "step": 2085 }, { "epoch": 2.5850340136054424, "grad_norm": 0.3671875, "learning_rate": 0.0001850511317691487, "loss": 0.8711, "step": 2090 }, { "epoch": 2.59121830550402, "grad_norm": 0.322265625, "learning_rate": 0.00018493732357600478, "loss": 0.8695, "step": 2095 }, { "epoch": 2.5974025974025974, "grad_norm": 0.3671875, "learning_rate": 0.0001848231190770714, "loss": 0.8641, "step": 2100 }, { "epoch": 2.603586889301175, "grad_norm": 0.3203125, "learning_rate": 0.00018470851880521098, "loss": 0.8726, "step": 2105 }, { "epoch": 2.6097711811997524, "grad_norm": 0.31640625, "learning_rate": 0.0001845935232951325, "loss": 0.8671, "step": 2110 }, { "epoch": 2.6159554730983303, "grad_norm": 0.392578125, "learning_rate": 0.00018447813308338908, "loss": 0.8691, "step": 2115 }, { "epoch": 2.622139764996908, "grad_norm": 0.34375, "learning_rate": 0.00018436234870837547, "loss": 0.8645, "step": 2120 }, { "epoch": 2.6283240568954853, "grad_norm": 0.353515625, "learning_rate": 0.00018424617071032557, "loss": 0.8724, "step": 2125 }, { "epoch": 2.6345083487940633, "grad_norm": 0.328125, "learning_rate": 0.00018412959963130975, "loss": 0.8703, "step": 2130 }, { "epoch": 2.6406926406926408, "grad_norm": 0.353515625, "learning_rate": 0.00018401263601523259, "loss": 0.868, "step": 2135 }, { "epoch": 2.6468769325912183, "grad_norm": 0.33984375, "learning_rate": 0.00018389528040783012, "loss": 0.8662, "step": 2140 }, { "epoch": 2.6530612244897958, "grad_norm": 0.314453125, "learning_rate": 0.00018377753335666733, "loss": 0.8641, "step": 2145 }, { "epoch": 2.6592455163883733, "grad_norm": 0.3359375, "learning_rate": 0.00018365939541113566, "loss": 0.8635, "step": 2150 }, { "epoch": 2.665429808286951, "grad_norm": 0.34375, "learning_rate": 0.0001835408671224504, "loss": 0.8702, "step": 2155 }, { "epoch": 2.6716141001855287, "grad_norm": 0.37109375, "learning_rate": 0.00018342194904364813, "loss": 0.8679, "step": 2160 }, { "epoch": 2.6777983920841066, "grad_norm": 0.31640625, "learning_rate": 0.00018330264172958415, "loss": 0.8634, "step": 2165 }, { "epoch": 2.683982683982684, "grad_norm": 0.337890625, "learning_rate": 0.00018318294573692985, "loss": 0.8745, "step": 2170 }, { "epoch": 2.6901669758812616, "grad_norm": 0.3359375, "learning_rate": 0.00018306286162417015, "loss": 0.8697, "step": 2175 }, { "epoch": 2.696351267779839, "grad_norm": 0.359375, "learning_rate": 0.00018294238995160094, "loss": 0.8625, "step": 2180 }, { "epoch": 2.7025355596784166, "grad_norm": 0.47265625, "learning_rate": 0.00018282153128132628, "loss": 0.8762, "step": 2185 }, { "epoch": 2.7087198515769946, "grad_norm": 0.361328125, "learning_rate": 0.00018270028617725607, "loss": 0.883, "step": 2190 }, { "epoch": 2.714904143475572, "grad_norm": 0.3828125, "learning_rate": 0.00018257865520510312, "loss": 0.8819, "step": 2195 }, { "epoch": 2.7210884353741496, "grad_norm": 0.345703125, "learning_rate": 0.00018245663893238075, "loss": 0.8659, "step": 2200 }, { "epoch": 2.7272727272727275, "grad_norm": 0.384765625, "learning_rate": 0.00018233423792839992, "loss": 0.868, "step": 2205 }, { "epoch": 2.733457019171305, "grad_norm": 0.314453125, "learning_rate": 0.00018221145276426683, "loss": 0.8671, "step": 2210 }, { "epoch": 2.7396413110698825, "grad_norm": 0.30859375, "learning_rate": 0.00018208828401288004, "loss": 0.8668, "step": 2215 }, { "epoch": 2.74582560296846, "grad_norm": 0.373046875, "learning_rate": 0.00018196473224892784, "loss": 0.8662, "step": 2220 }, { "epoch": 2.7520098948670375, "grad_norm": 0.33984375, "learning_rate": 0.00018184079804888572, "loss": 0.8663, "step": 2225 }, { "epoch": 2.7581941867656155, "grad_norm": 0.328125, "learning_rate": 0.00018171648199101346, "loss": 0.8639, "step": 2230 }, { "epoch": 2.764378478664193, "grad_norm": 0.330078125, "learning_rate": 0.00018159178465535256, "loss": 0.8757, "step": 2235 }, { "epoch": 2.7705627705627704, "grad_norm": 0.392578125, "learning_rate": 0.00018146670662372354, "loss": 0.8566, "step": 2240 }, { "epoch": 2.7767470624613484, "grad_norm": 0.3046875, "learning_rate": 0.00018134124847972316, "loss": 0.8673, "step": 2245 }, { "epoch": 2.782931354359926, "grad_norm": 0.32421875, "learning_rate": 0.00018121541080872176, "loss": 0.8619, "step": 2250 }, { "epoch": 2.7891156462585034, "grad_norm": 0.298828125, "learning_rate": 0.00018108919419786046, "loss": 0.8684, "step": 2255 }, { "epoch": 2.795299938157081, "grad_norm": 0.345703125, "learning_rate": 0.0001809625992360485, "loss": 0.8708, "step": 2260 }, { "epoch": 2.8014842300556584, "grad_norm": 0.345703125, "learning_rate": 0.0001808356265139605, "loss": 0.8784, "step": 2265 }, { "epoch": 2.8076685219542363, "grad_norm": 0.31640625, "learning_rate": 0.00018070827662403349, "loss": 0.8718, "step": 2270 }, { "epoch": 2.813852813852814, "grad_norm": 0.33984375, "learning_rate": 0.0001805805501604645, "loss": 0.8626, "step": 2275 }, { "epoch": 2.8200371057513913, "grad_norm": 0.310546875, "learning_rate": 0.0001804524477192075, "loss": 0.8692, "step": 2280 }, { "epoch": 2.8262213976499693, "grad_norm": 0.310546875, "learning_rate": 0.00018032396989797072, "loss": 0.8676, "step": 2285 }, { "epoch": 2.8324056895485468, "grad_norm": 0.333984375, "learning_rate": 0.0001801951172962139, "loss": 0.8754, "step": 2290 }, { "epoch": 2.8385899814471243, "grad_norm": 0.298828125, "learning_rate": 0.0001800658905151454, "loss": 0.8706, "step": 2295 }, { "epoch": 2.8447742733457018, "grad_norm": 0.3203125, "learning_rate": 0.0001799362901577196, "loss": 0.8658, "step": 2300 }, { "epoch": 2.8509585652442793, "grad_norm": 0.326171875, "learning_rate": 0.0001798063168286337, "loss": 0.8768, "step": 2305 }, { "epoch": 2.857142857142857, "grad_norm": 0.3203125, "learning_rate": 0.0001796759711343253, "loss": 0.8665, "step": 2310 }, { "epoch": 2.8633271490414347, "grad_norm": 0.359375, "learning_rate": 0.00017954525368296933, "loss": 0.8761, "step": 2315 }, { "epoch": 2.869511440940012, "grad_norm": 0.392578125, "learning_rate": 0.00017941416508447536, "loss": 0.8671, "step": 2320 }, { "epoch": 2.87569573283859, "grad_norm": 0.333984375, "learning_rate": 0.0001792827059504846, "loss": 0.8687, "step": 2325 }, { "epoch": 2.8818800247371676, "grad_norm": 0.376953125, "learning_rate": 0.0001791508768943672, "loss": 0.874, "step": 2330 }, { "epoch": 2.888064316635745, "grad_norm": 0.32421875, "learning_rate": 0.00017901867853121925, "loss": 0.8737, "step": 2335 }, { "epoch": 2.8942486085343226, "grad_norm": 0.33203125, "learning_rate": 0.00017888611147786002, "loss": 0.871, "step": 2340 }, { "epoch": 2.9004329004329006, "grad_norm": 0.365234375, "learning_rate": 0.000178753176352829, "loss": 0.8551, "step": 2345 }, { "epoch": 2.906617192331478, "grad_norm": 0.34765625, "learning_rate": 0.00017861987377638312, "loss": 0.8738, "step": 2350 }, { "epoch": 2.9128014842300556, "grad_norm": 0.357421875, "learning_rate": 0.0001784862043704937, "loss": 0.8611, "step": 2355 }, { "epoch": 2.9189857761286335, "grad_norm": 0.318359375, "learning_rate": 0.00017835216875884368, "loss": 0.8659, "step": 2360 }, { "epoch": 2.925170068027211, "grad_norm": 0.349609375, "learning_rate": 0.0001782177675668247, "loss": 0.8627, "step": 2365 }, { "epoch": 2.9313543599257885, "grad_norm": 0.296875, "learning_rate": 0.00017808300142153406, "loss": 0.8658, "step": 2370 }, { "epoch": 2.937538651824366, "grad_norm": 0.294921875, "learning_rate": 0.00017794787095177196, "loss": 0.8727, "step": 2375 }, { "epoch": 2.9437229437229435, "grad_norm": 0.3203125, "learning_rate": 0.00017781237678803847, "loss": 0.868, "step": 2380 }, { "epoch": 2.9499072356215215, "grad_norm": 0.322265625, "learning_rate": 0.00017767651956253054, "loss": 0.8638, "step": 2385 }, { "epoch": 2.956091527520099, "grad_norm": 0.318359375, "learning_rate": 0.00017754029990913926, "loss": 0.8746, "step": 2390 }, { "epoch": 2.9622758194186765, "grad_norm": 0.330078125, "learning_rate": 0.00017740371846344655, "loss": 0.8681, "step": 2395 }, { "epoch": 2.9684601113172544, "grad_norm": 0.384765625, "learning_rate": 0.00017726677586272263, "loss": 0.8708, "step": 2400 }, { "epoch": 2.974644403215832, "grad_norm": 0.296875, "learning_rate": 0.00017712947274592267, "loss": 0.8712, "step": 2405 }, { "epoch": 2.9808286951144094, "grad_norm": 0.341796875, "learning_rate": 0.00017699180975368396, "loss": 0.8614, "step": 2410 }, { "epoch": 2.987012987012987, "grad_norm": 0.3125, "learning_rate": 0.00017685378752832305, "loss": 0.8642, "step": 2415 }, { "epoch": 2.9931972789115644, "grad_norm": 0.31640625, "learning_rate": 0.00017671540671383243, "loss": 0.8755, "step": 2420 }, { "epoch": 2.9993815708101423, "grad_norm": 0.3203125, "learning_rate": 0.00017657666795587788, "loss": 0.8752, "step": 2425 }, { "epoch": 2.9993815708101423, "eval_loss": 2.514435291290283, "eval_runtime": 0.9539, "eval_samples_per_second": 10.484, "eval_steps_per_second": 1.048, "step": 2425 }, { "epoch": 3.00556586270872, "grad_norm": 0.330078125, "learning_rate": 0.00017643757190179523, "loss": 0.8414, "step": 2430 }, { "epoch": 3.0117501546072973, "grad_norm": 0.322265625, "learning_rate": 0.00017629811920058733, "loss": 0.8423, "step": 2435 }, { "epoch": 3.0179344465058753, "grad_norm": 0.310546875, "learning_rate": 0.0001761583105029213, "loss": 0.8393, "step": 2440 }, { "epoch": 3.0241187384044528, "grad_norm": 0.328125, "learning_rate": 0.00017601814646112506, "loss": 0.838, "step": 2445 }, { "epoch": 3.0303030303030303, "grad_norm": 0.318359375, "learning_rate": 0.00017587762772918467, "loss": 0.8458, "step": 2450 }, { "epoch": 3.0364873222016078, "grad_norm": 0.333984375, "learning_rate": 0.00017573675496274102, "loss": 0.8339, "step": 2455 }, { "epoch": 3.0426716141001857, "grad_norm": 0.3125, "learning_rate": 0.00017559552881908695, "loss": 0.8413, "step": 2460 }, { "epoch": 3.048855905998763, "grad_norm": 0.337890625, "learning_rate": 0.00017545394995716418, "loss": 0.8363, "step": 2465 }, { "epoch": 3.0550401978973407, "grad_norm": 0.357421875, "learning_rate": 0.00017531201903755994, "loss": 0.8377, "step": 2470 }, { "epoch": 3.061224489795918, "grad_norm": 0.330078125, "learning_rate": 0.00017516973672250432, "loss": 0.8415, "step": 2475 }, { "epoch": 3.067408781694496, "grad_norm": 0.37890625, "learning_rate": 0.00017502710367586687, "loss": 0.8313, "step": 2480 }, { "epoch": 3.0735930735930737, "grad_norm": 0.365234375, "learning_rate": 0.0001748841205631537, "loss": 0.8271, "step": 2485 }, { "epoch": 3.079777365491651, "grad_norm": 0.34765625, "learning_rate": 0.0001747407880515041, "loss": 0.8222, "step": 2490 }, { "epoch": 3.0859616573902287, "grad_norm": 0.357421875, "learning_rate": 0.0001745971068096878, "loss": 0.8444, "step": 2495 }, { "epoch": 3.0921459492888066, "grad_norm": 0.306640625, "learning_rate": 0.0001744530775081015, "loss": 0.8366, "step": 2500 }, { "epoch": 3.098330241187384, "grad_norm": 0.341796875, "learning_rate": 0.0001743087008187661, "loss": 0.8482, "step": 2505 }, { "epoch": 3.1045145330859616, "grad_norm": 0.33203125, "learning_rate": 0.00017416397741532315, "loss": 0.8387, "step": 2510 }, { "epoch": 3.110698824984539, "grad_norm": 0.35546875, "learning_rate": 0.00017401890797303206, "loss": 0.8376, "step": 2515 }, { "epoch": 3.116883116883117, "grad_norm": 0.3671875, "learning_rate": 0.00017387349316876666, "loss": 0.8405, "step": 2520 }, { "epoch": 3.1230674087816945, "grad_norm": 0.326171875, "learning_rate": 0.0001737277336810124, "loss": 0.8484, "step": 2525 }, { "epoch": 3.129251700680272, "grad_norm": 0.3203125, "learning_rate": 0.00017358163018986282, "loss": 0.8368, "step": 2530 }, { "epoch": 3.1354359925788495, "grad_norm": 0.3515625, "learning_rate": 0.00017343518337701658, "loss": 0.8367, "step": 2535 }, { "epoch": 3.1416202844774275, "grad_norm": 0.330078125, "learning_rate": 0.0001732883939257742, "loss": 0.8324, "step": 2540 }, { "epoch": 3.147804576376005, "grad_norm": 0.318359375, "learning_rate": 0.000173141262521035, "loss": 0.8393, "step": 2545 }, { "epoch": 3.1539888682745825, "grad_norm": 0.353515625, "learning_rate": 0.00017299378984929366, "loss": 0.8502, "step": 2550 }, { "epoch": 3.16017316017316, "grad_norm": 0.33203125, "learning_rate": 0.0001728459765986373, "loss": 0.8363, "step": 2555 }, { "epoch": 3.166357452071738, "grad_norm": 0.3359375, "learning_rate": 0.00017269782345874203, "loss": 0.8427, "step": 2560 }, { "epoch": 3.1725417439703154, "grad_norm": 0.337890625, "learning_rate": 0.00017254933112086996, "loss": 0.8413, "step": 2565 }, { "epoch": 3.178726035868893, "grad_norm": 0.369140625, "learning_rate": 0.0001724005002778657, "loss": 0.8452, "step": 2570 }, { "epoch": 3.1849103277674704, "grad_norm": 0.326171875, "learning_rate": 0.00017225133162415338, "loss": 0.8458, "step": 2575 }, { "epoch": 3.1910946196660483, "grad_norm": 0.3203125, "learning_rate": 0.00017210182585573327, "loss": 0.8419, "step": 2580 }, { "epoch": 3.197278911564626, "grad_norm": 0.337890625, "learning_rate": 0.00017195198367017862, "loss": 0.8457, "step": 2585 }, { "epoch": 3.2034632034632033, "grad_norm": 0.380859375, "learning_rate": 0.00017180180576663228, "loss": 0.8353, "step": 2590 }, { "epoch": 3.2096474953617813, "grad_norm": 0.37890625, "learning_rate": 0.00017165129284580353, "loss": 0.8453, "step": 2595 }, { "epoch": 3.215831787260359, "grad_norm": 0.330078125, "learning_rate": 0.00017150044560996488, "loss": 0.8424, "step": 2600 }, { "epoch": 3.2220160791589363, "grad_norm": 0.345703125, "learning_rate": 0.0001713492647629486, "loss": 0.836, "step": 2605 }, { "epoch": 3.228200371057514, "grad_norm": 0.34375, "learning_rate": 0.00017119775101014358, "loss": 0.8443, "step": 2610 }, { "epoch": 3.2343846629560917, "grad_norm": 0.32421875, "learning_rate": 0.00017104590505849206, "loss": 0.8454, "step": 2615 }, { "epoch": 3.2405689548546692, "grad_norm": 0.333984375, "learning_rate": 0.00017089372761648616, "loss": 0.8446, "step": 2620 }, { "epoch": 3.2467532467532467, "grad_norm": 0.36328125, "learning_rate": 0.00017074121939416478, "loss": 0.8403, "step": 2625 }, { "epoch": 3.2529375386518242, "grad_norm": 0.330078125, "learning_rate": 0.00017058838110311017, "loss": 0.8529, "step": 2630 }, { "epoch": 3.259121830550402, "grad_norm": 0.318359375, "learning_rate": 0.0001704352134564446, "loss": 0.8456, "step": 2635 }, { "epoch": 3.2653061224489797, "grad_norm": 0.318359375, "learning_rate": 0.00017028171716882714, "loss": 0.842, "step": 2640 }, { "epoch": 3.271490414347557, "grad_norm": 0.3125, "learning_rate": 0.00017012789295645016, "loss": 0.841, "step": 2645 }, { "epoch": 3.2776747062461347, "grad_norm": 0.330078125, "learning_rate": 0.00016997374153703625, "loss": 0.8483, "step": 2650 }, { "epoch": 3.2838589981447126, "grad_norm": 0.349609375, "learning_rate": 0.00016981926362983442, "loss": 0.8406, "step": 2655 }, { "epoch": 3.29004329004329, "grad_norm": 0.33203125, "learning_rate": 0.00016966445995561727, "loss": 0.8497, "step": 2660 }, { "epoch": 3.2962275819418676, "grad_norm": 0.3359375, "learning_rate": 0.00016950933123667733, "loss": 0.8411, "step": 2665 }, { "epoch": 3.302411873840445, "grad_norm": 0.318359375, "learning_rate": 0.00016935387819682376, "loss": 0.8346, "step": 2670 }, { "epoch": 3.308596165739023, "grad_norm": 0.36328125, "learning_rate": 0.0001691981015613788, "loss": 0.8417, "step": 2675 }, { "epoch": 3.3147804576376005, "grad_norm": 0.34375, "learning_rate": 0.0001690420020571747, "loss": 0.839, "step": 2680 }, { "epoch": 3.320964749536178, "grad_norm": 0.31640625, "learning_rate": 0.00016888558041255015, "loss": 0.8469, "step": 2685 }, { "epoch": 3.3271490414347555, "grad_norm": 0.341796875, "learning_rate": 0.0001687288373573469, "loss": 0.8449, "step": 2690 }, { "epoch": 3.3333333333333335, "grad_norm": 0.3359375, "learning_rate": 0.00016857177362290625, "loss": 0.8458, "step": 2695 }, { "epoch": 3.339517625231911, "grad_norm": 0.330078125, "learning_rate": 0.00016841438994206595, "loss": 0.8357, "step": 2700 }, { "epoch": 3.3457019171304885, "grad_norm": 0.3515625, "learning_rate": 0.00016825668704915643, "loss": 0.8378, "step": 2705 }, { "epoch": 3.3518862090290664, "grad_norm": 0.33203125, "learning_rate": 0.0001680986656799975, "loss": 0.8388, "step": 2710 }, { "epoch": 3.358070500927644, "grad_norm": 0.32421875, "learning_rate": 0.00016794032657189504, "loss": 0.838, "step": 2715 }, { "epoch": 3.3642547928262214, "grad_norm": 0.33203125, "learning_rate": 0.00016778167046363734, "loss": 0.8372, "step": 2720 }, { "epoch": 3.370439084724799, "grad_norm": 0.310546875, "learning_rate": 0.00016762269809549184, "loss": 0.8398, "step": 2725 }, { "epoch": 3.3766233766233764, "grad_norm": 0.306640625, "learning_rate": 0.00016746341020920167, "loss": 0.8402, "step": 2730 }, { "epoch": 3.3828076685219544, "grad_norm": 0.341796875, "learning_rate": 0.00016730380754798198, "loss": 0.8319, "step": 2735 }, { "epoch": 3.388991960420532, "grad_norm": 0.318359375, "learning_rate": 0.0001671438908565167, "loss": 0.8434, "step": 2740 }, { "epoch": 3.3951762523191094, "grad_norm": 0.3125, "learning_rate": 0.000166983660880955, "loss": 0.8417, "step": 2745 }, { "epoch": 3.4013605442176873, "grad_norm": 0.33984375, "learning_rate": 0.00016682311836890766, "loss": 0.8294, "step": 2750 }, { "epoch": 3.407544836116265, "grad_norm": 0.384765625, "learning_rate": 0.00016666226406944395, "loss": 0.84, "step": 2755 }, { "epoch": 3.4137291280148423, "grad_norm": 0.388671875, "learning_rate": 0.00016650109873308765, "loss": 0.8378, "step": 2760 }, { "epoch": 3.41991341991342, "grad_norm": 0.33984375, "learning_rate": 0.0001663396231118139, "loss": 0.8374, "step": 2765 }, { "epoch": 3.4260977118119973, "grad_norm": 0.40234375, "learning_rate": 0.00016617783795904565, "loss": 0.8331, "step": 2770 }, { "epoch": 3.4322820037105752, "grad_norm": 0.349609375, "learning_rate": 0.00016601574402965, "loss": 0.8289, "step": 2775 }, { "epoch": 3.4384662956091527, "grad_norm": 0.326171875, "learning_rate": 0.00016585334207993476, "loss": 0.8351, "step": 2780 }, { "epoch": 3.4446505875077302, "grad_norm": 0.3203125, "learning_rate": 0.0001656906328676449, "loss": 0.8474, "step": 2785 }, { "epoch": 3.450834879406308, "grad_norm": 0.333984375, "learning_rate": 0.00016552761715195918, "loss": 0.8373, "step": 2790 }, { "epoch": 3.4570191713048857, "grad_norm": 0.32421875, "learning_rate": 0.00016536429569348623, "loss": 0.8314, "step": 2795 }, { "epoch": 3.463203463203463, "grad_norm": 0.3046875, "learning_rate": 0.00016520066925426144, "loss": 0.8397, "step": 2800 }, { "epoch": 3.4693877551020407, "grad_norm": 0.310546875, "learning_rate": 0.0001650367385977431, "loss": 0.8393, "step": 2805 }, { "epoch": 3.4755720470006186, "grad_norm": 0.296875, "learning_rate": 0.00016487250448880893, "loss": 0.8369, "step": 2810 }, { "epoch": 3.481756338899196, "grad_norm": 0.330078125, "learning_rate": 0.00016470796769375257, "loss": 0.8336, "step": 2815 }, { "epoch": 3.4879406307977736, "grad_norm": 0.31640625, "learning_rate": 0.0001645431289802799, "loss": 0.8415, "step": 2820 }, { "epoch": 3.494124922696351, "grad_norm": 0.330078125, "learning_rate": 0.0001643779891175055, "loss": 0.8482, "step": 2825 }, { "epoch": 3.500309214594929, "grad_norm": 0.400390625, "learning_rate": 0.00016421254887594917, "loss": 0.8354, "step": 2830 }, { "epoch": 3.5064935064935066, "grad_norm": 0.447265625, "learning_rate": 0.00016404680902753214, "loss": 0.8423, "step": 2835 }, { "epoch": 3.512677798392084, "grad_norm": 0.33203125, "learning_rate": 0.00016388077034557355, "loss": 0.8334, "step": 2840 }, { "epoch": 3.5188620902906615, "grad_norm": 0.373046875, "learning_rate": 0.00016371443360478692, "loss": 0.8298, "step": 2845 }, { "epoch": 3.5250463821892395, "grad_norm": 0.369140625, "learning_rate": 0.0001635477995812765, "loss": 0.839, "step": 2850 }, { "epoch": 3.531230674087817, "grad_norm": 0.349609375, "learning_rate": 0.0001633808690525335, "loss": 0.8398, "step": 2855 }, { "epoch": 3.5374149659863945, "grad_norm": 0.345703125, "learning_rate": 0.00016321364279743266, "loss": 0.8429, "step": 2860 }, { "epoch": 3.5435992578849724, "grad_norm": 0.3671875, "learning_rate": 0.00016304612159622855, "loss": 0.8458, "step": 2865 }, { "epoch": 3.54978354978355, "grad_norm": 0.390625, "learning_rate": 0.00016287830623055188, "loss": 0.8421, "step": 2870 }, { "epoch": 3.5559678416821274, "grad_norm": 0.3203125, "learning_rate": 0.0001627101974834059, "loss": 0.8496, "step": 2875 }, { "epoch": 3.562152133580705, "grad_norm": 0.3359375, "learning_rate": 0.00016254179613916278, "loss": 0.8466, "step": 2880 }, { "epoch": 3.5683364254792824, "grad_norm": 0.322265625, "learning_rate": 0.00016237310298355986, "loss": 0.8342, "step": 2885 }, { "epoch": 3.5745207173778604, "grad_norm": 0.318359375, "learning_rate": 0.00016220411880369601, "loss": 0.8486, "step": 2890 }, { "epoch": 3.580705009276438, "grad_norm": 0.341796875, "learning_rate": 0.00016203484438802806, "loss": 0.8414, "step": 2895 }, { "epoch": 3.5868893011750154, "grad_norm": 0.310546875, "learning_rate": 0.00016186528052636692, "loss": 0.8345, "step": 2900 }, { "epoch": 3.5930735930735933, "grad_norm": 0.31640625, "learning_rate": 0.00016169542800987418, "loss": 0.8275, "step": 2905 }, { "epoch": 3.599257884972171, "grad_norm": 0.365234375, "learning_rate": 0.0001615252876310581, "loss": 0.8339, "step": 2910 }, { "epoch": 3.6054421768707483, "grad_norm": 0.349609375, "learning_rate": 0.00016135486018377008, "loss": 0.8399, "step": 2915 }, { "epoch": 3.611626468769326, "grad_norm": 0.388671875, "learning_rate": 0.0001611841464632011, "loss": 0.8415, "step": 2920 }, { "epoch": 3.6178107606679033, "grad_norm": 0.3125, "learning_rate": 0.0001610131472658777, "loss": 0.8321, "step": 2925 }, { "epoch": 3.6239950525664812, "grad_norm": 0.345703125, "learning_rate": 0.00016084186338965843, "loss": 0.8377, "step": 2930 }, { "epoch": 3.6301793444650587, "grad_norm": 0.345703125, "learning_rate": 0.00016067029563373013, "loss": 0.8448, "step": 2935 }, { "epoch": 3.6363636363636362, "grad_norm": 0.33203125, "learning_rate": 0.00016049844479860422, "loss": 0.8353, "step": 2940 }, { "epoch": 3.642547928262214, "grad_norm": 0.3046875, "learning_rate": 0.00016032631168611284, "loss": 0.8304, "step": 2945 }, { "epoch": 3.6487322201607917, "grad_norm": 0.333984375, "learning_rate": 0.00016015389709940538, "loss": 0.8326, "step": 2950 }, { "epoch": 3.654916512059369, "grad_norm": 0.34765625, "learning_rate": 0.0001599812018429443, "loss": 0.8412, "step": 2955 }, { "epoch": 3.6611008039579467, "grad_norm": 0.314453125, "learning_rate": 0.0001598082267225018, "loss": 0.8449, "step": 2960 }, { "epoch": 3.667285095856524, "grad_norm": 0.36328125, "learning_rate": 0.00015963497254515581, "loss": 0.8423, "step": 2965 }, { "epoch": 3.673469387755102, "grad_norm": 0.3359375, "learning_rate": 0.00015946144011928638, "loss": 0.8301, "step": 2970 }, { "epoch": 3.6796536796536796, "grad_norm": 0.3359375, "learning_rate": 0.0001592876302545718, "loss": 0.8345, "step": 2975 }, { "epoch": 3.685837971552257, "grad_norm": 0.326171875, "learning_rate": 0.0001591135437619847, "loss": 0.8339, "step": 2980 }, { "epoch": 3.692022263450835, "grad_norm": 0.306640625, "learning_rate": 0.00015893918145378866, "loss": 0.8481, "step": 2985 }, { "epoch": 3.6982065553494126, "grad_norm": 0.314453125, "learning_rate": 0.000158764544143534, "loss": 0.8328, "step": 2990 }, { "epoch": 3.70439084724799, "grad_norm": 0.3125, "learning_rate": 0.0001585896326460543, "loss": 0.8415, "step": 2995 }, { "epoch": 3.7105751391465676, "grad_norm": 0.314453125, "learning_rate": 0.0001584144477774623, "loss": 0.8354, "step": 3000 }, { "epoch": 3.716759431045145, "grad_norm": 0.33203125, "learning_rate": 0.00015823899035514639, "loss": 0.8496, "step": 3005 }, { "epoch": 3.722943722943723, "grad_norm": 0.3671875, "learning_rate": 0.00015806326119776663, "loss": 0.8391, "step": 3010 }, { "epoch": 3.7291280148423005, "grad_norm": 0.333984375, "learning_rate": 0.00015788726112525085, "loss": 0.8435, "step": 3015 }, { "epoch": 3.7353123067408784, "grad_norm": 0.328125, "learning_rate": 0.00015771099095879108, "loss": 0.8401, "step": 3020 }, { "epoch": 3.741496598639456, "grad_norm": 0.33203125, "learning_rate": 0.0001575344515208395, "loss": 0.8375, "step": 3025 }, { "epoch": 3.7476808905380334, "grad_norm": 0.322265625, "learning_rate": 0.0001573576436351046, "loss": 0.8404, "step": 3030 }, { "epoch": 3.753865182436611, "grad_norm": 0.30859375, "learning_rate": 0.00015718056812654763, "loss": 0.8402, "step": 3035 }, { "epoch": 3.7600494743351884, "grad_norm": 0.328125, "learning_rate": 0.00015700322582137827, "loss": 0.8328, "step": 3040 }, { "epoch": 3.7662337662337664, "grad_norm": 0.33984375, "learning_rate": 0.00015682561754705123, "loss": 0.8342, "step": 3045 }, { "epoch": 3.772418058132344, "grad_norm": 0.349609375, "learning_rate": 0.0001566477441322621, "loss": 0.8472, "step": 3050 }, { "epoch": 3.7786023500309214, "grad_norm": 0.361328125, "learning_rate": 0.0001564696064069436, "loss": 0.8341, "step": 3055 }, { "epoch": 3.7847866419294993, "grad_norm": 0.3515625, "learning_rate": 0.00015629120520226165, "loss": 0.8415, "step": 3060 }, { "epoch": 3.790970933828077, "grad_norm": 0.42578125, "learning_rate": 0.0001561125413506116, "loss": 0.8399, "step": 3065 }, { "epoch": 3.7971552257266543, "grad_norm": 0.361328125, "learning_rate": 0.00015593361568561428, "loss": 0.8374, "step": 3070 }, { "epoch": 3.803339517625232, "grad_norm": 0.330078125, "learning_rate": 0.000155754429042112, "loss": 0.8355, "step": 3075 }, { "epoch": 3.8095238095238093, "grad_norm": 0.345703125, "learning_rate": 0.00015557498225616487, "loss": 0.8246, "step": 3080 }, { "epoch": 3.8157081014223873, "grad_norm": 0.349609375, "learning_rate": 0.0001553952761650467, "loss": 0.8422, "step": 3085 }, { "epoch": 3.8218923933209648, "grad_norm": 0.35546875, "learning_rate": 0.00015521531160724126, "loss": 0.8327, "step": 3090 }, { "epoch": 3.8280766852195423, "grad_norm": 0.341796875, "learning_rate": 0.0001550350894224382, "loss": 0.8387, "step": 3095 }, { "epoch": 3.83426097711812, "grad_norm": 0.345703125, "learning_rate": 0.0001548546104515294, "loss": 0.8329, "step": 3100 }, { "epoch": 3.8404452690166977, "grad_norm": 0.337890625, "learning_rate": 0.0001546738755366046, "loss": 0.8404, "step": 3105 }, { "epoch": 3.846629560915275, "grad_norm": 0.318359375, "learning_rate": 0.00015449288552094796, "loss": 0.8376, "step": 3110 }, { "epoch": 3.8528138528138527, "grad_norm": 0.34375, "learning_rate": 0.00015431164124903382, "loss": 0.8304, "step": 3115 }, { "epoch": 3.85899814471243, "grad_norm": 0.30859375, "learning_rate": 0.00015413014356652286, "loss": 0.833, "step": 3120 }, { "epoch": 3.865182436611008, "grad_norm": 0.33984375, "learning_rate": 0.00015394839332025811, "loss": 0.8451, "step": 3125 }, { "epoch": 3.8713667285095856, "grad_norm": 0.33203125, "learning_rate": 0.00015376639135826107, "loss": 0.8373, "step": 3130 }, { "epoch": 3.877551020408163, "grad_norm": 0.328125, "learning_rate": 0.00015358413852972766, "loss": 0.8392, "step": 3135 }, { "epoch": 3.883735312306741, "grad_norm": 0.31640625, "learning_rate": 0.0001534016356850244, "loss": 0.8352, "step": 3140 }, { "epoch": 3.8899196042053186, "grad_norm": 0.38671875, "learning_rate": 0.00015321888367568422, "loss": 0.8344, "step": 3145 }, { "epoch": 3.896103896103896, "grad_norm": 0.341796875, "learning_rate": 0.00015303588335440274, "loss": 0.8408, "step": 3150 }, { "epoch": 3.9022881880024736, "grad_norm": 0.3359375, "learning_rate": 0.00015285263557503407, "loss": 0.8328, "step": 3155 }, { "epoch": 3.908472479901051, "grad_norm": 0.345703125, "learning_rate": 0.000152669141192587, "loss": 0.8454, "step": 3160 }, { "epoch": 3.914656771799629, "grad_norm": 0.326171875, "learning_rate": 0.00015248540106322094, "loss": 0.8234, "step": 3165 }, { "epoch": 3.9208410636982065, "grad_norm": 0.330078125, "learning_rate": 0.00015230141604424181, "loss": 0.8391, "step": 3170 }, { "epoch": 3.927025355596784, "grad_norm": 0.408203125, "learning_rate": 0.0001521171869940983, "loss": 0.8347, "step": 3175 }, { "epoch": 3.933209647495362, "grad_norm": 0.330078125, "learning_rate": 0.0001519327147723776, "loss": 0.8289, "step": 3180 }, { "epoch": 3.9393939393939394, "grad_norm": 0.3125, "learning_rate": 0.0001517480002398016, "loss": 0.8342, "step": 3185 }, { "epoch": 3.945578231292517, "grad_norm": 0.3125, "learning_rate": 0.00015156304425822267, "loss": 0.8406, "step": 3190 }, { "epoch": 3.9517625231910944, "grad_norm": 0.322265625, "learning_rate": 0.00015137784769061986, "loss": 0.8361, "step": 3195 }, { "epoch": 3.9579468150896724, "grad_norm": 0.34375, "learning_rate": 0.00015119241140109467, "loss": 0.8391, "step": 3200 }, { "epoch": 3.96413110698825, "grad_norm": 0.373046875, "learning_rate": 0.00015100673625486716, "loss": 0.8312, "step": 3205 }, { "epoch": 3.9703153988868274, "grad_norm": 0.3203125, "learning_rate": 0.00015082082311827183, "loss": 0.8381, "step": 3210 }, { "epoch": 3.9764996907854053, "grad_norm": 0.345703125, "learning_rate": 0.00015063467285875365, "loss": 0.8402, "step": 3215 }, { "epoch": 3.982683982683983, "grad_norm": 0.32421875, "learning_rate": 0.000150448286344864, "loss": 0.827, "step": 3220 }, { "epoch": 3.9888682745825603, "grad_norm": 0.326171875, "learning_rate": 0.00015026166444625646, "loss": 0.8326, "step": 3225 }, { "epoch": 3.995052566481138, "grad_norm": 0.359375, "learning_rate": 0.000150074808033683, "loss": 0.8424, "step": 3230 }, { "epoch": 4.0, "eval_loss": 2.5590126514434814, "eval_runtime": 0.5375, "eval_samples_per_second": 18.605, "eval_steps_per_second": 1.861, "step": 3234 }, { "epoch": 4.001236858379715, "grad_norm": 0.3671875, "learning_rate": 0.00014988771797898976, "loss": 0.8301, "step": 3235 }, { "epoch": 4.007421150278293, "grad_norm": 0.322265625, "learning_rate": 0.00014970039515511304, "loss": 0.8174, "step": 3240 }, { "epoch": 4.01360544217687, "grad_norm": 0.486328125, "learning_rate": 0.00014951284043607517, "loss": 0.7991, "step": 3245 }, { "epoch": 4.019789734075449, "grad_norm": 0.353515625, "learning_rate": 0.00014932505469698052, "loss": 0.8198, "step": 3250 }, { "epoch": 4.025974025974026, "grad_norm": 0.33984375, "learning_rate": 0.00014913703881401134, "loss": 0.8031, "step": 3255 }, { "epoch": 4.032158317872604, "grad_norm": 0.376953125, "learning_rate": 0.0001489487936644237, "loss": 0.8086, "step": 3260 }, { "epoch": 4.038342609771181, "grad_norm": 0.33203125, "learning_rate": 0.00014876032012654336, "loss": 0.8114, "step": 3265 }, { "epoch": 4.044526901669759, "grad_norm": 0.39453125, "learning_rate": 0.00014857161907976183, "loss": 0.8151, "step": 3270 }, { "epoch": 4.050711193568336, "grad_norm": 0.318359375, "learning_rate": 0.00014838269140453198, "loss": 0.8019, "step": 3275 }, { "epoch": 4.056895485466914, "grad_norm": 0.3671875, "learning_rate": 0.00014819353798236427, "loss": 0.7974, "step": 3280 }, { "epoch": 4.063079777365492, "grad_norm": 0.337890625, "learning_rate": 0.00014800415969582227, "loss": 0.8166, "step": 3285 }, { "epoch": 4.06926406926407, "grad_norm": 0.38671875, "learning_rate": 0.00014781455742851892, "loss": 0.8066, "step": 3290 }, { "epoch": 4.075448361162647, "grad_norm": 0.37109375, "learning_rate": 0.00014762473206511207, "loss": 0.8136, "step": 3295 }, { "epoch": 4.081632653061225, "grad_norm": 0.357421875, "learning_rate": 0.00014743468449130063, "loss": 0.8167, "step": 3300 }, { "epoch": 4.087816944959802, "grad_norm": 0.32421875, "learning_rate": 0.00014724441559382028, "loss": 0.809, "step": 3305 }, { "epoch": 4.09400123685838, "grad_norm": 0.408203125, "learning_rate": 0.0001470539262604393, "loss": 0.8024, "step": 3310 }, { "epoch": 4.100185528756957, "grad_norm": 0.337890625, "learning_rate": 0.00014686321737995454, "loss": 0.8148, "step": 3315 }, { "epoch": 4.106369820655535, "grad_norm": 0.322265625, "learning_rate": 0.0001466722898421873, "loss": 0.8127, "step": 3320 }, { "epoch": 4.112554112554113, "grad_norm": 0.357421875, "learning_rate": 0.0001464811445379789, "loss": 0.8008, "step": 3325 }, { "epoch": 4.1187384044526905, "grad_norm": 0.3125, "learning_rate": 0.00014628978235918695, "loss": 0.8145, "step": 3330 }, { "epoch": 4.124922696351268, "grad_norm": 0.3515625, "learning_rate": 0.0001460982041986809, "loss": 0.8009, "step": 3335 }, { "epoch": 4.1311069882498455, "grad_norm": 0.34375, "learning_rate": 0.00014590641095033787, "loss": 0.8056, "step": 3340 }, { "epoch": 4.137291280148423, "grad_norm": 0.34375, "learning_rate": 0.00014571440350903857, "loss": 0.8189, "step": 3345 }, { "epoch": 4.1434755720470005, "grad_norm": 0.359375, "learning_rate": 0.00014552218277066314, "loss": 0.8103, "step": 3350 }, { "epoch": 4.149659863945578, "grad_norm": 0.3359375, "learning_rate": 0.00014532974963208704, "loss": 0.8073, "step": 3355 }, { "epoch": 4.1558441558441555, "grad_norm": 0.330078125, "learning_rate": 0.00014513710499117647, "loss": 0.8158, "step": 3360 }, { "epoch": 4.162028447742734, "grad_norm": 0.365234375, "learning_rate": 0.00014494424974678476, "loss": 0.8058, "step": 3365 }, { "epoch": 4.168212739641311, "grad_norm": 0.43359375, "learning_rate": 0.00014475118479874774, "loss": 0.8117, "step": 3370 }, { "epoch": 4.174397031539889, "grad_norm": 0.353515625, "learning_rate": 0.00014455791104787976, "loss": 0.814, "step": 3375 }, { "epoch": 4.180581323438466, "grad_norm": 0.33984375, "learning_rate": 0.0001443644293959693, "loss": 0.8154, "step": 3380 }, { "epoch": 4.186765615337044, "grad_norm": 0.326171875, "learning_rate": 0.00014417074074577502, "loss": 0.8108, "step": 3385 }, { "epoch": 4.192949907235621, "grad_norm": 0.318359375, "learning_rate": 0.0001439768460010213, "loss": 0.8166, "step": 3390 }, { "epoch": 4.199134199134199, "grad_norm": 0.322265625, "learning_rate": 0.00014378274606639422, "loss": 0.8002, "step": 3395 }, { "epoch": 4.205318491032776, "grad_norm": 0.3125, "learning_rate": 0.00014358844184753712, "loss": 0.8058, "step": 3400 }, { "epoch": 4.211502782931355, "grad_norm": 0.330078125, "learning_rate": 0.00014339393425104663, "loss": 0.8011, "step": 3405 }, { "epoch": 4.217687074829932, "grad_norm": 0.37109375, "learning_rate": 0.00014319922418446824, "loss": 0.8159, "step": 3410 }, { "epoch": 4.22387136672851, "grad_norm": 0.37109375, "learning_rate": 0.0001430043125562922, "loss": 0.8065, "step": 3415 }, { "epoch": 4.230055658627087, "grad_norm": 0.337890625, "learning_rate": 0.00014280920027594907, "loss": 0.8171, "step": 3420 }, { "epoch": 4.236239950525665, "grad_norm": 0.3984375, "learning_rate": 0.00014261388825380586, "loss": 0.8245, "step": 3425 }, { "epoch": 4.242424242424242, "grad_norm": 0.33984375, "learning_rate": 0.00014241837740116132, "loss": 0.8015, "step": 3430 }, { "epoch": 4.24860853432282, "grad_norm": 0.341796875, "learning_rate": 0.00014222266863024206, "loss": 0.8169, "step": 3435 }, { "epoch": 4.254792826221397, "grad_norm": 0.349609375, "learning_rate": 0.00014202676285419812, "loss": 0.8034, "step": 3440 }, { "epoch": 4.260977118119976, "grad_norm": 0.3359375, "learning_rate": 0.00014183066098709865, "loss": 0.8083, "step": 3445 }, { "epoch": 4.267161410018553, "grad_norm": 0.330078125, "learning_rate": 0.00014163436394392786, "loss": 0.801, "step": 3450 }, { "epoch": 4.273345701917131, "grad_norm": 0.3359375, "learning_rate": 0.00014143787264058055, "loss": 0.8035, "step": 3455 }, { "epoch": 4.279529993815708, "grad_norm": 0.353515625, "learning_rate": 0.00014124118799385796, "loss": 0.8152, "step": 3460 }, { "epoch": 4.285714285714286, "grad_norm": 0.349609375, "learning_rate": 0.00014104431092146338, "loss": 0.8258, "step": 3465 }, { "epoch": 4.291898577612863, "grad_norm": 0.41796875, "learning_rate": 0.000140847242341998, "loss": 0.818, "step": 3470 }, { "epoch": 4.298082869511441, "grad_norm": 0.357421875, "learning_rate": 0.00014064998317495647, "loss": 0.8159, "step": 3475 }, { "epoch": 4.304267161410019, "grad_norm": 0.34765625, "learning_rate": 0.0001404525343407228, "loss": 0.8164, "step": 3480 }, { "epoch": 4.3104514533085965, "grad_norm": 0.326171875, "learning_rate": 0.00014025489676056587, "loss": 0.8083, "step": 3485 }, { "epoch": 4.316635745207174, "grad_norm": 0.349609375, "learning_rate": 0.00014005707135663527, "loss": 0.8148, "step": 3490 }, { "epoch": 4.3228200371057515, "grad_norm": 0.34375, "learning_rate": 0.00013985905905195697, "loss": 0.8236, "step": 3495 }, { "epoch": 4.329004329004329, "grad_norm": 0.392578125, "learning_rate": 0.0001396608607704289, "loss": 0.8153, "step": 3500 }, { "epoch": 4.3351886209029065, "grad_norm": 0.34375, "learning_rate": 0.00013946247743681686, "loss": 0.8156, "step": 3505 }, { "epoch": 4.341372912801484, "grad_norm": 0.349609375, "learning_rate": 0.00013926390997674997, "loss": 0.8181, "step": 3510 }, { "epoch": 4.3475572047000615, "grad_norm": 0.318359375, "learning_rate": 0.00013906515931671651, "loss": 0.809, "step": 3515 }, { "epoch": 4.35374149659864, "grad_norm": 0.361328125, "learning_rate": 0.00013886622638405952, "loss": 0.8045, "step": 3520 }, { "epoch": 4.359925788497217, "grad_norm": 0.3515625, "learning_rate": 0.00013866711210697256, "loss": 0.8105, "step": 3525 }, { "epoch": 4.366110080395795, "grad_norm": 0.328125, "learning_rate": 0.00013846781741449525, "loss": 0.8076, "step": 3530 }, { "epoch": 4.372294372294372, "grad_norm": 0.337890625, "learning_rate": 0.000138268343236509, "loss": 0.8137, "step": 3535 }, { "epoch": 4.37847866419295, "grad_norm": 0.349609375, "learning_rate": 0.0001380686905037327, "loss": 0.8085, "step": 3540 }, { "epoch": 4.384662956091527, "grad_norm": 0.330078125, "learning_rate": 0.00013786886014771843, "loss": 0.8151, "step": 3545 }, { "epoch": 4.390847247990105, "grad_norm": 0.330078125, "learning_rate": 0.00013766885310084688, "loss": 0.8075, "step": 3550 }, { "epoch": 4.397031539888682, "grad_norm": 0.31640625, "learning_rate": 0.00013746867029632324, "loss": 0.8081, "step": 3555 }, { "epoch": 4.403215831787261, "grad_norm": 0.322265625, "learning_rate": 0.00013726831266817278, "loss": 0.8067, "step": 3560 }, { "epoch": 4.409400123685838, "grad_norm": 0.34765625, "learning_rate": 0.00013706778115123646, "loss": 0.8127, "step": 3565 }, { "epoch": 4.415584415584416, "grad_norm": 0.35546875, "learning_rate": 0.0001368670766811665, "loss": 0.8151, "step": 3570 }, { "epoch": 4.421768707482993, "grad_norm": 0.326171875, "learning_rate": 0.00013666620019442223, "loss": 0.8074, "step": 3575 }, { "epoch": 4.427952999381571, "grad_norm": 0.32421875, "learning_rate": 0.00013646515262826552, "loss": 0.8165, "step": 3580 }, { "epoch": 4.434137291280148, "grad_norm": 0.3515625, "learning_rate": 0.00013626393492075645, "loss": 0.8115, "step": 3585 }, { "epoch": 4.440321583178726, "grad_norm": 0.375, "learning_rate": 0.00013606254801074895, "loss": 0.8113, "step": 3590 }, { "epoch": 4.446505875077303, "grad_norm": 0.3515625, "learning_rate": 0.0001358609928378865, "loss": 0.8035, "step": 3595 }, { "epoch": 4.452690166975882, "grad_norm": 0.3359375, "learning_rate": 0.0001356592703425976, "loss": 0.8097, "step": 3600 }, { "epoch": 4.458874458874459, "grad_norm": 0.33203125, "learning_rate": 0.00013545738146609145, "loss": 0.8187, "step": 3605 }, { "epoch": 4.465058750773037, "grad_norm": 0.345703125, "learning_rate": 0.00013525532715035366, "loss": 0.8031, "step": 3610 }, { "epoch": 4.471243042671614, "grad_norm": 0.328125, "learning_rate": 0.00013505310833814168, "loss": 0.8138, "step": 3615 }, { "epoch": 4.477427334570192, "grad_norm": 0.326171875, "learning_rate": 0.00013485072597298038, "loss": 0.8119, "step": 3620 }, { "epoch": 4.483611626468769, "grad_norm": 0.34375, "learning_rate": 0.00013464818099915798, "loss": 0.8212, "step": 3625 }, { "epoch": 4.489795918367347, "grad_norm": 0.3515625, "learning_rate": 0.00013444547436172117, "loss": 0.807, "step": 3630 }, { "epoch": 4.495980210265925, "grad_norm": 0.34765625, "learning_rate": 0.00013424260700647115, "loss": 0.809, "step": 3635 }, { "epoch": 4.5021645021645025, "grad_norm": 0.33984375, "learning_rate": 0.00013403957987995882, "loss": 0.8157, "step": 3640 }, { "epoch": 4.50834879406308, "grad_norm": 0.341796875, "learning_rate": 0.00013383639392948072, "loss": 0.8213, "step": 3645 }, { "epoch": 4.5145330859616575, "grad_norm": 0.361328125, "learning_rate": 0.00013363305010307425, "loss": 0.809, "step": 3650 }, { "epoch": 4.520717377860235, "grad_norm": 0.34375, "learning_rate": 0.00013342954934951365, "loss": 0.8138, "step": 3655 }, { "epoch": 4.5269016697588125, "grad_norm": 0.36328125, "learning_rate": 0.00013322589261830517, "loss": 0.8177, "step": 3660 }, { "epoch": 4.53308596165739, "grad_norm": 0.34765625, "learning_rate": 0.00013302208085968296, "loss": 0.8145, "step": 3665 }, { "epoch": 4.5392702535559675, "grad_norm": 0.326171875, "learning_rate": 0.0001328181150246045, "loss": 0.8063, "step": 3670 }, { "epoch": 4.545454545454545, "grad_norm": 0.34765625, "learning_rate": 0.00013261399606474605, "loss": 0.8136, "step": 3675 }, { "epoch": 4.551638837353123, "grad_norm": 0.384765625, "learning_rate": 0.00013240972493249847, "loss": 0.8117, "step": 3680 }, { "epoch": 4.557823129251701, "grad_norm": 0.376953125, "learning_rate": 0.00013220530258096252, "loss": 0.8164, "step": 3685 }, { "epoch": 4.564007421150278, "grad_norm": 0.34765625, "learning_rate": 0.0001320007299639446, "loss": 0.8153, "step": 3690 }, { "epoch": 4.570191713048856, "grad_norm": 0.3515625, "learning_rate": 0.00013179600803595224, "loss": 0.8148, "step": 3695 }, { "epoch": 4.576376004947433, "grad_norm": 0.314453125, "learning_rate": 0.00013159113775218964, "loss": 0.7987, "step": 3700 }, { "epoch": 4.582560296846011, "grad_norm": 0.3203125, "learning_rate": 0.00013138612006855307, "loss": 0.8191, "step": 3705 }, { "epoch": 4.588744588744589, "grad_norm": 0.328125, "learning_rate": 0.0001311809559416267, "loss": 0.8048, "step": 3710 }, { "epoch": 4.594928880643167, "grad_norm": 0.32421875, "learning_rate": 0.00013097564632867794, "loss": 0.8151, "step": 3715 }, { "epoch": 4.601113172541744, "grad_norm": 0.32421875, "learning_rate": 0.00013077019218765305, "loss": 0.8083, "step": 3720 }, { "epoch": 4.607297464440322, "grad_norm": 0.310546875, "learning_rate": 0.00013056459447717252, "loss": 0.8142, "step": 3725 }, { "epoch": 4.613481756338899, "grad_norm": 0.341796875, "learning_rate": 0.00013035885415652685, "loss": 0.8166, "step": 3730 }, { "epoch": 4.619666048237477, "grad_norm": 0.349609375, "learning_rate": 0.00013015297218567186, "loss": 0.8147, "step": 3735 }, { "epoch": 4.625850340136054, "grad_norm": 0.333984375, "learning_rate": 0.00012994694952522435, "loss": 0.819, "step": 3740 }, { "epoch": 4.632034632034632, "grad_norm": 0.330078125, "learning_rate": 0.0001297407871364575, "loss": 0.8188, "step": 3745 }, { "epoch": 4.638218923933209, "grad_norm": 0.349609375, "learning_rate": 0.00012953448598129643, "loss": 0.8137, "step": 3750 }, { "epoch": 4.644403215831788, "grad_norm": 0.322265625, "learning_rate": 0.0001293280470223138, "loss": 0.8157, "step": 3755 }, { "epoch": 4.650587507730365, "grad_norm": 0.3125, "learning_rate": 0.00012912147122272523, "loss": 0.8122, "step": 3760 }, { "epoch": 4.656771799628943, "grad_norm": 0.384765625, "learning_rate": 0.00012891475954638474, "loss": 0.8164, "step": 3765 }, { "epoch": 4.66295609152752, "grad_norm": 0.416015625, "learning_rate": 0.0001287079129577804, "loss": 0.8045, "step": 3770 }, { "epoch": 4.669140383426098, "grad_norm": 0.345703125, "learning_rate": 0.00012850093242202978, "loss": 0.814, "step": 3775 }, { "epoch": 4.675324675324675, "grad_norm": 0.35546875, "learning_rate": 0.00012829381890487536, "loss": 0.8187, "step": 3780 }, { "epoch": 4.681508967223253, "grad_norm": 0.33984375, "learning_rate": 0.00012808657337268014, "loss": 0.8224, "step": 3785 }, { "epoch": 4.687693259121831, "grad_norm": 0.328125, "learning_rate": 0.00012787919679242306, "loss": 0.8082, "step": 3790 }, { "epoch": 4.6938775510204085, "grad_norm": 0.318359375, "learning_rate": 0.00012767169013169457, "loss": 0.8258, "step": 3795 }, { "epoch": 4.700061842918986, "grad_norm": 0.349609375, "learning_rate": 0.00012746405435869198, "loss": 0.8099, "step": 3800 }, { "epoch": 4.7062461348175635, "grad_norm": 0.341796875, "learning_rate": 0.00012725629044221505, "loss": 0.8123, "step": 3805 }, { "epoch": 4.712430426716141, "grad_norm": 0.318359375, "learning_rate": 0.00012704839935166143, "loss": 0.8137, "step": 3810 }, { "epoch": 4.7186147186147185, "grad_norm": 0.33984375, "learning_rate": 0.00012684038205702222, "loss": 0.805, "step": 3815 }, { "epoch": 4.724799010513296, "grad_norm": 0.3125, "learning_rate": 0.00012663223952887723, "loss": 0.8148, "step": 3820 }, { "epoch": 4.7309833024118735, "grad_norm": 0.3046875, "learning_rate": 0.00012642397273839075, "loss": 0.8067, "step": 3825 }, { "epoch": 4.737167594310451, "grad_norm": 0.330078125, "learning_rate": 0.0001262155826573067, "loss": 0.8045, "step": 3830 }, { "epoch": 4.743351886209029, "grad_norm": 0.3125, "learning_rate": 0.00012600707025794443, "loss": 0.813, "step": 3835 }, { "epoch": 4.749536178107607, "grad_norm": 0.3515625, "learning_rate": 0.0001257984365131938, "loss": 0.8037, "step": 3840 }, { "epoch": 4.755720470006184, "grad_norm": 0.34765625, "learning_rate": 0.0001255896823965111, "loss": 0.8107, "step": 3845 }, { "epoch": 4.761904761904762, "grad_norm": 0.33984375, "learning_rate": 0.00012538080888191408, "loss": 0.8004, "step": 3850 }, { "epoch": 4.768089053803339, "grad_norm": 0.326171875, "learning_rate": 0.00012517181694397762, "loss": 0.8151, "step": 3855 }, { "epoch": 4.774273345701917, "grad_norm": 0.35546875, "learning_rate": 0.00012496270755782914, "loss": 0.819, "step": 3860 }, { "epoch": 4.780457637600494, "grad_norm": 0.369140625, "learning_rate": 0.0001247534816991441, "loss": 0.8118, "step": 3865 }, { "epoch": 4.786641929499073, "grad_norm": 0.384765625, "learning_rate": 0.00012454414034414142, "loss": 0.8119, "step": 3870 }, { "epoch": 4.79282622139765, "grad_norm": 0.3671875, "learning_rate": 0.00012433468446957887, "loss": 0.8172, "step": 3875 }, { "epoch": 4.799010513296228, "grad_norm": 0.37109375, "learning_rate": 0.00012412511505274844, "loss": 0.8091, "step": 3880 }, { "epoch": 4.805194805194805, "grad_norm": 0.34765625, "learning_rate": 0.00012391543307147212, "loss": 0.8139, "step": 3885 }, { "epoch": 4.811379097093383, "grad_norm": 0.3515625, "learning_rate": 0.00012370563950409703, "loss": 0.7987, "step": 3890 }, { "epoch": 4.81756338899196, "grad_norm": 0.32421875, "learning_rate": 0.0001234957353294908, "loss": 0.8144, "step": 3895 }, { "epoch": 4.823747680890538, "grad_norm": 0.3359375, "learning_rate": 0.00012328572152703725, "loss": 0.8085, "step": 3900 }, { "epoch": 4.829931972789115, "grad_norm": 0.361328125, "learning_rate": 0.00012307559907663175, "loss": 0.8178, "step": 3905 }, { "epoch": 4.836116264687694, "grad_norm": 0.333984375, "learning_rate": 0.00012286536895867654, "loss": 0.8146, "step": 3910 }, { "epoch": 4.842300556586271, "grad_norm": 0.33203125, "learning_rate": 0.00012265503215407627, "loss": 0.8224, "step": 3915 }, { "epoch": 4.848484848484849, "grad_norm": 0.36328125, "learning_rate": 0.00012244458964423327, "loss": 0.8146, "step": 3920 }, { "epoch": 4.854669140383426, "grad_norm": 0.33203125, "learning_rate": 0.00012223404241104317, "loss": 0.8034, "step": 3925 }, { "epoch": 4.860853432282004, "grad_norm": 0.345703125, "learning_rate": 0.00012202339143689023, "loss": 0.8128, "step": 3930 }, { "epoch": 4.867037724180581, "grad_norm": 0.388671875, "learning_rate": 0.00012181263770464273, "loss": 0.8148, "step": 3935 }, { "epoch": 4.873222016079159, "grad_norm": 0.37109375, "learning_rate": 0.00012160178219764837, "loss": 0.806, "step": 3940 }, { "epoch": 4.879406307977737, "grad_norm": 0.373046875, "learning_rate": 0.00012139082589972972, "loss": 0.8123, "step": 3945 }, { "epoch": 4.8855905998763145, "grad_norm": 0.328125, "learning_rate": 0.00012117976979517973, "loss": 0.8089, "step": 3950 }, { "epoch": 4.891774891774892, "grad_norm": 0.341796875, "learning_rate": 0.00012096861486875693, "loss": 0.8188, "step": 3955 }, { "epoch": 4.8979591836734695, "grad_norm": 0.357421875, "learning_rate": 0.0001207573621056809, "loss": 0.8057, "step": 3960 }, { "epoch": 4.904143475572047, "grad_norm": 0.37890625, "learning_rate": 0.00012054601249162783, "loss": 0.8163, "step": 3965 }, { "epoch": 4.9103277674706245, "grad_norm": 0.330078125, "learning_rate": 0.00012033456701272576, "loss": 0.8103, "step": 3970 }, { "epoch": 4.916512059369202, "grad_norm": 0.34375, "learning_rate": 0.00012012302665555002, "loss": 0.8099, "step": 3975 }, { "epoch": 4.9226963512677795, "grad_norm": 0.376953125, "learning_rate": 0.00011991139240711857, "loss": 0.814, "step": 3980 }, { "epoch": 4.928880643166357, "grad_norm": 0.345703125, "learning_rate": 0.00011969966525488753, "loss": 0.8105, "step": 3985 }, { "epoch": 4.935064935064935, "grad_norm": 0.328125, "learning_rate": 0.00011948784618674653, "loss": 0.8127, "step": 3990 }, { "epoch": 4.941249226963513, "grad_norm": 0.34375, "learning_rate": 0.00011927593619101391, "loss": 0.8136, "step": 3995 }, { "epoch": 4.94743351886209, "grad_norm": 0.353515625, "learning_rate": 0.00011906393625643244, "loss": 0.811, "step": 4000 }, { "epoch": 4.953617810760668, "grad_norm": 0.34765625, "learning_rate": 0.0001188518473721644, "loss": 0.8143, "step": 4005 }, { "epoch": 4.959802102659245, "grad_norm": 0.310546875, "learning_rate": 0.00011863967052778721, "loss": 0.8092, "step": 4010 }, { "epoch": 4.965986394557823, "grad_norm": 0.32421875, "learning_rate": 0.0001184274067132886, "loss": 0.8225, "step": 4015 }, { "epoch": 4.9721706864564, "grad_norm": 0.3359375, "learning_rate": 0.00011821505691906216, "loss": 0.8057, "step": 4020 }, { "epoch": 4.978354978354979, "grad_norm": 0.322265625, "learning_rate": 0.00011800262213590261, "loss": 0.8183, "step": 4025 }, { "epoch": 4.984539270253556, "grad_norm": 0.35546875, "learning_rate": 0.0001177901033550012, "loss": 0.8163, "step": 4030 }, { "epoch": 4.990723562152134, "grad_norm": 0.353515625, "learning_rate": 0.00011757750156794118, "loss": 0.8127, "step": 4035 }, { "epoch": 4.996907854050711, "grad_norm": 0.349609375, "learning_rate": 0.00011736481776669306, "loss": 0.8173, "step": 4040 }, { "epoch": 4.999381570810142, "eval_loss": 2.6021482944488525, "eval_runtime": 0.8279, "eval_samples_per_second": 12.079, "eval_steps_per_second": 1.208, "step": 4042 }, { "epoch": 5.003092145949289, "grad_norm": 0.318359375, "learning_rate": 0.00011715205294360994, "loss": 0.8044, "step": 4045 }, { "epoch": 5.009276437847866, "grad_norm": 0.328125, "learning_rate": 0.00011693920809142305, "loss": 0.7861, "step": 4050 }, { "epoch": 5.015460729746444, "grad_norm": 0.330078125, "learning_rate": 0.00011672628420323699, "loss": 0.7971, "step": 4055 }, { "epoch": 5.021645021645021, "grad_norm": 0.34375, "learning_rate": 0.00011651328227252517, "loss": 0.7921, "step": 4060 }, { "epoch": 5.0278293135436, "grad_norm": 0.369140625, "learning_rate": 0.00011630020329312507, "loss": 0.7904, "step": 4065 }, { "epoch": 5.034013605442177, "grad_norm": 0.3515625, "learning_rate": 0.00011608704825923369, "loss": 0.7837, "step": 4070 }, { "epoch": 5.040197897340755, "grad_norm": 0.326171875, "learning_rate": 0.00011587381816540292, "loss": 0.7822, "step": 4075 }, { "epoch": 5.046382189239332, "grad_norm": 0.32421875, "learning_rate": 0.00011566051400653486, "loss": 0.7833, "step": 4080 }, { "epoch": 5.05256648113791, "grad_norm": 0.33203125, "learning_rate": 0.00011544713677787715, "loss": 0.7926, "step": 4085 }, { "epoch": 5.058750773036487, "grad_norm": 0.33203125, "learning_rate": 0.00011523368747501839, "loss": 0.784, "step": 4090 }, { "epoch": 5.064935064935065, "grad_norm": 0.3671875, "learning_rate": 0.00011502016709388348, "loss": 0.781, "step": 4095 }, { "epoch": 5.071119356833642, "grad_norm": 0.357421875, "learning_rate": 0.00011480657663072896, "loss": 0.7901, "step": 4100 }, { "epoch": 5.0773036487322205, "grad_norm": 0.392578125, "learning_rate": 0.00011459291708213836, "loss": 0.8018, "step": 4105 }, { "epoch": 5.083487940630798, "grad_norm": 0.330078125, "learning_rate": 0.00011437918944501749, "loss": 0.7841, "step": 4110 }, { "epoch": 5.0896722325293755, "grad_norm": 0.34765625, "learning_rate": 0.00011416539471658994, "loss": 0.7773, "step": 4115 }, { "epoch": 5.095856524427953, "grad_norm": 0.33984375, "learning_rate": 0.00011395153389439233, "loss": 0.7806, "step": 4120 }, { "epoch": 5.1020408163265305, "grad_norm": 0.361328125, "learning_rate": 0.0001137376079762696, "loss": 0.7873, "step": 4125 }, { "epoch": 5.108225108225108, "grad_norm": 0.32421875, "learning_rate": 0.00011352361796037047, "loss": 0.7771, "step": 4130 }, { "epoch": 5.1144094001236855, "grad_norm": 0.333984375, "learning_rate": 0.00011330956484514274, "loss": 0.7894, "step": 4135 }, { "epoch": 5.120593692022264, "grad_norm": 0.33984375, "learning_rate": 0.00011309544962932862, "loss": 0.7922, "step": 4140 }, { "epoch": 5.126777983920841, "grad_norm": 0.33203125, "learning_rate": 0.00011288127331195998, "loss": 0.7946, "step": 4145 }, { "epoch": 5.132962275819419, "grad_norm": 0.32421875, "learning_rate": 0.00011266703689235394, "loss": 0.7818, "step": 4150 }, { "epoch": 5.139146567717996, "grad_norm": 0.326171875, "learning_rate": 0.00011245274137010791, "loss": 0.7851, "step": 4155 }, { "epoch": 5.145330859616574, "grad_norm": 0.353515625, "learning_rate": 0.00011223838774509514, "loss": 0.7875, "step": 4160 }, { "epoch": 5.151515151515151, "grad_norm": 0.33984375, "learning_rate": 0.00011202397701745994, "loss": 0.7865, "step": 4165 }, { "epoch": 5.157699443413729, "grad_norm": 0.326171875, "learning_rate": 0.00011180951018761314, "loss": 0.8001, "step": 4170 }, { "epoch": 5.163883735312306, "grad_norm": 0.365234375, "learning_rate": 0.00011159498825622718, "loss": 0.7827, "step": 4175 }, { "epoch": 5.170068027210885, "grad_norm": 0.3671875, "learning_rate": 0.00011138041222423177, "loss": 0.7928, "step": 4180 }, { "epoch": 5.176252319109462, "grad_norm": 0.33984375, "learning_rate": 0.00011116578309280887, "loss": 0.7891, "step": 4185 }, { "epoch": 5.18243661100804, "grad_norm": 0.3359375, "learning_rate": 0.00011095110186338835, "loss": 0.795, "step": 4190 }, { "epoch": 5.188620902906617, "grad_norm": 0.359375, "learning_rate": 0.00011073636953764306, "loss": 0.7843, "step": 4195 }, { "epoch": 5.194805194805195, "grad_norm": 0.3359375, "learning_rate": 0.00011052158711748434, "loss": 0.7896, "step": 4200 }, { "epoch": 5.200989486703772, "grad_norm": 0.376953125, "learning_rate": 0.00011030675560505717, "loss": 0.785, "step": 4205 }, { "epoch": 5.20717377860235, "grad_norm": 0.337890625, "learning_rate": 0.00011009187600273566, "loss": 0.7892, "step": 4210 }, { "epoch": 5.213358070500927, "grad_norm": 0.3359375, "learning_rate": 0.00010987694931311827, "loss": 0.7908, "step": 4215 }, { "epoch": 5.219542362399506, "grad_norm": 0.345703125, "learning_rate": 0.0001096619765390232, "loss": 0.7925, "step": 4220 }, { "epoch": 5.225726654298083, "grad_norm": 0.369140625, "learning_rate": 0.00010944695868348359, "loss": 0.7926, "step": 4225 }, { "epoch": 5.231910946196661, "grad_norm": 0.3828125, "learning_rate": 0.00010923189674974301, "loss": 0.7941, "step": 4230 }, { "epoch": 5.238095238095238, "grad_norm": 0.357421875, "learning_rate": 0.00010901679174125066, "loss": 0.7873, "step": 4235 }, { "epoch": 5.244279529993816, "grad_norm": 0.353515625, "learning_rate": 0.00010880164466165674, "loss": 0.7951, "step": 4240 }, { "epoch": 5.250463821892393, "grad_norm": 0.3515625, "learning_rate": 0.00010858645651480768, "loss": 0.7936, "step": 4245 }, { "epoch": 5.256648113790971, "grad_norm": 0.345703125, "learning_rate": 0.00010837122830474158, "loss": 0.7933, "step": 4250 }, { "epoch": 5.262832405689548, "grad_norm": 0.6015625, "learning_rate": 0.00010815596103568353, "loss": 0.7982, "step": 4255 }, { "epoch": 5.2690166975881265, "grad_norm": 0.34375, "learning_rate": 0.00010794065571204072, "loss": 0.7757, "step": 4260 }, { "epoch": 5.275200989486704, "grad_norm": 0.34375, "learning_rate": 0.00010772531333839801, "loss": 0.7929, "step": 4265 }, { "epoch": 5.2813852813852815, "grad_norm": 0.375, "learning_rate": 0.0001075099349195131, "loss": 0.7827, "step": 4270 }, { "epoch": 5.287569573283859, "grad_norm": 0.408203125, "learning_rate": 0.00010729452146031183, "loss": 0.794, "step": 4275 }, { "epoch": 5.2937538651824365, "grad_norm": 0.369140625, "learning_rate": 0.00010707907396588361, "loss": 0.7956, "step": 4280 }, { "epoch": 5.299938157081014, "grad_norm": 0.330078125, "learning_rate": 0.00010686359344147658, "loss": 0.7739, "step": 4285 }, { "epoch": 5.3061224489795915, "grad_norm": 0.33203125, "learning_rate": 0.00010664808089249305, "loss": 0.7824, "step": 4290 }, { "epoch": 5.312306740878169, "grad_norm": 0.369140625, "learning_rate": 0.00010643253732448475, "loss": 0.7853, "step": 4295 }, { "epoch": 5.318491032776747, "grad_norm": 0.3359375, "learning_rate": 0.00010621696374314807, "loss": 0.7836, "step": 4300 }, { "epoch": 5.324675324675325, "grad_norm": 0.373046875, "learning_rate": 0.0001060013611543195, "loss": 0.7848, "step": 4305 }, { "epoch": 5.330859616573902, "grad_norm": 0.3828125, "learning_rate": 0.00010578573056397085, "loss": 0.7916, "step": 4310 }, { "epoch": 5.33704390847248, "grad_norm": 0.341796875, "learning_rate": 0.00010557007297820468, "loss": 0.7931, "step": 4315 }, { "epoch": 5.343228200371057, "grad_norm": 0.333984375, "learning_rate": 0.0001053543894032493, "loss": 0.7895, "step": 4320 }, { "epoch": 5.349412492269635, "grad_norm": 0.34375, "learning_rate": 0.00010513868084545446, "loss": 0.7927, "step": 4325 }, { "epoch": 5.355596784168212, "grad_norm": 0.318359375, "learning_rate": 0.00010492294831128641, "loss": 0.7862, "step": 4330 }, { "epoch": 5.361781076066791, "grad_norm": 0.3359375, "learning_rate": 0.00010470719280732333, "loss": 0.7832, "step": 4335 }, { "epoch": 5.367965367965368, "grad_norm": 0.3671875, "learning_rate": 0.00010449141534025045, "loss": 0.786, "step": 4340 }, { "epoch": 5.374149659863946, "grad_norm": 0.328125, "learning_rate": 0.00010427561691685557, "loss": 0.7869, "step": 4345 }, { "epoch": 5.380333951762523, "grad_norm": 0.337890625, "learning_rate": 0.00010405979854402425, "loss": 0.7865, "step": 4350 }, { "epoch": 5.386518243661101, "grad_norm": 0.349609375, "learning_rate": 0.00010384396122873515, "loss": 0.7944, "step": 4355 }, { "epoch": 5.392702535559678, "grad_norm": 0.326171875, "learning_rate": 0.00010362810597805526, "loss": 0.7856, "step": 4360 }, { "epoch": 5.398886827458256, "grad_norm": 0.3671875, "learning_rate": 0.00010341223379913534, "loss": 0.7869, "step": 4365 }, { "epoch": 5.405071119356833, "grad_norm": 0.38671875, "learning_rate": 0.00010319634569920504, "loss": 0.7841, "step": 4370 }, { "epoch": 5.411255411255412, "grad_norm": 0.328125, "learning_rate": 0.00010298044268556832, "loss": 0.7895, "step": 4375 }, { "epoch": 5.417439703153989, "grad_norm": 0.359375, "learning_rate": 0.00010276452576559879, "loss": 0.7905, "step": 4380 }, { "epoch": 5.423623995052567, "grad_norm": 0.3359375, "learning_rate": 0.0001025485959467349, "loss": 0.7868, "step": 4385 }, { "epoch": 5.429808286951144, "grad_norm": 0.34375, "learning_rate": 0.00010233265423647523, "loss": 0.7945, "step": 4390 }, { "epoch": 5.435992578849722, "grad_norm": 0.357421875, "learning_rate": 0.00010211670164237392, "loss": 0.7897, "step": 4395 }, { "epoch": 5.442176870748299, "grad_norm": 0.34375, "learning_rate": 0.00010190073917203589, "loss": 0.7897, "step": 4400 }, { "epoch": 5.448361162646877, "grad_norm": 0.333984375, "learning_rate": 0.0001016847678331121, "loss": 0.7912, "step": 4405 }, { "epoch": 5.454545454545454, "grad_norm": 0.345703125, "learning_rate": 0.00010146878863329492, "loss": 0.7858, "step": 4410 }, { "epoch": 5.4607297464440325, "grad_norm": 0.32421875, "learning_rate": 0.00010125280258031335, "loss": 0.7962, "step": 4415 }, { "epoch": 5.46691403834261, "grad_norm": 0.35546875, "learning_rate": 0.00010103681068192845, "loss": 0.7914, "step": 4420 }, { "epoch": 5.4730983302411875, "grad_norm": 0.384765625, "learning_rate": 0.00010082081394592851, "loss": 0.7891, "step": 4425 }, { "epoch": 5.479282622139765, "grad_norm": 0.3515625, "learning_rate": 0.00010060481338012435, "loss": 0.7881, "step": 4430 }, { "epoch": 5.4854669140383425, "grad_norm": 0.341796875, "learning_rate": 0.0001003888099923447, "loss": 0.796, "step": 4435 }, { "epoch": 5.49165120593692, "grad_norm": 0.349609375, "learning_rate": 0.00010017280479043147, "loss": 0.8005, "step": 4440 }, { "epoch": 5.4978354978354975, "grad_norm": 0.359375, "learning_rate": 9.995679878223505e-05, "loss": 0.7964, "step": 4445 }, { "epoch": 5.504019789734075, "grad_norm": 0.330078125, "learning_rate": 9.97407929756095e-05, "loss": 0.7767, "step": 4450 }, { "epoch": 5.510204081632653, "grad_norm": 0.35546875, "learning_rate": 9.952478837840803e-05, "loss": 0.7961, "step": 4455 }, { "epoch": 5.516388373531231, "grad_norm": 0.357421875, "learning_rate": 9.930878599847821e-05, "loss": 0.795, "step": 4460 }, { "epoch": 5.522572665429808, "grad_norm": 0.34765625, "learning_rate": 9.909278684365718e-05, "loss": 0.7861, "step": 4465 }, { "epoch": 5.528756957328386, "grad_norm": 0.34765625, "learning_rate": 9.887679192176712e-05, "loss": 0.7909, "step": 4470 }, { "epoch": 5.534941249226963, "grad_norm": 0.361328125, "learning_rate": 9.866080224061038e-05, "loss": 0.7894, "step": 4475 }, { "epoch": 5.541125541125541, "grad_norm": 0.337890625, "learning_rate": 9.844481880796491e-05, "loss": 0.7904, "step": 4480 }, { "epoch": 5.547309833024118, "grad_norm": 0.33203125, "learning_rate": 9.822884263157957e-05, "loss": 0.7901, "step": 4485 }, { "epoch": 5.553494124922697, "grad_norm": 0.33984375, "learning_rate": 9.801287471916919e-05, "loss": 0.8019, "step": 4490 }, { "epoch": 5.559678416821274, "grad_norm": 0.330078125, "learning_rate": 9.779691607841019e-05, "loss": 0.7863, "step": 4495 }, { "epoch": 5.565862708719852, "grad_norm": 0.35546875, "learning_rate": 9.758096771693573e-05, "loss": 0.7933, "step": 4500 }, { "epoch": 5.572047000618429, "grad_norm": 0.337890625, "learning_rate": 9.736503064233086e-05, "loss": 0.7874, "step": 4505 }, { "epoch": 5.578231292517007, "grad_norm": 0.341796875, "learning_rate": 9.714910586212816e-05, "loss": 0.7938, "step": 4510 }, { "epoch": 5.584415584415584, "grad_norm": 0.349609375, "learning_rate": 9.693319438380266e-05, "loss": 0.7906, "step": 4515 }, { "epoch": 5.590599876314162, "grad_norm": 0.357421875, "learning_rate": 9.671729721476746e-05, "loss": 0.7907, "step": 4520 }, { "epoch": 5.596784168212739, "grad_norm": 0.349609375, "learning_rate": 9.650141536236889e-05, "loss": 0.7948, "step": 4525 }, { "epoch": 5.602968460111317, "grad_norm": 0.330078125, "learning_rate": 9.628554983388173e-05, "loss": 0.7899, "step": 4530 }, { "epoch": 5.609152752009895, "grad_norm": 0.33984375, "learning_rate": 9.606970163650465e-05, "loss": 0.7881, "step": 4535 }, { "epoch": 5.615337043908473, "grad_norm": 0.341796875, "learning_rate": 9.585387177735547e-05, "loss": 0.7821, "step": 4540 }, { "epoch": 5.62152133580705, "grad_norm": 0.3359375, "learning_rate": 9.563806126346642e-05, "loss": 0.7883, "step": 4545 }, { "epoch": 5.627705627705628, "grad_norm": 0.3359375, "learning_rate": 9.542227110177945e-05, "loss": 0.7947, "step": 4550 }, { "epoch": 5.633889919604205, "grad_norm": 0.341796875, "learning_rate": 9.520650229914157e-05, "loss": 0.7849, "step": 4555 }, { "epoch": 5.640074211502783, "grad_norm": 0.326171875, "learning_rate": 9.499075586230013e-05, "loss": 0.7893, "step": 4560 }, { "epoch": 5.646258503401361, "grad_norm": 0.326171875, "learning_rate": 9.477503279789817e-05, "loss": 0.7945, "step": 4565 }, { "epoch": 5.6524427952999385, "grad_norm": 0.3359375, "learning_rate": 9.455933411246958e-05, "loss": 0.7949, "step": 4570 }, { "epoch": 5.658627087198516, "grad_norm": 0.36328125, "learning_rate": 9.43436608124346e-05, "loss": 0.7806, "step": 4575 }, { "epoch": 5.6648113790970935, "grad_norm": 0.359375, "learning_rate": 9.412801390409497e-05, "loss": 0.7862, "step": 4580 }, { "epoch": 5.670995670995671, "grad_norm": 0.349609375, "learning_rate": 9.391239439362928e-05, "loss": 0.7911, "step": 4585 }, { "epoch": 5.6771799628942485, "grad_norm": 0.3359375, "learning_rate": 9.369680328708836e-05, "loss": 0.7926, "step": 4590 }, { "epoch": 5.683364254792826, "grad_norm": 0.373046875, "learning_rate": 9.348124159039036e-05, "loss": 0.7925, "step": 4595 }, { "epoch": 5.6895485466914035, "grad_norm": 0.357421875, "learning_rate": 9.326571030931637e-05, "loss": 0.7949, "step": 4600 }, { "epoch": 5.695732838589981, "grad_norm": 0.330078125, "learning_rate": 9.305021044950552e-05, "loss": 0.7953, "step": 4605 }, { "epoch": 5.701917130488559, "grad_norm": 0.330078125, "learning_rate": 9.283474301645026e-05, "loss": 0.7858, "step": 4610 }, { "epoch": 5.708101422387137, "grad_norm": 0.33203125, "learning_rate": 9.261930901549181e-05, "loss": 0.7911, "step": 4615 }, { "epoch": 5.714285714285714, "grad_norm": 0.337890625, "learning_rate": 9.240390945181543e-05, "loss": 0.7924, "step": 4620 }, { "epoch": 5.720470006184292, "grad_norm": 0.341796875, "learning_rate": 9.21885453304456e-05, "loss": 0.7896, "step": 4625 }, { "epoch": 5.726654298082869, "grad_norm": 0.330078125, "learning_rate": 9.197321765624152e-05, "loss": 0.7911, "step": 4630 }, { "epoch": 5.732838589981447, "grad_norm": 0.353515625, "learning_rate": 9.175792743389227e-05, "loss": 0.788, "step": 4635 }, { "epoch": 5.739022881880024, "grad_norm": 0.341796875, "learning_rate": 9.154267566791223e-05, "loss": 0.804, "step": 4640 }, { "epoch": 5.745207173778603, "grad_norm": 0.341796875, "learning_rate": 9.132746336263632e-05, "loss": 0.7906, "step": 4645 }, { "epoch": 5.75139146567718, "grad_norm": 0.359375, "learning_rate": 9.111229152221535e-05, "loss": 0.7834, "step": 4650 }, { "epoch": 5.757575757575758, "grad_norm": 0.328125, "learning_rate": 9.089716115061135e-05, "loss": 0.7897, "step": 4655 }, { "epoch": 5.763760049474335, "grad_norm": 0.33203125, "learning_rate": 9.068207325159284e-05, "loss": 0.7916, "step": 4660 }, { "epoch": 5.769944341372913, "grad_norm": 0.33203125, "learning_rate": 9.046702882873016e-05, "loss": 0.787, "step": 4665 }, { "epoch": 5.77612863327149, "grad_norm": 0.337890625, "learning_rate": 9.02520288853908e-05, "loss": 0.7902, "step": 4670 }, { "epoch": 5.782312925170068, "grad_norm": 0.337890625, "learning_rate": 9.003707442473479e-05, "loss": 0.793, "step": 4675 }, { "epoch": 5.788497217068645, "grad_norm": 0.357421875, "learning_rate": 8.982216644970979e-05, "loss": 0.7871, "step": 4680 }, { "epoch": 5.794681508967223, "grad_norm": 0.37890625, "learning_rate": 8.960730596304664e-05, "loss": 0.7816, "step": 4685 }, { "epoch": 5.800865800865801, "grad_norm": 0.376953125, "learning_rate": 8.939249396725467e-05, "loss": 0.7936, "step": 4690 }, { "epoch": 5.807050092764379, "grad_norm": 0.345703125, "learning_rate": 8.917773146461692e-05, "loss": 0.783, "step": 4695 }, { "epoch": 5.813234384662956, "grad_norm": 0.361328125, "learning_rate": 8.896301945718541e-05, "loss": 0.7901, "step": 4700 }, { "epoch": 5.819418676561534, "grad_norm": 0.330078125, "learning_rate": 8.87483589467767e-05, "loss": 0.7946, "step": 4705 }, { "epoch": 5.825602968460111, "grad_norm": 0.333984375, "learning_rate": 8.853375093496699e-05, "loss": 0.7934, "step": 4710 }, { "epoch": 5.831787260358689, "grad_norm": 0.369140625, "learning_rate": 8.831919642308756e-05, "loss": 0.7853, "step": 4715 }, { "epoch": 5.837971552257266, "grad_norm": 0.388671875, "learning_rate": 8.810469641222001e-05, "loss": 0.795, "step": 4720 }, { "epoch": 5.8441558441558445, "grad_norm": 0.35546875, "learning_rate": 8.789025190319169e-05, "loss": 0.7956, "step": 4725 }, { "epoch": 5.850340136054422, "grad_norm": 0.36328125, "learning_rate": 8.767586389657098e-05, "loss": 0.795, "step": 4730 }, { "epoch": 5.8565244279529995, "grad_norm": 0.365234375, "learning_rate": 8.746153339266269e-05, "loss": 0.7876, "step": 4735 }, { "epoch": 5.862708719851577, "grad_norm": 0.34765625, "learning_rate": 8.724726139150318e-05, "loss": 0.7828, "step": 4740 }, { "epoch": 5.8688930117501545, "grad_norm": 0.369140625, "learning_rate": 8.7033048892856e-05, "loss": 0.7989, "step": 4745 }, { "epoch": 5.875077303648732, "grad_norm": 0.369140625, "learning_rate": 8.6818896896207e-05, "loss": 0.7921, "step": 4750 }, { "epoch": 5.8812615955473095, "grad_norm": 0.3515625, "learning_rate": 8.66048064007597e-05, "loss": 0.7923, "step": 4755 }, { "epoch": 5.887445887445887, "grad_norm": 0.357421875, "learning_rate": 8.639077840543077e-05, "loss": 0.796, "step": 4760 }, { "epoch": 5.893630179344465, "grad_norm": 0.3359375, "learning_rate": 8.617681390884512e-05, "loss": 0.7927, "step": 4765 }, { "epoch": 5.899814471243043, "grad_norm": 0.330078125, "learning_rate": 8.596291390933147e-05, "loss": 0.7888, "step": 4770 }, { "epoch": 5.90599876314162, "grad_norm": 0.330078125, "learning_rate": 8.574907940491767e-05, "loss": 0.7926, "step": 4775 }, { "epoch": 5.912183055040198, "grad_norm": 0.341796875, "learning_rate": 8.553531139332582e-05, "loss": 0.7997, "step": 4780 }, { "epoch": 5.918367346938775, "grad_norm": 0.337890625, "learning_rate": 8.532161087196791e-05, "loss": 0.7912, "step": 4785 }, { "epoch": 5.924551638837353, "grad_norm": 0.341796875, "learning_rate": 8.510797883794097e-05, "loss": 0.794, "step": 4790 }, { "epoch": 5.93073593073593, "grad_norm": 0.341796875, "learning_rate": 8.489441628802246e-05, "loss": 0.7995, "step": 4795 }, { "epoch": 5.936920222634509, "grad_norm": 0.33984375, "learning_rate": 8.468092421866573e-05, "loss": 0.7934, "step": 4800 }, { "epoch": 5.943104514533086, "grad_norm": 0.326171875, "learning_rate": 8.446750362599513e-05, "loss": 0.7781, "step": 4805 }, { "epoch": 5.949288806431664, "grad_norm": 0.328125, "learning_rate": 8.425415550580162e-05, "loss": 0.784, "step": 4810 }, { "epoch": 5.955473098330241, "grad_norm": 0.3359375, "learning_rate": 8.4040880853538e-05, "loss": 0.7875, "step": 4815 }, { "epoch": 5.961657390228819, "grad_norm": 0.33984375, "learning_rate": 8.382768066431425e-05, "loss": 0.7874, "step": 4820 }, { "epoch": 5.967841682127396, "grad_norm": 0.349609375, "learning_rate": 8.361455593289292e-05, "loss": 0.7898, "step": 4825 }, { "epoch": 5.974025974025974, "grad_norm": 0.345703125, "learning_rate": 8.340150765368452e-05, "loss": 0.7973, "step": 4830 }, { "epoch": 5.980210265924551, "grad_norm": 0.349609375, "learning_rate": 8.318853682074278e-05, "loss": 0.7854, "step": 4835 }, { "epoch": 5.986394557823129, "grad_norm": 0.345703125, "learning_rate": 8.297564442776014e-05, "loss": 0.7794, "step": 4840 }, { "epoch": 5.992578849721707, "grad_norm": 0.333984375, "learning_rate": 8.276283146806304e-05, "loss": 0.8008, "step": 4845 }, { "epoch": 5.998763141620285, "grad_norm": 0.333984375, "learning_rate": 8.255009893460724e-05, "loss": 0.7949, "step": 4850 }, { "epoch": 6.0, "eval_loss": 2.6445889472961426, "eval_runtime": 0.538, "eval_samples_per_second": 18.587, "eval_steps_per_second": 1.859, "step": 4851 }, { "epoch": 6.004947433518862, "grad_norm": 0.337890625, "learning_rate": 8.233744781997329e-05, "loss": 0.7779, "step": 4855 }, { "epoch": 6.01113172541744, "grad_norm": 0.33984375, "learning_rate": 8.212487911636184e-05, "loss": 0.7803, "step": 4860 }, { "epoch": 6.017316017316017, "grad_norm": 0.34375, "learning_rate": 8.191239381558904e-05, "loss": 0.779, "step": 4865 }, { "epoch": 6.023500309214595, "grad_norm": 0.3515625, "learning_rate": 8.169999290908188e-05, "loss": 0.7566, "step": 4870 }, { "epoch": 6.029684601113172, "grad_norm": 0.3359375, "learning_rate": 8.148767738787355e-05, "loss": 0.7617, "step": 4875 }, { "epoch": 6.035868893011751, "grad_norm": 0.341796875, "learning_rate": 8.127544824259889e-05, "loss": 0.7754, "step": 4880 }, { "epoch": 6.042053184910328, "grad_norm": 0.32421875, "learning_rate": 8.106330646348972e-05, "loss": 0.7757, "step": 4885 }, { "epoch": 6.0482374768089056, "grad_norm": 0.341796875, "learning_rate": 8.085125304037018e-05, "loss": 0.7626, "step": 4890 }, { "epoch": 6.054421768707483, "grad_norm": 0.361328125, "learning_rate": 8.063928896265217e-05, "loss": 0.7734, "step": 4895 }, { "epoch": 6.0606060606060606, "grad_norm": 0.376953125, "learning_rate": 8.042741521933071e-05, "loss": 0.7789, "step": 4900 }, { "epoch": 6.066790352504638, "grad_norm": 0.361328125, "learning_rate": 8.021563279897938e-05, "loss": 0.7702, "step": 4905 }, { "epoch": 6.0729746444032155, "grad_norm": 0.34765625, "learning_rate": 8.000394268974563e-05, "loss": 0.7761, "step": 4910 }, { "epoch": 6.079158936301793, "grad_norm": 0.3671875, "learning_rate": 7.979234587934616e-05, "loss": 0.7661, "step": 4915 }, { "epoch": 6.085343228200371, "grad_norm": 0.361328125, "learning_rate": 7.958084335506239e-05, "loss": 0.7652, "step": 4920 }, { "epoch": 6.091527520098949, "grad_norm": 0.3359375, "learning_rate": 7.936943610373584e-05, "loss": 0.7751, "step": 4925 }, { "epoch": 6.097711811997526, "grad_norm": 0.337890625, "learning_rate": 7.915812511176347e-05, "loss": 0.7661, "step": 4930 }, { "epoch": 6.103896103896104, "grad_norm": 0.33984375, "learning_rate": 7.894691136509305e-05, "loss": 0.7664, "step": 4935 }, { "epoch": 6.110080395794681, "grad_norm": 0.37109375, "learning_rate": 7.873579584921869e-05, "loss": 0.7732, "step": 4940 }, { "epoch": 6.116264687693259, "grad_norm": 0.3515625, "learning_rate": 7.852477954917618e-05, "loss": 0.7684, "step": 4945 }, { "epoch": 6.122448979591836, "grad_norm": 0.34765625, "learning_rate": 7.831386344953836e-05, "loss": 0.7782, "step": 4950 }, { "epoch": 6.128633271490414, "grad_norm": 0.34765625, "learning_rate": 7.810304853441051e-05, "loss": 0.7699, "step": 4955 }, { "epoch": 6.134817563388992, "grad_norm": 0.34765625, "learning_rate": 7.789233578742582e-05, "loss": 0.7796, "step": 4960 }, { "epoch": 6.14100185528757, "grad_norm": 0.357421875, "learning_rate": 7.768172619174086e-05, "loss": 0.771, "step": 4965 }, { "epoch": 6.147186147186147, "grad_norm": 0.349609375, "learning_rate": 7.747122073003075e-05, "loss": 0.7766, "step": 4970 }, { "epoch": 6.153370439084725, "grad_norm": 0.345703125, "learning_rate": 7.72608203844849e-05, "loss": 0.7733, "step": 4975 }, { "epoch": 6.159554730983302, "grad_norm": 0.341796875, "learning_rate": 7.705052613680211e-05, "loss": 0.7719, "step": 4980 }, { "epoch": 6.16573902288188, "grad_norm": 0.35546875, "learning_rate": 7.684033896818627e-05, "loss": 0.7711, "step": 4985 }, { "epoch": 6.171923314780457, "grad_norm": 0.345703125, "learning_rate": 7.663025985934158e-05, "loss": 0.775, "step": 4990 }, { "epoch": 6.178107606679035, "grad_norm": 0.330078125, "learning_rate": 7.642028979046807e-05, "loss": 0.7742, "step": 4995 }, { "epoch": 6.184291898577613, "grad_norm": 0.361328125, "learning_rate": 7.6210429741257e-05, "loss": 0.7785, "step": 5000 }, { "epoch": 6.190476190476191, "grad_norm": 0.33203125, "learning_rate": 7.600068069088634e-05, "loss": 0.7774, "step": 5005 }, { "epoch": 6.196660482374768, "grad_norm": 0.330078125, "learning_rate": 7.579104361801605e-05, "loss": 0.7792, "step": 5010 }, { "epoch": 6.202844774273346, "grad_norm": 0.3515625, "learning_rate": 7.558151950078376e-05, "loss": 0.7627, "step": 5015 }, { "epoch": 6.209029066171923, "grad_norm": 0.353515625, "learning_rate": 7.537210931679987e-05, "loss": 0.7706, "step": 5020 }, { "epoch": 6.215213358070501, "grad_norm": 0.3359375, "learning_rate": 7.516281404314341e-05, "loss": 0.7667, "step": 5025 }, { "epoch": 6.221397649969078, "grad_norm": 0.3359375, "learning_rate": 7.495363465635708e-05, "loss": 0.7715, "step": 5030 }, { "epoch": 6.227581941867657, "grad_norm": 0.3359375, "learning_rate": 7.474457213244293e-05, "loss": 0.7713, "step": 5035 }, { "epoch": 6.233766233766234, "grad_norm": 0.361328125, "learning_rate": 7.453562744685778e-05, "loss": 0.7678, "step": 5040 }, { "epoch": 6.239950525664812, "grad_norm": 0.353515625, "learning_rate": 7.432680157450857e-05, "loss": 0.771, "step": 5045 }, { "epoch": 6.246134817563389, "grad_norm": 0.353515625, "learning_rate": 7.411809548974792e-05, "loss": 0.7679, "step": 5050 }, { "epoch": 6.252319109461967, "grad_norm": 0.359375, "learning_rate": 7.390951016636952e-05, "loss": 0.7661, "step": 5055 }, { "epoch": 6.258503401360544, "grad_norm": 0.359375, "learning_rate": 7.370104657760361e-05, "loss": 0.7689, "step": 5060 }, { "epoch": 6.264687693259122, "grad_norm": 0.34375, "learning_rate": 7.34927056961124e-05, "loss": 0.7744, "step": 5065 }, { "epoch": 6.270871985157699, "grad_norm": 0.3359375, "learning_rate": 7.328448849398558e-05, "loss": 0.7728, "step": 5070 }, { "epoch": 6.2770562770562774, "grad_norm": 0.33203125, "learning_rate": 7.307639594273581e-05, "loss": 0.7649, "step": 5075 }, { "epoch": 6.283240568954855, "grad_norm": 0.34375, "learning_rate": 7.286842901329412e-05, "loss": 0.7695, "step": 5080 }, { "epoch": 6.289424860853432, "grad_norm": 0.341796875, "learning_rate": 7.266058867600537e-05, "loss": 0.7775, "step": 5085 }, { "epoch": 6.29560915275201, "grad_norm": 0.330078125, "learning_rate": 7.245287590062384e-05, "loss": 0.775, "step": 5090 }, { "epoch": 6.301793444650587, "grad_norm": 0.345703125, "learning_rate": 7.224529165630856e-05, "loss": 0.7714, "step": 5095 }, { "epoch": 6.307977736549165, "grad_norm": 0.365234375, "learning_rate": 7.203783691161883e-05, "loss": 0.7614, "step": 5100 }, { "epoch": 6.314162028447742, "grad_norm": 0.361328125, "learning_rate": 7.183051263450983e-05, "loss": 0.7715, "step": 5105 }, { "epoch": 6.32034632034632, "grad_norm": 0.345703125, "learning_rate": 7.162331979232783e-05, "loss": 0.7729, "step": 5110 }, { "epoch": 6.326530612244898, "grad_norm": 0.34375, "learning_rate": 7.1416259351806e-05, "loss": 0.7706, "step": 5115 }, { "epoch": 6.332714904143476, "grad_norm": 0.34375, "learning_rate": 7.12093322790597e-05, "loss": 0.7749, "step": 5120 }, { "epoch": 6.338899196042053, "grad_norm": 0.35546875, "learning_rate": 7.100253953958195e-05, "loss": 0.7782, "step": 5125 }, { "epoch": 6.345083487940631, "grad_norm": 0.36328125, "learning_rate": 7.079588209823906e-05, "loss": 0.7707, "step": 5130 }, { "epoch": 6.351267779839208, "grad_norm": 0.357421875, "learning_rate": 7.058936091926603e-05, "loss": 0.7723, "step": 5135 }, { "epoch": 6.357452071737786, "grad_norm": 0.349609375, "learning_rate": 7.038297696626206e-05, "loss": 0.7736, "step": 5140 }, { "epoch": 6.363636363636363, "grad_norm": 0.33984375, "learning_rate": 7.017673120218615e-05, "loss": 0.7765, "step": 5145 }, { "epoch": 6.369820655534941, "grad_norm": 0.34375, "learning_rate": 6.99706245893524e-05, "loss": 0.7703, "step": 5150 }, { "epoch": 6.376004947433519, "grad_norm": 0.341796875, "learning_rate": 6.976465808942576e-05, "loss": 0.7717, "step": 5155 }, { "epoch": 6.382189239332097, "grad_norm": 0.341796875, "learning_rate": 6.955883266341741e-05, "loss": 0.773, "step": 5160 }, { "epoch": 6.388373531230674, "grad_norm": 0.345703125, "learning_rate": 6.935314927168026e-05, "loss": 0.781, "step": 5165 }, { "epoch": 6.394557823129252, "grad_norm": 0.3515625, "learning_rate": 6.914760887390452e-05, "loss": 0.766, "step": 5170 }, { "epoch": 6.400742115027829, "grad_norm": 0.392578125, "learning_rate": 6.894221242911329e-05, "loss": 0.7808, "step": 5175 }, { "epoch": 6.406926406926407, "grad_norm": 0.33203125, "learning_rate": 6.873696089565786e-05, "loss": 0.769, "step": 5180 }, { "epoch": 6.413110698824984, "grad_norm": 0.333984375, "learning_rate": 6.853185523121348e-05, "loss": 0.7708, "step": 5185 }, { "epoch": 6.419294990723563, "grad_norm": 0.33984375, "learning_rate": 6.832689639277484e-05, "loss": 0.7689, "step": 5190 }, { "epoch": 6.42547928262214, "grad_norm": 0.357421875, "learning_rate": 6.812208533665141e-05, "loss": 0.7797, "step": 5195 }, { "epoch": 6.431663574520718, "grad_norm": 0.337890625, "learning_rate": 6.791742301846326e-05, "loss": 0.7686, "step": 5200 }, { "epoch": 6.437847866419295, "grad_norm": 0.3359375, "learning_rate": 6.77129103931364e-05, "loss": 0.7735, "step": 5205 }, { "epoch": 6.444032158317873, "grad_norm": 0.369140625, "learning_rate": 6.750854841489842e-05, "loss": 0.7708, "step": 5210 }, { "epoch": 6.45021645021645, "grad_norm": 0.3359375, "learning_rate": 6.730433803727407e-05, "loss": 0.765, "step": 5215 }, { "epoch": 6.456400742115028, "grad_norm": 0.361328125, "learning_rate": 6.710028021308061e-05, "loss": 0.7662, "step": 5220 }, { "epoch": 6.462585034013605, "grad_norm": 0.34765625, "learning_rate": 6.689637589442361e-05, "loss": 0.7692, "step": 5225 }, { "epoch": 6.4687693259121835, "grad_norm": 0.35546875, "learning_rate": 6.669262603269246e-05, "loss": 0.7819, "step": 5230 }, { "epoch": 6.474953617810761, "grad_norm": 0.34375, "learning_rate": 6.64890315785557e-05, "loss": 0.7696, "step": 5235 }, { "epoch": 6.4811379097093385, "grad_norm": 0.3359375, "learning_rate": 6.62855934819569e-05, "loss": 0.7709, "step": 5240 }, { "epoch": 6.487322201607916, "grad_norm": 0.330078125, "learning_rate": 6.608231269211002e-05, "loss": 0.7673, "step": 5245 }, { "epoch": 6.4935064935064934, "grad_norm": 0.34375, "learning_rate": 6.587919015749511e-05, "loss": 0.7745, "step": 5250 }, { "epoch": 6.499690785405071, "grad_norm": 0.333984375, "learning_rate": 6.56762268258538e-05, "loss": 0.7677, "step": 5255 }, { "epoch": 6.5058750773036484, "grad_norm": 0.353515625, "learning_rate": 6.547342364418481e-05, "loss": 0.7729, "step": 5260 }, { "epoch": 6.512059369202227, "grad_norm": 0.337890625, "learning_rate": 6.527078155873978e-05, "loss": 0.7756, "step": 5265 }, { "epoch": 6.518243661100804, "grad_norm": 0.3359375, "learning_rate": 6.506830151501861e-05, "loss": 0.7707, "step": 5270 }, { "epoch": 6.524427952999382, "grad_norm": 0.341796875, "learning_rate": 6.486598445776513e-05, "loss": 0.7715, "step": 5275 }, { "epoch": 6.530612244897959, "grad_norm": 0.33203125, "learning_rate": 6.466383133096267e-05, "loss": 0.7745, "step": 5280 }, { "epoch": 6.536796536796537, "grad_norm": 0.33984375, "learning_rate": 6.446184307782978e-05, "loss": 0.7744, "step": 5285 }, { "epoch": 6.542980828695114, "grad_norm": 0.3515625, "learning_rate": 6.426002064081565e-05, "loss": 0.779, "step": 5290 }, { "epoch": 6.549165120593692, "grad_norm": 0.345703125, "learning_rate": 6.405836496159585e-05, "loss": 0.7714, "step": 5295 }, { "epoch": 6.555349412492269, "grad_norm": 0.349609375, "learning_rate": 6.385687698106781e-05, "loss": 0.775, "step": 5300 }, { "epoch": 6.561533704390847, "grad_norm": 0.34375, "learning_rate": 6.365555763934656e-05, "loss": 0.7724, "step": 5305 }, { "epoch": 6.567717996289425, "grad_norm": 0.34765625, "learning_rate": 6.345440787576031e-05, "loss": 0.777, "step": 5310 }, { "epoch": 6.573902288188003, "grad_norm": 0.33203125, "learning_rate": 6.3253428628846e-05, "loss": 0.7853, "step": 5315 }, { "epoch": 6.58008658008658, "grad_norm": 0.349609375, "learning_rate": 6.305262083634488e-05, "loss": 0.7741, "step": 5320 }, { "epoch": 6.586270871985158, "grad_norm": 0.345703125, "learning_rate": 6.285198543519835e-05, "loss": 0.7777, "step": 5325 }, { "epoch": 6.592455163883735, "grad_norm": 0.3359375, "learning_rate": 6.265152336154345e-05, "loss": 0.7692, "step": 5330 }, { "epoch": 6.598639455782313, "grad_norm": 0.33203125, "learning_rate": 6.245123555070838e-05, "loss": 0.7723, "step": 5335 }, { "epoch": 6.60482374768089, "grad_norm": 0.353515625, "learning_rate": 6.225112293720836e-05, "loss": 0.7705, "step": 5340 }, { "epoch": 6.611008039579469, "grad_norm": 0.330078125, "learning_rate": 6.205118645474115e-05, "loss": 0.7708, "step": 5345 }, { "epoch": 6.617192331478046, "grad_norm": 0.345703125, "learning_rate": 6.18514270361827e-05, "loss": 0.7716, "step": 5350 }, { "epoch": 6.623376623376624, "grad_norm": 0.34765625, "learning_rate": 6.165184561358275e-05, "loss": 0.7692, "step": 5355 }, { "epoch": 6.629560915275201, "grad_norm": 0.35546875, "learning_rate": 6.145244311816063e-05, "loss": 0.768, "step": 5360 }, { "epoch": 6.635745207173779, "grad_norm": 0.337890625, "learning_rate": 6.125322048030072e-05, "loss": 0.7651, "step": 5365 }, { "epoch": 6.641929499072356, "grad_norm": 0.357421875, "learning_rate": 6.105417862954828e-05, "loss": 0.7685, "step": 5370 }, { "epoch": 6.648113790970934, "grad_norm": 0.3515625, "learning_rate": 6.0855318494605004e-05, "loss": 0.7773, "step": 5375 }, { "epoch": 6.654298082869511, "grad_norm": 0.33984375, "learning_rate": 6.065664100332478e-05, "loss": 0.7813, "step": 5380 }, { "epoch": 6.660482374768089, "grad_norm": 0.341796875, "learning_rate": 6.045814708270925e-05, "loss": 0.7759, "step": 5385 }, { "epoch": 6.666666666666667, "grad_norm": 0.3359375, "learning_rate": 6.025983765890353e-05, "loss": 0.7689, "step": 5390 }, { "epoch": 6.6728509585652445, "grad_norm": 0.34765625, "learning_rate": 6.0061713657191976e-05, "loss": 0.7735, "step": 5395 }, { "epoch": 6.679035250463822, "grad_norm": 0.345703125, "learning_rate": 5.986377600199371e-05, "loss": 0.7647, "step": 5400 }, { "epoch": 6.6852195423623995, "grad_norm": 0.34375, "learning_rate": 5.9666025616858475e-05, "loss": 0.7687, "step": 5405 }, { "epoch": 6.691403834260977, "grad_norm": 0.330078125, "learning_rate": 5.946846342446214e-05, "loss": 0.7685, "step": 5410 }, { "epoch": 6.6975881261595545, "grad_norm": 0.3359375, "learning_rate": 5.927109034660251e-05, "loss": 0.7727, "step": 5415 }, { "epoch": 6.703772418058133, "grad_norm": 0.33203125, "learning_rate": 5.907390730419507e-05, "loss": 0.7711, "step": 5420 }, { "epoch": 6.70995670995671, "grad_norm": 0.349609375, "learning_rate": 5.887691521726859e-05, "loss": 0.769, "step": 5425 }, { "epoch": 6.716141001855288, "grad_norm": 0.33984375, "learning_rate": 5.868011500496084e-05, "loss": 0.7663, "step": 5430 }, { "epoch": 6.722325293753865, "grad_norm": 0.33984375, "learning_rate": 5.848350758551437e-05, "loss": 0.7712, "step": 5435 }, { "epoch": 6.728509585652443, "grad_norm": 0.333984375, "learning_rate": 5.828709387627218e-05, "loss": 0.7715, "step": 5440 }, { "epoch": 6.73469387755102, "grad_norm": 0.345703125, "learning_rate": 5.80908747936735e-05, "loss": 0.77, "step": 5445 }, { "epoch": 6.740878169449598, "grad_norm": 0.359375, "learning_rate": 5.789485125324926e-05, "loss": 0.786, "step": 5450 }, { "epoch": 6.747062461348175, "grad_norm": 0.337890625, "learning_rate": 5.7699024169618256e-05, "loss": 0.778, "step": 5455 }, { "epoch": 6.753246753246753, "grad_norm": 0.3359375, "learning_rate": 5.750339445648252e-05, "loss": 0.7707, "step": 5460 }, { "epoch": 6.759431045145331, "grad_norm": 0.3515625, "learning_rate": 5.7307963026623146e-05, "loss": 0.7772, "step": 5465 }, { "epoch": 6.765615337043909, "grad_norm": 0.33984375, "learning_rate": 5.7112730791896207e-05, "loss": 0.7712, "step": 5470 }, { "epoch": 6.771799628942486, "grad_norm": 0.359375, "learning_rate": 5.691769866322825e-05, "loss": 0.772, "step": 5475 }, { "epoch": 6.777983920841064, "grad_norm": 0.34375, "learning_rate": 5.6722867550612116e-05, "loss": 0.7651, "step": 5480 }, { "epoch": 6.784168212739641, "grad_norm": 0.345703125, "learning_rate": 5.652823836310288e-05, "loss": 0.7676, "step": 5485 }, { "epoch": 6.790352504638219, "grad_norm": 0.330078125, "learning_rate": 5.633381200881335e-05, "loss": 0.7677, "step": 5490 }, { "epoch": 6.796536796536796, "grad_norm": 0.359375, "learning_rate": 5.613958939490995e-05, "loss": 0.7768, "step": 5495 }, { "epoch": 6.802721088435375, "grad_norm": 0.34375, "learning_rate": 5.5945571427608526e-05, "loss": 0.7766, "step": 5500 }, { "epoch": 6.808905380333952, "grad_norm": 0.337890625, "learning_rate": 5.575175901216999e-05, "loss": 0.7739, "step": 5505 }, { "epoch": 6.81508967223253, "grad_norm": 0.345703125, "learning_rate": 5.555815305289631e-05, "loss": 0.7766, "step": 5510 }, { "epoch": 6.821273964131107, "grad_norm": 0.353515625, "learning_rate": 5.536475445312606e-05, "loss": 0.7739, "step": 5515 }, { "epoch": 6.827458256029685, "grad_norm": 0.34765625, "learning_rate": 5.5171564115230254e-05, "loss": 0.7771, "step": 5520 }, { "epoch": 6.833642547928262, "grad_norm": 0.34375, "learning_rate": 5.4978582940608356e-05, "loss": 0.7792, "step": 5525 }, { "epoch": 6.83982683982684, "grad_norm": 0.37109375, "learning_rate": 5.4785811829683764e-05, "loss": 0.7781, "step": 5530 }, { "epoch": 6.846011131725417, "grad_norm": 0.365234375, "learning_rate": 5.459325168189977e-05, "loss": 0.772, "step": 5535 }, { "epoch": 6.852195423623995, "grad_norm": 0.33203125, "learning_rate": 5.4400903395715366e-05, "loss": 0.7666, "step": 5540 }, { "epoch": 6.858379715522573, "grad_norm": 0.345703125, "learning_rate": 5.4208767868600986e-05, "loss": 0.7778, "step": 5545 }, { "epoch": 6.8645640074211505, "grad_norm": 0.337890625, "learning_rate": 5.401684599703445e-05, "loss": 0.7673, "step": 5550 }, { "epoch": 6.870748299319728, "grad_norm": 0.34765625, "learning_rate": 5.382513867649663e-05, "loss": 0.7766, "step": 5555 }, { "epoch": 6.8769325912183055, "grad_norm": 0.349609375, "learning_rate": 5.363364680146725e-05, "loss": 0.771, "step": 5560 }, { "epoch": 6.883116883116883, "grad_norm": 0.349609375, "learning_rate": 5.3442371265420995e-05, "loss": 0.7721, "step": 5565 }, { "epoch": 6.8893011750154605, "grad_norm": 0.34375, "learning_rate": 5.325131296082298e-05, "loss": 0.7745, "step": 5570 }, { "epoch": 6.895485466914038, "grad_norm": 0.33984375, "learning_rate": 5.306047277912479e-05, "loss": 0.7693, "step": 5575 }, { "epoch": 6.901669758812616, "grad_norm": 0.35546875, "learning_rate": 5.286985161076029e-05, "loss": 0.7751, "step": 5580 }, { "epoch": 6.907854050711194, "grad_norm": 0.345703125, "learning_rate": 5.2679450345141416e-05, "loss": 0.7807, "step": 5585 }, { "epoch": 6.914038342609771, "grad_norm": 0.33984375, "learning_rate": 5.248926987065417e-05, "loss": 0.7848, "step": 5590 }, { "epoch": 6.920222634508349, "grad_norm": 0.34375, "learning_rate": 5.2299311074654265e-05, "loss": 0.7677, "step": 5595 }, { "epoch": 6.926406926406926, "grad_norm": 0.3515625, "learning_rate": 5.210957484346314e-05, "loss": 0.7877, "step": 5600 }, { "epoch": 6.932591218305504, "grad_norm": 0.373046875, "learning_rate": 5.192006206236382e-05, "loss": 0.7775, "step": 5605 }, { "epoch": 6.938775510204081, "grad_norm": 0.341796875, "learning_rate": 5.173077361559665e-05, "loss": 0.7645, "step": 5610 }, { "epoch": 6.944959802102659, "grad_norm": 0.34765625, "learning_rate": 5.154171038635534e-05, "loss": 0.7685, "step": 5615 }, { "epoch": 6.951144094001237, "grad_norm": 0.349609375, "learning_rate": 5.135287325678271e-05, "loss": 0.7718, "step": 5620 }, { "epoch": 6.957328385899815, "grad_norm": 0.34765625, "learning_rate": 5.116426310796663e-05, "loss": 0.7705, "step": 5625 }, { "epoch": 6.963512677798392, "grad_norm": 0.33984375, "learning_rate": 5.0975880819936004e-05, "loss": 0.7672, "step": 5630 }, { "epoch": 6.96969696969697, "grad_norm": 0.341796875, "learning_rate": 5.078772727165646e-05, "loss": 0.7737, "step": 5635 }, { "epoch": 6.975881261595547, "grad_norm": 0.34765625, "learning_rate": 5.059980334102637e-05, "loss": 0.7793, "step": 5640 }, { "epoch": 6.982065553494125, "grad_norm": 0.337890625, "learning_rate": 5.041210990487286e-05, "loss": 0.7691, "step": 5645 }, { "epoch": 6.988249845392702, "grad_norm": 0.36328125, "learning_rate": 5.022464783894744e-05, "loss": 0.7773, "step": 5650 }, { "epoch": 6.994434137291281, "grad_norm": 0.34375, "learning_rate": 5.0037418017922125e-05, "loss": 0.7732, "step": 5655 }, { "epoch": 6.999381570810142, "eval_loss": 2.678618907928467, "eval_runtime": 0.8072, "eval_samples_per_second": 12.388, "eval_steps_per_second": 1.239, "step": 5659 }, { "epoch": 7.000618429189858, "grad_norm": 0.328125, "learning_rate": 4.985042131538545e-05, "loss": 0.7716, "step": 5660 }, { "epoch": 7.006802721088436, "grad_norm": 0.341796875, "learning_rate": 4.966365860383798e-05, "loss": 0.7436, "step": 5665 }, { "epoch": 7.012987012987013, "grad_norm": 0.349609375, "learning_rate": 4.9477130754688775e-05, "loss": 0.7644, "step": 5670 }, { "epoch": 7.019171304885591, "grad_norm": 0.34375, "learning_rate": 4.92908386382509e-05, "loss": 0.7701, "step": 5675 }, { "epoch": 7.025355596784168, "grad_norm": 0.341796875, "learning_rate": 4.9104783123737566e-05, "loss": 0.7624, "step": 5680 }, { "epoch": 7.031539888682746, "grad_norm": 0.33984375, "learning_rate": 4.891896507925808e-05, "loss": 0.7688, "step": 5685 }, { "epoch": 7.037724180581323, "grad_norm": 0.353515625, "learning_rate": 4.873338537181368e-05, "loss": 0.7593, "step": 5690 }, { "epoch": 7.0439084724799015, "grad_norm": 0.359375, "learning_rate": 4.854804486729355e-05, "loss": 0.7599, "step": 5695 }, { "epoch": 7.050092764378479, "grad_norm": 0.34765625, "learning_rate": 4.836294443047088e-05, "loss": 0.7649, "step": 5700 }, { "epoch": 7.0562770562770565, "grad_norm": 0.345703125, "learning_rate": 4.817808492499866e-05, "loss": 0.7533, "step": 5705 }, { "epoch": 7.062461348175634, "grad_norm": 0.34375, "learning_rate": 4.7993467213405706e-05, "loss": 0.7545, "step": 5710 }, { "epoch": 7.0686456400742115, "grad_norm": 0.345703125, "learning_rate": 4.780909215709273e-05, "loss": 0.7542, "step": 5715 }, { "epoch": 7.074829931972789, "grad_norm": 0.345703125, "learning_rate": 4.762496061632814e-05, "loss": 0.7556, "step": 5720 }, { "epoch": 7.0810142238713665, "grad_norm": 0.333984375, "learning_rate": 4.744107345024432e-05, "loss": 0.7599, "step": 5725 }, { "epoch": 7.087198515769944, "grad_norm": 0.345703125, "learning_rate": 4.725743151683325e-05, "loss": 0.7591, "step": 5730 }, { "epoch": 7.093382807668522, "grad_norm": 0.337890625, "learning_rate": 4.707403567294275e-05, "loss": 0.7558, "step": 5735 }, { "epoch": 7.0995670995671, "grad_norm": 0.345703125, "learning_rate": 4.689088677427249e-05, "loss": 0.7574, "step": 5740 }, { "epoch": 7.105751391465677, "grad_norm": 0.341796875, "learning_rate": 4.670798567536986e-05, "loss": 0.7602, "step": 5745 }, { "epoch": 7.111935683364255, "grad_norm": 0.33984375, "learning_rate": 4.652533322962597e-05, "loss": 0.7612, "step": 5750 }, { "epoch": 7.118119975262832, "grad_norm": 0.357421875, "learning_rate": 4.6342930289271925e-05, "loss": 0.7663, "step": 5755 }, { "epoch": 7.12430426716141, "grad_norm": 0.361328125, "learning_rate": 4.6160777705374524e-05, "loss": 0.7658, "step": 5760 }, { "epoch": 7.130488559059987, "grad_norm": 0.349609375, "learning_rate": 4.597887632783258e-05, "loss": 0.7526, "step": 5765 }, { "epoch": 7.136672850958565, "grad_norm": 0.34765625, "learning_rate": 4.579722700537268e-05, "loss": 0.7562, "step": 5770 }, { "epoch": 7.142857142857143, "grad_norm": 0.33984375, "learning_rate": 4.561583058554537e-05, "loss": 0.7587, "step": 5775 }, { "epoch": 7.149041434755721, "grad_norm": 0.357421875, "learning_rate": 4.543468791472131e-05, "loss": 0.7509, "step": 5780 }, { "epoch": 7.155225726654298, "grad_norm": 0.341796875, "learning_rate": 4.525379983808706e-05, "loss": 0.7658, "step": 5785 }, { "epoch": 7.161410018552876, "grad_norm": 0.341796875, "learning_rate": 4.5073167199641367e-05, "loss": 0.766, "step": 5790 }, { "epoch": 7.167594310451453, "grad_norm": 0.345703125, "learning_rate": 4.489279084219108e-05, "loss": 0.7609, "step": 5795 }, { "epoch": 7.173778602350031, "grad_norm": 0.349609375, "learning_rate": 4.471267160734731e-05, "loss": 0.7531, "step": 5800 }, { "epoch": 7.179962894248608, "grad_norm": 0.33984375, "learning_rate": 4.453281033552142e-05, "loss": 0.7633, "step": 5805 }, { "epoch": 7.186147186147186, "grad_norm": 0.34375, "learning_rate": 4.43532078659213e-05, "loss": 0.7722, "step": 5810 }, { "epoch": 7.192331478045764, "grad_norm": 0.33984375, "learning_rate": 4.4173865036547105e-05, "loss": 0.7576, "step": 5815 }, { "epoch": 7.198515769944342, "grad_norm": 0.3515625, "learning_rate": 4.399478268418771e-05, "loss": 0.7573, "step": 5820 }, { "epoch": 7.204700061842919, "grad_norm": 0.34375, "learning_rate": 4.3815961644416536e-05, "loss": 0.7523, "step": 5825 }, { "epoch": 7.210884353741497, "grad_norm": 0.353515625, "learning_rate": 4.36374027515878e-05, "loss": 0.7605, "step": 5830 }, { "epoch": 7.217068645640074, "grad_norm": 0.33984375, "learning_rate": 4.3459106838832566e-05, "loss": 0.7668, "step": 5835 }, { "epoch": 7.223252937538652, "grad_norm": 0.34375, "learning_rate": 4.328107473805487e-05, "loss": 0.7547, "step": 5840 }, { "epoch": 7.229437229437229, "grad_norm": 0.34765625, "learning_rate": 4.3103307279927804e-05, "loss": 0.7549, "step": 5845 }, { "epoch": 7.235621521335807, "grad_norm": 0.33203125, "learning_rate": 4.2925805293889786e-05, "loss": 0.7616, "step": 5850 }, { "epoch": 7.241805813234385, "grad_norm": 0.33984375, "learning_rate": 4.274856960814045e-05, "loss": 0.7548, "step": 5855 }, { "epoch": 7.2479901051329625, "grad_norm": 0.341796875, "learning_rate": 4.257160104963696e-05, "loss": 0.7668, "step": 5860 }, { "epoch": 7.25417439703154, "grad_norm": 0.361328125, "learning_rate": 4.2394900444090134e-05, "loss": 0.7602, "step": 5865 }, { "epoch": 7.2603586889301175, "grad_norm": 0.341796875, "learning_rate": 4.2218468615960484e-05, "loss": 0.7551, "step": 5870 }, { "epoch": 7.266542980828695, "grad_norm": 0.3515625, "learning_rate": 4.204230638845458e-05, "loss": 0.7587, "step": 5875 }, { "epoch": 7.2727272727272725, "grad_norm": 0.345703125, "learning_rate": 4.1866414583520877e-05, "loss": 0.7574, "step": 5880 }, { "epoch": 7.27891156462585, "grad_norm": 0.3359375, "learning_rate": 4.169079402184618e-05, "loss": 0.7554, "step": 5885 }, { "epoch": 7.285095856524428, "grad_norm": 0.359375, "learning_rate": 4.1515445522851784e-05, "loss": 0.7646, "step": 5890 }, { "epoch": 7.291280148423006, "grad_norm": 0.35546875, "learning_rate": 4.134036990468946e-05, "loss": 0.7644, "step": 5895 }, { "epoch": 7.297464440321583, "grad_norm": 0.3359375, "learning_rate": 4.1165567984237764e-05, "loss": 0.7581, "step": 5900 }, { "epoch": 7.303648732220161, "grad_norm": 0.337890625, "learning_rate": 4.0991040577098316e-05, "loss": 0.7615, "step": 5905 }, { "epoch": 7.309833024118738, "grad_norm": 0.359375, "learning_rate": 4.081678849759181e-05, "loss": 0.759, "step": 5910 }, { "epoch": 7.316017316017316, "grad_norm": 0.359375, "learning_rate": 4.064281255875429e-05, "loss": 0.7666, "step": 5915 }, { "epoch": 7.322201607915893, "grad_norm": 0.34765625, "learning_rate": 4.046911357233343e-05, "loss": 0.7573, "step": 5920 }, { "epoch": 7.328385899814471, "grad_norm": 0.34765625, "learning_rate": 4.0295692348784586e-05, "loss": 0.7538, "step": 5925 }, { "epoch": 7.334570191713049, "grad_norm": 0.3515625, "learning_rate": 4.0122549697267244e-05, "loss": 0.7705, "step": 5930 }, { "epoch": 7.340754483611627, "grad_norm": 0.34375, "learning_rate": 3.994968642564101e-05, "loss": 0.7636, "step": 5935 }, { "epoch": 7.346938775510204, "grad_norm": 0.34375, "learning_rate": 3.977710334046193e-05, "loss": 0.7644, "step": 5940 }, { "epoch": 7.353123067408782, "grad_norm": 0.35546875, "learning_rate": 3.960480124697885e-05, "loss": 0.7771, "step": 5945 }, { "epoch": 7.359307359307359, "grad_norm": 0.34375, "learning_rate": 3.943278094912946e-05, "loss": 0.7582, "step": 5950 }, { "epoch": 7.365491651205937, "grad_norm": 0.3515625, "learning_rate": 3.926104324953658e-05, "loss": 0.7585, "step": 5955 }, { "epoch": 7.371675943104514, "grad_norm": 0.34375, "learning_rate": 3.9089588949504655e-05, "loss": 0.7614, "step": 5960 }, { "epoch": 7.377860235003092, "grad_norm": 0.345703125, "learning_rate": 3.891841884901557e-05, "loss": 0.7694, "step": 5965 }, { "epoch": 7.38404452690167, "grad_norm": 0.33984375, "learning_rate": 3.874753374672542e-05, "loss": 0.7549, "step": 5970 }, { "epoch": 7.390228818800248, "grad_norm": 0.345703125, "learning_rate": 3.857693443996038e-05, "loss": 0.7721, "step": 5975 }, { "epoch": 7.396413110698825, "grad_norm": 0.3359375, "learning_rate": 3.840662172471315e-05, "loss": 0.762, "step": 5980 }, { "epoch": 7.402597402597403, "grad_norm": 0.3359375, "learning_rate": 3.8236596395639354e-05, "loss": 0.7569, "step": 5985 }, { "epoch": 7.40878169449598, "grad_norm": 0.349609375, "learning_rate": 3.806685924605361e-05, "loss": 0.7617, "step": 5990 }, { "epoch": 7.414965986394558, "grad_norm": 0.33984375, "learning_rate": 3.7897411067925894e-05, "loss": 0.7598, "step": 5995 }, { "epoch": 7.421150278293135, "grad_norm": 0.34765625, "learning_rate": 3.772825265187802e-05, "loss": 0.7624, "step": 6000 }, { "epoch": 7.427334570191713, "grad_norm": 0.353515625, "learning_rate": 3.755938478717968e-05, "loss": 0.7614, "step": 6005 }, { "epoch": 7.433518862090291, "grad_norm": 0.3515625, "learning_rate": 3.739080826174498e-05, "loss": 0.7665, "step": 6010 }, { "epoch": 7.4397031539888685, "grad_norm": 0.345703125, "learning_rate": 3.722252386212862e-05, "loss": 0.7694, "step": 6015 }, { "epoch": 7.445887445887446, "grad_norm": 0.33984375, "learning_rate": 3.705453237352227e-05, "loss": 0.7687, "step": 6020 }, { "epoch": 7.4520717377860235, "grad_norm": 0.341796875, "learning_rate": 3.688683457975103e-05, "loss": 0.7684, "step": 6025 }, { "epoch": 7.458256029684601, "grad_norm": 0.3359375, "learning_rate": 3.6719431263269533e-05, "loss": 0.7564, "step": 6030 }, { "epoch": 7.4644403215831785, "grad_norm": 0.3515625, "learning_rate": 3.655232320515844e-05, "loss": 0.7689, "step": 6035 }, { "epoch": 7.470624613481756, "grad_norm": 0.341796875, "learning_rate": 3.638551118512089e-05, "loss": 0.7683, "step": 6040 }, { "epoch": 7.476808905380334, "grad_norm": 0.359375, "learning_rate": 3.621899598147863e-05, "loss": 0.7605, "step": 6045 }, { "epoch": 7.482993197278912, "grad_norm": 0.33984375, "learning_rate": 3.605277837116854e-05, "loss": 0.7638, "step": 6050 }, { "epoch": 7.489177489177489, "grad_norm": 0.34375, "learning_rate": 3.588685912973896e-05, "loss": 0.7569, "step": 6055 }, { "epoch": 7.495361781076067, "grad_norm": 0.34765625, "learning_rate": 3.5721239031346066e-05, "loss": 0.7588, "step": 6060 }, { "epoch": 7.501546072974644, "grad_norm": 0.357421875, "learning_rate": 3.555591884875038e-05, "loss": 0.7602, "step": 6065 }, { "epoch": 7.507730364873222, "grad_norm": 0.34765625, "learning_rate": 3.539089935331294e-05, "loss": 0.7616, "step": 6070 }, { "epoch": 7.513914656771799, "grad_norm": 0.345703125, "learning_rate": 3.52261813149918e-05, "loss": 0.764, "step": 6075 }, { "epoch": 7.520098948670377, "grad_norm": 0.333984375, "learning_rate": 3.506176550233863e-05, "loss": 0.7588, "step": 6080 }, { "epoch": 7.526283240568954, "grad_norm": 0.3515625, "learning_rate": 3.4897652682494776e-05, "loss": 0.7579, "step": 6085 }, { "epoch": 7.532467532467533, "grad_norm": 0.3515625, "learning_rate": 3.473384362118794e-05, "loss": 0.7637, "step": 6090 }, { "epoch": 7.53865182436611, "grad_norm": 0.345703125, "learning_rate": 3.457033908272852e-05, "loss": 0.7657, "step": 6095 }, { "epoch": 7.544836116264688, "grad_norm": 0.34765625, "learning_rate": 3.440713983000601e-05, "loss": 0.7603, "step": 6100 }, { "epoch": 7.551020408163265, "grad_norm": 0.341796875, "learning_rate": 3.424424662448559e-05, "loss": 0.7621, "step": 6105 }, { "epoch": 7.557204700061843, "grad_norm": 0.349609375, "learning_rate": 3.4081660226204357e-05, "loss": 0.7672, "step": 6110 }, { "epoch": 7.56338899196042, "grad_norm": 0.3359375, "learning_rate": 3.3919381393767925e-05, "loss": 0.7577, "step": 6115 }, { "epoch": 7.569573283858999, "grad_norm": 0.337890625, "learning_rate": 3.3757410884346894e-05, "loss": 0.7561, "step": 6120 }, { "epoch": 7.575757575757576, "grad_norm": 0.34765625, "learning_rate": 3.3595749453673206e-05, "loss": 0.765, "step": 6125 }, { "epoch": 7.581941867656154, "grad_norm": 0.359375, "learning_rate": 3.34343978560367e-05, "loss": 0.7675, "step": 6130 }, { "epoch": 7.588126159554731, "grad_norm": 0.353515625, "learning_rate": 3.3273356844281613e-05, "loss": 0.7637, "step": 6135 }, { "epoch": 7.594310451453309, "grad_norm": 0.357421875, "learning_rate": 3.3112627169802946e-05, "loss": 0.767, "step": 6140 }, { "epoch": 7.600494743351886, "grad_norm": 0.3671875, "learning_rate": 3.295220958254317e-05, "loss": 0.7645, "step": 6145 }, { "epoch": 7.606679035250464, "grad_norm": 0.341796875, "learning_rate": 3.2792104830988515e-05, "loss": 0.7654, "step": 6150 }, { "epoch": 7.612863327149041, "grad_norm": 0.3515625, "learning_rate": 3.2632313662165525e-05, "loss": 0.7671, "step": 6155 }, { "epoch": 7.619047619047619, "grad_norm": 0.341796875, "learning_rate": 3.2472836821637744e-05, "loss": 0.7625, "step": 6160 }, { "epoch": 7.625231910946197, "grad_norm": 0.34375, "learning_rate": 3.231367505350199e-05, "loss": 0.7662, "step": 6165 }, { "epoch": 7.6314162028447745, "grad_norm": 0.3515625, "learning_rate": 3.2154829100385e-05, "loss": 0.7527, "step": 6170 }, { "epoch": 7.637600494743352, "grad_norm": 0.337890625, "learning_rate": 3.1996299703440095e-05, "loss": 0.7665, "step": 6175 }, { "epoch": 7.6437847866419295, "grad_norm": 0.33984375, "learning_rate": 3.1838087602343344e-05, "loss": 0.7641, "step": 6180 }, { "epoch": 7.649969078540507, "grad_norm": 0.333984375, "learning_rate": 3.1680193535290626e-05, "loss": 0.7638, "step": 6185 }, { "epoch": 7.6561533704390845, "grad_norm": 0.345703125, "learning_rate": 3.1522618238993725e-05, "loss": 0.7636, "step": 6190 }, { "epoch": 7.662337662337662, "grad_norm": 0.337890625, "learning_rate": 3.1365362448677146e-05, "loss": 0.7629, "step": 6195 }, { "epoch": 7.66852195423624, "grad_norm": 0.345703125, "learning_rate": 3.120842689807468e-05, "loss": 0.7709, "step": 6200 }, { "epoch": 7.674706246134818, "grad_norm": 0.353515625, "learning_rate": 3.105181231942584e-05, "loss": 0.7639, "step": 6205 }, { "epoch": 7.680890538033395, "grad_norm": 0.3359375, "learning_rate": 3.089551944347255e-05, "loss": 0.7675, "step": 6210 }, { "epoch": 7.687074829931973, "grad_norm": 0.341796875, "learning_rate": 3.0739548999455805e-05, "loss": 0.7611, "step": 6215 }, { "epoch": 7.69325912183055, "grad_norm": 0.349609375, "learning_rate": 3.058390171511196e-05, "loss": 0.7589, "step": 6220 }, { "epoch": 7.699443413729128, "grad_norm": 0.353515625, "learning_rate": 3.0428578316669798e-05, "loss": 0.7629, "step": 6225 }, { "epoch": 7.705627705627705, "grad_norm": 0.34765625, "learning_rate": 3.0273579528846762e-05, "loss": 0.7581, "step": 6230 }, { "epoch": 7.711811997526283, "grad_norm": 0.34375, "learning_rate": 3.0118906074845678e-05, "loss": 0.761, "step": 6235 }, { "epoch": 7.71799628942486, "grad_norm": 0.34765625, "learning_rate": 2.996455867635155e-05, "loss": 0.7692, "step": 6240 }, { "epoch": 7.724180581323439, "grad_norm": 0.345703125, "learning_rate": 2.9810538053527914e-05, "loss": 0.7696, "step": 6245 }, { "epoch": 7.730364873222016, "grad_norm": 0.3515625, "learning_rate": 2.9656844925013637e-05, "loss": 0.7705, "step": 6250 }, { "epoch": 7.736549165120594, "grad_norm": 0.353515625, "learning_rate": 2.9503480007919648e-05, "loss": 0.7645, "step": 6255 }, { "epoch": 7.742733457019171, "grad_norm": 0.345703125, "learning_rate": 2.9350444017825385e-05, "loss": 0.7537, "step": 6260 }, { "epoch": 7.748917748917749, "grad_norm": 0.3515625, "learning_rate": 2.919773766877556e-05, "loss": 0.7634, "step": 6265 }, { "epoch": 7.755102040816326, "grad_norm": 0.349609375, "learning_rate": 2.9045361673276872e-05, "loss": 0.7648, "step": 6270 }, { "epoch": 7.761286332714904, "grad_norm": 0.349609375, "learning_rate": 2.8893316742294562e-05, "loss": 0.7568, "step": 6275 }, { "epoch": 7.767470624613482, "grad_norm": 0.341796875, "learning_rate": 2.874160358524931e-05, "loss": 0.7604, "step": 6280 }, { "epoch": 7.77365491651206, "grad_norm": 0.345703125, "learning_rate": 2.8590222910013655e-05, "loss": 0.7556, "step": 6285 }, { "epoch": 7.779839208410637, "grad_norm": 0.35546875, "learning_rate": 2.8439175422908824e-05, "loss": 0.7591, "step": 6290 }, { "epoch": 7.786023500309215, "grad_norm": 0.33984375, "learning_rate": 2.828846182870155e-05, "loss": 0.7548, "step": 6295 }, { "epoch": 7.792207792207792, "grad_norm": 0.34375, "learning_rate": 2.8138082830600554e-05, "loss": 0.7589, "step": 6300 }, { "epoch": 7.79839208410637, "grad_norm": 0.337890625, "learning_rate": 2.798803913025343e-05, "loss": 0.7604, "step": 6305 }, { "epoch": 7.804576376004947, "grad_norm": 0.337890625, "learning_rate": 2.7838331427743282e-05, "loss": 0.7524, "step": 6310 }, { "epoch": 7.810760667903525, "grad_norm": 0.3359375, "learning_rate": 2.76889604215855e-05, "loss": 0.7664, "step": 6315 }, { "epoch": 7.816944959802103, "grad_norm": 0.349609375, "learning_rate": 2.753992680872457e-05, "loss": 0.7627, "step": 6320 }, { "epoch": 7.8231292517006805, "grad_norm": 0.3359375, "learning_rate": 2.739123128453066e-05, "loss": 0.7592, "step": 6325 }, { "epoch": 7.829313543599258, "grad_norm": 0.357421875, "learning_rate": 2.7242874542796482e-05, "loss": 0.7581, "step": 6330 }, { "epoch": 7.8354978354978355, "grad_norm": 0.3359375, "learning_rate": 2.7094857275734076e-05, "loss": 0.7632, "step": 6335 }, { "epoch": 7.841682127396413, "grad_norm": 0.345703125, "learning_rate": 2.6947180173971508e-05, "loss": 0.7633, "step": 6340 }, { "epoch": 7.8478664192949905, "grad_norm": 0.361328125, "learning_rate": 2.6799843926549685e-05, "loss": 0.7628, "step": 6345 }, { "epoch": 7.854050711193568, "grad_norm": 0.3359375, "learning_rate": 2.665284922091912e-05, "loss": 0.7538, "step": 6350 }, { "epoch": 7.860235003092146, "grad_norm": 0.34375, "learning_rate": 2.6506196742936717e-05, "loss": 0.7672, "step": 6355 }, { "epoch": 7.866419294990724, "grad_norm": 0.33984375, "learning_rate": 2.6359887176862718e-05, "loss": 0.7593, "step": 6360 }, { "epoch": 7.872603586889301, "grad_norm": 0.349609375, "learning_rate": 2.621392120535724e-05, "loss": 0.7619, "step": 6365 }, { "epoch": 7.878787878787879, "grad_norm": 0.35546875, "learning_rate": 2.6068299509477266e-05, "loss": 0.7687, "step": 6370 }, { "epoch": 7.884972170686456, "grad_norm": 0.34375, "learning_rate": 2.5923022768673532e-05, "loss": 0.7602, "step": 6375 }, { "epoch": 7.891156462585034, "grad_norm": 0.341796875, "learning_rate": 2.577809166078716e-05, "loss": 0.7664, "step": 6380 }, { "epoch": 7.897340754483611, "grad_norm": 0.34375, "learning_rate": 2.5633506862046607e-05, "loss": 0.7607, "step": 6385 }, { "epoch": 7.903525046382189, "grad_norm": 0.3515625, "learning_rate": 2.548926904706459e-05, "loss": 0.7651, "step": 6390 }, { "epoch": 7.909709338280766, "grad_norm": 0.349609375, "learning_rate": 2.5345378888834714e-05, "loss": 0.7706, "step": 6395 }, { "epoch": 7.915893630179345, "grad_norm": 0.333984375, "learning_rate": 2.5201837058728505e-05, "loss": 0.757, "step": 6400 }, { "epoch": 7.922077922077922, "grad_norm": 0.3515625, "learning_rate": 2.5058644226492346e-05, "loss": 0.7556, "step": 6405 }, { "epoch": 7.9282622139765, "grad_norm": 0.337890625, "learning_rate": 2.4915801060244092e-05, "loss": 0.7614, "step": 6410 }, { "epoch": 7.934446505875077, "grad_norm": 0.337890625, "learning_rate": 2.4773308226470238e-05, "loss": 0.7577, "step": 6415 }, { "epoch": 7.940630797773655, "grad_norm": 0.345703125, "learning_rate": 2.4631166390022574e-05, "loss": 0.7622, "step": 6420 }, { "epoch": 7.946815089672232, "grad_norm": 0.34765625, "learning_rate": 2.4489376214115212e-05, "loss": 0.7594, "step": 6425 }, { "epoch": 7.95299938157081, "grad_norm": 0.33984375, "learning_rate": 2.4347938360321566e-05, "loss": 0.7637, "step": 6430 }, { "epoch": 7.959183673469388, "grad_norm": 0.3515625, "learning_rate": 2.4206853488570957e-05, "loss": 0.7606, "step": 6435 }, { "epoch": 7.965367965367966, "grad_norm": 0.34765625, "learning_rate": 2.4066122257145894e-05, "loss": 0.7591, "step": 6440 }, { "epoch": 7.971552257266543, "grad_norm": 0.330078125, "learning_rate": 2.392574532267886e-05, "loss": 0.7565, "step": 6445 }, { "epoch": 7.977736549165121, "grad_norm": 0.341796875, "learning_rate": 2.3785723340149134e-05, "loss": 0.7602, "step": 6450 }, { "epoch": 7.983920841063698, "grad_norm": 0.3515625, "learning_rate": 2.3646056962879946e-05, "loss": 0.765, "step": 6455 }, { "epoch": 7.990105132962276, "grad_norm": 0.349609375, "learning_rate": 2.3506746842535242e-05, "loss": 0.7504, "step": 6460 }, { "epoch": 7.996289424860853, "grad_norm": 0.337890625, "learning_rate": 2.336779362911674e-05, "loss": 0.7605, "step": 6465 }, { "epoch": 8.0, "eval_loss": 2.6913065910339355, "eval_runtime": 0.5419, "eval_samples_per_second": 18.454, "eval_steps_per_second": 1.845, "step": 6468 }, { "epoch": 8.00247371675943, "grad_norm": 0.345703125, "learning_rate": 2.3229197970960924e-05, "loss": 0.7614, "step": 6470 }, { "epoch": 8.008658008658008, "grad_norm": 0.3359375, "learning_rate": 2.309096051473597e-05, "loss": 0.7476, "step": 6475 }, { "epoch": 8.014842300556586, "grad_norm": 0.3515625, "learning_rate": 2.295308190543859e-05, "loss": 0.7644, "step": 6480 }, { "epoch": 8.021026592455163, "grad_norm": 0.337890625, "learning_rate": 2.2815562786391387e-05, "loss": 0.7602, "step": 6485 }, { "epoch": 8.02721088435374, "grad_norm": 0.3359375, "learning_rate": 2.26784037992395e-05, "loss": 0.7602, "step": 6490 }, { "epoch": 8.03339517625232, "grad_norm": 0.361328125, "learning_rate": 2.2541605583947724e-05, "loss": 0.7652, "step": 6495 }, { "epoch": 8.039579468150897, "grad_norm": 0.33984375, "learning_rate": 2.2405168778797646e-05, "loss": 0.7472, "step": 6500 }, { "epoch": 8.045763760049475, "grad_norm": 0.359375, "learning_rate": 2.226909402038446e-05, "loss": 0.7592, "step": 6505 }, { "epoch": 8.051948051948052, "grad_norm": 0.353515625, "learning_rate": 2.2133381943614207e-05, "loss": 0.7577, "step": 6510 }, { "epoch": 8.05813234384663, "grad_norm": 0.34765625, "learning_rate": 2.1998033181700617e-05, "loss": 0.7583, "step": 6515 }, { "epoch": 8.064316635745207, "grad_norm": 0.33984375, "learning_rate": 2.1863048366162208e-05, "loss": 0.7609, "step": 6520 }, { "epoch": 8.070500927643785, "grad_norm": 0.337890625, "learning_rate": 2.17284281268195e-05, "loss": 0.7582, "step": 6525 }, { "epoch": 8.076685219542362, "grad_norm": 0.35546875, "learning_rate": 2.159417309179189e-05, "loss": 0.7651, "step": 6530 }, { "epoch": 8.08286951144094, "grad_norm": 0.353515625, "learning_rate": 2.1460283887494724e-05, "loss": 0.76, "step": 6535 }, { "epoch": 8.089053803339517, "grad_norm": 0.34765625, "learning_rate": 2.1326761138636553e-05, "loss": 0.7578, "step": 6540 }, { "epoch": 8.095238095238095, "grad_norm": 0.341796875, "learning_rate": 2.1193605468216005e-05, "loss": 0.7515, "step": 6545 }, { "epoch": 8.101422387136672, "grad_norm": 0.345703125, "learning_rate": 2.106081749751897e-05, "loss": 0.7533, "step": 6550 }, { "epoch": 8.10760667903525, "grad_norm": 0.33984375, "learning_rate": 2.092839784611579e-05, "loss": 0.7621, "step": 6555 }, { "epoch": 8.113790970933827, "grad_norm": 0.34375, "learning_rate": 2.0796347131858186e-05, "loss": 0.7519, "step": 6560 }, { "epoch": 8.119975262832405, "grad_norm": 0.337890625, "learning_rate": 2.0664665970876496e-05, "loss": 0.7582, "step": 6565 }, { "epoch": 8.126159554730984, "grad_norm": 0.337890625, "learning_rate": 2.05333549775768e-05, "loss": 0.7682, "step": 6570 }, { "epoch": 8.132343846629562, "grad_norm": 0.341796875, "learning_rate": 2.0402414764637978e-05, "loss": 0.7644, "step": 6575 }, { "epoch": 8.13852813852814, "grad_norm": 0.35546875, "learning_rate": 2.027184594300898e-05, "loss": 0.7534, "step": 6580 }, { "epoch": 8.144712430426717, "grad_norm": 0.33984375, "learning_rate": 2.0141649121905827e-05, "loss": 0.7579, "step": 6585 }, { "epoch": 8.150896722325294, "grad_norm": 0.345703125, "learning_rate": 2.0011824908808808e-05, "loss": 0.7619, "step": 6590 }, { "epoch": 8.157081014223872, "grad_norm": 0.34765625, "learning_rate": 1.9882373909459795e-05, "loss": 0.7595, "step": 6595 }, { "epoch": 8.16326530612245, "grad_norm": 0.33984375, "learning_rate": 1.9753296727859195e-05, "loss": 0.7598, "step": 6600 }, { "epoch": 8.169449598021027, "grad_norm": 0.337890625, "learning_rate": 1.962459396626326e-05, "loss": 0.759, "step": 6605 }, { "epoch": 8.175633889919604, "grad_norm": 0.341796875, "learning_rate": 1.9496266225181248e-05, "loss": 0.7556, "step": 6610 }, { "epoch": 8.181818181818182, "grad_norm": 0.341796875, "learning_rate": 1.936831410337261e-05, "loss": 0.7477, "step": 6615 }, { "epoch": 8.18800247371676, "grad_norm": 0.34375, "learning_rate": 1.9240738197844278e-05, "loss": 0.7617, "step": 6620 }, { "epoch": 8.194186765615337, "grad_norm": 0.345703125, "learning_rate": 1.9113539103847723e-05, "loss": 0.7535, "step": 6625 }, { "epoch": 8.200371057513914, "grad_norm": 0.341796875, "learning_rate": 1.89867174148763e-05, "loss": 0.7577, "step": 6630 }, { "epoch": 8.206555349412492, "grad_norm": 0.33984375, "learning_rate": 1.886027372266247e-05, "loss": 0.7597, "step": 6635 }, { "epoch": 8.21273964131107, "grad_norm": 0.337890625, "learning_rate": 1.8734208617174988e-05, "loss": 0.7588, "step": 6640 }, { "epoch": 8.218923933209647, "grad_norm": 0.333984375, "learning_rate": 1.860852268661616e-05, "loss": 0.7583, "step": 6645 }, { "epoch": 8.225108225108226, "grad_norm": 0.357421875, "learning_rate": 1.8483216517419142e-05, "loss": 0.7556, "step": 6650 }, { "epoch": 8.231292517006803, "grad_norm": 0.34765625, "learning_rate": 1.835829069424515e-05, "loss": 0.7524, "step": 6655 }, { "epoch": 8.237476808905381, "grad_norm": 0.34375, "learning_rate": 1.8233745799980817e-05, "loss": 0.7563, "step": 6660 }, { "epoch": 8.243661100803958, "grad_norm": 0.345703125, "learning_rate": 1.810958241573535e-05, "loss": 0.747, "step": 6665 }, { "epoch": 8.249845392702536, "grad_norm": 0.34765625, "learning_rate": 1.7985801120837865e-05, "loss": 0.7622, "step": 6670 }, { "epoch": 8.256029684601113, "grad_norm": 0.33984375, "learning_rate": 1.7862402492834806e-05, "loss": 0.76, "step": 6675 }, { "epoch": 8.262213976499691, "grad_norm": 0.349609375, "learning_rate": 1.773938710748706e-05, "loss": 0.7635, "step": 6680 }, { "epoch": 8.268398268398268, "grad_norm": 0.345703125, "learning_rate": 1.761675553876736e-05, "loss": 0.7445, "step": 6685 }, { "epoch": 8.274582560296846, "grad_norm": 0.34375, "learning_rate": 1.7494508358857677e-05, "loss": 0.7611, "step": 6690 }, { "epoch": 8.280766852195423, "grad_norm": 0.3359375, "learning_rate": 1.737264613814633e-05, "loss": 0.753, "step": 6695 }, { "epoch": 8.286951144094001, "grad_norm": 0.34375, "learning_rate": 1.7251169445225657e-05, "loss": 0.7568, "step": 6700 }, { "epoch": 8.293135435992578, "grad_norm": 0.337890625, "learning_rate": 1.713007884688904e-05, "loss": 0.7483, "step": 6705 }, { "epoch": 8.299319727891156, "grad_norm": 0.341796875, "learning_rate": 1.700937490812844e-05, "loss": 0.7579, "step": 6710 }, { "epoch": 8.305504019789733, "grad_norm": 0.341796875, "learning_rate": 1.6889058192131734e-05, "loss": 0.7635, "step": 6715 }, { "epoch": 8.311688311688311, "grad_norm": 0.34765625, "learning_rate": 1.676912926028007e-05, "loss": 0.7617, "step": 6720 }, { "epoch": 8.317872603586888, "grad_norm": 0.349609375, "learning_rate": 1.664958867214519e-05, "loss": 0.7611, "step": 6725 }, { "epoch": 8.324056895485468, "grad_norm": 0.345703125, "learning_rate": 1.6530436985486996e-05, "loss": 0.7589, "step": 6730 }, { "epoch": 8.330241187384045, "grad_norm": 0.341796875, "learning_rate": 1.6411674756250663e-05, "loss": 0.7638, "step": 6735 }, { "epoch": 8.336425479282623, "grad_norm": 0.34765625, "learning_rate": 1.6293302538564382e-05, "loss": 0.7526, "step": 6740 }, { "epoch": 8.3426097711812, "grad_norm": 0.365234375, "learning_rate": 1.617532088473651e-05, "loss": 0.7587, "step": 6745 }, { "epoch": 8.348794063079778, "grad_norm": 0.35546875, "learning_rate": 1.6057730345253065e-05, "loss": 0.7523, "step": 6750 }, { "epoch": 8.354978354978355, "grad_norm": 0.349609375, "learning_rate": 1.594053146877529e-05, "loss": 0.7557, "step": 6755 }, { "epoch": 8.361162646876933, "grad_norm": 0.341796875, "learning_rate": 1.5823724802136865e-05, "loss": 0.7588, "step": 6760 }, { "epoch": 8.36734693877551, "grad_norm": 0.349609375, "learning_rate": 1.570731089034151e-05, "loss": 0.7627, "step": 6765 }, { "epoch": 8.373531230674088, "grad_norm": 0.345703125, "learning_rate": 1.5591290276560466e-05, "loss": 0.7688, "step": 6770 }, { "epoch": 8.379715522572665, "grad_norm": 0.345703125, "learning_rate": 1.5475663502129822e-05, "loss": 0.7577, "step": 6775 }, { "epoch": 8.385899814471243, "grad_norm": 0.33984375, "learning_rate": 1.536043110654809e-05, "loss": 0.755, "step": 6780 }, { "epoch": 8.39208410636982, "grad_norm": 0.34765625, "learning_rate": 1.5245593627473675e-05, "loss": 0.7617, "step": 6785 }, { "epoch": 8.398268398268398, "grad_norm": 0.33984375, "learning_rate": 1.5131151600722337e-05, "loss": 0.7438, "step": 6790 }, { "epoch": 8.404452690166975, "grad_norm": 0.33984375, "learning_rate": 1.5017105560264755e-05, "loss": 0.7559, "step": 6795 }, { "epoch": 8.410636982065553, "grad_norm": 0.34765625, "learning_rate": 1.4903456038223939e-05, "loss": 0.7504, "step": 6800 }, { "epoch": 8.416821273964132, "grad_norm": 0.3359375, "learning_rate": 1.4790203564872818e-05, "loss": 0.7544, "step": 6805 }, { "epoch": 8.42300556586271, "grad_norm": 0.341796875, "learning_rate": 1.4677348668631763e-05, "loss": 0.7612, "step": 6810 }, { "epoch": 8.429189857761287, "grad_norm": 0.35546875, "learning_rate": 1.45648918760661e-05, "loss": 0.7613, "step": 6815 }, { "epoch": 8.435374149659864, "grad_norm": 0.34375, "learning_rate": 1.4452833711883628e-05, "loss": 0.7519, "step": 6820 }, { "epoch": 8.441558441558442, "grad_norm": 0.3359375, "learning_rate": 1.4341174698932224e-05, "loss": 0.7503, "step": 6825 }, { "epoch": 8.44774273345702, "grad_norm": 0.34375, "learning_rate": 1.4229915358197377e-05, "loss": 0.7562, "step": 6830 }, { "epoch": 8.453927025355597, "grad_norm": 0.33984375, "learning_rate": 1.4119056208799786e-05, "loss": 0.7516, "step": 6835 }, { "epoch": 8.460111317254174, "grad_norm": 0.33984375, "learning_rate": 1.4008597767992871e-05, "loss": 0.7588, "step": 6840 }, { "epoch": 8.466295609152752, "grad_norm": 0.3515625, "learning_rate": 1.389854055116041e-05, "loss": 0.7607, "step": 6845 }, { "epoch": 8.47247990105133, "grad_norm": 0.34375, "learning_rate": 1.3788885071814172e-05, "loss": 0.757, "step": 6850 }, { "epoch": 8.478664192949907, "grad_norm": 0.34375, "learning_rate": 1.3679631841591411e-05, "loss": 0.7535, "step": 6855 }, { "epoch": 8.484848484848484, "grad_norm": 0.34375, "learning_rate": 1.3570781370252582e-05, "loss": 0.7512, "step": 6860 }, { "epoch": 8.491032776747062, "grad_norm": 0.34765625, "learning_rate": 1.3462334165678902e-05, "loss": 0.7506, "step": 6865 }, { "epoch": 8.49721706864564, "grad_norm": 0.36328125, "learning_rate": 1.3354290733869979e-05, "loss": 0.7614, "step": 6870 }, { "epoch": 8.503401360544217, "grad_norm": 0.34765625, "learning_rate": 1.3246651578941572e-05, "loss": 0.7618, "step": 6875 }, { "epoch": 8.509585652442794, "grad_norm": 0.3359375, "learning_rate": 1.3139417203123027e-05, "loss": 0.7581, "step": 6880 }, { "epoch": 8.515769944341374, "grad_norm": 0.3359375, "learning_rate": 1.3032588106755084e-05, "loss": 0.7652, "step": 6885 }, { "epoch": 8.521954236239951, "grad_norm": 0.341796875, "learning_rate": 1.2926164788287543e-05, "loss": 0.7585, "step": 6890 }, { "epoch": 8.528138528138529, "grad_norm": 0.34375, "learning_rate": 1.2820147744276866e-05, "loss": 0.7428, "step": 6895 }, { "epoch": 8.534322820037106, "grad_norm": 0.337890625, "learning_rate": 1.2714537469383858e-05, "loss": 0.7608, "step": 6900 }, { "epoch": 8.540507111935684, "grad_norm": 0.33984375, "learning_rate": 1.2609334456371514e-05, "loss": 0.7555, "step": 6905 }, { "epoch": 8.546691403834261, "grad_norm": 0.337890625, "learning_rate": 1.2504539196102439e-05, "loss": 0.753, "step": 6910 }, { "epoch": 8.552875695732839, "grad_norm": 0.337890625, "learning_rate": 1.240015217753685e-05, "loss": 0.7556, "step": 6915 }, { "epoch": 8.559059987631416, "grad_norm": 0.341796875, "learning_rate": 1.2296173887730123e-05, "loss": 0.755, "step": 6920 }, { "epoch": 8.565244279529994, "grad_norm": 0.337890625, "learning_rate": 1.2192604811830532e-05, "loss": 0.7499, "step": 6925 }, { "epoch": 8.571428571428571, "grad_norm": 0.337890625, "learning_rate": 1.2089445433077073e-05, "loss": 0.7544, "step": 6930 }, { "epoch": 8.577612863327149, "grad_norm": 0.34375, "learning_rate": 1.1986696232797101e-05, "loss": 0.7542, "step": 6935 }, { "epoch": 8.583797155225726, "grad_norm": 0.33984375, "learning_rate": 1.1884357690404158e-05, "loss": 0.7524, "step": 6940 }, { "epoch": 8.589981447124304, "grad_norm": 0.34765625, "learning_rate": 1.178243028339574e-05, "loss": 0.7568, "step": 6945 }, { "epoch": 8.596165739022881, "grad_norm": 0.345703125, "learning_rate": 1.1680914487350959e-05, "loss": 0.7597, "step": 6950 }, { "epoch": 8.602350030921459, "grad_norm": 0.337890625, "learning_rate": 1.1579810775928502e-05, "loss": 0.7645, "step": 6955 }, { "epoch": 8.608534322820038, "grad_norm": 0.349609375, "learning_rate": 1.1479119620864276e-05, "loss": 0.7513, "step": 6960 }, { "epoch": 8.614718614718615, "grad_norm": 0.3359375, "learning_rate": 1.1378841491969239e-05, "loss": 0.7533, "step": 6965 }, { "epoch": 8.620902906617193, "grad_norm": 0.333984375, "learning_rate": 1.1278976857127311e-05, "loss": 0.7549, "step": 6970 }, { "epoch": 8.62708719851577, "grad_norm": 0.33984375, "learning_rate": 1.117952618229301e-05, "loss": 0.7588, "step": 6975 }, { "epoch": 8.633271490414348, "grad_norm": 0.333984375, "learning_rate": 1.1080489931489391e-05, "loss": 0.7599, "step": 6980 }, { "epoch": 8.639455782312925, "grad_norm": 0.341796875, "learning_rate": 1.0981868566805942e-05, "loss": 0.765, "step": 6985 }, { "epoch": 8.645640074211503, "grad_norm": 0.349609375, "learning_rate": 1.0883662548396257e-05, "loss": 0.757, "step": 6990 }, { "epoch": 8.65182436611008, "grad_norm": 0.34765625, "learning_rate": 1.0785872334476033e-05, "loss": 0.7657, "step": 6995 }, { "epoch": 8.658008658008658, "grad_norm": 0.337890625, "learning_rate": 1.0688498381320855e-05, "loss": 0.7525, "step": 7000 }, { "epoch": 8.664192949907235, "grad_norm": 0.3515625, "learning_rate": 1.0591541143264084e-05, "loss": 0.7522, "step": 7005 }, { "epoch": 8.670377241805813, "grad_norm": 0.34765625, "learning_rate": 1.049500107269481e-05, "loss": 0.7616, "step": 7010 }, { "epoch": 8.67656153370439, "grad_norm": 0.3515625, "learning_rate": 1.0398878620055618e-05, "loss": 0.7494, "step": 7015 }, { "epoch": 8.682745825602968, "grad_norm": 0.349609375, "learning_rate": 1.0303174233840528e-05, "loss": 0.7502, "step": 7020 }, { "epoch": 8.688930117501545, "grad_norm": 0.34765625, "learning_rate": 1.0207888360592998e-05, "loss": 0.7725, "step": 7025 }, { "epoch": 8.695114409400123, "grad_norm": 0.34765625, "learning_rate": 1.0113021444903726e-05, "loss": 0.7546, "step": 7030 }, { "epoch": 8.7012987012987, "grad_norm": 0.349609375, "learning_rate": 1.0018573929408526e-05, "loss": 0.746, "step": 7035 }, { "epoch": 8.70748299319728, "grad_norm": 0.34375, "learning_rate": 9.924546254786493e-06, "loss": 0.7605, "step": 7040 }, { "epoch": 8.713667285095857, "grad_norm": 0.3359375, "learning_rate": 9.83093885975771e-06, "loss": 0.7406, "step": 7045 }, { "epoch": 8.719851576994435, "grad_norm": 0.341796875, "learning_rate": 9.737752181081338e-06, "loss": 0.7543, "step": 7050 }, { "epoch": 8.726035868893012, "grad_norm": 0.361328125, "learning_rate": 9.644986653553512e-06, "loss": 0.7606, "step": 7055 }, { "epoch": 8.73222016079159, "grad_norm": 0.345703125, "learning_rate": 9.552642710005299e-06, "loss": 0.7609, "step": 7060 }, { "epoch": 8.738404452690167, "grad_norm": 0.337890625, "learning_rate": 9.460720781300814e-06, "loss": 0.7606, "step": 7065 }, { "epoch": 8.744588744588745, "grad_norm": 0.345703125, "learning_rate": 9.369221296335006e-06, "loss": 0.7521, "step": 7070 }, { "epoch": 8.750773036487322, "grad_norm": 0.33984375, "learning_rate": 9.278144682031809e-06, "loss": 0.7593, "step": 7075 }, { "epoch": 8.7569573283859, "grad_norm": 0.333984375, "learning_rate": 9.187491363342093e-06, "loss": 0.7523, "step": 7080 }, { "epoch": 8.763141620284477, "grad_norm": 0.353515625, "learning_rate": 9.097261763241694e-06, "loss": 0.7608, "step": 7085 }, { "epoch": 8.769325912183055, "grad_norm": 0.345703125, "learning_rate": 9.0074563027294e-06, "loss": 0.7652, "step": 7090 }, { "epoch": 8.775510204081632, "grad_norm": 0.359375, "learning_rate": 8.918075400825098e-06, "loss": 0.7634, "step": 7095 }, { "epoch": 8.78169449598021, "grad_norm": 0.34765625, "learning_rate": 8.829119474567671e-06, "loss": 0.7596, "step": 7100 }, { "epoch": 8.787878787878787, "grad_norm": 0.34375, "learning_rate": 8.740588939013173e-06, "loss": 0.7522, "step": 7105 }, { "epoch": 8.794063079777365, "grad_norm": 0.333984375, "learning_rate": 8.652484207232803e-06, "loss": 0.7516, "step": 7110 }, { "epoch": 8.800247371675944, "grad_norm": 0.33984375, "learning_rate": 8.564805690311029e-06, "loss": 0.7628, "step": 7115 }, { "epoch": 8.806431663574521, "grad_norm": 0.34765625, "learning_rate": 8.47755379734373e-06, "loss": 0.7541, "step": 7120 }, { "epoch": 8.812615955473099, "grad_norm": 0.359375, "learning_rate": 8.390728935436088e-06, "loss": 0.7502, "step": 7125 }, { "epoch": 8.818800247371676, "grad_norm": 0.345703125, "learning_rate": 8.304331509700891e-06, "loss": 0.7605, "step": 7130 }, { "epoch": 8.824984539270254, "grad_norm": 0.345703125, "learning_rate": 8.218361923256601e-06, "loss": 0.7572, "step": 7135 }, { "epoch": 8.831168831168831, "grad_norm": 0.36328125, "learning_rate": 8.132820577225387e-06, "loss": 0.7471, "step": 7140 }, { "epoch": 8.837353123067409, "grad_norm": 0.341796875, "learning_rate": 8.047707870731291e-06, "loss": 0.7698, "step": 7145 }, { "epoch": 8.843537414965986, "grad_norm": 0.34375, "learning_rate": 7.963024200898462e-06, "loss": 0.755, "step": 7150 }, { "epoch": 8.849721706864564, "grad_norm": 0.341796875, "learning_rate": 7.878769962849141e-06, "loss": 0.7533, "step": 7155 }, { "epoch": 8.855905998763141, "grad_norm": 0.345703125, "learning_rate": 7.794945549701993e-06, "loss": 0.7536, "step": 7160 }, { "epoch": 8.862090290661719, "grad_norm": 0.337890625, "learning_rate": 7.711551352570056e-06, "loss": 0.7605, "step": 7165 }, { "epoch": 8.868274582560296, "grad_norm": 0.34765625, "learning_rate": 7.6285877605591135e-06, "loss": 0.7442, "step": 7170 }, { "epoch": 8.874458874458874, "grad_norm": 0.33984375, "learning_rate": 7.546055160765819e-06, "loss": 0.7463, "step": 7175 }, { "epoch": 8.880643166357451, "grad_norm": 0.3359375, "learning_rate": 7.463953938275858e-06, "loss": 0.7583, "step": 7180 }, { "epoch": 8.886827458256029, "grad_norm": 0.333984375, "learning_rate": 7.382284476162127e-06, "loss": 0.758, "step": 7185 }, { "epoch": 8.893011750154606, "grad_norm": 0.3359375, "learning_rate": 7.3010471554830766e-06, "loss": 0.7482, "step": 7190 }, { "epoch": 8.899196042053186, "grad_norm": 0.357421875, "learning_rate": 7.220242355280771e-06, "loss": 0.7571, "step": 7195 }, { "epoch": 8.905380333951763, "grad_norm": 0.357421875, "learning_rate": 7.1398704525792e-06, "loss": 0.7541, "step": 7200 }, { "epoch": 8.91156462585034, "grad_norm": 0.3359375, "learning_rate": 7.0599318223825925e-06, "loss": 0.752, "step": 7205 }, { "epoch": 8.917748917748918, "grad_norm": 0.34375, "learning_rate": 6.980426837673437e-06, "loss": 0.7591, "step": 7210 }, { "epoch": 8.923933209647496, "grad_norm": 0.345703125, "learning_rate": 6.901355869411053e-06, "loss": 0.7568, "step": 7215 }, { "epoch": 8.930117501546073, "grad_norm": 0.333984375, "learning_rate": 6.8227192865295995e-06, "loss": 0.752, "step": 7220 }, { "epoch": 8.93630179344465, "grad_norm": 0.333984375, "learning_rate": 6.744517455936483e-06, "loss": 0.7447, "step": 7225 }, { "epoch": 8.942486085343228, "grad_norm": 0.34375, "learning_rate": 6.666750742510619e-06, "loss": 0.7554, "step": 7230 }, { "epoch": 8.948670377241806, "grad_norm": 0.341796875, "learning_rate": 6.589419509100736e-06, "loss": 0.7593, "step": 7235 }, { "epoch": 8.954854669140383, "grad_norm": 0.341796875, "learning_rate": 6.512524116523633e-06, "loss": 0.7571, "step": 7240 }, { "epoch": 8.96103896103896, "grad_norm": 0.33984375, "learning_rate": 6.436064923562601e-06, "loss": 0.7516, "step": 7245 }, { "epoch": 8.967223252937538, "grad_norm": 0.341796875, "learning_rate": 6.360042286965595e-06, "loss": 0.7647, "step": 7250 }, { "epoch": 8.973407544836116, "grad_norm": 0.3515625, "learning_rate": 6.284456561443763e-06, "loss": 0.7573, "step": 7255 }, { "epoch": 8.979591836734693, "grad_norm": 0.359375, "learning_rate": 6.209308099669597e-06, "loss": 0.7641, "step": 7260 }, { "epoch": 8.98577612863327, "grad_norm": 0.3515625, "learning_rate": 6.134597252275409e-06, "loss": 0.7687, "step": 7265 }, { "epoch": 8.99196042053185, "grad_norm": 0.34375, "learning_rate": 6.0603243678516995e-06, "loss": 0.7573, "step": 7270 }, { "epoch": 8.998144712430427, "grad_norm": 0.333984375, "learning_rate": 5.9864897929454374e-06, "loss": 0.7532, "step": 7275 }, { "epoch": 8.999381570810142, "eval_loss": 2.69946551322937, "eval_runtime": 0.8232, "eval_samples_per_second": 12.147, "eval_steps_per_second": 1.215, "step": 7276 }, { "epoch": 9.004329004329005, "grad_norm": 0.33984375, "learning_rate": 5.913093872058528e-06, "loss": 0.7539, "step": 7280 }, { "epoch": 9.010513296227582, "grad_norm": 0.333984375, "learning_rate": 5.84013694764618e-06, "loss": 0.7551, "step": 7285 }, { "epoch": 9.01669758812616, "grad_norm": 0.3515625, "learning_rate": 5.767619360115295e-06, "loss": 0.7517, "step": 7290 }, { "epoch": 9.022881880024737, "grad_norm": 0.361328125, "learning_rate": 5.695541447822905e-06, "loss": 0.7582, "step": 7295 }, { "epoch": 9.029066171923315, "grad_norm": 0.349609375, "learning_rate": 5.623903547074549e-06, "loss": 0.768, "step": 7300 }, { "epoch": 9.035250463821892, "grad_norm": 0.337890625, "learning_rate": 5.552705992122742e-06, "loss": 0.7567, "step": 7305 }, { "epoch": 9.04143475572047, "grad_norm": 0.33984375, "learning_rate": 5.481949115165452e-06, "loss": 0.7477, "step": 7310 }, { "epoch": 9.047619047619047, "grad_norm": 0.34765625, "learning_rate": 5.41163324634445e-06, "loss": 0.7538, "step": 7315 }, { "epoch": 9.053803339517625, "grad_norm": 0.345703125, "learning_rate": 5.341758713743828e-06, "loss": 0.7573, "step": 7320 }, { "epoch": 9.059987631416202, "grad_norm": 0.33203125, "learning_rate": 5.272325843388504e-06, "loss": 0.7519, "step": 7325 }, { "epoch": 9.06617192331478, "grad_norm": 0.345703125, "learning_rate": 5.2033349592426335e-06, "loss": 0.7595, "step": 7330 }, { "epoch": 9.072356215213357, "grad_norm": 0.357421875, "learning_rate": 5.134786383208112e-06, "loss": 0.7615, "step": 7335 }, { "epoch": 9.078540507111935, "grad_norm": 0.337890625, "learning_rate": 5.066680435123106e-06, "loss": 0.7608, "step": 7340 }, { "epoch": 9.084724799010512, "grad_norm": 0.341796875, "learning_rate": 4.9990174327605225e-06, "loss": 0.7508, "step": 7345 }, { "epoch": 9.090909090909092, "grad_norm": 0.3359375, "learning_rate": 4.931797691826601e-06, "loss": 0.7466, "step": 7350 }, { "epoch": 9.09709338280767, "grad_norm": 0.345703125, "learning_rate": 4.865021525959323e-06, "loss": 0.7615, "step": 7355 }, { "epoch": 9.103277674706247, "grad_norm": 0.353515625, "learning_rate": 4.798689246727006e-06, "loss": 0.7569, "step": 7360 }, { "epoch": 9.109461966604824, "grad_norm": 0.333984375, "learning_rate": 4.732801163626921e-06, "loss": 0.7551, "step": 7365 }, { "epoch": 9.115646258503402, "grad_norm": 0.3515625, "learning_rate": 4.667357584083721e-06, "loss": 0.7565, "step": 7370 }, { "epoch": 9.12183055040198, "grad_norm": 0.33984375, "learning_rate": 4.602358813448093e-06, "loss": 0.7618, "step": 7375 }, { "epoch": 9.128014842300557, "grad_norm": 0.35546875, "learning_rate": 4.537805154995278e-06, "loss": 0.7622, "step": 7380 }, { "epoch": 9.134199134199134, "grad_norm": 0.3359375, "learning_rate": 4.473696909923719e-06, "loss": 0.7535, "step": 7385 }, { "epoch": 9.140383426097712, "grad_norm": 0.341796875, "learning_rate": 4.4100343773536225e-06, "loss": 0.7599, "step": 7390 }, { "epoch": 9.14656771799629, "grad_norm": 0.337890625, "learning_rate": 4.346817854325535e-06, "loss": 0.7546, "step": 7395 }, { "epoch": 9.152752009894867, "grad_norm": 0.33984375, "learning_rate": 4.2840476357989825e-06, "loss": 0.7529, "step": 7400 }, { "epoch": 9.158936301793444, "grad_norm": 0.330078125, "learning_rate": 4.221724014651151e-06, "loss": 0.7502, "step": 7405 }, { "epoch": 9.165120593692022, "grad_norm": 0.341796875, "learning_rate": 4.159847281675411e-06, "loss": 0.7637, "step": 7410 }, { "epoch": 9.1713048855906, "grad_norm": 0.34375, "learning_rate": 4.098417725580006e-06, "loss": 0.7642, "step": 7415 }, { "epoch": 9.177489177489177, "grad_norm": 0.333984375, "learning_rate": 4.037435632986786e-06, "loss": 0.7588, "step": 7420 }, { "epoch": 9.183673469387756, "grad_norm": 0.34375, "learning_rate": 3.976901288429691e-06, "loss": 0.7466, "step": 7425 }, { "epoch": 9.189857761286333, "grad_norm": 0.349609375, "learning_rate": 3.916814974353633e-06, "loss": 0.7612, "step": 7430 }, { "epoch": 9.196042053184911, "grad_norm": 0.34375, "learning_rate": 3.857176971113019e-06, "loss": 0.758, "step": 7435 }, { "epoch": 9.202226345083488, "grad_norm": 0.345703125, "learning_rate": 3.797987556970495e-06, "loss": 0.761, "step": 7440 }, { "epoch": 9.208410636982066, "grad_norm": 0.337890625, "learning_rate": 3.7392470080957033e-06, "loss": 0.7517, "step": 7445 }, { "epoch": 9.214594928880643, "grad_norm": 0.34375, "learning_rate": 3.6809555985639068e-06, "loss": 0.7572, "step": 7450 }, { "epoch": 9.220779220779221, "grad_norm": 0.357421875, "learning_rate": 3.6231136003547106e-06, "loss": 0.7568, "step": 7455 }, { "epoch": 9.226963512677798, "grad_norm": 0.3359375, "learning_rate": 3.565721283350931e-06, "loss": 0.7516, "step": 7460 }, { "epoch": 9.233147804576376, "grad_norm": 0.34765625, "learning_rate": 3.5087789153371187e-06, "loss": 0.7611, "step": 7465 }, { "epoch": 9.239332096474953, "grad_norm": 0.341796875, "learning_rate": 3.452286761998491e-06, "loss": 0.7555, "step": 7470 }, { "epoch": 9.245516388373531, "grad_norm": 0.337890625, "learning_rate": 3.396245086919636e-06, "loss": 0.749, "step": 7475 }, { "epoch": 9.251700680272108, "grad_norm": 0.33984375, "learning_rate": 3.3406541515832003e-06, "loss": 0.7539, "step": 7480 }, { "epoch": 9.257884972170686, "grad_norm": 0.34375, "learning_rate": 3.2855142153688457e-06, "loss": 0.7569, "step": 7485 }, { "epoch": 9.264069264069263, "grad_norm": 0.34375, "learning_rate": 3.2308255355518403e-06, "loss": 0.7589, "step": 7490 }, { "epoch": 9.270253555967841, "grad_norm": 0.33984375, "learning_rate": 3.1765883673019914e-06, "loss": 0.7575, "step": 7495 }, { "epoch": 9.276437847866418, "grad_norm": 0.345703125, "learning_rate": 3.1228029636824475e-06, "loss": 0.7598, "step": 7500 }, { "epoch": 9.282622139764998, "grad_norm": 0.3359375, "learning_rate": 3.0694695756484205e-06, "loss": 0.7557, "step": 7505 }, { "epoch": 9.288806431663575, "grad_norm": 0.34765625, "learning_rate": 3.0165884520461316e-06, "loss": 0.7577, "step": 7510 }, { "epoch": 9.294990723562153, "grad_norm": 0.34375, "learning_rate": 2.96415983961158e-06, "loss": 0.7531, "step": 7515 }, { "epoch": 9.30117501546073, "grad_norm": 0.34765625, "learning_rate": 2.912183982969385e-06, "loss": 0.7611, "step": 7520 }, { "epoch": 9.307359307359308, "grad_norm": 0.337890625, "learning_rate": 2.860661124631725e-06, "loss": 0.7458, "step": 7525 }, { "epoch": 9.313543599257885, "grad_norm": 0.34375, "learning_rate": 2.809591504997111e-06, "loss": 0.7474, "step": 7530 }, { "epoch": 9.319727891156463, "grad_norm": 0.341796875, "learning_rate": 2.7589753623493142e-06, "loss": 0.758, "step": 7535 }, { "epoch": 9.32591218305504, "grad_norm": 0.33203125, "learning_rate": 2.708812932856253e-06, "loss": 0.7467, "step": 7540 }, { "epoch": 9.332096474953618, "grad_norm": 0.33984375, "learning_rate": 2.6591044505688833e-06, "loss": 0.7613, "step": 7545 }, { "epoch": 9.338280766852195, "grad_norm": 0.357421875, "learning_rate": 2.6098501474200787e-06, "loss": 0.7486, "step": 7550 }, { "epoch": 9.344465058750773, "grad_norm": 0.33984375, "learning_rate": 2.561050253223618e-06, "loss": 0.7575, "step": 7555 }, { "epoch": 9.35064935064935, "grad_norm": 0.3515625, "learning_rate": 2.5127049956730207e-06, "loss": 0.7614, "step": 7560 }, { "epoch": 9.356833642547928, "grad_norm": 0.3359375, "learning_rate": 2.4648146003405925e-06, "loss": 0.7606, "step": 7565 }, { "epoch": 9.363017934446505, "grad_norm": 0.341796875, "learning_rate": 2.4173792906762804e-06, "loss": 0.7555, "step": 7570 }, { "epoch": 9.369202226345083, "grad_norm": 0.337890625, "learning_rate": 2.3703992880066638e-06, "loss": 0.7457, "step": 7575 }, { "epoch": 9.375386518243662, "grad_norm": 0.337890625, "learning_rate": 2.3238748115339324e-06, "loss": 0.7602, "step": 7580 }, { "epoch": 9.38157081014224, "grad_norm": 0.345703125, "learning_rate": 2.277806078334843e-06, "loss": 0.7638, "step": 7585 }, { "epoch": 9.387755102040817, "grad_norm": 0.33984375, "learning_rate": 2.232193303359742e-06, "loss": 0.7501, "step": 7590 }, { "epoch": 9.393939393939394, "grad_norm": 0.353515625, "learning_rate": 2.1870366994315106e-06, "loss": 0.759, "step": 7595 }, { "epoch": 9.400123685837972, "grad_norm": 0.34375, "learning_rate": 2.1423364772445887e-06, "loss": 0.7654, "step": 7600 }, { "epoch": 9.40630797773655, "grad_norm": 0.3359375, "learning_rate": 2.0980928453640637e-06, "loss": 0.7575, "step": 7605 }, { "epoch": 9.412492269635127, "grad_norm": 0.34375, "learning_rate": 2.0543060102245717e-06, "loss": 0.7584, "step": 7610 }, { "epoch": 9.418676561533704, "grad_norm": 0.33203125, "learning_rate": 2.0109761761294087e-06, "loss": 0.7539, "step": 7615 }, { "epoch": 9.424860853432282, "grad_norm": 0.34765625, "learning_rate": 1.968103545249611e-06, "loss": 0.7491, "step": 7620 }, { "epoch": 9.43104514533086, "grad_norm": 0.337890625, "learning_rate": 1.9256883176229202e-06, "loss": 0.7501, "step": 7625 }, { "epoch": 9.437229437229437, "grad_norm": 0.341796875, "learning_rate": 1.8837306911529184e-06, "loss": 0.7571, "step": 7630 }, { "epoch": 9.443413729128014, "grad_norm": 0.341796875, "learning_rate": 1.8422308616080853e-06, "loss": 0.7482, "step": 7635 }, { "epoch": 9.449598021026592, "grad_norm": 0.34375, "learning_rate": 1.8011890226208527e-06, "loss": 0.7693, "step": 7640 }, { "epoch": 9.45578231292517, "grad_norm": 0.34765625, "learning_rate": 1.760605365686785e-06, "loss": 0.7588, "step": 7645 }, { "epoch": 9.461966604823747, "grad_norm": 0.34375, "learning_rate": 1.7204800801636e-06, "loss": 0.7564, "step": 7650 }, { "epoch": 9.468150896722324, "grad_norm": 0.34765625, "learning_rate": 1.6808133532703163e-06, "loss": 0.7676, "step": 7655 }, { "epoch": 9.474335188620904, "grad_norm": 0.341796875, "learning_rate": 1.6416053700863964e-06, "loss": 0.754, "step": 7660 }, { "epoch": 9.480519480519481, "grad_norm": 0.3515625, "learning_rate": 1.602856313550849e-06, "loss": 0.7572, "step": 7665 }, { "epoch": 9.486703772418059, "grad_norm": 0.33203125, "learning_rate": 1.5645663644614172e-06, "loss": 0.7688, "step": 7670 }, { "epoch": 9.492888064316636, "grad_norm": 0.3515625, "learning_rate": 1.5267357014737027e-06, "loss": 0.7603, "step": 7675 }, { "epoch": 9.499072356215214, "grad_norm": 0.3359375, "learning_rate": 1.489364501100332e-06, "loss": 0.7567, "step": 7680 }, { "epoch": 9.505256648113791, "grad_norm": 0.33984375, "learning_rate": 1.4524529377101358e-06, "loss": 0.7631, "step": 7685 }, { "epoch": 9.511440940012369, "grad_norm": 0.34765625, "learning_rate": 1.4160011835273934e-06, "loss": 0.7482, "step": 7690 }, { "epoch": 9.517625231910946, "grad_norm": 0.345703125, "learning_rate": 1.3800094086309112e-06, "loss": 0.7595, "step": 7695 }, { "epoch": 9.523809523809524, "grad_norm": 0.33984375, "learning_rate": 1.344477780953346e-06, "loss": 0.7519, "step": 7700 }, { "epoch": 9.529993815708101, "grad_norm": 0.34375, "learning_rate": 1.3094064662803385e-06, "loss": 0.7549, "step": 7705 }, { "epoch": 9.536178107606679, "grad_norm": 0.341796875, "learning_rate": 1.274795628249792e-06, "loss": 0.7529, "step": 7710 }, { "epoch": 9.542362399505256, "grad_norm": 0.34375, "learning_rate": 1.2406454283510948e-06, "loss": 0.7552, "step": 7715 }, { "epoch": 9.548546691403834, "grad_norm": 0.333984375, "learning_rate": 1.2069560259243328e-06, "loss": 0.749, "step": 7720 }, { "epoch": 9.554730983302411, "grad_norm": 0.33984375, "learning_rate": 1.173727578159589e-06, "loss": 0.7532, "step": 7725 }, { "epoch": 9.560915275200989, "grad_norm": 0.349609375, "learning_rate": 1.1409602400962227e-06, "loss": 0.7513, "step": 7730 }, { "epoch": 9.567099567099568, "grad_norm": 0.341796875, "learning_rate": 1.1086541646220693e-06, "loss": 0.7514, "step": 7735 }, { "epoch": 9.573283858998145, "grad_norm": 0.333984375, "learning_rate": 1.076809502472831e-06, "loss": 0.7645, "step": 7740 }, { "epoch": 9.579468150896723, "grad_norm": 0.333984375, "learning_rate": 1.0454264022312644e-06, "loss": 0.7566, "step": 7745 }, { "epoch": 9.5856524427953, "grad_norm": 0.353515625, "learning_rate": 1.014505010326583e-06, "loss": 0.7652, "step": 7750 }, { "epoch": 9.591836734693878, "grad_norm": 0.3515625, "learning_rate": 9.840454710337122e-07, "loss": 0.7521, "step": 7755 }, { "epoch": 9.598021026592455, "grad_norm": 0.3359375, "learning_rate": 9.540479264726676e-07, "loss": 0.7529, "step": 7760 }, { "epoch": 9.604205318491033, "grad_norm": 0.3359375, "learning_rate": 9.245125166078005e-07, "loss": 0.7574, "step": 7765 }, { "epoch": 9.61038961038961, "grad_norm": 0.34375, "learning_rate": 8.954393792472649e-07, "loss": 0.7483, "step": 7770 }, { "epoch": 9.616573902288188, "grad_norm": 0.341796875, "learning_rate": 8.668286500422951e-07, "loss": 0.7509, "step": 7775 }, { "epoch": 9.622758194186765, "grad_norm": 0.33984375, "learning_rate": 8.386804624865851e-07, "loss": 0.7564, "step": 7780 }, { "epoch": 9.628942486085343, "grad_norm": 0.34765625, "learning_rate": 8.109949479156886e-07, "loss": 0.7577, "step": 7785 }, { "epoch": 9.63512677798392, "grad_norm": 0.345703125, "learning_rate": 7.837722355063637e-07, "loss": 0.7574, "step": 7790 }, { "epoch": 9.641311069882498, "grad_norm": 0.341796875, "learning_rate": 7.570124522760402e-07, "loss": 0.7579, "step": 7795 }, { "epoch": 9.647495361781075, "grad_norm": 0.341796875, "learning_rate": 7.307157230821426e-07, "loss": 0.7525, "step": 7800 }, { "epoch": 9.653679653679653, "grad_norm": 0.33984375, "learning_rate": 7.048821706215792e-07, "loss": 0.7549, "step": 7805 }, { "epoch": 9.65986394557823, "grad_norm": 0.333984375, "learning_rate": 6.7951191543012e-07, "loss": 0.7468, "step": 7810 }, { "epoch": 9.666048237476808, "grad_norm": 0.337890625, "learning_rate": 6.546050758818756e-07, "loss": 0.7565, "step": 7815 }, { "epoch": 9.672232529375387, "grad_norm": 0.3515625, "learning_rate": 6.301617681886863e-07, "loss": 0.7556, "step": 7820 }, { "epoch": 9.678416821273965, "grad_norm": 0.33203125, "learning_rate": 6.061821063996665e-07, "loss": 0.7546, "step": 7825 }, { "epoch": 9.684601113172542, "grad_norm": 0.34375, "learning_rate": 5.826662024005835e-07, "loss": 0.7565, "step": 7830 }, { "epoch": 9.69078540507112, "grad_norm": 0.341796875, "learning_rate": 5.596141659133913e-07, "loss": 0.7576, "step": 7835 }, { "epoch": 9.696969696969697, "grad_norm": 0.337890625, "learning_rate": 5.370261044956971e-07, "loss": 0.7583, "step": 7840 }, { "epoch": 9.703153988868275, "grad_norm": 0.345703125, "learning_rate": 5.149021235402729e-07, "loss": 0.7512, "step": 7845 }, { "epoch": 9.709338280766852, "grad_norm": 0.337890625, "learning_rate": 4.932423262745456e-07, "loss": 0.7478, "step": 7850 }, { "epoch": 9.71552257266543, "grad_norm": 0.345703125, "learning_rate": 4.7204681376014084e-07, "loss": 0.7614, "step": 7855 }, { "epoch": 9.721706864564007, "grad_norm": 0.34765625, "learning_rate": 4.5131568489236166e-07, "loss": 0.7562, "step": 7860 }, { "epoch": 9.727891156462585, "grad_norm": 0.365234375, "learning_rate": 4.3104903639981097e-07, "loss": 0.7494, "step": 7865 }, { "epoch": 9.734075448361162, "grad_norm": 0.337890625, "learning_rate": 4.112469628438365e-07, "loss": 0.7569, "step": 7870 }, { "epoch": 9.74025974025974, "grad_norm": 0.341796875, "learning_rate": 3.919095566181974e-07, "loss": 0.7585, "step": 7875 }, { "epoch": 9.746444032158317, "grad_norm": 0.3515625, "learning_rate": 3.73036907948543e-07, "loss": 0.7532, "step": 7880 }, { "epoch": 9.752628324056895, "grad_norm": 0.3359375, "learning_rate": 3.546291048920347e-07, "loss": 0.7544, "step": 7885 }, { "epoch": 9.758812615955474, "grad_norm": 0.34375, "learning_rate": 3.366862333369358e-07, "loss": 0.7473, "step": 7890 }, { "epoch": 9.764996907854051, "grad_norm": 0.337890625, "learning_rate": 3.192083770021892e-07, "loss": 0.7525, "step": 7895 }, { "epoch": 9.771181199752629, "grad_norm": 0.34765625, "learning_rate": 3.0219561743707326e-07, "loss": 0.7504, "step": 7900 }, { "epoch": 9.777365491651206, "grad_norm": 0.3515625, "learning_rate": 2.856480340207579e-07, "loss": 0.7495, "step": 7905 }, { "epoch": 9.783549783549784, "grad_norm": 0.337890625, "learning_rate": 2.6956570396197143e-07, "loss": 0.7595, "step": 7910 }, { "epoch": 9.789734075448361, "grad_norm": 0.3359375, "learning_rate": 2.539487022986453e-07, "loss": 0.7584, "step": 7915 }, { "epoch": 9.795918367346939, "grad_norm": 0.34375, "learning_rate": 2.3879710189753656e-07, "loss": 0.7476, "step": 7920 }, { "epoch": 9.802102659245516, "grad_norm": 0.34375, "learning_rate": 2.2411097345392818e-07, "loss": 0.7463, "step": 7925 }, { "epoch": 9.808286951144094, "grad_norm": 0.337890625, "learning_rate": 2.098903854912515e-07, "loss": 0.7549, "step": 7930 }, { "epoch": 9.814471243042671, "grad_norm": 0.34375, "learning_rate": 1.9613540436080878e-07, "loss": 0.7567, "step": 7935 }, { "epoch": 9.820655534941249, "grad_norm": 0.353515625, "learning_rate": 1.8284609424142895e-07, "loss": 0.7578, "step": 7940 }, { "epoch": 9.826839826839826, "grad_norm": 0.33984375, "learning_rate": 1.7002251713920114e-07, "loss": 0.7542, "step": 7945 }, { "epoch": 9.833024118738404, "grad_norm": 0.345703125, "learning_rate": 1.5766473288715278e-07, "loss": 0.7565, "step": 7950 }, { "epoch": 9.839208410636981, "grad_norm": 0.3515625, "learning_rate": 1.457727991449942e-07, "loss": 0.7649, "step": 7955 }, { "epoch": 9.845392702535559, "grad_norm": 0.345703125, "learning_rate": 1.3434677139885222e-07, "loss": 0.753, "step": 7960 }, { "epoch": 9.851576994434136, "grad_norm": 0.349609375, "learning_rate": 1.2338670296097034e-07, "loss": 0.7556, "step": 7965 }, { "epoch": 9.857761286332714, "grad_norm": 0.349609375, "learning_rate": 1.1289264496953111e-07, "loss": 0.7629, "step": 7970 }, { "epoch": 9.863945578231293, "grad_norm": 0.345703125, "learning_rate": 1.0286464638834536e-07, "loss": 0.7587, "step": 7975 }, { "epoch": 9.87012987012987, "grad_norm": 0.34765625, "learning_rate": 9.330275400666332e-08, "loss": 0.7626, "step": 7980 }, { "epoch": 9.876314162028448, "grad_norm": 0.33984375, "learning_rate": 8.420701243895268e-08, "loss": 0.7528, "step": 7985 }, { "epoch": 9.882498453927026, "grad_norm": 0.34765625, "learning_rate": 7.557746412468758e-08, "loss": 0.7496, "step": 7990 }, { "epoch": 9.888682745825603, "grad_norm": 0.3359375, "learning_rate": 6.741414932813773e-08, "loss": 0.7468, "step": 7995 }, { "epoch": 9.89486703772418, "grad_norm": 0.34765625, "learning_rate": 5.971710613821291e-08, "loss": 0.7611, "step": 8000 }, { "epoch": 9.901051329622758, "grad_norm": 0.333984375, "learning_rate": 5.248637046824101e-08, "loss": 0.753, "step": 8005 }, { "epoch": 9.907235621521336, "grad_norm": 0.33984375, "learning_rate": 4.572197605583473e-08, "loss": 0.7515, "step": 8010 }, { "epoch": 9.913419913419913, "grad_norm": 0.365234375, "learning_rate": 3.9423954462713964e-08, "loss": 0.7673, "step": 8015 }, { "epoch": 9.91960420531849, "grad_norm": 0.3359375, "learning_rate": 3.359233507459481e-08, "loss": 0.7515, "step": 8020 }, { "epoch": 9.925788497217068, "grad_norm": 0.349609375, "learning_rate": 2.8227145100989672e-08, "loss": 0.7569, "step": 8025 }, { "epoch": 9.931972789115646, "grad_norm": 0.341796875, "learning_rate": 2.3328409575129608e-08, "loss": 0.7495, "step": 8030 }, { "epoch": 9.938157081014223, "grad_norm": 0.341796875, "learning_rate": 1.8896151353853253e-08, "loss": 0.7516, "step": 8035 }, { "epoch": 9.9443413729128, "grad_norm": 0.3515625, "learning_rate": 1.4930391117451426e-08, "loss": 0.7551, "step": 8040 }, { "epoch": 9.95052566481138, "grad_norm": 0.34375, "learning_rate": 1.1431147369611595e-08, "loss": 0.7553, "step": 8045 }, { "epoch": 9.956709956709958, "grad_norm": 0.33203125, "learning_rate": 8.398436437317969e-09, "loss": 0.759, "step": 8050 }, { "epoch": 9.962894248608535, "grad_norm": 0.3359375, "learning_rate": 5.832272470795985e-09, "loss": 0.7562, "step": 8055 }, { "epoch": 9.969078540507113, "grad_norm": 0.3359375, "learning_rate": 3.732667443390181e-09, "loss": 0.7497, "step": 8060 }, { "epoch": 9.97526283240569, "grad_norm": 0.353515625, "learning_rate": 2.099631151586401e-09, "loss": 0.7637, "step": 8065 }, { "epoch": 9.981447124304268, "grad_norm": 0.33984375, "learning_rate": 9.33171214889672e-10, "loss": 0.7618, "step": 8070 }, { "epoch": 9.987631416202845, "grad_norm": 0.337890625, "learning_rate": 2.3329307584640804e-10, "loss": 0.7574, "step": 8075 }, { "epoch": 9.993815708101423, "grad_norm": 0.337890625, "learning_rate": 0.0, "loss": 0.7647, "step": 8080 }, { "epoch": 9.993815708101423, "eval_loss": 2.699876308441162, "eval_runtime": 0.5372, "eval_samples_per_second": 18.614, "eval_steps_per_second": 1.861, "step": 8080 }, { "epoch": 9.993815708101423, "step": 8080, "total_flos": 4.816075145514844e+18, "train_loss": 0.8524611343841741, "train_runtime": 49320.3929, "train_samples_per_second": 7.867, "train_steps_per_second": 0.164 } ], "logging_steps": 5, "max_steps": 8080, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.816075145514844e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }