{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 13550, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007380073800738007, "grad_norm": 100.91170501708984, "learning_rate": 7.380073800738008e-07, "loss": 15.7032, "step": 10 }, { "epoch": 0.0014760147601476014, "grad_norm": 53.92641830444336, "learning_rate": 1.4760147601476015e-06, "loss": 12.3986, "step": 20 }, { "epoch": 0.002214022140221402, "grad_norm": 22.292367935180664, "learning_rate": 2.2140221402214023e-06, "loss": 10.8644, "step": 30 }, { "epoch": 0.002952029520295203, "grad_norm": 9.494361877441406, "learning_rate": 2.952029520295203e-06, "loss": 10.0464, "step": 40 }, { "epoch": 0.0036900369003690036, "grad_norm": 5.657174110412598, "learning_rate": 3.690036900369004e-06, "loss": 9.8267, "step": 50 }, { "epoch": 0.004428044280442804, "grad_norm": 11.480326652526855, "learning_rate": 4.428044280442805e-06, "loss": 9.6732, "step": 60 }, { "epoch": 0.0051660516605166054, "grad_norm": 32.78021240234375, "learning_rate": 5.166051660516605e-06, "loss": 9.6283, "step": 70 }, { "epoch": 0.005904059040590406, "grad_norm": 26.172266006469727, "learning_rate": 5.904059040590406e-06, "loss": 9.4751, "step": 80 }, { "epoch": 0.006642066420664207, "grad_norm": 33.70742416381836, "learning_rate": 6.642066420664207e-06, "loss": 9.3959, "step": 90 }, { "epoch": 0.007380073800738007, "grad_norm": 63.11279296875, "learning_rate": 7.380073800738008e-06, "loss": 9.3828, "step": 100 }, { "epoch": 0.008118081180811807, "grad_norm": 15.8975191116333, "learning_rate": 8.118081180811808e-06, "loss": 9.2352, "step": 110 }, { "epoch": 0.008856088560885609, "grad_norm": 12.312295913696289, "learning_rate": 8.85608856088561e-06, "loss": 9.1436, "step": 120 }, { "epoch": 0.00959409594095941, "grad_norm": 10.606693267822266, "learning_rate": 9.59409594095941e-06, "loss": 8.8854, "step": 130 }, { "epoch": 0.010332103321033211, "grad_norm": 16.000524520874023, "learning_rate": 1.033210332103321e-05, "loss": 8.703, "step": 140 }, { "epoch": 0.01107011070110701, "grad_norm": 14.297750473022461, "learning_rate": 1.1070110701107012e-05, "loss": 8.5243, "step": 150 }, { "epoch": 0.011808118081180811, "grad_norm": 11.472665786743164, "learning_rate": 1.1808118081180812e-05, "loss": 8.232, "step": 160 }, { "epoch": 0.012546125461254613, "grad_norm": 7.633975028991699, "learning_rate": 1.2546125461254612e-05, "loss": 8.0453, "step": 170 }, { "epoch": 0.013284132841328414, "grad_norm": 7.606258869171143, "learning_rate": 1.3284132841328414e-05, "loss": 7.9445, "step": 180 }, { "epoch": 0.014022140221402213, "grad_norm": 13.680715560913086, "learning_rate": 1.4022140221402214e-05, "loss": 7.9335, "step": 190 }, { "epoch": 0.014760147601476014, "grad_norm": 10.28775405883789, "learning_rate": 1.4760147601476015e-05, "loss": 7.7923, "step": 200 }, { "epoch": 0.015498154981549815, "grad_norm": 7.461697101593018, "learning_rate": 1.5498154981549817e-05, "loss": 7.763, "step": 210 }, { "epoch": 0.016236162361623615, "grad_norm": 4.384743690490723, "learning_rate": 1.6236162361623615e-05, "loss": 7.7702, "step": 220 }, { "epoch": 0.016974169741697416, "grad_norm": 5.806989669799805, "learning_rate": 1.6974169741697417e-05, "loss": 7.76, "step": 230 }, { "epoch": 0.017712177121771217, "grad_norm": 5.75732421875, "learning_rate": 1.771217712177122e-05, "loss": 7.6101, "step": 240 }, { "epoch": 0.01845018450184502, "grad_norm": 3.3278968334198, "learning_rate": 1.845018450184502e-05, "loss": 7.5669, "step": 250 }, { "epoch": 0.01918819188191882, "grad_norm": 5.252697467803955, "learning_rate": 1.918819188191882e-05, "loss": 7.3905, "step": 260 }, { "epoch": 0.01992619926199262, "grad_norm": 3.135658025741577, "learning_rate": 1.992619926199262e-05, "loss": 7.3472, "step": 270 }, { "epoch": 0.020664206642066422, "grad_norm": 5.030785083770752, "learning_rate": 2.066420664206642e-05, "loss": 7.2426, "step": 280 }, { "epoch": 0.021402214022140223, "grad_norm": 4.882932186126709, "learning_rate": 2.140221402214022e-05, "loss": 7.0541, "step": 290 }, { "epoch": 0.02214022140221402, "grad_norm": 2.2638933658599854, "learning_rate": 2.2140221402214025e-05, "loss": 7.0113, "step": 300 }, { "epoch": 0.022878228782287822, "grad_norm": 4.782796859741211, "learning_rate": 2.2878228782287826e-05, "loss": 6.8661, "step": 310 }, { "epoch": 0.023616236162361623, "grad_norm": 1.9799453020095825, "learning_rate": 2.3616236162361624e-05, "loss": 7.0323, "step": 320 }, { "epoch": 0.024354243542435424, "grad_norm": 4.8417558670043945, "learning_rate": 2.4354243542435426e-05, "loss": 6.8865, "step": 330 }, { "epoch": 0.025092250922509225, "grad_norm": 4.531852722167969, "learning_rate": 2.5092250922509224e-05, "loss": 6.7982, "step": 340 }, { "epoch": 0.025830258302583026, "grad_norm": 3.0997111797332764, "learning_rate": 2.5830258302583026e-05, "loss": 6.79, "step": 350 }, { "epoch": 0.026568265682656828, "grad_norm": 3.2981128692626953, "learning_rate": 2.6568265682656828e-05, "loss": 6.7459, "step": 360 }, { "epoch": 0.02730627306273063, "grad_norm": 3.313589572906494, "learning_rate": 2.730627306273063e-05, "loss": 6.637, "step": 370 }, { "epoch": 0.028044280442804426, "grad_norm": 2.1940433979034424, "learning_rate": 2.8044280442804427e-05, "loss": 6.5645, "step": 380 }, { "epoch": 0.028782287822878228, "grad_norm": 3.6912360191345215, "learning_rate": 2.878228782287823e-05, "loss": 6.4398, "step": 390 }, { "epoch": 0.02952029520295203, "grad_norm": 3.37406325340271, "learning_rate": 2.952029520295203e-05, "loss": 6.4774, "step": 400 }, { "epoch": 0.03025830258302583, "grad_norm": 3.22963285446167, "learning_rate": 3.0258302583025832e-05, "loss": 6.3106, "step": 410 }, { "epoch": 0.03099630996309963, "grad_norm": 2.419431686401367, "learning_rate": 3.0996309963099634e-05, "loss": 6.3172, "step": 420 }, { "epoch": 0.03173431734317343, "grad_norm": 2.677661895751953, "learning_rate": 3.173431734317343e-05, "loss": 6.1359, "step": 430 }, { "epoch": 0.03247232472324723, "grad_norm": 2.795398712158203, "learning_rate": 3.247232472324723e-05, "loss": 6.2412, "step": 440 }, { "epoch": 0.033210332103321034, "grad_norm": 2.9979288578033447, "learning_rate": 3.3210332103321035e-05, "loss": 6.2192, "step": 450 }, { "epoch": 0.03394833948339483, "grad_norm": 3.352975845336914, "learning_rate": 3.3948339483394833e-05, "loss": 6.1654, "step": 460 }, { "epoch": 0.03468634686346864, "grad_norm": 2.6526570320129395, "learning_rate": 3.468634686346864e-05, "loss": 6.0669, "step": 470 }, { "epoch": 0.035424354243542434, "grad_norm": 2.950063467025757, "learning_rate": 3.542435424354244e-05, "loss": 6.0332, "step": 480 }, { "epoch": 0.03616236162361624, "grad_norm": 4.221488952636719, "learning_rate": 3.6162361623616235e-05, "loss": 5.9708, "step": 490 }, { "epoch": 0.03690036900369004, "grad_norm": 2.6405985355377197, "learning_rate": 3.690036900369004e-05, "loss": 5.76, "step": 500 }, { "epoch": 0.037638376383763834, "grad_norm": 4.019585132598877, "learning_rate": 3.763837638376384e-05, "loss": 5.9036, "step": 510 }, { "epoch": 0.03837638376383764, "grad_norm": 2.687580108642578, "learning_rate": 3.837638376383764e-05, "loss": 5.808, "step": 520 }, { "epoch": 0.03911439114391144, "grad_norm": 3.339268207550049, "learning_rate": 3.911439114391144e-05, "loss": 5.7391, "step": 530 }, { "epoch": 0.03985239852398524, "grad_norm": 3.0441882610321045, "learning_rate": 3.985239852398524e-05, "loss": 5.7045, "step": 540 }, { "epoch": 0.04059040590405904, "grad_norm": 2.8957650661468506, "learning_rate": 4.0590405904059045e-05, "loss": 5.6956, "step": 550 }, { "epoch": 0.041328413284132844, "grad_norm": 3.6834869384765625, "learning_rate": 4.132841328413284e-05, "loss": 5.6103, "step": 560 }, { "epoch": 0.04206642066420664, "grad_norm": 3.4573564529418945, "learning_rate": 4.206642066420665e-05, "loss": 5.4107, "step": 570 }, { "epoch": 0.042804428044280446, "grad_norm": 3.2341487407684326, "learning_rate": 4.280442804428044e-05, "loss": 5.4208, "step": 580 }, { "epoch": 0.043542435424354244, "grad_norm": 3.6147806644439697, "learning_rate": 4.3542435424354244e-05, "loss": 5.4217, "step": 590 }, { "epoch": 0.04428044280442804, "grad_norm": 3.6139488220214844, "learning_rate": 4.428044280442805e-05, "loss": 5.3994, "step": 600 }, { "epoch": 0.045018450184501846, "grad_norm": 3.277580499649048, "learning_rate": 4.501845018450185e-05, "loss": 5.3605, "step": 610 }, { "epoch": 0.045756457564575644, "grad_norm": 2.5641043186187744, "learning_rate": 4.575645756457565e-05, "loss": 5.1682, "step": 620 }, { "epoch": 0.04649446494464945, "grad_norm": 2.422578811645508, "learning_rate": 4.6494464944649444e-05, "loss": 5.1585, "step": 630 }, { "epoch": 0.047232472324723246, "grad_norm": 4.027858257293701, "learning_rate": 4.723247232472325e-05, "loss": 5.2147, "step": 640 }, { "epoch": 0.04797047970479705, "grad_norm": 2.401747226715088, "learning_rate": 4.797047970479705e-05, "loss": 5.0839, "step": 650 }, { "epoch": 0.04870848708487085, "grad_norm": 2.9220759868621826, "learning_rate": 4.870848708487085e-05, "loss": 5.1216, "step": 660 }, { "epoch": 0.04944649446494465, "grad_norm": 2.4891719818115234, "learning_rate": 4.944649446494466e-05, "loss": 4.9789, "step": 670 }, { "epoch": 0.05018450184501845, "grad_norm": 2.279683828353882, "learning_rate": 5.018450184501845e-05, "loss": 4.9611, "step": 680 }, { "epoch": 0.05092250922509225, "grad_norm": 2.045536518096924, "learning_rate": 5.0922509225092254e-05, "loss": 4.8993, "step": 690 }, { "epoch": 0.05166051660516605, "grad_norm": 1.9132373332977295, "learning_rate": 5.166051660516605e-05, "loss": 4.8083, "step": 700 }, { "epoch": 0.05239852398523985, "grad_norm": 2.304215669631958, "learning_rate": 5.239852398523986e-05, "loss": 4.828, "step": 710 }, { "epoch": 0.053136531365313655, "grad_norm": 2.2891597747802734, "learning_rate": 5.3136531365313655e-05, "loss": 4.7838, "step": 720 }, { "epoch": 0.05387453874538745, "grad_norm": 2.411600351333618, "learning_rate": 5.387453874538746e-05, "loss": 4.6931, "step": 730 }, { "epoch": 0.05461254612546126, "grad_norm": 1.6772541999816895, "learning_rate": 5.461254612546126e-05, "loss": 4.7027, "step": 740 }, { "epoch": 0.055350553505535055, "grad_norm": 1.7979137897491455, "learning_rate": 5.535055350553506e-05, "loss": 4.7452, "step": 750 }, { "epoch": 0.05608856088560885, "grad_norm": 2.3298912048339844, "learning_rate": 5.6088560885608855e-05, "loss": 4.7062, "step": 760 }, { "epoch": 0.05682656826568266, "grad_norm": 1.986875295639038, "learning_rate": 5.682656826568265e-05, "loss": 4.6142, "step": 770 }, { "epoch": 0.057564575645756455, "grad_norm": 1.8501532077789307, "learning_rate": 5.756457564575646e-05, "loss": 4.4524, "step": 780 }, { "epoch": 0.05830258302583026, "grad_norm": 1.5959872007369995, "learning_rate": 5.830258302583026e-05, "loss": 4.5299, "step": 790 }, { "epoch": 0.05904059040590406, "grad_norm": 2.339456796646118, "learning_rate": 5.904059040590406e-05, "loss": 4.4703, "step": 800 }, { "epoch": 0.05977859778597786, "grad_norm": 1.6436880826950073, "learning_rate": 5.9778597785977866e-05, "loss": 4.463, "step": 810 }, { "epoch": 0.06051660516605166, "grad_norm": 1.7336505651474, "learning_rate": 6.0516605166051664e-05, "loss": 4.4363, "step": 820 }, { "epoch": 0.061254612546125464, "grad_norm": 1.691726565361023, "learning_rate": 6.125461254612547e-05, "loss": 4.4099, "step": 830 }, { "epoch": 0.06199261992619926, "grad_norm": 1.5019862651824951, "learning_rate": 6.199261992619927e-05, "loss": 4.4153, "step": 840 }, { "epoch": 0.06273062730627306, "grad_norm": 1.4851793050765991, "learning_rate": 6.273062730627307e-05, "loss": 4.4721, "step": 850 }, { "epoch": 0.06346863468634686, "grad_norm": 1.4793798923492432, "learning_rate": 6.346863468634686e-05, "loss": 4.3287, "step": 860 }, { "epoch": 0.06420664206642067, "grad_norm": 1.5791796445846558, "learning_rate": 6.420664206642066e-05, "loss": 4.3766, "step": 870 }, { "epoch": 0.06494464944649446, "grad_norm": 1.5449219942092896, "learning_rate": 6.494464944649446e-05, "loss": 4.2412, "step": 880 }, { "epoch": 0.06568265682656826, "grad_norm": 1.229464054107666, "learning_rate": 6.568265682656827e-05, "loss": 4.1558, "step": 890 }, { "epoch": 0.06642066420664207, "grad_norm": 1.5863291025161743, "learning_rate": 6.642066420664207e-05, "loss": 4.2074, "step": 900 }, { "epoch": 0.06715867158671587, "grad_norm": 1.319446086883545, "learning_rate": 6.715867158671587e-05, "loss": 4.258, "step": 910 }, { "epoch": 0.06789667896678966, "grad_norm": 1.3132025003433228, "learning_rate": 6.789667896678967e-05, "loss": 4.1444, "step": 920 }, { "epoch": 0.06863468634686347, "grad_norm": 1.5694645643234253, "learning_rate": 6.863468634686348e-05, "loss": 4.1201, "step": 930 }, { "epoch": 0.06937269372693727, "grad_norm": 1.4163988828659058, "learning_rate": 6.937269372693728e-05, "loss": 4.1113, "step": 940 }, { "epoch": 0.07011070110701106, "grad_norm": 1.487798810005188, "learning_rate": 7.011070110701108e-05, "loss": 4.0805, "step": 950 }, { "epoch": 0.07084870848708487, "grad_norm": 1.2213908433914185, "learning_rate": 7.084870848708487e-05, "loss": 4.0324, "step": 960 }, { "epoch": 0.07158671586715867, "grad_norm": 1.332588791847229, "learning_rate": 7.158671586715867e-05, "loss": 4.0455, "step": 970 }, { "epoch": 0.07232472324723248, "grad_norm": 1.212963342666626, "learning_rate": 7.232472324723247e-05, "loss": 4.0044, "step": 980 }, { "epoch": 0.07306273062730627, "grad_norm": 1.0928430557250977, "learning_rate": 7.306273062730628e-05, "loss": 4.0215, "step": 990 }, { "epoch": 0.07380073800738007, "grad_norm": 1.1430400609970093, "learning_rate": 7.380073800738008e-05, "loss": 4.0744, "step": 1000 }, { "epoch": 0.07453874538745388, "grad_norm": 0.9975944757461548, "learning_rate": 7.453874538745388e-05, "loss": 3.9955, "step": 1010 }, { "epoch": 0.07527675276752767, "grad_norm": 1.1288777589797974, "learning_rate": 7.527675276752768e-05, "loss": 3.9916, "step": 1020 }, { "epoch": 0.07601476014760147, "grad_norm": 1.0064688920974731, "learning_rate": 7.601476014760149e-05, "loss": 3.9025, "step": 1030 }, { "epoch": 0.07675276752767528, "grad_norm": 1.329229474067688, "learning_rate": 7.675276752767529e-05, "loss": 3.9822, "step": 1040 }, { "epoch": 0.07749077490774908, "grad_norm": 1.022760033607483, "learning_rate": 7.749077490774908e-05, "loss": 3.8762, "step": 1050 }, { "epoch": 0.07822878228782287, "grad_norm": 1.0934398174285889, "learning_rate": 7.822878228782288e-05, "loss": 3.7911, "step": 1060 }, { "epoch": 0.07896678966789668, "grad_norm": 1.008171796798706, "learning_rate": 7.896678966789668e-05, "loss": 3.8972, "step": 1070 }, { "epoch": 0.07970479704797048, "grad_norm": 1.1563254594802856, "learning_rate": 7.970479704797048e-05, "loss": 3.7901, "step": 1080 }, { "epoch": 0.08044280442804429, "grad_norm": 1.06783926486969, "learning_rate": 8.044280442804428e-05, "loss": 3.9768, "step": 1090 }, { "epoch": 0.08118081180811808, "grad_norm": 1.0809143781661987, "learning_rate": 8.118081180811809e-05, "loss": 3.7534, "step": 1100 }, { "epoch": 0.08191881918819188, "grad_norm": 1.040157675743103, "learning_rate": 8.191881918819189e-05, "loss": 3.7609, "step": 1110 }, { "epoch": 0.08265682656826569, "grad_norm": 1.0198458433151245, "learning_rate": 8.265682656826569e-05, "loss": 3.8031, "step": 1120 }, { "epoch": 0.08339483394833948, "grad_norm": 1.3017419576644897, "learning_rate": 8.339483394833948e-05, "loss": 3.8526, "step": 1130 }, { "epoch": 0.08413284132841328, "grad_norm": 0.9285693168640137, "learning_rate": 8.41328413284133e-05, "loss": 3.7551, "step": 1140 }, { "epoch": 0.08487084870848709, "grad_norm": 0.9603882431983948, "learning_rate": 8.48708487084871e-05, "loss": 3.834, "step": 1150 }, { "epoch": 0.08560885608856089, "grad_norm": 0.9195291996002197, "learning_rate": 8.560885608856088e-05, "loss": 3.7804, "step": 1160 }, { "epoch": 0.08634686346863468, "grad_norm": 1.0526838302612305, "learning_rate": 8.634686346863469e-05, "loss": 3.8757, "step": 1170 }, { "epoch": 0.08708487084870849, "grad_norm": 0.8891322612762451, "learning_rate": 8.708487084870849e-05, "loss": 3.7513, "step": 1180 }, { "epoch": 0.08782287822878229, "grad_norm": 0.9467900395393372, "learning_rate": 8.782287822878229e-05, "loss": 3.7555, "step": 1190 }, { "epoch": 0.08856088560885608, "grad_norm": 1.0294831991195679, "learning_rate": 8.85608856088561e-05, "loss": 3.834, "step": 1200 }, { "epoch": 0.08929889298892989, "grad_norm": 1.0832924842834473, "learning_rate": 8.92988929889299e-05, "loss": 3.6299, "step": 1210 }, { "epoch": 0.09003690036900369, "grad_norm": 0.9595062732696533, "learning_rate": 9.00369003690037e-05, "loss": 3.7695, "step": 1220 }, { "epoch": 0.0907749077490775, "grad_norm": 0.8714928030967712, "learning_rate": 9.077490774907749e-05, "loss": 3.7346, "step": 1230 }, { "epoch": 0.09151291512915129, "grad_norm": 0.9189225435256958, "learning_rate": 9.15129151291513e-05, "loss": 3.7646, "step": 1240 }, { "epoch": 0.09225092250922509, "grad_norm": 1.0212230682373047, "learning_rate": 9.22509225092251e-05, "loss": 3.6518, "step": 1250 }, { "epoch": 0.0929889298892989, "grad_norm": 0.8631012439727783, "learning_rate": 9.298892988929889e-05, "loss": 3.6702, "step": 1260 }, { "epoch": 0.09372693726937269, "grad_norm": 0.76339191198349, "learning_rate": 9.37269372693727e-05, "loss": 3.6757, "step": 1270 }, { "epoch": 0.09446494464944649, "grad_norm": 0.8459323048591614, "learning_rate": 9.44649446494465e-05, "loss": 3.8173, "step": 1280 }, { "epoch": 0.0952029520295203, "grad_norm": 0.7884580492973328, "learning_rate": 9.52029520295203e-05, "loss": 3.6719, "step": 1290 }, { "epoch": 0.0959409594095941, "grad_norm": 0.9069279432296753, "learning_rate": 9.59409594095941e-05, "loss": 3.6447, "step": 1300 }, { "epoch": 0.09667896678966789, "grad_norm": 0.8386545181274414, "learning_rate": 9.66789667896679e-05, "loss": 3.6681, "step": 1310 }, { "epoch": 0.0974169741697417, "grad_norm": 0.8082497119903564, "learning_rate": 9.74169741697417e-05, "loss": 3.6526, "step": 1320 }, { "epoch": 0.0981549815498155, "grad_norm": 0.7619675993919373, "learning_rate": 9.81549815498155e-05, "loss": 3.6717, "step": 1330 }, { "epoch": 0.0988929889298893, "grad_norm": 0.803425133228302, "learning_rate": 9.889298892988931e-05, "loss": 3.5892, "step": 1340 }, { "epoch": 0.0996309963099631, "grad_norm": 0.8372170925140381, "learning_rate": 9.963099630996311e-05, "loss": 3.6615, "step": 1350 }, { "epoch": 0.1003690036900369, "grad_norm": 0.8343318700790405, "learning_rate": 9.999995852216369e-05, "loss": 3.5785, "step": 1360 }, { "epoch": 0.1011070110701107, "grad_norm": 0.8367707133293152, "learning_rate": 9.999962669988607e-05, "loss": 3.625, "step": 1370 }, { "epoch": 0.1018450184501845, "grad_norm": 0.8662716150283813, "learning_rate": 9.999896305753297e-05, "loss": 3.6656, "step": 1380 }, { "epoch": 0.1025830258302583, "grad_norm": 0.747052788734436, "learning_rate": 9.999796759950864e-05, "loss": 3.5761, "step": 1390 }, { "epoch": 0.1033210332103321, "grad_norm": 0.7763943672180176, "learning_rate": 9.999664033241933e-05, "loss": 3.5234, "step": 1400 }, { "epoch": 0.10405904059040591, "grad_norm": 0.7435528039932251, "learning_rate": 9.99949812650734e-05, "loss": 3.5132, "step": 1410 }, { "epoch": 0.1047970479704797, "grad_norm": 0.8303211331367493, "learning_rate": 9.999299040848121e-05, "loss": 3.5173, "step": 1420 }, { "epoch": 0.1055350553505535, "grad_norm": 0.8359752297401428, "learning_rate": 9.999066777585495e-05, "loss": 3.5605, "step": 1430 }, { "epoch": 0.10627306273062731, "grad_norm": 0.909545361995697, "learning_rate": 9.998801338260865e-05, "loss": 3.5839, "step": 1440 }, { "epoch": 0.1070110701107011, "grad_norm": 0.845916748046875, "learning_rate": 9.99850272463581e-05, "loss": 3.5685, "step": 1450 }, { "epoch": 0.1077490774907749, "grad_norm": 0.834235429763794, "learning_rate": 9.99817093869206e-05, "loss": 3.5476, "step": 1460 }, { "epoch": 0.10848708487084871, "grad_norm": 0.7273171544075012, "learning_rate": 9.997805982631499e-05, "loss": 3.4777, "step": 1470 }, { "epoch": 0.10922509225092251, "grad_norm": 0.839796245098114, "learning_rate": 9.99740785887614e-05, "loss": 3.5084, "step": 1480 }, { "epoch": 0.1099630996309963, "grad_norm": 0.7638348340988159, "learning_rate": 9.99697657006811e-05, "loss": 3.5741, "step": 1490 }, { "epoch": 0.11070110701107011, "grad_norm": 0.7195069193840027, "learning_rate": 9.996512119069636e-05, "loss": 3.5083, "step": 1500 }, { "epoch": 0.11143911439114391, "grad_norm": 0.7351711392402649, "learning_rate": 9.996014508963028e-05, "loss": 3.365, "step": 1510 }, { "epoch": 0.1121771217712177, "grad_norm": 0.7192705869674683, "learning_rate": 9.995483743050648e-05, "loss": 3.5233, "step": 1520 }, { "epoch": 0.11291512915129151, "grad_norm": 0.7362285256385803, "learning_rate": 9.994919824854898e-05, "loss": 3.5548, "step": 1530 }, { "epoch": 0.11365313653136531, "grad_norm": 0.6908057928085327, "learning_rate": 9.994322758118196e-05, "loss": 3.4293, "step": 1540 }, { "epoch": 0.11439114391143912, "grad_norm": 0.7892534136772156, "learning_rate": 9.993692546802941e-05, "loss": 3.4583, "step": 1550 }, { "epoch": 0.11512915129151291, "grad_norm": 0.7085639834403992, "learning_rate": 9.993029195091505e-05, "loss": 3.4349, "step": 1560 }, { "epoch": 0.11586715867158671, "grad_norm": 0.7825974225997925, "learning_rate": 9.992332707386188e-05, "loss": 3.4496, "step": 1570 }, { "epoch": 0.11660516605166052, "grad_norm": 0.7284643054008484, "learning_rate": 9.991603088309194e-05, "loss": 3.517, "step": 1580 }, { "epoch": 0.11734317343173432, "grad_norm": 0.7682483792304993, "learning_rate": 9.990840342702606e-05, "loss": 3.4505, "step": 1590 }, { "epoch": 0.11808118081180811, "grad_norm": 0.8391796350479126, "learning_rate": 9.990044475628347e-05, "loss": 3.5077, "step": 1600 }, { "epoch": 0.11881918819188192, "grad_norm": 0.7043576836585999, "learning_rate": 9.989215492368151e-05, "loss": 3.4272, "step": 1610 }, { "epoch": 0.11955719557195572, "grad_norm": 0.72553551197052, "learning_rate": 9.988353398423527e-05, "loss": 3.3559, "step": 1620 }, { "epoch": 0.12029520295202951, "grad_norm": 0.7156850099563599, "learning_rate": 9.987458199515713e-05, "loss": 3.4108, "step": 1630 }, { "epoch": 0.12103321033210332, "grad_norm": 0.6410751342773438, "learning_rate": 9.98652990158566e-05, "loss": 3.4688, "step": 1640 }, { "epoch": 0.12177121771217712, "grad_norm": 0.8124927282333374, "learning_rate": 9.985568510793967e-05, "loss": 3.4611, "step": 1650 }, { "epoch": 0.12250922509225093, "grad_norm": 0.7403334379196167, "learning_rate": 9.984574033520857e-05, "loss": 3.4669, "step": 1660 }, { "epoch": 0.12324723247232472, "grad_norm": 0.662948727607727, "learning_rate": 9.983546476366132e-05, "loss": 3.4798, "step": 1670 }, { "epoch": 0.12398523985239852, "grad_norm": 0.6987183690071106, "learning_rate": 9.982485846149125e-05, "loss": 3.3932, "step": 1680 }, { "epoch": 0.12472324723247233, "grad_norm": 0.650486171245575, "learning_rate": 9.981392149908652e-05, "loss": 3.3856, "step": 1690 }, { "epoch": 0.12546125461254612, "grad_norm": 0.6416191458702087, "learning_rate": 9.98026539490298e-05, "loss": 3.455, "step": 1700 }, { "epoch": 0.12619926199261994, "grad_norm": 0.6319407820701599, "learning_rate": 9.979105588609762e-05, "loss": 3.4001, "step": 1710 }, { "epoch": 0.12693726937269373, "grad_norm": 0.6667493581771851, "learning_rate": 9.977912738725994e-05, "loss": 3.4277, "step": 1720 }, { "epoch": 0.12767527675276752, "grad_norm": 0.6686265468597412, "learning_rate": 9.976686853167967e-05, "loss": 3.4075, "step": 1730 }, { "epoch": 0.12841328413284134, "grad_norm": 0.731555700302124, "learning_rate": 9.975427940071211e-05, "loss": 3.4226, "step": 1740 }, { "epoch": 0.12915129151291513, "grad_norm": 0.6553905606269836, "learning_rate": 9.97413600779044e-05, "loss": 3.4306, "step": 1750 }, { "epoch": 0.12988929889298892, "grad_norm": 0.7509811520576477, "learning_rate": 9.9728110648995e-05, "loss": 3.3937, "step": 1760 }, { "epoch": 0.13062730627306274, "grad_norm": 0.7052728533744812, "learning_rate": 9.971453120191309e-05, "loss": 3.3822, "step": 1770 }, { "epoch": 0.13136531365313653, "grad_norm": 0.6742541790008545, "learning_rate": 9.970062182677801e-05, "loss": 3.3824, "step": 1780 }, { "epoch": 0.13210332103321032, "grad_norm": 0.6257262825965881, "learning_rate": 9.968638261589866e-05, "loss": 3.4047, "step": 1790 }, { "epoch": 0.13284132841328414, "grad_norm": 0.6546107530593872, "learning_rate": 9.967181366377285e-05, "loss": 3.3903, "step": 1800 }, { "epoch": 0.13357933579335793, "grad_norm": 0.8019782304763794, "learning_rate": 9.965691506708672e-05, "loss": 3.3911, "step": 1810 }, { "epoch": 0.13431734317343175, "grad_norm": 0.6207643151283264, "learning_rate": 9.964168692471408e-05, "loss": 3.3861, "step": 1820 }, { "epoch": 0.13505535055350554, "grad_norm": 0.6750718355178833, "learning_rate": 9.962612933771576e-05, "loss": 3.4424, "step": 1830 }, { "epoch": 0.13579335793357933, "grad_norm": 0.9330940246582031, "learning_rate": 9.961024240933892e-05, "loss": 3.3459, "step": 1840 }, { "epoch": 0.13653136531365315, "grad_norm": 0.7058202028274536, "learning_rate": 9.959402624501636e-05, "loss": 3.3327, "step": 1850 }, { "epoch": 0.13726937269372694, "grad_norm": 0.779712438583374, "learning_rate": 9.957748095236589e-05, "loss": 3.4398, "step": 1860 }, { "epoch": 0.13800738007380073, "grad_norm": 0.663960337638855, "learning_rate": 9.956060664118951e-05, "loss": 3.3513, "step": 1870 }, { "epoch": 0.13874538745387455, "grad_norm": 0.756618082523346, "learning_rate": 9.954340342347279e-05, "loss": 3.304, "step": 1880 }, { "epoch": 0.13948339483394834, "grad_norm": 0.7523687481880188, "learning_rate": 9.952587141338403e-05, "loss": 3.3155, "step": 1890 }, { "epoch": 0.14022140221402213, "grad_norm": 0.6524930596351624, "learning_rate": 9.950801072727356e-05, "loss": 3.3803, "step": 1900 }, { "epoch": 0.14095940959409595, "grad_norm": 0.7161090970039368, "learning_rate": 9.948982148367292e-05, "loss": 3.4219, "step": 1910 }, { "epoch": 0.14169741697416974, "grad_norm": 0.7181054949760437, "learning_rate": 9.947130380329418e-05, "loss": 3.301, "step": 1920 }, { "epoch": 0.14243542435424356, "grad_norm": 0.6185216903686523, "learning_rate": 9.945245780902899e-05, "loss": 3.3666, "step": 1930 }, { "epoch": 0.14317343173431735, "grad_norm": 0.6279731392860413, "learning_rate": 9.943328362594788e-05, "loss": 3.2862, "step": 1940 }, { "epoch": 0.14391143911439114, "grad_norm": 0.6401661038398743, "learning_rate": 9.941378138129938e-05, "loss": 3.3112, "step": 1950 }, { "epoch": 0.14464944649446496, "grad_norm": 0.6105781197547913, "learning_rate": 9.939395120450916e-05, "loss": 3.3539, "step": 1960 }, { "epoch": 0.14538745387453875, "grad_norm": 0.6660001873970032, "learning_rate": 9.937379322717924e-05, "loss": 3.3722, "step": 1970 }, { "epoch": 0.14612546125461254, "grad_norm": 0.6415931582450867, "learning_rate": 9.935330758308705e-05, "loss": 3.3329, "step": 1980 }, { "epoch": 0.14686346863468636, "grad_norm": 0.6147580742835999, "learning_rate": 9.933249440818455e-05, "loss": 3.2807, "step": 1990 }, { "epoch": 0.14760147601476015, "grad_norm": 0.694519579410553, "learning_rate": 9.931135384059736e-05, "loss": 3.2662, "step": 2000 }, { "epoch": 0.14833948339483394, "grad_norm": 0.6452217102050781, "learning_rate": 9.928988602062384e-05, "loss": 3.2942, "step": 2010 }, { "epoch": 0.14907749077490776, "grad_norm": 0.6983804106712341, "learning_rate": 9.926809109073412e-05, "loss": 3.2639, "step": 2020 }, { "epoch": 0.14981549815498155, "grad_norm": 0.6302483677864075, "learning_rate": 9.924596919556917e-05, "loss": 3.3648, "step": 2030 }, { "epoch": 0.15055350553505534, "grad_norm": 0.6506009697914124, "learning_rate": 9.922352048193986e-05, "loss": 3.3417, "step": 2040 }, { "epoch": 0.15129151291512916, "grad_norm": 0.6232055425643921, "learning_rate": 9.920074509882602e-05, "loss": 3.3304, "step": 2050 }, { "epoch": 0.15202952029520295, "grad_norm": 0.6454508900642395, "learning_rate": 9.917764319737533e-05, "loss": 3.2585, "step": 2060 }, { "epoch": 0.15276752767527677, "grad_norm": 0.6281662583351135, "learning_rate": 9.915421493090243e-05, "loss": 3.2753, "step": 2070 }, { "epoch": 0.15350553505535056, "grad_norm": 0.7222394943237305, "learning_rate": 9.913046045488786e-05, "loss": 3.2683, "step": 2080 }, { "epoch": 0.15424354243542435, "grad_norm": 0.6333222389221191, "learning_rate": 9.910637992697707e-05, "loss": 3.2676, "step": 2090 }, { "epoch": 0.15498154981549817, "grad_norm": 0.6758008003234863, "learning_rate": 9.908197350697926e-05, "loss": 3.2941, "step": 2100 }, { "epoch": 0.15571955719557196, "grad_norm": 0.5930529832839966, "learning_rate": 9.905724135686648e-05, "loss": 3.3365, "step": 2110 }, { "epoch": 0.15645756457564575, "grad_norm": 0.7024756669998169, "learning_rate": 9.903218364077243e-05, "loss": 3.2594, "step": 2120 }, { "epoch": 0.15719557195571957, "grad_norm": 0.6018502712249756, "learning_rate": 9.900680052499138e-05, "loss": 3.3316, "step": 2130 }, { "epoch": 0.15793357933579336, "grad_norm": 0.6856579184532166, "learning_rate": 9.898109217797717e-05, "loss": 3.3196, "step": 2140 }, { "epoch": 0.15867158671586715, "grad_norm": 0.6864190101623535, "learning_rate": 9.895505877034198e-05, "loss": 3.3116, "step": 2150 }, { "epoch": 0.15940959409594097, "grad_norm": 0.57015061378479, "learning_rate": 9.892870047485526e-05, "loss": 3.3119, "step": 2160 }, { "epoch": 0.16014760147601476, "grad_norm": 0.5812332630157471, "learning_rate": 9.89020174664425e-05, "loss": 3.2727, "step": 2170 }, { "epoch": 0.16088560885608857, "grad_norm": 0.6356363296508789, "learning_rate": 9.887500992218421e-05, "loss": 3.3661, "step": 2180 }, { "epoch": 0.16162361623616237, "grad_norm": 0.672024130821228, "learning_rate": 9.884767802131465e-05, "loss": 3.3215, "step": 2190 }, { "epoch": 0.16236162361623616, "grad_norm": 0.6531562805175781, "learning_rate": 9.882002194522064e-05, "loss": 3.2374, "step": 2200 }, { "epoch": 0.16309963099630997, "grad_norm": 0.6039624214172363, "learning_rate": 9.879204187744036e-05, "loss": 3.2342, "step": 2210 }, { "epoch": 0.16383763837638377, "grad_norm": 0.5702035427093506, "learning_rate": 9.876373800366215e-05, "loss": 3.3181, "step": 2220 }, { "epoch": 0.16457564575645756, "grad_norm": 0.6860033273696899, "learning_rate": 9.87351105117233e-05, "loss": 3.3758, "step": 2230 }, { "epoch": 0.16531365313653137, "grad_norm": 0.6462620496749878, "learning_rate": 9.870615959160875e-05, "loss": 3.3542, "step": 2240 }, { "epoch": 0.16605166051660517, "grad_norm": 0.6575970649719238, "learning_rate": 9.867688543544988e-05, "loss": 3.2135, "step": 2250 }, { "epoch": 0.16678966789667896, "grad_norm": 0.6185761094093323, "learning_rate": 9.86472882375232e-05, "loss": 3.294, "step": 2260 }, { "epoch": 0.16752767527675277, "grad_norm": 0.6141475439071655, "learning_rate": 9.861736819424902e-05, "loss": 3.1992, "step": 2270 }, { "epoch": 0.16826568265682657, "grad_norm": 0.6172120571136475, "learning_rate": 9.85871255041903e-05, "loss": 3.2167, "step": 2280 }, { "epoch": 0.16900369003690036, "grad_norm": 0.5904815196990967, "learning_rate": 9.855656036805114e-05, "loss": 3.2945, "step": 2290 }, { "epoch": 0.16974169741697417, "grad_norm": 0.6383630633354187, "learning_rate": 9.852567298867557e-05, "loss": 3.2865, "step": 2300 }, { "epoch": 0.17047970479704797, "grad_norm": 0.60262531042099, "learning_rate": 9.84944635710462e-05, "loss": 3.2188, "step": 2310 }, { "epoch": 0.17121771217712178, "grad_norm": 0.5909958481788635, "learning_rate": 9.846293232228274e-05, "loss": 3.2896, "step": 2320 }, { "epoch": 0.17195571955719557, "grad_norm": 0.5554500818252563, "learning_rate": 9.843107945164086e-05, "loss": 3.1705, "step": 2330 }, { "epoch": 0.17269372693726937, "grad_norm": 0.620606005191803, "learning_rate": 9.83989051705105e-05, "loss": 3.2288, "step": 2340 }, { "epoch": 0.17343173431734318, "grad_norm": 0.6841108202934265, "learning_rate": 9.836640969241475e-05, "loss": 3.2441, "step": 2350 }, { "epoch": 0.17416974169741697, "grad_norm": 0.6839698553085327, "learning_rate": 9.833359323300826e-05, "loss": 3.2246, "step": 2360 }, { "epoch": 0.17490774907749077, "grad_norm": 0.7128744721412659, "learning_rate": 9.830045601007584e-05, "loss": 3.2008, "step": 2370 }, { "epoch": 0.17564575645756458, "grad_norm": 0.65251624584198, "learning_rate": 9.826699824353106e-05, "loss": 3.3275, "step": 2380 }, { "epoch": 0.17638376383763837, "grad_norm": 0.5380867123603821, "learning_rate": 9.823322015541474e-05, "loss": 3.2064, "step": 2390 }, { "epoch": 0.17712177121771217, "grad_norm": 0.5963719487190247, "learning_rate": 9.819912196989351e-05, "loss": 3.1643, "step": 2400 }, { "epoch": 0.17785977859778598, "grad_norm": 0.8703069090843201, "learning_rate": 9.816470391325832e-05, "loss": 3.1848, "step": 2410 }, { "epoch": 0.17859778597785977, "grad_norm": 0.608935534954071, "learning_rate": 9.81299662139229e-05, "loss": 3.2719, "step": 2420 }, { "epoch": 0.1793357933579336, "grad_norm": 0.6425730586051941, "learning_rate": 9.809490910242229e-05, "loss": 3.2619, "step": 2430 }, { "epoch": 0.18007380073800738, "grad_norm": 0.5790001749992371, "learning_rate": 9.805953281141131e-05, "loss": 3.243, "step": 2440 }, { "epoch": 0.18081180811808117, "grad_norm": 0.6436141133308411, "learning_rate": 9.802383757566301e-05, "loss": 3.2284, "step": 2450 }, { "epoch": 0.181549815498155, "grad_norm": 0.5458927154541016, "learning_rate": 9.798782363206702e-05, "loss": 3.2043, "step": 2460 }, { "epoch": 0.18228782287822878, "grad_norm": 0.6296219229698181, "learning_rate": 9.795149121962815e-05, "loss": 3.2683, "step": 2470 }, { "epoch": 0.18302583025830257, "grad_norm": 0.6964813470840454, "learning_rate": 9.791484057946465e-05, "loss": 3.1977, "step": 2480 }, { "epoch": 0.1837638376383764, "grad_norm": 0.5911018252372742, "learning_rate": 9.787787195480672e-05, "loss": 3.2263, "step": 2490 }, { "epoch": 0.18450184501845018, "grad_norm": 0.5431626439094543, "learning_rate": 9.784058559099483e-05, "loss": 3.1628, "step": 2500 }, { "epoch": 0.18523985239852397, "grad_norm": 0.6068975329399109, "learning_rate": 9.78029817354781e-05, "loss": 3.1828, "step": 2510 }, { "epoch": 0.1859778597785978, "grad_norm": 0.580287516117096, "learning_rate": 9.776506063781269e-05, "loss": 3.2248, "step": 2520 }, { "epoch": 0.18671586715867158, "grad_norm": 0.6136944890022278, "learning_rate": 9.772682254966008e-05, "loss": 3.2495, "step": 2530 }, { "epoch": 0.18745387453874537, "grad_norm": 0.6076098680496216, "learning_rate": 9.76882677247855e-05, "loss": 3.1979, "step": 2540 }, { "epoch": 0.1881918819188192, "grad_norm": 0.5682818293571472, "learning_rate": 9.764939641905615e-05, "loss": 3.1714, "step": 2550 }, { "epoch": 0.18892988929889298, "grad_norm": 0.5991480350494385, "learning_rate": 9.761020889043954e-05, "loss": 3.154, "step": 2560 }, { "epoch": 0.1896678966789668, "grad_norm": 0.6232896447181702, "learning_rate": 9.75707053990018e-05, "loss": 3.2036, "step": 2570 }, { "epoch": 0.1904059040590406, "grad_norm": 0.5560643672943115, "learning_rate": 9.75308862069059e-05, "loss": 3.2392, "step": 2580 }, { "epoch": 0.19114391143911438, "grad_norm": 0.5718569755554199, "learning_rate": 9.749075157840996e-05, "loss": 3.2528, "step": 2590 }, { "epoch": 0.1918819188191882, "grad_norm": 0.5662999749183655, "learning_rate": 9.74503017798655e-05, "loss": 3.2256, "step": 2600 }, { "epoch": 0.192619926199262, "grad_norm": 0.6026265621185303, "learning_rate": 9.74095370797156e-05, "loss": 3.2183, "step": 2610 }, { "epoch": 0.19335793357933578, "grad_norm": 0.6032066941261292, "learning_rate": 9.736845774849321e-05, "loss": 3.2418, "step": 2620 }, { "epoch": 0.1940959409594096, "grad_norm": 0.5830618143081665, "learning_rate": 9.732706405881931e-05, "loss": 3.191, "step": 2630 }, { "epoch": 0.1948339483394834, "grad_norm": 0.5695509314537048, "learning_rate": 9.728535628540109e-05, "loss": 3.1968, "step": 2640 }, { "epoch": 0.19557195571955718, "grad_norm": 0.5905478000640869, "learning_rate": 9.724333470503013e-05, "loss": 3.2596, "step": 2650 }, { "epoch": 0.196309963099631, "grad_norm": 0.5251249670982361, "learning_rate": 9.720099959658062e-05, "loss": 3.1729, "step": 2660 }, { "epoch": 0.1970479704797048, "grad_norm": 0.6502349972724915, "learning_rate": 9.715835124100742e-05, "loss": 3.2604, "step": 2670 }, { "epoch": 0.1977859778597786, "grad_norm": 0.6250560283660889, "learning_rate": 9.711538992134426e-05, "loss": 3.2194, "step": 2680 }, { "epoch": 0.1985239852398524, "grad_norm": 0.5793785452842712, "learning_rate": 9.707211592270183e-05, "loss": 3.1994, "step": 2690 }, { "epoch": 0.1992619926199262, "grad_norm": 0.6495150327682495, "learning_rate": 9.70285295322659e-05, "loss": 3.1919, "step": 2700 }, { "epoch": 0.2, "grad_norm": 0.5875915288925171, "learning_rate": 9.698463103929542e-05, "loss": 3.2464, "step": 2710 }, { "epoch": 0.2007380073800738, "grad_norm": 0.5518725514411926, "learning_rate": 9.69404207351206e-05, "loss": 3.2042, "step": 2720 }, { "epoch": 0.2014760147601476, "grad_norm": 0.5390283465385437, "learning_rate": 9.689589891314094e-05, "loss": 3.2012, "step": 2730 }, { "epoch": 0.2022140221402214, "grad_norm": 0.5596645474433899, "learning_rate": 9.685106586882336e-05, "loss": 3.2053, "step": 2740 }, { "epoch": 0.2029520295202952, "grad_norm": 0.5377479195594788, "learning_rate": 9.680592189970015e-05, "loss": 3.177, "step": 2750 }, { "epoch": 0.203690036900369, "grad_norm": 0.5858853459358215, "learning_rate": 9.676046730536704e-05, "loss": 3.2039, "step": 2760 }, { "epoch": 0.2044280442804428, "grad_norm": 0.5771840810775757, "learning_rate": 9.671470238748124e-05, "loss": 3.1654, "step": 2770 }, { "epoch": 0.2051660516605166, "grad_norm": 0.5626157522201538, "learning_rate": 9.666862744975938e-05, "loss": 3.1978, "step": 2780 }, { "epoch": 0.2059040590405904, "grad_norm": 0.5536968111991882, "learning_rate": 9.662224279797552e-05, "loss": 3.2152, "step": 2790 }, { "epoch": 0.2066420664206642, "grad_norm": 0.5982388854026794, "learning_rate": 9.657554873995913e-05, "loss": 3.1699, "step": 2800 }, { "epoch": 0.207380073800738, "grad_norm": 0.5761833190917969, "learning_rate": 9.652854558559308e-05, "loss": 3.1766, "step": 2810 }, { "epoch": 0.20811808118081182, "grad_norm": 0.5907506346702576, "learning_rate": 9.648123364681145e-05, "loss": 3.0935, "step": 2820 }, { "epoch": 0.2088560885608856, "grad_norm": 0.5584788918495178, "learning_rate": 9.643361323759763e-05, "loss": 3.1111, "step": 2830 }, { "epoch": 0.2095940959409594, "grad_norm": 0.5568063855171204, "learning_rate": 9.638568467398215e-05, "loss": 3.1739, "step": 2840 }, { "epoch": 0.21033210332103322, "grad_norm": 0.5453604459762573, "learning_rate": 9.633744827404055e-05, "loss": 3.2064, "step": 2850 }, { "epoch": 0.211070110701107, "grad_norm": 0.6171849966049194, "learning_rate": 9.628890435789135e-05, "loss": 3.2281, "step": 2860 }, { "epoch": 0.2118081180811808, "grad_norm": 0.5285280346870422, "learning_rate": 9.624005324769388e-05, "loss": 3.113, "step": 2870 }, { "epoch": 0.21254612546125462, "grad_norm": 0.5632630586624146, "learning_rate": 9.619089526764614e-05, "loss": 3.1592, "step": 2880 }, { "epoch": 0.2132841328413284, "grad_norm": 0.6024160385131836, "learning_rate": 9.614143074398264e-05, "loss": 3.1904, "step": 2890 }, { "epoch": 0.2140221402214022, "grad_norm": 0.5437342524528503, "learning_rate": 9.609166000497229e-05, "loss": 3.1156, "step": 2900 }, { "epoch": 0.21476014760147602, "grad_norm": 0.5884766578674316, "learning_rate": 9.604158338091615e-05, "loss": 3.1888, "step": 2910 }, { "epoch": 0.2154981549815498, "grad_norm": 0.547242283821106, "learning_rate": 9.599120120414531e-05, "loss": 3.1079, "step": 2920 }, { "epoch": 0.21623616236162363, "grad_norm": 0.5443885326385498, "learning_rate": 9.594051380901859e-05, "loss": 3.1147, "step": 2930 }, { "epoch": 0.21697416974169742, "grad_norm": 0.5350677371025085, "learning_rate": 9.588952153192041e-05, "loss": 3.1061, "step": 2940 }, { "epoch": 0.2177121771217712, "grad_norm": 0.5434796214103699, "learning_rate": 9.583822471125854e-05, "loss": 3.1172, "step": 2950 }, { "epoch": 0.21845018450184503, "grad_norm": 0.5185326933860779, "learning_rate": 9.578662368746182e-05, "loss": 3.2186, "step": 2960 }, { "epoch": 0.21918819188191882, "grad_norm": 0.5394032001495361, "learning_rate": 9.57347188029779e-05, "loss": 3.1628, "step": 2970 }, { "epoch": 0.2199261992619926, "grad_norm": 0.5857832431793213, "learning_rate": 9.568251040227101e-05, "loss": 3.1291, "step": 2980 }, { "epoch": 0.22066420664206643, "grad_norm": 0.6189760565757751, "learning_rate": 9.562999883181967e-05, "loss": 3.1305, "step": 2990 }, { "epoch": 0.22140221402214022, "grad_norm": 0.5518510937690735, "learning_rate": 9.557718444011431e-05, "loss": 3.2148, "step": 3000 }, { "epoch": 0.222140221402214, "grad_norm": 0.5947515964508057, "learning_rate": 9.552406757765509e-05, "loss": 3.1322, "step": 3010 }, { "epoch": 0.22287822878228783, "grad_norm": 0.5554746985435486, "learning_rate": 9.547064859694943e-05, "loss": 3.1822, "step": 3020 }, { "epoch": 0.22361623616236162, "grad_norm": 0.5308244824409485, "learning_rate": 9.541692785250981e-05, "loss": 3.1371, "step": 3030 }, { "epoch": 0.2243542435424354, "grad_norm": 0.5285702347755432, "learning_rate": 9.536290570085131e-05, "loss": 3.1329, "step": 3040 }, { "epoch": 0.22509225092250923, "grad_norm": 0.5468854904174805, "learning_rate": 9.530858250048932e-05, "loss": 3.2538, "step": 3050 }, { "epoch": 0.22583025830258302, "grad_norm": 0.5449059009552002, "learning_rate": 9.525395861193707e-05, "loss": 3.2139, "step": 3060 }, { "epoch": 0.22656826568265684, "grad_norm": 0.5692685842514038, "learning_rate": 9.519903439770332e-05, "loss": 3.1138, "step": 3070 }, { "epoch": 0.22730627306273063, "grad_norm": 0.5263866782188416, "learning_rate": 9.514381022228997e-05, "loss": 3.0872, "step": 3080 }, { "epoch": 0.22804428044280442, "grad_norm": 0.5696788430213928, "learning_rate": 9.50882864521895e-05, "loss": 3.167, "step": 3090 }, { "epoch": 0.22878228782287824, "grad_norm": 0.5760169625282288, "learning_rate": 9.503246345588274e-05, "loss": 3.15, "step": 3100 }, { "epoch": 0.22952029520295203, "grad_norm": 0.5390339493751526, "learning_rate": 9.497634160383626e-05, "loss": 3.1367, "step": 3110 }, { "epoch": 0.23025830258302582, "grad_norm": 0.5490269660949707, "learning_rate": 9.491992126849997e-05, "loss": 3.1779, "step": 3120 }, { "epoch": 0.23099630996309964, "grad_norm": 0.5177121758460999, "learning_rate": 9.486320282430468e-05, "loss": 3.0789, "step": 3130 }, { "epoch": 0.23173431734317343, "grad_norm": 0.5448027849197388, "learning_rate": 9.480618664765955e-05, "loss": 3.1866, "step": 3140 }, { "epoch": 0.23247232472324722, "grad_norm": 0.5371176600456238, "learning_rate": 9.474887311694968e-05, "loss": 3.2089, "step": 3150 }, { "epoch": 0.23321033210332104, "grad_norm": 0.6013469099998474, "learning_rate": 9.469126261253348e-05, "loss": 3.1159, "step": 3160 }, { "epoch": 0.23394833948339483, "grad_norm": 0.5597007274627686, "learning_rate": 9.463335551674025e-05, "loss": 3.124, "step": 3170 }, { "epoch": 0.23468634686346865, "grad_norm": 0.5460641384124756, "learning_rate": 9.45751522138676e-05, "loss": 3.103, "step": 3180 }, { "epoch": 0.23542435424354244, "grad_norm": 0.5389031767845154, "learning_rate": 9.45166530901789e-05, "loss": 3.1502, "step": 3190 }, { "epoch": 0.23616236162361623, "grad_norm": 0.5293789505958557, "learning_rate": 9.445785853390073e-05, "loss": 3.0856, "step": 3200 }, { "epoch": 0.23690036900369005, "grad_norm": 0.677259087562561, "learning_rate": 9.439876893522028e-05, "loss": 3.1143, "step": 3210 }, { "epoch": 0.23763837638376384, "grad_norm": 0.5259451866149902, "learning_rate": 9.433938468628277e-05, "loss": 3.1628, "step": 3220 }, { "epoch": 0.23837638376383763, "grad_norm": 0.5321341156959534, "learning_rate": 9.427970618118888e-05, "loss": 3.1164, "step": 3230 }, { "epoch": 0.23911439114391145, "grad_norm": 0.5752614140510559, "learning_rate": 9.421973381599208e-05, "loss": 3.0361, "step": 3240 }, { "epoch": 0.23985239852398524, "grad_norm": 0.5552977323532104, "learning_rate": 9.415946798869602e-05, "loss": 3.1452, "step": 3250 }, { "epoch": 0.24059040590405903, "grad_norm": 0.5862517952919006, "learning_rate": 9.409890909925193e-05, "loss": 3.1493, "step": 3260 }, { "epoch": 0.24132841328413285, "grad_norm": 0.5374996066093445, "learning_rate": 9.40380575495559e-05, "loss": 3.1315, "step": 3270 }, { "epoch": 0.24206642066420664, "grad_norm": 0.5315213203430176, "learning_rate": 9.39769137434463e-05, "loss": 3.1218, "step": 3280 }, { "epoch": 0.24280442804428043, "grad_norm": 0.5306174159049988, "learning_rate": 9.391547808670096e-05, "loss": 3.0916, "step": 3290 }, { "epoch": 0.24354243542435425, "grad_norm": 0.5105913281440735, "learning_rate": 9.385375098703465e-05, "loss": 3.0469, "step": 3300 }, { "epoch": 0.24428044280442804, "grad_norm": 0.5171898603439331, "learning_rate": 9.379173285409621e-05, "loss": 3.068, "step": 3310 }, { "epoch": 0.24501845018450186, "grad_norm": 0.5028154253959656, "learning_rate": 9.372942409946596e-05, "loss": 3.1542, "step": 3320 }, { "epoch": 0.24575645756457565, "grad_norm": 0.5281797647476196, "learning_rate": 9.366682513665293e-05, "loss": 3.1484, "step": 3330 }, { "epoch": 0.24649446494464944, "grad_norm": 0.5240592956542969, "learning_rate": 9.360393638109201e-05, "loss": 3.103, "step": 3340 }, { "epoch": 0.24723247232472326, "grad_norm": 0.5516790747642517, "learning_rate": 9.354075825014139e-05, "loss": 3.0701, "step": 3350 }, { "epoch": 0.24797047970479705, "grad_norm": 0.6081251502037048, "learning_rate": 9.347729116307964e-05, "loss": 3.1434, "step": 3360 }, { "epoch": 0.24870848708487084, "grad_norm": 0.5216418504714966, "learning_rate": 9.341353554110297e-05, "loss": 3.1567, "step": 3370 }, { "epoch": 0.24944649446494466, "grad_norm": 0.5264909863471985, "learning_rate": 9.334949180732245e-05, "loss": 3.162, "step": 3380 }, { "epoch": 0.25018450184501845, "grad_norm": 0.4942391812801361, "learning_rate": 9.328516038676119e-05, "loss": 3.1532, "step": 3390 }, { "epoch": 0.25092250922509224, "grad_norm": 0.5401615500450134, "learning_rate": 9.322054170635149e-05, "loss": 3.1, "step": 3400 }, { "epoch": 0.25166051660516603, "grad_norm": 0.5021462440490723, "learning_rate": 9.315563619493209e-05, "loss": 3.0438, "step": 3410 }, { "epoch": 0.2523985239852399, "grad_norm": 0.5627569556236267, "learning_rate": 9.309044428324522e-05, "loss": 3.2005, "step": 3420 }, { "epoch": 0.25313653136531367, "grad_norm": 0.514385461807251, "learning_rate": 9.302496640393382e-05, "loss": 3.1035, "step": 3430 }, { "epoch": 0.25387453874538746, "grad_norm": 0.5261507630348206, "learning_rate": 9.295920299153863e-05, "loss": 3.1706, "step": 3440 }, { "epoch": 0.25461254612546125, "grad_norm": 0.5069513916969299, "learning_rate": 9.289315448249531e-05, "loss": 3.1218, "step": 3450 }, { "epoch": 0.25535055350553504, "grad_norm": 0.49072757363319397, "learning_rate": 9.282682131513157e-05, "loss": 3.1231, "step": 3460 }, { "epoch": 0.25608856088560883, "grad_norm": 0.6358250379562378, "learning_rate": 9.276020392966422e-05, "loss": 3.1082, "step": 3470 }, { "epoch": 0.2568265682656827, "grad_norm": 0.5456467270851135, "learning_rate": 9.26933027681963e-05, "loss": 3.1454, "step": 3480 }, { "epoch": 0.25756457564575647, "grad_norm": 0.5754953026771545, "learning_rate": 9.262611827471406e-05, "loss": 3.1334, "step": 3490 }, { "epoch": 0.25830258302583026, "grad_norm": 0.5355437397956848, "learning_rate": 9.25586508950841e-05, "loss": 3.0149, "step": 3500 }, { "epoch": 0.25904059040590405, "grad_norm": 0.5386449694633484, "learning_rate": 9.249090107705044e-05, "loss": 3.1859, "step": 3510 }, { "epoch": 0.25977859778597784, "grad_norm": 0.5665399432182312, "learning_rate": 9.242286927023136e-05, "loss": 3.171, "step": 3520 }, { "epoch": 0.2605166051660517, "grad_norm": 0.5453583002090454, "learning_rate": 9.235455592611665e-05, "loss": 3.1198, "step": 3530 }, { "epoch": 0.2612546125461255, "grad_norm": 0.5409013032913208, "learning_rate": 9.22859614980645e-05, "loss": 3.0841, "step": 3540 }, { "epoch": 0.26199261992619927, "grad_norm": 0.5243815779685974, "learning_rate": 9.221708644129843e-05, "loss": 3.13, "step": 3550 }, { "epoch": 0.26273062730627306, "grad_norm": 0.562589168548584, "learning_rate": 9.214793121290442e-05, "loss": 3.0718, "step": 3560 }, { "epoch": 0.26346863468634685, "grad_norm": 0.5075133442878723, "learning_rate": 9.207849627182772e-05, "loss": 3.1159, "step": 3570 }, { "epoch": 0.26420664206642064, "grad_norm": 0.5348154902458191, "learning_rate": 9.200878207886993e-05, "loss": 3.1932, "step": 3580 }, { "epoch": 0.2649446494464945, "grad_norm": 0.5550357103347778, "learning_rate": 9.19387890966859e-05, "loss": 3.0973, "step": 3590 }, { "epoch": 0.2656826568265683, "grad_norm": 0.534482479095459, "learning_rate": 9.186851778978062e-05, "loss": 3.1466, "step": 3600 }, { "epoch": 0.26642066420664207, "grad_norm": 0.521537184715271, "learning_rate": 9.179796862450618e-05, "loss": 3.0424, "step": 3610 }, { "epoch": 0.26715867158671586, "grad_norm": 0.5350748896598816, "learning_rate": 9.172714206905866e-05, "loss": 3.0505, "step": 3620 }, { "epoch": 0.26789667896678965, "grad_norm": 0.5348935127258301, "learning_rate": 9.165603859347502e-05, "loss": 3.1561, "step": 3630 }, { "epoch": 0.2686346863468635, "grad_norm": 0.5182725191116333, "learning_rate": 9.158465866963002e-05, "loss": 3.0778, "step": 3640 }, { "epoch": 0.2693726937269373, "grad_norm": 0.5188565850257874, "learning_rate": 9.151300277123301e-05, "loss": 3.0517, "step": 3650 }, { "epoch": 0.2701107011070111, "grad_norm": 0.5163888931274414, "learning_rate": 9.144107137382484e-05, "loss": 2.979, "step": 3660 }, { "epoch": 0.27084870848708487, "grad_norm": 0.5174587965011597, "learning_rate": 9.136886495477475e-05, "loss": 3.0661, "step": 3670 }, { "epoch": 0.27158671586715866, "grad_norm": 0.5590752363204956, "learning_rate": 9.129638399327706e-05, "loss": 3.0624, "step": 3680 }, { "epoch": 0.27232472324723245, "grad_norm": 0.48960742354393005, "learning_rate": 9.122362897034817e-05, "loss": 3.0344, "step": 3690 }, { "epoch": 0.2730627306273063, "grad_norm": 0.5071660876274109, "learning_rate": 9.115060036882318e-05, "loss": 3.0374, "step": 3700 }, { "epoch": 0.2738007380073801, "grad_norm": 0.5058993697166443, "learning_rate": 9.107729867335288e-05, "loss": 3.0823, "step": 3710 }, { "epoch": 0.2745387453874539, "grad_norm": 0.5252380967140198, "learning_rate": 9.100372437040034e-05, "loss": 3.0558, "step": 3720 }, { "epoch": 0.27527675276752767, "grad_norm": 0.49785932898521423, "learning_rate": 9.092987794823786e-05, "loss": 3.0836, "step": 3730 }, { "epoch": 0.27601476014760146, "grad_norm": 0.5140420794487, "learning_rate": 9.085575989694357e-05, "loss": 3.1079, "step": 3740 }, { "epoch": 0.2767527675276753, "grad_norm": 0.5329453945159912, "learning_rate": 9.078137070839832e-05, "loss": 3.0775, "step": 3750 }, { "epoch": 0.2774907749077491, "grad_norm": 0.4971647560596466, "learning_rate": 9.070671087628229e-05, "loss": 3.0756, "step": 3760 }, { "epoch": 0.2782287822878229, "grad_norm": 0.5552874803543091, "learning_rate": 9.063178089607183e-05, "loss": 3.0615, "step": 3770 }, { "epoch": 0.2789667896678967, "grad_norm": 0.525969922542572, "learning_rate": 9.055658126503605e-05, "loss": 3.0594, "step": 3780 }, { "epoch": 0.27970479704797047, "grad_norm": 0.5235247611999512, "learning_rate": 9.048111248223368e-05, "loss": 3.097, "step": 3790 }, { "epoch": 0.28044280442804426, "grad_norm": 0.5573784112930298, "learning_rate": 9.040537504850954e-05, "loss": 3.0303, "step": 3800 }, { "epoch": 0.2811808118081181, "grad_norm": 0.5464443564414978, "learning_rate": 9.032936946649144e-05, "loss": 3.063, "step": 3810 }, { "epoch": 0.2819188191881919, "grad_norm": 0.5378391146659851, "learning_rate": 9.02530962405867e-05, "loss": 3.0853, "step": 3820 }, { "epoch": 0.2826568265682657, "grad_norm": 0.5274621844291687, "learning_rate": 9.017655587697885e-05, "loss": 3.1374, "step": 3830 }, { "epoch": 0.2833948339483395, "grad_norm": 0.5044965744018555, "learning_rate": 9.009974888362424e-05, "loss": 3.064, "step": 3840 }, { "epoch": 0.28413284132841327, "grad_norm": 0.5318046808242798, "learning_rate": 9.002267577024876e-05, "loss": 3.0662, "step": 3850 }, { "epoch": 0.2848708487084871, "grad_norm": 0.5438222289085388, "learning_rate": 8.994533704834435e-05, "loss": 3.0999, "step": 3860 }, { "epoch": 0.2856088560885609, "grad_norm": 0.5226894021034241, "learning_rate": 8.986773323116563e-05, "loss": 3.0496, "step": 3870 }, { "epoch": 0.2863468634686347, "grad_norm": 1.9248789548873901, "learning_rate": 8.978986483372655e-05, "loss": 3.0549, "step": 3880 }, { "epoch": 0.2870848708487085, "grad_norm": 0.49465620517730713, "learning_rate": 8.971173237279692e-05, "loss": 3.085, "step": 3890 }, { "epoch": 0.2878228782287823, "grad_norm": 0.5317748785018921, "learning_rate": 8.963333636689898e-05, "loss": 3.0659, "step": 3900 }, { "epoch": 0.28856088560885607, "grad_norm": 0.5400087833404541, "learning_rate": 8.9554677336304e-05, "loss": 3.0963, "step": 3910 }, { "epoch": 0.2892988929889299, "grad_norm": 0.5060845613479614, "learning_rate": 8.947575580302878e-05, "loss": 3.0503, "step": 3920 }, { "epoch": 0.2900369003690037, "grad_norm": 0.5168414115905762, "learning_rate": 8.939657229083222e-05, "loss": 3.1322, "step": 3930 }, { "epoch": 0.2907749077490775, "grad_norm": 0.5268558263778687, "learning_rate": 8.931712732521183e-05, "loss": 3.0947, "step": 3940 }, { "epoch": 0.2915129151291513, "grad_norm": 0.5113683938980103, "learning_rate": 8.92374214334002e-05, "loss": 3.0379, "step": 3950 }, { "epoch": 0.2922509225092251, "grad_norm": 0.5602664947509766, "learning_rate": 8.915745514436161e-05, "loss": 3.0636, "step": 3960 }, { "epoch": 0.29298892988929887, "grad_norm": 0.507926344871521, "learning_rate": 8.907722898878844e-05, "loss": 3.0737, "step": 3970 }, { "epoch": 0.2937269372693727, "grad_norm": 0.5805441737174988, "learning_rate": 8.899674349909759e-05, "loss": 3.0743, "step": 3980 }, { "epoch": 0.2944649446494465, "grad_norm": 0.5141892433166504, "learning_rate": 8.891599920942713e-05, "loss": 3.0711, "step": 3990 }, { "epoch": 0.2952029520295203, "grad_norm": 0.5769287347793579, "learning_rate": 8.883499665563253e-05, "loss": 3.0302, "step": 4000 }, { "epoch": 0.2959409594095941, "grad_norm": 0.5248669981956482, "learning_rate": 8.875373637528335e-05, "loss": 3.0871, "step": 4010 }, { "epoch": 0.2966789667896679, "grad_norm": 0.5001204609870911, "learning_rate": 8.867221890765938e-05, "loss": 3.0342, "step": 4020 }, { "epoch": 0.2974169741697417, "grad_norm": 0.5176003575325012, "learning_rate": 8.859044479374736e-05, "loss": 3.1404, "step": 4030 }, { "epoch": 0.2981549815498155, "grad_norm": 0.5125160217285156, "learning_rate": 8.850841457623719e-05, "loss": 3.0399, "step": 4040 }, { "epoch": 0.2988929889298893, "grad_norm": 0.49271440505981445, "learning_rate": 8.842612879951837e-05, "loss": 3.0082, "step": 4050 }, { "epoch": 0.2996309963099631, "grad_norm": 0.5456764698028564, "learning_rate": 8.834358800967645e-05, "loss": 3.0537, "step": 4060 }, { "epoch": 0.3003690036900369, "grad_norm": 0.5039022564888, "learning_rate": 8.826079275448933e-05, "loss": 3.0508, "step": 4070 }, { "epoch": 0.3011070110701107, "grad_norm": 0.48597994446754456, "learning_rate": 8.817774358342367e-05, "loss": 3.0806, "step": 4080 }, { "epoch": 0.3018450184501845, "grad_norm": 0.5243167877197266, "learning_rate": 8.809444104763122e-05, "loss": 3.1176, "step": 4090 }, { "epoch": 0.3025830258302583, "grad_norm": 0.5244473218917847, "learning_rate": 8.801088569994522e-05, "loss": 3.0985, "step": 4100 }, { "epoch": 0.3033210332103321, "grad_norm": 0.4856514632701874, "learning_rate": 8.792707809487661e-05, "loss": 3.0546, "step": 4110 }, { "epoch": 0.3040590405904059, "grad_norm": 0.48701879382133484, "learning_rate": 8.784301878861047e-05, "loss": 3.083, "step": 4120 }, { "epoch": 0.3047970479704797, "grad_norm": 0.5364317297935486, "learning_rate": 8.775870833900226e-05, "loss": 3.0672, "step": 4130 }, { "epoch": 0.30553505535055353, "grad_norm": 0.5016632676124573, "learning_rate": 8.767414730557418e-05, "loss": 2.9692, "step": 4140 }, { "epoch": 0.3062730627306273, "grad_norm": 0.5020787715911865, "learning_rate": 8.758933624951135e-05, "loss": 3.0618, "step": 4150 }, { "epoch": 0.3070110701107011, "grad_norm": 0.5041311383247375, "learning_rate": 8.750427573365824e-05, "loss": 3.0193, "step": 4160 }, { "epoch": 0.3077490774907749, "grad_norm": 0.5102233290672302, "learning_rate": 8.741896632251476e-05, "loss": 3.0837, "step": 4170 }, { "epoch": 0.3084870848708487, "grad_norm": 0.5173757672309875, "learning_rate": 8.733340858223268e-05, "loss": 2.9969, "step": 4180 }, { "epoch": 0.3092250922509225, "grad_norm": 0.47782695293426514, "learning_rate": 8.724760308061172e-05, "loss": 2.9934, "step": 4190 }, { "epoch": 0.30996309963099633, "grad_norm": 0.4984055161476135, "learning_rate": 8.71615503870959e-05, "loss": 3.0055, "step": 4200 }, { "epoch": 0.3107011070110701, "grad_norm": 0.535744845867157, "learning_rate": 8.707525107276971e-05, "loss": 3.1124, "step": 4210 }, { "epoch": 0.3114391143911439, "grad_norm": 0.5163019895553589, "learning_rate": 8.698870571035435e-05, "loss": 3.0904, "step": 4220 }, { "epoch": 0.3121771217712177, "grad_norm": 0.5297439694404602, "learning_rate": 8.690191487420385e-05, "loss": 3.039, "step": 4230 }, { "epoch": 0.3129151291512915, "grad_norm": 0.5315809845924377, "learning_rate": 8.681487914030137e-05, "loss": 3.1418, "step": 4240 }, { "epoch": 0.31365313653136534, "grad_norm": 0.5038068890571594, "learning_rate": 8.672759908625528e-05, "loss": 3.105, "step": 4250 }, { "epoch": 0.31439114391143913, "grad_norm": 0.5104600787162781, "learning_rate": 8.664007529129539e-05, "loss": 3.0253, "step": 4260 }, { "epoch": 0.3151291512915129, "grad_norm": 0.5337395668029785, "learning_rate": 8.655230833626908e-05, "loss": 3.0637, "step": 4270 }, { "epoch": 0.3158671586715867, "grad_norm": 0.5203779935836792, "learning_rate": 8.646429880363746e-05, "loss": 3.0862, "step": 4280 }, { "epoch": 0.3166051660516605, "grad_norm": 0.510831356048584, "learning_rate": 8.637604727747149e-05, "loss": 2.9944, "step": 4290 }, { "epoch": 0.3173431734317343, "grad_norm": 0.5363606214523315, "learning_rate": 8.62875543434481e-05, "loss": 3.1227, "step": 4300 }, { "epoch": 0.31808118081180814, "grad_norm": 0.5156981945037842, "learning_rate": 8.61988205888463e-05, "loss": 3.046, "step": 4310 }, { "epoch": 0.31881918819188193, "grad_norm": 0.530002772808075, "learning_rate": 8.610984660254333e-05, "loss": 3.037, "step": 4320 }, { "epoch": 0.3195571955719557, "grad_norm": 0.5514121651649475, "learning_rate": 8.602063297501068e-05, "loss": 3.0828, "step": 4330 }, { "epoch": 0.3202952029520295, "grad_norm": 0.49961575865745544, "learning_rate": 8.593118029831025e-05, "loss": 3.0404, "step": 4340 }, { "epoch": 0.3210332103321033, "grad_norm": 0.4883437752723694, "learning_rate": 8.584148916609032e-05, "loss": 3.0681, "step": 4350 }, { "epoch": 0.32177121771217715, "grad_norm": 0.5226607918739319, "learning_rate": 8.575156017358171e-05, "loss": 3.0631, "step": 4360 }, { "epoch": 0.32250922509225094, "grad_norm": 0.5821093320846558, "learning_rate": 8.566139391759378e-05, "loss": 3.0793, "step": 4370 }, { "epoch": 0.32324723247232473, "grad_norm": 0.5188676118850708, "learning_rate": 8.557099099651047e-05, "loss": 3.086, "step": 4380 }, { "epoch": 0.3239852398523985, "grad_norm": 0.5117591023445129, "learning_rate": 8.548035201028636e-05, "loss": 3.1174, "step": 4390 }, { "epoch": 0.3247232472324723, "grad_norm": 0.48335784673690796, "learning_rate": 8.538947756044261e-05, "loss": 2.9864, "step": 4400 }, { "epoch": 0.3254612546125461, "grad_norm": 0.5281744599342346, "learning_rate": 8.52983682500631e-05, "loss": 3.0942, "step": 4410 }, { "epoch": 0.32619926199261995, "grad_norm": 0.4935998022556305, "learning_rate": 8.520702468379028e-05, "loss": 3.0716, "step": 4420 }, { "epoch": 0.32693726937269374, "grad_norm": 0.4817652404308319, "learning_rate": 8.511544746782125e-05, "loss": 3.0314, "step": 4430 }, { "epoch": 0.32767527675276753, "grad_norm": 0.49610570073127747, "learning_rate": 8.502363720990374e-05, "loss": 2.9699, "step": 4440 }, { "epoch": 0.3284132841328413, "grad_norm": 0.5101500749588013, "learning_rate": 8.493159451933203e-05, "loss": 2.9248, "step": 4450 }, { "epoch": 0.3291512915129151, "grad_norm": 0.48433801531791687, "learning_rate": 8.483932000694295e-05, "loss": 3.0812, "step": 4460 }, { "epoch": 0.3298892988929889, "grad_norm": 0.4775218665599823, "learning_rate": 8.474681428511177e-05, "loss": 2.986, "step": 4470 }, { "epoch": 0.33062730627306275, "grad_norm": 0.49710339307785034, "learning_rate": 8.465407796774816e-05, "loss": 3.0331, "step": 4480 }, { "epoch": 0.33136531365313654, "grad_norm": 0.5008261799812317, "learning_rate": 8.456111167029219e-05, "loss": 3.0763, "step": 4490 }, { "epoch": 0.33210332103321033, "grad_norm": 0.5350390672683716, "learning_rate": 8.446791600971012e-05, "loss": 3.0238, "step": 4500 }, { "epoch": 0.3328413284132841, "grad_norm": 0.5100720524787903, "learning_rate": 8.43744916044904e-05, "loss": 3.1137, "step": 4510 }, { "epoch": 0.3335793357933579, "grad_norm": 0.5103323459625244, "learning_rate": 8.428083907463951e-05, "loss": 3.0862, "step": 4520 }, { "epoch": 0.33431734317343176, "grad_norm": 0.563750147819519, "learning_rate": 8.418695904167788e-05, "loss": 3.0551, "step": 4530 }, { "epoch": 0.33505535055350555, "grad_norm": 0.4909681975841522, "learning_rate": 8.40928521286358e-05, "loss": 2.9769, "step": 4540 }, { "epoch": 0.33579335793357934, "grad_norm": 0.5330002903938293, "learning_rate": 8.399851896004913e-05, "loss": 3.046, "step": 4550 }, { "epoch": 0.33653136531365313, "grad_norm": 0.49845483899116516, "learning_rate": 8.390396016195537e-05, "loss": 3.0318, "step": 4560 }, { "epoch": 0.3372693726937269, "grad_norm": 0.4647519290447235, "learning_rate": 8.380917636188934e-05, "loss": 3.0097, "step": 4570 }, { "epoch": 0.3380073800738007, "grad_norm": 0.4947097599506378, "learning_rate": 8.371416818887908e-05, "loss": 3.0244, "step": 4580 }, { "epoch": 0.33874538745387456, "grad_norm": 0.514033854007721, "learning_rate": 8.361893627344168e-05, "loss": 3.0259, "step": 4590 }, { "epoch": 0.33948339483394835, "grad_norm": 0.5403528213500977, "learning_rate": 8.35234812475791e-05, "loss": 3.0071, "step": 4600 }, { "epoch": 0.34022140221402214, "grad_norm": 0.495109498500824, "learning_rate": 8.342780374477396e-05, "loss": 3.058, "step": 4610 }, { "epoch": 0.34095940959409593, "grad_norm": 0.48301902413368225, "learning_rate": 8.33319043999853e-05, "loss": 3.0686, "step": 4620 }, { "epoch": 0.3416974169741697, "grad_norm": 0.4977583885192871, "learning_rate": 8.323578384964444e-05, "loss": 2.9218, "step": 4630 }, { "epoch": 0.34243542435424357, "grad_norm": 0.4929274022579193, "learning_rate": 8.313944273165069e-05, "loss": 3.0489, "step": 4640 }, { "epoch": 0.34317343173431736, "grad_norm": 0.5092618465423584, "learning_rate": 8.304288168536718e-05, "loss": 2.9915, "step": 4650 }, { "epoch": 0.34391143911439115, "grad_norm": 0.48645535111427307, "learning_rate": 8.294610135161658e-05, "loss": 2.9596, "step": 4660 }, { "epoch": 0.34464944649446494, "grad_norm": 0.5053686499595642, "learning_rate": 8.284910237267682e-05, "loss": 3.0022, "step": 4670 }, { "epoch": 0.34538745387453873, "grad_norm": 0.5074572563171387, "learning_rate": 8.275188539227686e-05, "loss": 3.0701, "step": 4680 }, { "epoch": 0.3461254612546125, "grad_norm": 0.5153145790100098, "learning_rate": 8.265445105559247e-05, "loss": 2.9951, "step": 4690 }, { "epoch": 0.34686346863468637, "grad_norm": 0.5247951745986938, "learning_rate": 8.255680000924184e-05, "loss": 3.0631, "step": 4700 }, { "epoch": 0.34760147601476016, "grad_norm": 0.4750431180000305, "learning_rate": 8.245893290128136e-05, "loss": 3.0917, "step": 4710 }, { "epoch": 0.34833948339483395, "grad_norm": 0.4787590503692627, "learning_rate": 8.236085038120129e-05, "loss": 3.0494, "step": 4720 }, { "epoch": 0.34907749077490774, "grad_norm": 0.49496400356292725, "learning_rate": 8.22625530999215e-05, "loss": 3.0276, "step": 4730 }, { "epoch": 0.34981549815498153, "grad_norm": 0.517461359500885, "learning_rate": 8.216404170978707e-05, "loss": 2.9682, "step": 4740 }, { "epoch": 0.3505535055350554, "grad_norm": 0.4839133024215698, "learning_rate": 8.206531686456403e-05, "loss": 3.0396, "step": 4750 }, { "epoch": 0.35129151291512917, "grad_norm": 0.5224480628967285, "learning_rate": 8.196637921943496e-05, "loss": 3.048, "step": 4760 }, { "epoch": 0.35202952029520296, "grad_norm": 0.5209102034568787, "learning_rate": 8.186722943099472e-05, "loss": 3.0128, "step": 4770 }, { "epoch": 0.35276752767527675, "grad_norm": 0.480421781539917, "learning_rate": 8.176786815724601e-05, "loss": 3.0139, "step": 4780 }, { "epoch": 0.35350553505535054, "grad_norm": 0.4676721692085266, "learning_rate": 8.166829605759507e-05, "loss": 2.8988, "step": 4790 }, { "epoch": 0.35424354243542433, "grad_norm": 0.5178680419921875, "learning_rate": 8.156851379284729e-05, "loss": 3.0074, "step": 4800 }, { "epoch": 0.3549815498154982, "grad_norm": 0.5426033735275269, "learning_rate": 8.146852202520277e-05, "loss": 2.9998, "step": 4810 }, { "epoch": 0.35571955719557197, "grad_norm": 0.4766799807548523, "learning_rate": 8.136832141825196e-05, "loss": 3.0129, "step": 4820 }, { "epoch": 0.35645756457564576, "grad_norm": 0.49461451172828674, "learning_rate": 8.12679126369713e-05, "loss": 3.0726, "step": 4830 }, { "epoch": 0.35719557195571955, "grad_norm": 0.4843361973762512, "learning_rate": 8.116729634771876e-05, "loss": 2.9953, "step": 4840 }, { "epoch": 0.35793357933579334, "grad_norm": 0.5127764344215393, "learning_rate": 8.106647321822943e-05, "loss": 3.0525, "step": 4850 }, { "epoch": 0.3586715867158672, "grad_norm": 0.4938580393791199, "learning_rate": 8.096544391761103e-05, "loss": 2.975, "step": 4860 }, { "epoch": 0.359409594095941, "grad_norm": 0.4944118559360504, "learning_rate": 8.08642091163396e-05, "loss": 3.0102, "step": 4870 }, { "epoch": 0.36014760147601477, "grad_norm": 0.4949988126754761, "learning_rate": 8.076276948625494e-05, "loss": 2.9756, "step": 4880 }, { "epoch": 0.36088560885608856, "grad_norm": 0.5549206733703613, "learning_rate": 8.066112570055621e-05, "loss": 3.0896, "step": 4890 }, { "epoch": 0.36162361623616235, "grad_norm": 0.4933255910873413, "learning_rate": 8.055927843379738e-05, "loss": 3.036, "step": 4900 }, { "epoch": 0.36236162361623614, "grad_norm": 0.5120234489440918, "learning_rate": 8.04572283618829e-05, "loss": 3.0661, "step": 4910 }, { "epoch": 0.36309963099631, "grad_norm": 0.47579410672187805, "learning_rate": 8.035497616206302e-05, "loss": 2.9517, "step": 4920 }, { "epoch": 0.3638376383763838, "grad_norm": 0.47006312012672424, "learning_rate": 8.025252251292949e-05, "loss": 2.9931, "step": 4930 }, { "epoch": 0.36457564575645757, "grad_norm": 0.498418927192688, "learning_rate": 8.014986809441094e-05, "loss": 2.9749, "step": 4940 }, { "epoch": 0.36531365313653136, "grad_norm": 0.4772182106971741, "learning_rate": 8.00470135877684e-05, "loss": 2.9708, "step": 4950 }, { "epoch": 0.36605166051660515, "grad_norm": 0.47467556595802307, "learning_rate": 7.994395967559076e-05, "loss": 2.9898, "step": 4960 }, { "epoch": 0.36678966789667894, "grad_norm": 0.509661078453064, "learning_rate": 7.984070704179026e-05, "loss": 3.0238, "step": 4970 }, { "epoch": 0.3675276752767528, "grad_norm": 0.47225892543792725, "learning_rate": 7.973725637159794e-05, "loss": 3.0066, "step": 4980 }, { "epoch": 0.3682656826568266, "grad_norm": 0.5211546421051025, "learning_rate": 7.963360835155915e-05, "loss": 3.0896, "step": 4990 }, { "epoch": 0.36900369003690037, "grad_norm": 0.4817075729370117, "learning_rate": 7.952976366952888e-05, "loss": 3.0348, "step": 5000 }, { "epoch": 0.36974169741697416, "grad_norm": 0.4747537672519684, "learning_rate": 7.942572301466727e-05, "loss": 3.0146, "step": 5010 }, { "epoch": 0.37047970479704795, "grad_norm": 0.5026445984840393, "learning_rate": 7.932148707743503e-05, "loss": 2.9681, "step": 5020 }, { "epoch": 0.3712177121771218, "grad_norm": 0.47187340259552, "learning_rate": 7.921705654958886e-05, "loss": 3.0161, "step": 5030 }, { "epoch": 0.3719557195571956, "grad_norm": 0.5039234161376953, "learning_rate": 7.911243212417687e-05, "loss": 3.0002, "step": 5040 }, { "epoch": 0.3726937269372694, "grad_norm": 0.481448233127594, "learning_rate": 7.900761449553394e-05, "loss": 2.9907, "step": 5050 }, { "epoch": 0.37343173431734317, "grad_norm": 0.4844491481781006, "learning_rate": 7.890260435927708e-05, "loss": 3.0501, "step": 5060 }, { "epoch": 0.37416974169741696, "grad_norm": 0.502325177192688, "learning_rate": 7.879740241230098e-05, "loss": 2.9843, "step": 5070 }, { "epoch": 0.37490774907749075, "grad_norm": 0.49289822578430176, "learning_rate": 7.869200935277317e-05, "loss": 2.9808, "step": 5080 }, { "epoch": 0.3756457564575646, "grad_norm": 0.4960924983024597, "learning_rate": 7.858642588012957e-05, "loss": 3.0367, "step": 5090 }, { "epoch": 0.3763837638376384, "grad_norm": 0.4961390495300293, "learning_rate": 7.848065269506968e-05, "loss": 3.0371, "step": 5100 }, { "epoch": 0.3771217712177122, "grad_norm": 0.5095449090003967, "learning_rate": 7.837469049955211e-05, "loss": 2.9584, "step": 5110 }, { "epoch": 0.37785977859778597, "grad_norm": 0.5364798307418823, "learning_rate": 7.826853999678979e-05, "loss": 3.0194, "step": 5120 }, { "epoch": 0.37859778597785976, "grad_norm": 0.47735193371772766, "learning_rate": 7.816220189124526e-05, "loss": 2.9603, "step": 5130 }, { "epoch": 0.3793357933579336, "grad_norm": 0.47760894894599915, "learning_rate": 7.805567688862626e-05, "loss": 3.0335, "step": 5140 }, { "epoch": 0.3800738007380074, "grad_norm": 0.4874935448169708, "learning_rate": 7.794896569588066e-05, "loss": 3.0274, "step": 5150 }, { "epoch": 0.3808118081180812, "grad_norm": 0.48565617203712463, "learning_rate": 7.784206902119213e-05, "loss": 3.0081, "step": 5160 }, { "epoch": 0.381549815498155, "grad_norm": 0.513862133026123, "learning_rate": 7.773498757397522e-05, "loss": 2.9605, "step": 5170 }, { "epoch": 0.38228782287822877, "grad_norm": 0.4750123918056488, "learning_rate": 7.762772206487066e-05, "loss": 3.0109, "step": 5180 }, { "epoch": 0.38302583025830256, "grad_norm": 0.4761565327644348, "learning_rate": 7.75202732057408e-05, "loss": 3.0137, "step": 5190 }, { "epoch": 0.3837638376383764, "grad_norm": 0.5001286864280701, "learning_rate": 7.741264170966472e-05, "loss": 3.0493, "step": 5200 }, { "epoch": 0.3845018450184502, "grad_norm": 0.48891499638557434, "learning_rate": 7.730482829093358e-05, "loss": 3.0333, "step": 5210 }, { "epoch": 0.385239852398524, "grad_norm": 0.4714498221874237, "learning_rate": 7.719683366504586e-05, "loss": 2.9868, "step": 5220 }, { "epoch": 0.3859778597785978, "grad_norm": 0.4761471748352051, "learning_rate": 7.708865854870258e-05, "loss": 3.0351, "step": 5230 }, { "epoch": 0.38671586715867157, "grad_norm": 0.47278621792793274, "learning_rate": 7.698030365980265e-05, "loss": 3.0056, "step": 5240 }, { "epoch": 0.3874538745387454, "grad_norm": 0.502041220664978, "learning_rate": 7.687176971743796e-05, "loss": 3.013, "step": 5250 }, { "epoch": 0.3881918819188192, "grad_norm": 0.4808847904205322, "learning_rate": 7.676305744188871e-05, "loss": 3.0363, "step": 5260 }, { "epoch": 0.388929889298893, "grad_norm": 0.4782809615135193, "learning_rate": 7.665416755461859e-05, "loss": 2.9693, "step": 5270 }, { "epoch": 0.3896678966789668, "grad_norm": 0.4984862804412842, "learning_rate": 7.654510077827003e-05, "loss": 2.9882, "step": 5280 }, { "epoch": 0.3904059040590406, "grad_norm": 0.48033297061920166, "learning_rate": 7.643585783665931e-05, "loss": 2.9822, "step": 5290 }, { "epoch": 0.39114391143911437, "grad_norm": 0.5328406691551208, "learning_rate": 7.632643945477193e-05, "loss": 2.9835, "step": 5300 }, { "epoch": 0.3918819188191882, "grad_norm": 0.4741387963294983, "learning_rate": 7.621684635875756e-05, "loss": 3.0095, "step": 5310 }, { "epoch": 0.392619926199262, "grad_norm": 0.8941669464111328, "learning_rate": 7.610707927592549e-05, "loss": 2.9642, "step": 5320 }, { "epoch": 0.3933579335793358, "grad_norm": 0.501148521900177, "learning_rate": 7.59971389347395e-05, "loss": 2.9973, "step": 5330 }, { "epoch": 0.3940959409594096, "grad_norm": 0.4852311611175537, "learning_rate": 7.588702606481337e-05, "loss": 3.019, "step": 5340 }, { "epoch": 0.3948339483394834, "grad_norm": 0.44878798723220825, "learning_rate": 7.577674139690572e-05, "loss": 2.9582, "step": 5350 }, { "epoch": 0.3955719557195572, "grad_norm": 0.4837028384208679, "learning_rate": 7.566628566291536e-05, "loss": 2.9865, "step": 5360 }, { "epoch": 0.396309963099631, "grad_norm": 0.5781135559082031, "learning_rate": 7.555565959587638e-05, "loss": 2.9709, "step": 5370 }, { "epoch": 0.3970479704797048, "grad_norm": 0.4646313786506653, "learning_rate": 7.544486392995324e-05, "loss": 3.0123, "step": 5380 }, { "epoch": 0.3977859778597786, "grad_norm": 0.45897990465164185, "learning_rate": 7.533389940043598e-05, "loss": 2.9744, "step": 5390 }, { "epoch": 0.3985239852398524, "grad_norm": 0.47609013319015503, "learning_rate": 7.522276674373525e-05, "loss": 2.9654, "step": 5400 }, { "epoch": 0.3992619926199262, "grad_norm": 0.48847806453704834, "learning_rate": 7.51114666973775e-05, "loss": 3.0279, "step": 5410 }, { "epoch": 0.4, "grad_norm": 0.5017388463020325, "learning_rate": 7.500000000000001e-05, "loss": 2.9632, "step": 5420 }, { "epoch": 0.4007380073800738, "grad_norm": 0.49840694665908813, "learning_rate": 7.488836739134608e-05, "loss": 3.0054, "step": 5430 }, { "epoch": 0.4014760147601476, "grad_norm": 0.48498594760894775, "learning_rate": 7.477656961226007e-05, "loss": 2.9744, "step": 5440 }, { "epoch": 0.4022140221402214, "grad_norm": 0.49641212821006775, "learning_rate": 7.466460740468245e-05, "loss": 3.0054, "step": 5450 }, { "epoch": 0.4029520295202952, "grad_norm": 0.47951868176460266, "learning_rate": 7.455248151164493e-05, "loss": 2.9506, "step": 5460 }, { "epoch": 0.40369003690036903, "grad_norm": 0.5073153972625732, "learning_rate": 7.444019267726553e-05, "loss": 2.9172, "step": 5470 }, { "epoch": 0.4044280442804428, "grad_norm": 0.48473188281059265, "learning_rate": 7.432774164674359e-05, "loss": 2.9388, "step": 5480 }, { "epoch": 0.4051660516605166, "grad_norm": 0.4775610566139221, "learning_rate": 7.421512916635485e-05, "loss": 3.0088, "step": 5490 }, { "epoch": 0.4059040590405904, "grad_norm": 0.5261042714118958, "learning_rate": 7.410235598344657e-05, "loss": 2.9721, "step": 5500 }, { "epoch": 0.4066420664206642, "grad_norm": 0.45107316970825195, "learning_rate": 7.398942284643241e-05, "loss": 2.9521, "step": 5510 }, { "epoch": 0.407380073800738, "grad_norm": 0.46772444248199463, "learning_rate": 7.387633050478766e-05, "loss": 2.9259, "step": 5520 }, { "epoch": 0.40811808118081183, "grad_norm": 0.4604153633117676, "learning_rate": 7.376307970904408e-05, "loss": 3.082, "step": 5530 }, { "epoch": 0.4088560885608856, "grad_norm": 0.47096291184425354, "learning_rate": 7.364967121078502e-05, "loss": 2.9186, "step": 5540 }, { "epoch": 0.4095940959409594, "grad_norm": 0.4761073589324951, "learning_rate": 7.353610576264045e-05, "loss": 3.028, "step": 5550 }, { "epoch": 0.4103321033210332, "grad_norm": 0.5043940544128418, "learning_rate": 7.34223841182819e-05, "loss": 2.9259, "step": 5560 }, { "epoch": 0.411070110701107, "grad_norm": 0.48511525988578796, "learning_rate": 7.33085070324175e-05, "loss": 2.9453, "step": 5570 }, { "epoch": 0.4118081180811808, "grad_norm": 0.4717444181442261, "learning_rate": 7.319447526078696e-05, "loss": 3.0091, "step": 5580 }, { "epoch": 0.41254612546125463, "grad_norm": 0.44939619302749634, "learning_rate": 7.308028956015653e-05, "loss": 2.9809, "step": 5590 }, { "epoch": 0.4132841328413284, "grad_norm": 0.46631982922554016, "learning_rate": 7.296595068831406e-05, "loss": 2.9969, "step": 5600 }, { "epoch": 0.4140221402214022, "grad_norm": 0.4884931743144989, "learning_rate": 7.285145940406386e-05, "loss": 2.9521, "step": 5610 }, { "epoch": 0.414760147601476, "grad_norm": 0.4892655611038208, "learning_rate": 7.273681646722173e-05, "loss": 2.9666, "step": 5620 }, { "epoch": 0.4154981549815498, "grad_norm": 0.4869326651096344, "learning_rate": 7.262202263860988e-05, "loss": 2.9618, "step": 5630 }, { "epoch": 0.41623616236162364, "grad_norm": 0.48076122999191284, "learning_rate": 7.2507078680052e-05, "loss": 2.9113, "step": 5640 }, { "epoch": 0.41697416974169743, "grad_norm": 0.46369293332099915, "learning_rate": 7.239198535436801e-05, "loss": 2.9309, "step": 5650 }, { "epoch": 0.4177121771217712, "grad_norm": 0.49062806367874146, "learning_rate": 7.227674342536913e-05, "loss": 3.0057, "step": 5660 }, { "epoch": 0.418450184501845, "grad_norm": 0.4727836847305298, "learning_rate": 7.216135365785279e-05, "loss": 3.0034, "step": 5670 }, { "epoch": 0.4191881918819188, "grad_norm": 0.5185651779174805, "learning_rate": 7.20458168175975e-05, "loss": 2.9296, "step": 5680 }, { "epoch": 0.4199261992619926, "grad_norm": 0.4758572280406952, "learning_rate": 7.193013367135792e-05, "loss": 2.9805, "step": 5690 }, { "epoch": 0.42066420664206644, "grad_norm": 0.507834255695343, "learning_rate": 7.181430498685954e-05, "loss": 2.9829, "step": 5700 }, { "epoch": 0.42140221402214023, "grad_norm": 0.48527729511260986, "learning_rate": 7.169833153279375e-05, "loss": 2.9951, "step": 5710 }, { "epoch": 0.422140221402214, "grad_norm": 0.5018925070762634, "learning_rate": 7.158221407881272e-05, "loss": 3.0251, "step": 5720 }, { "epoch": 0.4228782287822878, "grad_norm": 0.5182327032089233, "learning_rate": 7.146595339552422e-05, "loss": 2.9954, "step": 5730 }, { "epoch": 0.4236162361623616, "grad_norm": 0.5015000104904175, "learning_rate": 7.134955025448663e-05, "loss": 2.9285, "step": 5740 }, { "epoch": 0.42435424354243545, "grad_norm": 0.47007137537002563, "learning_rate": 7.123300542820366e-05, "loss": 2.923, "step": 5750 }, { "epoch": 0.42509225092250924, "grad_norm": 0.4987011253833771, "learning_rate": 7.111631969011938e-05, "loss": 2.9555, "step": 5760 }, { "epoch": 0.42583025830258303, "grad_norm": 0.4811478853225708, "learning_rate": 7.099949381461296e-05, "loss": 2.9797, "step": 5770 }, { "epoch": 0.4265682656826568, "grad_norm": 0.4753568470478058, "learning_rate": 7.08825285769936e-05, "loss": 2.9137, "step": 5780 }, { "epoch": 0.4273062730627306, "grad_norm": 0.46175628900527954, "learning_rate": 7.076542475349537e-05, "loss": 2.9291, "step": 5790 }, { "epoch": 0.4280442804428044, "grad_norm": 0.5033062696456909, "learning_rate": 7.06481831212721e-05, "loss": 2.9927, "step": 5800 }, { "epoch": 0.42878228782287825, "grad_norm": 0.4942483603954315, "learning_rate": 7.05308044583921e-05, "loss": 2.8999, "step": 5810 }, { "epoch": 0.42952029520295204, "grad_norm": 0.46212270855903625, "learning_rate": 7.041328954383316e-05, "loss": 2.9618, "step": 5820 }, { "epoch": 0.43025830258302583, "grad_norm": 0.4895878732204437, "learning_rate": 7.029563915747722e-05, "loss": 3.0415, "step": 5830 }, { "epoch": 0.4309963099630996, "grad_norm": 0.48732495307922363, "learning_rate": 7.017785408010533e-05, "loss": 2.9275, "step": 5840 }, { "epoch": 0.4317343173431734, "grad_norm": 0.49087876081466675, "learning_rate": 7.005993509339241e-05, "loss": 2.981, "step": 5850 }, { "epoch": 0.43247232472324726, "grad_norm": 0.5266060829162598, "learning_rate": 6.9941882979902e-05, "loss": 2.8859, "step": 5860 }, { "epoch": 0.43321033210332105, "grad_norm": 0.45862722396850586, "learning_rate": 6.982369852308124e-05, "loss": 2.9225, "step": 5870 }, { "epoch": 0.43394833948339484, "grad_norm": 0.5097654461860657, "learning_rate": 6.97053825072554e-05, "loss": 2.9179, "step": 5880 }, { "epoch": 0.43468634686346863, "grad_norm": 0.5156700611114502, "learning_rate": 6.958693571762301e-05, "loss": 3.0092, "step": 5890 }, { "epoch": 0.4354243542435424, "grad_norm": 0.4698309898376465, "learning_rate": 6.946835894025037e-05, "loss": 2.8776, "step": 5900 }, { "epoch": 0.4361623616236162, "grad_norm": 0.4787076711654663, "learning_rate": 6.934965296206645e-05, "loss": 2.9759, "step": 5910 }, { "epoch": 0.43690036900369006, "grad_norm": 0.4753543734550476, "learning_rate": 6.923081857085766e-05, "loss": 3.0012, "step": 5920 }, { "epoch": 0.43763837638376385, "grad_norm": 0.4781608283519745, "learning_rate": 6.911185655526263e-05, "loss": 2.9636, "step": 5930 }, { "epoch": 0.43837638376383764, "grad_norm": 0.46679866313934326, "learning_rate": 6.899276770476695e-05, "loss": 2.9666, "step": 5940 }, { "epoch": 0.43911439114391143, "grad_norm": 0.4817095100879669, "learning_rate": 6.887355280969796e-05, "loss": 2.9268, "step": 5950 }, { "epoch": 0.4398523985239852, "grad_norm": 0.46391561627388, "learning_rate": 6.875421266121946e-05, "loss": 2.9796, "step": 5960 }, { "epoch": 0.44059040590405907, "grad_norm": 0.4704035222530365, "learning_rate": 6.86347480513265e-05, "loss": 2.93, "step": 5970 }, { "epoch": 0.44132841328413286, "grad_norm": 0.5005739331245422, "learning_rate": 6.851515977284013e-05, "loss": 2.9329, "step": 5980 }, { "epoch": 0.44206642066420665, "grad_norm": 0.5069407224655151, "learning_rate": 6.839544861940214e-05, "loss": 3.0269, "step": 5990 }, { "epoch": 0.44280442804428044, "grad_norm": 0.4672479033470154, "learning_rate": 6.827561538546967e-05, "loss": 2.9522, "step": 6000 }, { "epoch": 0.44354243542435423, "grad_norm": 0.4877452850341797, "learning_rate": 6.815566086631016e-05, "loss": 2.9381, "step": 6010 }, { "epoch": 0.444280442804428, "grad_norm": 0.4852764308452606, "learning_rate": 6.80355858579959e-05, "loss": 2.9431, "step": 6020 }, { "epoch": 0.44501845018450187, "grad_norm": 0.4775632321834564, "learning_rate": 6.791539115739879e-05, "loss": 2.9923, "step": 6030 }, { "epoch": 0.44575645756457566, "grad_norm": 0.48804882168769836, "learning_rate": 6.779507756218509e-05, "loss": 3.0321, "step": 6040 }, { "epoch": 0.44649446494464945, "grad_norm": 0.4770827293395996, "learning_rate": 6.76746458708101e-05, "loss": 3.0004, "step": 6050 }, { "epoch": 0.44723247232472324, "grad_norm": 0.47312870621681213, "learning_rate": 6.75540968825128e-05, "loss": 2.9975, "step": 6060 }, { "epoch": 0.44797047970479703, "grad_norm": 0.48013314604759216, "learning_rate": 6.74334313973107e-05, "loss": 2.9666, "step": 6070 }, { "epoch": 0.4487084870848708, "grad_norm": 0.4521431624889374, "learning_rate": 6.731265021599436e-05, "loss": 2.8592, "step": 6080 }, { "epoch": 0.44944649446494467, "grad_norm": 0.4653100073337555, "learning_rate": 6.719175414012219e-05, "loss": 2.9367, "step": 6090 }, { "epoch": 0.45018450184501846, "grad_norm": 0.5198903679847717, "learning_rate": 6.707074397201508e-05, "loss": 3.014, "step": 6100 }, { "epoch": 0.45092250922509225, "grad_norm": 0.4655381441116333, "learning_rate": 6.694962051475107e-05, "loss": 2.9422, "step": 6110 }, { "epoch": 0.45166051660516604, "grad_norm": 0.4614551067352295, "learning_rate": 6.682838457216009e-05, "loss": 2.9474, "step": 6120 }, { "epoch": 0.45239852398523983, "grad_norm": 0.4937768876552582, "learning_rate": 6.67070369488185e-05, "loss": 2.8953, "step": 6130 }, { "epoch": 0.4531365313653137, "grad_norm": 0.4759802222251892, "learning_rate": 6.65855784500439e-05, "loss": 2.9553, "step": 6140 }, { "epoch": 0.45387453874538747, "grad_norm": 0.519924521446228, "learning_rate": 6.646400988188964e-05, "loss": 2.8839, "step": 6150 }, { "epoch": 0.45461254612546126, "grad_norm": 0.46175694465637207, "learning_rate": 6.63423320511396e-05, "loss": 2.9878, "step": 6160 }, { "epoch": 0.45535055350553505, "grad_norm": 0.48847445845603943, "learning_rate": 6.622054576530274e-05, "loss": 2.9601, "step": 6170 }, { "epoch": 0.45608856088560884, "grad_norm": 0.46752119064331055, "learning_rate": 6.609865183260778e-05, "loss": 2.9375, "step": 6180 }, { "epoch": 0.45682656826568263, "grad_norm": 0.48789575695991516, "learning_rate": 6.597665106199783e-05, "loss": 2.9675, "step": 6190 }, { "epoch": 0.4575645756457565, "grad_norm": 0.46002650260925293, "learning_rate": 6.585454426312506e-05, "loss": 2.9194, "step": 6200 }, { "epoch": 0.45830258302583027, "grad_norm": 0.4882054924964905, "learning_rate": 6.573233224634524e-05, "loss": 2.931, "step": 6210 }, { "epoch": 0.45904059040590406, "grad_norm": 0.4962427318096161, "learning_rate": 6.561001582271245e-05, "loss": 2.9639, "step": 6220 }, { "epoch": 0.45977859778597785, "grad_norm": 0.47860512137413025, "learning_rate": 6.548759580397363e-05, "loss": 2.9726, "step": 6230 }, { "epoch": 0.46051660516605164, "grad_norm": 0.4823954701423645, "learning_rate": 6.536507300256327e-05, "loss": 2.9363, "step": 6240 }, { "epoch": 0.4612546125461255, "grad_norm": 0.46530622243881226, "learning_rate": 6.524244823159794e-05, "loss": 2.9696, "step": 6250 }, { "epoch": 0.4619926199261993, "grad_norm": 0.4861395061016083, "learning_rate": 6.511972230487091e-05, "loss": 2.9816, "step": 6260 }, { "epoch": 0.46273062730627307, "grad_norm": 0.47099757194519043, "learning_rate": 6.499689603684682e-05, "loss": 2.8812, "step": 6270 }, { "epoch": 0.46346863468634686, "grad_norm": 0.47105422616004944, "learning_rate": 6.487397024265616e-05, "loss": 2.8715, "step": 6280 }, { "epoch": 0.46420664206642065, "grad_norm": 0.4647127091884613, "learning_rate": 6.475094573808993e-05, "loss": 2.972, "step": 6290 }, { "epoch": 0.46494464944649444, "grad_norm": 0.4713263213634491, "learning_rate": 6.462782333959429e-05, "loss": 2.9297, "step": 6300 }, { "epoch": 0.4656826568265683, "grad_norm": 0.4704754650592804, "learning_rate": 6.450460386426495e-05, "loss": 2.9489, "step": 6310 }, { "epoch": 0.4664206642066421, "grad_norm": 0.49764499068260193, "learning_rate": 6.438128812984199e-05, "loss": 2.8814, "step": 6320 }, { "epoch": 0.46715867158671587, "grad_norm": 0.46612176299095154, "learning_rate": 6.425787695470419e-05, "loss": 2.9663, "step": 6330 }, { "epoch": 0.46789667896678966, "grad_norm": 0.46676209568977356, "learning_rate": 6.41343711578638e-05, "loss": 2.9843, "step": 6340 }, { "epoch": 0.46863468634686345, "grad_norm": 0.45879995822906494, "learning_rate": 6.401077155896099e-05, "loss": 2.8991, "step": 6350 }, { "epoch": 0.4693726937269373, "grad_norm": 0.4595896303653717, "learning_rate": 6.388707897825846e-05, "loss": 2.9603, "step": 6360 }, { "epoch": 0.4701107011070111, "grad_norm": 0.47197359800338745, "learning_rate": 6.376329423663596e-05, "loss": 3.0058, "step": 6370 }, { "epoch": 0.4708487084870849, "grad_norm": 0.4487576186656952, "learning_rate": 6.363941815558484e-05, "loss": 2.9126, "step": 6380 }, { "epoch": 0.47158671586715867, "grad_norm": 0.45560458302497864, "learning_rate": 6.35154515572027e-05, "loss": 2.9979, "step": 6390 }, { "epoch": 0.47232472324723246, "grad_norm": 0.4601997435092926, "learning_rate": 6.339139526418778e-05, "loss": 2.8166, "step": 6400 }, { "epoch": 0.47306273062730625, "grad_norm": 0.48877766728401184, "learning_rate": 6.32672500998336e-05, "loss": 2.8798, "step": 6410 }, { "epoch": 0.4738007380073801, "grad_norm": 0.4835923910140991, "learning_rate": 6.314301688802347e-05, "loss": 2.9273, "step": 6420 }, { "epoch": 0.4745387453874539, "grad_norm": 0.465264230966568, "learning_rate": 6.301869645322498e-05, "loss": 2.9399, "step": 6430 }, { "epoch": 0.4752767527675277, "grad_norm": 0.49252355098724365, "learning_rate": 6.289428962048467e-05, "loss": 2.9608, "step": 6440 }, { "epoch": 0.47601476014760147, "grad_norm": 0.48788875341415405, "learning_rate": 6.276979721542239e-05, "loss": 2.9896, "step": 6450 }, { "epoch": 0.47675276752767526, "grad_norm": 0.4745902121067047, "learning_rate": 6.264522006422586e-05, "loss": 2.9076, "step": 6460 }, { "epoch": 0.4774907749077491, "grad_norm": 0.47580885887145996, "learning_rate": 6.252055899364525e-05, "loss": 2.899, "step": 6470 }, { "epoch": 0.4782287822878229, "grad_norm": 0.47672221064567566, "learning_rate": 6.239581483098766e-05, "loss": 2.9338, "step": 6480 }, { "epoch": 0.4789667896678967, "grad_norm": 0.46901679039001465, "learning_rate": 6.227098840411166e-05, "loss": 2.9081, "step": 6490 }, { "epoch": 0.4797047970479705, "grad_norm": 0.45821747183799744, "learning_rate": 6.214608054142167e-05, "loss": 2.9717, "step": 6500 }, { "epoch": 0.48044280442804427, "grad_norm": 0.457815945148468, "learning_rate": 6.202109207186263e-05, "loss": 2.9594, "step": 6510 }, { "epoch": 0.48118081180811806, "grad_norm": 0.45802658796310425, "learning_rate": 6.189602382491439e-05, "loss": 2.958, "step": 6520 }, { "epoch": 0.4819188191881919, "grad_norm": 0.47702470421791077, "learning_rate": 6.177087663058626e-05, "loss": 2.9481, "step": 6530 }, { "epoch": 0.4826568265682657, "grad_norm": 0.4765585660934448, "learning_rate": 6.164565131941147e-05, "loss": 2.9139, "step": 6540 }, { "epoch": 0.4833948339483395, "grad_norm": 0.49875739216804504, "learning_rate": 6.152034872244166e-05, "loss": 2.9726, "step": 6550 }, { "epoch": 0.4841328413284133, "grad_norm": 0.46083393692970276, "learning_rate": 6.13949696712414e-05, "loss": 2.9462, "step": 6560 }, { "epoch": 0.48487084870848707, "grad_norm": 0.4647446274757385, "learning_rate": 6.126951499788261e-05, "loss": 2.9349, "step": 6570 }, { "epoch": 0.48560885608856086, "grad_norm": 0.4930126667022705, "learning_rate": 6.114398553493908e-05, "loss": 2.9763, "step": 6580 }, { "epoch": 0.4863468634686347, "grad_norm": 0.4873722791671753, "learning_rate": 6.1018382115480985e-05, "loss": 2.9322, "step": 6590 }, { "epoch": 0.4870848708487085, "grad_norm": 0.4486652910709381, "learning_rate": 6.089270557306923e-05, "loss": 2.8796, "step": 6600 }, { "epoch": 0.4878228782287823, "grad_norm": 0.482166588306427, "learning_rate": 6.076695674175007e-05, "loss": 2.9542, "step": 6610 }, { "epoch": 0.4885608856088561, "grad_norm": 0.4913167953491211, "learning_rate": 6.0641136456049454e-05, "loss": 3.0476, "step": 6620 }, { "epoch": 0.48929889298892987, "grad_norm": 0.4978322982788086, "learning_rate": 6.051524555096754e-05, "loss": 2.8936, "step": 6630 }, { "epoch": 0.4900369003690037, "grad_norm": 0.4421325922012329, "learning_rate": 6.038928486197316e-05, "loss": 2.9131, "step": 6640 }, { "epoch": 0.4907749077490775, "grad_norm": 0.4662306308746338, "learning_rate": 6.02632552249983e-05, "loss": 2.8394, "step": 6650 }, { "epoch": 0.4915129151291513, "grad_norm": 0.5267830491065979, "learning_rate": 6.0137157476432424e-05, "loss": 2.8703, "step": 6660 }, { "epoch": 0.4922509225092251, "grad_norm": 0.509088397026062, "learning_rate": 6.001099245311711e-05, "loss": 2.9691, "step": 6670 }, { "epoch": 0.4929889298892989, "grad_norm": 0.46723711490631104, "learning_rate": 5.988476099234033e-05, "loss": 2.9496, "step": 6680 }, { "epoch": 0.49372693726937267, "grad_norm": 0.4566686153411865, "learning_rate": 5.975846393183101e-05, "loss": 2.8571, "step": 6690 }, { "epoch": 0.4944649446494465, "grad_norm": 0.4769027829170227, "learning_rate": 5.963210210975343e-05, "loss": 2.898, "step": 6700 }, { "epoch": 0.4952029520295203, "grad_norm": 0.4787648320198059, "learning_rate": 5.95056763647016e-05, "loss": 2.9649, "step": 6710 }, { "epoch": 0.4959409594095941, "grad_norm": 0.45179930329322815, "learning_rate": 5.9379187535693804e-05, "loss": 2.9201, "step": 6720 }, { "epoch": 0.4966789667896679, "grad_norm": 0.4381027817726135, "learning_rate": 5.925263646216697e-05, "loss": 2.9402, "step": 6730 }, { "epoch": 0.4974169741697417, "grad_norm": 0.49445804953575134, "learning_rate": 5.912602398397111e-05, "loss": 2.9305, "step": 6740 }, { "epoch": 0.4981549815498155, "grad_norm": 0.4826495349407196, "learning_rate": 5.8999350941363726e-05, "loss": 2.9346, "step": 6750 }, { "epoch": 0.4988929889298893, "grad_norm": 0.4974125921726227, "learning_rate": 5.887261817500427e-05, "loss": 2.9743, "step": 6760 }, { "epoch": 0.4996309963099631, "grad_norm": 0.47447288036346436, "learning_rate": 5.874582652594854e-05, "loss": 2.9399, "step": 6770 }, { "epoch": 0.5003690036900369, "grad_norm": 0.48605871200561523, "learning_rate": 5.861897683564312e-05, "loss": 2.9667, "step": 6780 }, { "epoch": 0.5011070110701107, "grad_norm": 0.4562762379646301, "learning_rate": 5.849206994591976e-05, "loss": 2.9355, "step": 6790 }, { "epoch": 0.5018450184501845, "grad_norm": 0.4724028706550598, "learning_rate": 5.8365106698989834e-05, "loss": 2.8938, "step": 6800 }, { "epoch": 0.5025830258302583, "grad_norm": 0.4404136538505554, "learning_rate": 5.82380879374387e-05, "loss": 2.8332, "step": 6810 }, { "epoch": 0.5033210332103321, "grad_norm": 0.4685560464859009, "learning_rate": 5.8111014504220165e-05, "loss": 2.9792, "step": 6820 }, { "epoch": 0.5040590405904058, "grad_norm": 0.47112590074539185, "learning_rate": 5.7983887242650846e-05, "loss": 2.9933, "step": 6830 }, { "epoch": 0.5047970479704798, "grad_norm": 0.46272197365760803, "learning_rate": 5.78567069964046e-05, "loss": 2.9916, "step": 6840 }, { "epoch": 0.5055350553505535, "grad_norm": 0.47110989689826965, "learning_rate": 5.772947460950688e-05, "loss": 2.8869, "step": 6850 }, { "epoch": 0.5062730627306273, "grad_norm": 0.47916916012763977, "learning_rate": 5.760219092632924e-05, "loss": 2.9576, "step": 6860 }, { "epoch": 0.5070110701107011, "grad_norm": 0.47247427701950073, "learning_rate": 5.7474856791583576e-05, "loss": 2.9433, "step": 6870 }, { "epoch": 0.5077490774907749, "grad_norm": 0.4856591820716858, "learning_rate": 5.7347473050316636e-05, "loss": 2.983, "step": 6880 }, { "epoch": 0.5084870848708487, "grad_norm": 0.4498710036277771, "learning_rate": 5.722004054790442e-05, "loss": 2.95, "step": 6890 }, { "epoch": 0.5092250922509225, "grad_norm": 0.4407157003879547, "learning_rate": 5.7092560130046466e-05, "loss": 2.9004, "step": 6900 }, { "epoch": 0.5099630996309963, "grad_norm": 0.4676019847393036, "learning_rate": 5.696503264276035e-05, "loss": 2.8584, "step": 6910 }, { "epoch": 0.5107011070110701, "grad_norm": 0.44521570205688477, "learning_rate": 5.683745893237597e-05, "loss": 2.9214, "step": 6920 }, { "epoch": 0.5114391143911439, "grad_norm": 0.4693831503391266, "learning_rate": 5.670983984553003e-05, "loss": 2.9721, "step": 6930 }, { "epoch": 0.5121771217712177, "grad_norm": 0.43683314323425293, "learning_rate": 5.6582176229160355e-05, "loss": 2.8837, "step": 6940 }, { "epoch": 0.5129151291512916, "grad_norm": 0.4462457299232483, "learning_rate": 5.645446893050029e-05, "loss": 2.8014, "step": 6950 }, { "epoch": 0.5136531365313654, "grad_norm": 0.46673473715782166, "learning_rate": 5.632671879707307e-05, "loss": 2.8542, "step": 6960 }, { "epoch": 0.5143911439114391, "grad_norm": 0.5018209218978882, "learning_rate": 5.619892667668618e-05, "loss": 2.9344, "step": 6970 }, { "epoch": 0.5151291512915129, "grad_norm": 0.4942212700843811, "learning_rate": 5.607109341742579e-05, "loss": 2.9002, "step": 6980 }, { "epoch": 0.5158671586715867, "grad_norm": 0.4789501428604126, "learning_rate": 5.5943219867651086e-05, "loss": 2.8955, "step": 6990 }, { "epoch": 0.5166051660516605, "grad_norm": 0.44573846459388733, "learning_rate": 5.58153068759886e-05, "loss": 2.9184, "step": 7000 }, { "epoch": 0.5173431734317343, "grad_norm": 0.4906388819217682, "learning_rate": 5.568735529132665e-05, "loss": 2.9369, "step": 7010 }, { "epoch": 0.5180811808118081, "grad_norm": 0.44844797253608704, "learning_rate": 5.555936596280966e-05, "loss": 2.9435, "step": 7020 }, { "epoch": 0.5188191881918819, "grad_norm": 0.46517252922058105, "learning_rate": 5.5431339739832545e-05, "loss": 2.9933, "step": 7030 }, { "epoch": 0.5195571955719557, "grad_norm": 0.4549432396888733, "learning_rate": 5.530327747203506e-05, "loss": 2.8739, "step": 7040 }, { "epoch": 0.5202952029520295, "grad_norm": 0.47701096534729004, "learning_rate": 5.51751800092962e-05, "loss": 2.9088, "step": 7050 }, { "epoch": 0.5210332103321034, "grad_norm": 0.489654541015625, "learning_rate": 5.50470482017285e-05, "loss": 2.9574, "step": 7060 }, { "epoch": 0.5217712177121772, "grad_norm": 0.4661862850189209, "learning_rate": 5.491888289967241e-05, "loss": 2.9482, "step": 7070 }, { "epoch": 0.522509225092251, "grad_norm": 0.446463406085968, "learning_rate": 5.4790684953690706e-05, "loss": 2.9176, "step": 7080 }, { "epoch": 0.5232472324723247, "grad_norm": 0.4751204550266266, "learning_rate": 5.466245521456278e-05, "loss": 2.924, "step": 7090 }, { "epoch": 0.5239852398523985, "grad_norm": 0.5041395425796509, "learning_rate": 5.4534194533279e-05, "loss": 2.8624, "step": 7100 }, { "epoch": 0.5247232472324723, "grad_norm": 0.4631516635417938, "learning_rate": 5.4405903761035124e-05, "loss": 2.9072, "step": 7110 }, { "epoch": 0.5254612546125461, "grad_norm": 0.45753976702690125, "learning_rate": 5.427758374922658e-05, "loss": 2.9332, "step": 7120 }, { "epoch": 0.5261992619926199, "grad_norm": 0.4684479236602783, "learning_rate": 5.414923534944283e-05, "loss": 2.9017, "step": 7130 }, { "epoch": 0.5269372693726937, "grad_norm": 0.46777448058128357, "learning_rate": 5.4020859413461756e-05, "loss": 2.9231, "step": 7140 }, { "epoch": 0.5276752767527675, "grad_norm": 0.47089943289756775, "learning_rate": 5.389245679324398e-05, "loss": 2.9215, "step": 7150 }, { "epoch": 0.5284132841328413, "grad_norm": 0.44447311758995056, "learning_rate": 5.376402834092721e-05, "loss": 2.9281, "step": 7160 }, { "epoch": 0.5291512915129152, "grad_norm": 0.47463953495025635, "learning_rate": 5.363557490882057e-05, "loss": 2.947, "step": 7170 }, { "epoch": 0.529889298892989, "grad_norm": 0.47504737973213196, "learning_rate": 5.350709734939897e-05, "loss": 3.0103, "step": 7180 }, { "epoch": 0.5306273062730628, "grad_norm": 0.472151517868042, "learning_rate": 5.337859651529746e-05, "loss": 2.966, "step": 7190 }, { "epoch": 0.5313653136531366, "grad_norm": 0.44552987813949585, "learning_rate": 5.325007325930554e-05, "loss": 2.8962, "step": 7200 }, { "epoch": 0.5321033210332103, "grad_norm": 0.487582266330719, "learning_rate": 5.3121528434361524e-05, "loss": 2.9548, "step": 7210 }, { "epoch": 0.5328413284132841, "grad_norm": 0.47288230061531067, "learning_rate": 5.299296289354681e-05, "loss": 2.8969, "step": 7220 }, { "epoch": 0.5335793357933579, "grad_norm": 0.4963250756263733, "learning_rate": 5.2864377490080306e-05, "loss": 2.9785, "step": 7230 }, { "epoch": 0.5343173431734317, "grad_norm": 0.4519381821155548, "learning_rate": 5.2735773077312814e-05, "loss": 2.9112, "step": 7240 }, { "epoch": 0.5350553505535055, "grad_norm": 0.47766226530075073, "learning_rate": 5.2607150508721195e-05, "loss": 2.8749, "step": 7250 }, { "epoch": 0.5357933579335793, "grad_norm": 0.4712168872356415, "learning_rate": 5.24785106379028e-05, "loss": 2.9148, "step": 7260 }, { "epoch": 0.5365313653136531, "grad_norm": 0.44543230533599854, "learning_rate": 5.234985431856988e-05, "loss": 2.9281, "step": 7270 }, { "epoch": 0.537269372693727, "grad_norm": 0.46235865354537964, "learning_rate": 5.2221182404543754e-05, "loss": 2.9294, "step": 7280 }, { "epoch": 0.5380073800738008, "grad_norm": 0.4579477608203888, "learning_rate": 5.2092495749749346e-05, "loss": 2.9286, "step": 7290 }, { "epoch": 0.5387453874538746, "grad_norm": 0.4533149302005768, "learning_rate": 5.196379520820929e-05, "loss": 2.9063, "step": 7300 }, { "epoch": 0.5394833948339484, "grad_norm": 0.48128604888916016, "learning_rate": 5.183508163403845e-05, "loss": 2.8985, "step": 7310 }, { "epoch": 0.5402214022140222, "grad_norm": 0.46598076820373535, "learning_rate": 5.170635588143816e-05, "loss": 2.9074, "step": 7320 }, { "epoch": 0.5409594095940959, "grad_norm": 0.4706079363822937, "learning_rate": 5.157761880469058e-05, "loss": 2.9216, "step": 7330 }, { "epoch": 0.5416974169741697, "grad_norm": 0.45854324102401733, "learning_rate": 5.144887125815301e-05, "loss": 2.9771, "step": 7340 }, { "epoch": 0.5424354243542435, "grad_norm": 0.4575222134590149, "learning_rate": 5.132011409625224e-05, "loss": 2.878, "step": 7350 }, { "epoch": 0.5431734317343173, "grad_norm": 0.45603683590888977, "learning_rate": 5.1191348173478884e-05, "loss": 2.9328, "step": 7360 }, { "epoch": 0.5439114391143911, "grad_norm": 0.47662872076034546, "learning_rate": 5.1062574344381686e-05, "loss": 2.9483, "step": 7370 }, { "epoch": 0.5446494464944649, "grad_norm": 0.4564341604709625, "learning_rate": 5.093379346356185e-05, "loss": 2.8084, "step": 7380 }, { "epoch": 0.5453874538745388, "grad_norm": 0.4610985219478607, "learning_rate": 5.080500638566741e-05, "loss": 2.9255, "step": 7390 }, { "epoch": 0.5461254612546126, "grad_norm": 0.46059536933898926, "learning_rate": 5.0676213965387475e-05, "loss": 2.851, "step": 7400 }, { "epoch": 0.5468634686346864, "grad_norm": 0.482048362493515, "learning_rate": 5.0547417057446665e-05, "loss": 2.9626, "step": 7410 }, { "epoch": 0.5476014760147602, "grad_norm": 0.4469466209411621, "learning_rate": 5.0418616516599346e-05, "loss": 2.8261, "step": 7420 }, { "epoch": 0.548339483394834, "grad_norm": 0.4489482343196869, "learning_rate": 5.028981319762399e-05, "loss": 2.9388, "step": 7430 }, { "epoch": 0.5490774907749078, "grad_norm": 0.4895458221435547, "learning_rate": 5.016100795531754e-05, "loss": 2.9598, "step": 7440 }, { "epoch": 0.5498154981549815, "grad_norm": 0.45136043429374695, "learning_rate": 5.003220164448967e-05, "loss": 2.8466, "step": 7450 }, { "epoch": 0.5505535055350553, "grad_norm": 0.4319990873336792, "learning_rate": 4.990339511995718e-05, "loss": 2.8589, "step": 7460 }, { "epoch": 0.5512915129151291, "grad_norm": 0.4822845458984375, "learning_rate": 4.977458923653823e-05, "loss": 2.8766, "step": 7470 }, { "epoch": 0.5520295202952029, "grad_norm": 0.4683190882205963, "learning_rate": 4.9645784849046786e-05, "loss": 2.9471, "step": 7480 }, { "epoch": 0.5527675276752767, "grad_norm": 0.4755018353462219, "learning_rate": 4.9516982812286854e-05, "loss": 2.9336, "step": 7490 }, { "epoch": 0.5535055350553506, "grad_norm": 0.4847009778022766, "learning_rate": 4.938818398104685e-05, "loss": 2.8928, "step": 7500 }, { "epoch": 0.5542435424354244, "grad_norm": 0.49205484986305237, "learning_rate": 4.92593892100939e-05, "loss": 2.9413, "step": 7510 }, { "epoch": 0.5549815498154982, "grad_norm": 0.4603287875652313, "learning_rate": 4.913059935416822e-05, "loss": 2.8814, "step": 7520 }, { "epoch": 0.555719557195572, "grad_norm": 0.4724648594856262, "learning_rate": 4.900181526797737e-05, "loss": 2.9493, "step": 7530 }, { "epoch": 0.5564575645756458, "grad_norm": 0.6270569562911987, "learning_rate": 4.887303780619066e-05, "loss": 2.9201, "step": 7540 }, { "epoch": 0.5571955719557196, "grad_norm": 0.4619079828262329, "learning_rate": 4.874426782343338e-05, "loss": 2.915, "step": 7550 }, { "epoch": 0.5579335793357934, "grad_norm": 0.45699045062065125, "learning_rate": 4.861550617428122e-05, "loss": 2.914, "step": 7560 }, { "epoch": 0.5586715867158671, "grad_norm": 0.46511203050613403, "learning_rate": 4.8486753713254586e-05, "loss": 2.8837, "step": 7570 }, { "epoch": 0.5594095940959409, "grad_norm": 0.4465058147907257, "learning_rate": 4.835801129481287e-05, "loss": 2.9087, "step": 7580 }, { "epoch": 0.5601476014760147, "grad_norm": 0.4666641652584076, "learning_rate": 4.8229279773348845e-05, "loss": 2.9486, "step": 7590 }, { "epoch": 0.5608856088560885, "grad_norm": 0.4582604765892029, "learning_rate": 4.810056000318293e-05, "loss": 2.9275, "step": 7600 }, { "epoch": 0.5616236162361624, "grad_norm": 0.4589548408985138, "learning_rate": 4.7971852838557565e-05, "loss": 2.8683, "step": 7610 }, { "epoch": 0.5623616236162362, "grad_norm": 0.4380606412887573, "learning_rate": 4.78431591336316e-05, "loss": 2.8368, "step": 7620 }, { "epoch": 0.56309963099631, "grad_norm": 0.44517070055007935, "learning_rate": 4.771447974247449e-05, "loss": 2.8804, "step": 7630 }, { "epoch": 0.5638376383763838, "grad_norm": 0.46472036838531494, "learning_rate": 4.7585815519060694e-05, "loss": 2.8983, "step": 7640 }, { "epoch": 0.5645756457564576, "grad_norm": 0.47114098072052, "learning_rate": 4.7457167317264064e-05, "loss": 2.9284, "step": 7650 }, { "epoch": 0.5653136531365314, "grad_norm": 0.4522678256034851, "learning_rate": 4.732853599085207e-05, "loss": 2.8971, "step": 7660 }, { "epoch": 0.5660516605166052, "grad_norm": 0.46045982837677, "learning_rate": 4.719992239348024e-05, "loss": 2.844, "step": 7670 }, { "epoch": 0.566789667896679, "grad_norm": 0.4543171525001526, "learning_rate": 4.7071327378686386e-05, "loss": 2.9121, "step": 7680 }, { "epoch": 0.5675276752767527, "grad_norm": 0.48567166924476624, "learning_rate": 4.6942751799885054e-05, "loss": 2.9274, "step": 7690 }, { "epoch": 0.5682656826568265, "grad_norm": 0.4700704514980316, "learning_rate": 4.681419651036177e-05, "loss": 2.9872, "step": 7700 }, { "epoch": 0.5690036900369003, "grad_norm": 0.44953039288520813, "learning_rate": 4.6685662363267415e-05, "loss": 2.873, "step": 7710 }, { "epoch": 0.5697416974169742, "grad_norm": 0.46205776929855347, "learning_rate": 4.655715021161258e-05, "loss": 2.8282, "step": 7720 }, { "epoch": 0.570479704797048, "grad_norm": 0.4394710063934326, "learning_rate": 4.6428660908261864e-05, "loss": 2.8753, "step": 7730 }, { "epoch": 0.5712177121771218, "grad_norm": 0.43995216488838196, "learning_rate": 4.6300195305928243e-05, "loss": 2.7643, "step": 7740 }, { "epoch": 0.5719557195571956, "grad_norm": 0.4612707495689392, "learning_rate": 4.617175425716741e-05, "loss": 2.8683, "step": 7750 }, { "epoch": 0.5726937269372694, "grad_norm": 0.4660702347755432, "learning_rate": 4.604333861437207e-05, "loss": 2.9493, "step": 7760 }, { "epoch": 0.5734317343173432, "grad_norm": 0.47154900431632996, "learning_rate": 4.591494922976637e-05, "loss": 2.9493, "step": 7770 }, { "epoch": 0.574169741697417, "grad_norm": 0.4602459967136383, "learning_rate": 4.578658695540018e-05, "loss": 2.9144, "step": 7780 }, { "epoch": 0.5749077490774908, "grad_norm": 0.4484480917453766, "learning_rate": 4.5658252643143435e-05, "loss": 2.9145, "step": 7790 }, { "epoch": 0.5756457564575646, "grad_norm": 0.469936341047287, "learning_rate": 4.552994714468055e-05, "loss": 2.8947, "step": 7800 }, { "epoch": 0.5763837638376383, "grad_norm": 0.48601603507995605, "learning_rate": 4.5401671311504616e-05, "loss": 2.9164, "step": 7810 }, { "epoch": 0.5771217712177121, "grad_norm": 0.46561533212661743, "learning_rate": 4.5273425994912e-05, "loss": 2.8656, "step": 7820 }, { "epoch": 0.5778597785977859, "grad_norm": 0.48168033361434937, "learning_rate": 4.5145212045996446e-05, "loss": 2.8667, "step": 7830 }, { "epoch": 0.5785977859778598, "grad_norm": 0.45122450590133667, "learning_rate": 4.5017030315643536e-05, "loss": 2.9668, "step": 7840 }, { "epoch": 0.5793357933579336, "grad_norm": 0.4591752290725708, "learning_rate": 4.4888881654525057e-05, "loss": 2.8924, "step": 7850 }, { "epoch": 0.5800738007380074, "grad_norm": 0.4341951906681061, "learning_rate": 4.4760766913093325e-05, "loss": 2.8232, "step": 7860 }, { "epoch": 0.5808118081180812, "grad_norm": 0.46191418170928955, "learning_rate": 4.463268694157556e-05, "loss": 2.9198, "step": 7870 }, { "epoch": 0.581549815498155, "grad_norm": 0.43734246492385864, "learning_rate": 4.450464258996822e-05, "loss": 2.8755, "step": 7880 }, { "epoch": 0.5822878228782288, "grad_norm": 0.4456181228160858, "learning_rate": 4.437663470803137e-05, "loss": 2.8545, "step": 7890 }, { "epoch": 0.5830258302583026, "grad_norm": 0.46855318546295166, "learning_rate": 4.4248664145283054e-05, "loss": 2.8658, "step": 7900 }, { "epoch": 0.5837638376383764, "grad_norm": 0.4666096568107605, "learning_rate": 4.4120731750993645e-05, "loss": 2.9317, "step": 7910 }, { "epoch": 0.5845018450184502, "grad_norm": 0.46038341522216797, "learning_rate": 4.3992838374180234e-05, "loss": 2.9288, "step": 7920 }, { "epoch": 0.5852398523985239, "grad_norm": 0.47123417258262634, "learning_rate": 4.386498486360094e-05, "loss": 2.9348, "step": 7930 }, { "epoch": 0.5859778597785977, "grad_norm": 0.43836262822151184, "learning_rate": 4.373717206774935e-05, "loss": 2.8594, "step": 7940 }, { "epoch": 0.5867158671586716, "grad_norm": 0.46412384510040283, "learning_rate": 4.360940083484881e-05, "loss": 2.9131, "step": 7950 }, { "epoch": 0.5874538745387454, "grad_norm": 0.43723878264427185, "learning_rate": 4.3481672012846865e-05, "loss": 2.9116, "step": 7960 }, { "epoch": 0.5881918819188192, "grad_norm": 0.46796315908432007, "learning_rate": 4.335398644940957e-05, "loss": 2.9236, "step": 7970 }, { "epoch": 0.588929889298893, "grad_norm": 0.4761864244937897, "learning_rate": 4.322634499191594e-05, "loss": 2.8988, "step": 7980 }, { "epoch": 0.5896678966789668, "grad_norm": 0.4379028081893921, "learning_rate": 4.309874848745225e-05, "loss": 2.851, "step": 7990 }, { "epoch": 0.5904059040590406, "grad_norm": 0.4515070617198944, "learning_rate": 4.297119778280645e-05, "loss": 2.8823, "step": 8000 }, { "epoch": 0.5911439114391144, "grad_norm": 0.456480473279953, "learning_rate": 4.2843693724462555e-05, "loss": 2.9163, "step": 8010 }, { "epoch": 0.5918819188191882, "grad_norm": 0.4556421935558319, "learning_rate": 4.271623715859501e-05, "loss": 2.8997, "step": 8020 }, { "epoch": 0.592619926199262, "grad_norm": 0.4618515372276306, "learning_rate": 4.2588828931063086e-05, "loss": 2.9223, "step": 8030 }, { "epoch": 0.5933579335793358, "grad_norm": 0.4617830812931061, "learning_rate": 4.246146988740525e-05, "loss": 2.8476, "step": 8040 }, { "epoch": 0.5940959409594095, "grad_norm": 0.43721622228622437, "learning_rate": 4.233416087283354e-05, "loss": 2.9253, "step": 8050 }, { "epoch": 0.5948339483394834, "grad_norm": 0.43407517671585083, "learning_rate": 4.2206902732228015e-05, "loss": 2.9307, "step": 8060 }, { "epoch": 0.5955719557195572, "grad_norm": 0.4590218663215637, "learning_rate": 4.207969631013109e-05, "loss": 2.9194, "step": 8070 }, { "epoch": 0.596309963099631, "grad_norm": 0.45232662558555603, "learning_rate": 4.195254245074196e-05, "loss": 2.814, "step": 8080 }, { "epoch": 0.5970479704797048, "grad_norm": 0.47659075260162354, "learning_rate": 4.1825441997911016e-05, "loss": 2.8991, "step": 8090 }, { "epoch": 0.5977859778597786, "grad_norm": 0.4390777349472046, "learning_rate": 4.169839579513415e-05, "loss": 2.8377, "step": 8100 }, { "epoch": 0.5985239852398524, "grad_norm": 0.44624418020248413, "learning_rate": 4.1571404685547265e-05, "loss": 2.9126, "step": 8110 }, { "epoch": 0.5992619926199262, "grad_norm": 0.4411090314388275, "learning_rate": 4.14444695119207e-05, "loss": 2.8661, "step": 8120 }, { "epoch": 0.6, "grad_norm": 0.45906946063041687, "learning_rate": 4.131759111665349e-05, "loss": 2.8862, "step": 8130 }, { "epoch": 0.6007380073800738, "grad_norm": 0.450738787651062, "learning_rate": 4.1190770341767884e-05, "loss": 2.8788, "step": 8140 }, { "epoch": 0.6014760147601476, "grad_norm": 0.4635327458381653, "learning_rate": 4.1064008028903766e-05, "loss": 2.8856, "step": 8150 }, { "epoch": 0.6022140221402214, "grad_norm": 0.46390798687934875, "learning_rate": 4.093730501931301e-05, "loss": 2.8435, "step": 8160 }, { "epoch": 0.6029520295202953, "grad_norm": 0.46583694219589233, "learning_rate": 4.0810662153853955e-05, "loss": 2.9068, "step": 8170 }, { "epoch": 0.603690036900369, "grad_norm": 0.441485732793808, "learning_rate": 4.068408027298576e-05, "loss": 2.9141, "step": 8180 }, { "epoch": 0.6044280442804428, "grad_norm": 0.43635720014572144, "learning_rate": 4.0557560216762884e-05, "loss": 2.8165, "step": 8190 }, { "epoch": 0.6051660516605166, "grad_norm": 0.45056867599487305, "learning_rate": 4.0431102824829495e-05, "loss": 2.8923, "step": 8200 }, { "epoch": 0.6059040590405904, "grad_norm": 0.47618359327316284, "learning_rate": 4.030470893641387e-05, "loss": 2.8337, "step": 8210 }, { "epoch": 0.6066420664206642, "grad_norm": 0.46678489446640015, "learning_rate": 4.0178379390322896e-05, "loss": 2.9041, "step": 8220 }, { "epoch": 0.607380073800738, "grad_norm": 0.45858731865882874, "learning_rate": 4.0052115024936396e-05, "loss": 2.8919, "step": 8230 }, { "epoch": 0.6081180811808118, "grad_norm": 0.46500325202941895, "learning_rate": 3.9925916678201656e-05, "loss": 2.7873, "step": 8240 }, { "epoch": 0.6088560885608856, "grad_norm": 0.4576093256473541, "learning_rate": 3.9799785187627844e-05, "loss": 2.9581, "step": 8250 }, { "epoch": 0.6095940959409594, "grad_norm": 0.4603584408760071, "learning_rate": 3.96737213902804e-05, "loss": 2.932, "step": 8260 }, { "epoch": 0.6103321033210332, "grad_norm": 0.4474504888057709, "learning_rate": 3.954772612277556e-05, "loss": 2.8907, "step": 8270 }, { "epoch": 0.6110701107011071, "grad_norm": 0.4676888585090637, "learning_rate": 3.942180022127475e-05, "loss": 2.9279, "step": 8280 }, { "epoch": 0.6118081180811809, "grad_norm": 0.4762161374092102, "learning_rate": 3.929594452147903e-05, "loss": 2.8668, "step": 8290 }, { "epoch": 0.6125461254612546, "grad_norm": 0.45031213760375977, "learning_rate": 3.917015985862364e-05, "loss": 3.0203, "step": 8300 }, { "epoch": 0.6132841328413284, "grad_norm": 0.4627397656440735, "learning_rate": 3.904444706747227e-05, "loss": 2.8669, "step": 8310 }, { "epoch": 0.6140221402214022, "grad_norm": 0.4964381456375122, "learning_rate": 3.891880698231176e-05, "loss": 2.8888, "step": 8320 }, { "epoch": 0.614760147601476, "grad_norm": 0.4690164029598236, "learning_rate": 3.879324043694639e-05, "loss": 2.8772, "step": 8330 }, { "epoch": 0.6154981549815498, "grad_norm": 0.46316999197006226, "learning_rate": 3.8667748264692355e-05, "loss": 2.9203, "step": 8340 }, { "epoch": 0.6162361623616236, "grad_norm": 0.46457648277282715, "learning_rate": 3.854233129837233e-05, "loss": 2.8959, "step": 8350 }, { "epoch": 0.6169741697416974, "grad_norm": 0.46210619807243347, "learning_rate": 3.841699037030989e-05, "loss": 2.9754, "step": 8360 }, { "epoch": 0.6177121771217712, "grad_norm": 0.4708150029182434, "learning_rate": 3.829172631232395e-05, "loss": 2.8779, "step": 8370 }, { "epoch": 0.618450184501845, "grad_norm": 0.4539421498775482, "learning_rate": 3.8166539955723315e-05, "loss": 2.7857, "step": 8380 }, { "epoch": 0.6191881918819189, "grad_norm": 0.4383450150489807, "learning_rate": 3.80414321313011e-05, "loss": 2.9466, "step": 8390 }, { "epoch": 0.6199261992619927, "grad_norm": 0.47667232155799866, "learning_rate": 3.791640366932926e-05, "loss": 2.8896, "step": 8400 }, { "epoch": 0.6206642066420665, "grad_norm": 0.47078999876976013, "learning_rate": 3.7791455399553054e-05, "loss": 2.8787, "step": 8410 }, { "epoch": 0.6214022140221402, "grad_norm": 0.4621264934539795, "learning_rate": 3.7666588151185586e-05, "loss": 2.9516, "step": 8420 }, { "epoch": 0.622140221402214, "grad_norm": 0.4561121165752411, "learning_rate": 3.754180275290222e-05, "loss": 2.8712, "step": 8430 }, { "epoch": 0.6228782287822878, "grad_norm": 0.4745158851146698, "learning_rate": 3.741710003283515e-05, "loss": 2.9942, "step": 8440 }, { "epoch": 0.6236162361623616, "grad_norm": 0.4506776034832001, "learning_rate": 3.729248081856788e-05, "loss": 2.8662, "step": 8450 }, { "epoch": 0.6243542435424354, "grad_norm": 0.4925256073474884, "learning_rate": 3.716794593712973e-05, "loss": 2.9148, "step": 8460 }, { "epoch": 0.6250922509225092, "grad_norm": 0.4477274715900421, "learning_rate": 3.704349621499032e-05, "loss": 2.8946, "step": 8470 }, { "epoch": 0.625830258302583, "grad_norm": 0.45974335074424744, "learning_rate": 3.691913247805415e-05, "loss": 2.8444, "step": 8480 }, { "epoch": 0.6265682656826568, "grad_norm": 0.4468931555747986, "learning_rate": 3.6794855551655095e-05, "loss": 2.8183, "step": 8490 }, { "epoch": 0.6273062730627307, "grad_norm": 0.45352327823638916, "learning_rate": 3.6670666260550866e-05, "loss": 2.8385, "step": 8500 }, { "epoch": 0.6280442804428045, "grad_norm": 0.48543328046798706, "learning_rate": 3.654656542891762e-05, "loss": 2.8982, "step": 8510 }, { "epoch": 0.6287822878228783, "grad_norm": 0.47315549850463867, "learning_rate": 3.642255388034448e-05, "loss": 2.8477, "step": 8520 }, { "epoch": 0.629520295202952, "grad_norm": 0.4466278851032257, "learning_rate": 3.629863243782799e-05, "loss": 2.9499, "step": 8530 }, { "epoch": 0.6302583025830258, "grad_norm": 0.4634998142719269, "learning_rate": 3.617480192376676e-05, "loss": 2.9209, "step": 8540 }, { "epoch": 0.6309963099630996, "grad_norm": 0.4444449841976166, "learning_rate": 3.6051063159955914e-05, "loss": 2.8547, "step": 8550 }, { "epoch": 0.6317343173431734, "grad_norm": 0.4805346727371216, "learning_rate": 3.592741696758171e-05, "loss": 2.9504, "step": 8560 }, { "epoch": 0.6324723247232472, "grad_norm": 0.4576335549354553, "learning_rate": 3.580386416721605e-05, "loss": 2.8166, "step": 8570 }, { "epoch": 0.633210332103321, "grad_norm": 0.48051634430885315, "learning_rate": 3.568040557881106e-05, "loss": 2.8457, "step": 8580 }, { "epoch": 0.6339483394833948, "grad_norm": 0.45053961873054504, "learning_rate": 3.55570420216936e-05, "loss": 2.8554, "step": 8590 }, { "epoch": 0.6346863468634686, "grad_norm": 0.4763762652873993, "learning_rate": 3.543377431455991e-05, "loss": 2.9245, "step": 8600 }, { "epoch": 0.6354243542435425, "grad_norm": 0.466516375541687, "learning_rate": 3.531060327547003e-05, "loss": 2.8784, "step": 8610 }, { "epoch": 0.6361623616236163, "grad_norm": 0.4508006274700165, "learning_rate": 3.51875297218426e-05, "loss": 2.8572, "step": 8620 }, { "epoch": 0.6369003690036901, "grad_norm": 0.43419796228408813, "learning_rate": 3.506455447044923e-05, "loss": 2.9553, "step": 8630 }, { "epoch": 0.6376383763837639, "grad_norm": 0.4657207131385803, "learning_rate": 3.494167833740912e-05, "loss": 2.9388, "step": 8640 }, { "epoch": 0.6383763837638377, "grad_norm": 0.47769656777381897, "learning_rate": 3.481890213818374e-05, "loss": 2.889, "step": 8650 }, { "epoch": 0.6391143911439114, "grad_norm": 0.452332466840744, "learning_rate": 3.469622668757132e-05, "loss": 2.8618, "step": 8660 }, { "epoch": 0.6398523985239852, "grad_norm": 0.44228044152259827, "learning_rate": 3.457365279970147e-05, "loss": 2.858, "step": 8670 }, { "epoch": 0.640590405904059, "grad_norm": 0.45381829142570496, "learning_rate": 3.4451181288029835e-05, "loss": 2.9324, "step": 8680 }, { "epoch": 0.6413284132841328, "grad_norm": 0.45243462920188904, "learning_rate": 3.4328812965332566e-05, "loss": 2.8569, "step": 8690 }, { "epoch": 0.6420664206642066, "grad_norm": 0.44624003767967224, "learning_rate": 3.420654864370107e-05, "loss": 2.8305, "step": 8700 }, { "epoch": 0.6428044280442804, "grad_norm": 0.45331937074661255, "learning_rate": 3.408438913453652e-05, "loss": 2.9233, "step": 8710 }, { "epoch": 0.6435424354243543, "grad_norm": 0.46031826734542847, "learning_rate": 3.396233524854453e-05, "loss": 2.8136, "step": 8720 }, { "epoch": 0.6442804428044281, "grad_norm": 0.4405251443386078, "learning_rate": 3.384038779572975e-05, "loss": 2.8196, "step": 8730 }, { "epoch": 0.6450184501845019, "grad_norm": 0.433918297290802, "learning_rate": 3.371854758539047e-05, "loss": 2.828, "step": 8740 }, { "epoch": 0.6457564575645757, "grad_norm": 0.437752366065979, "learning_rate": 3.3596815426113285e-05, "loss": 2.9084, "step": 8750 }, { "epoch": 0.6464944649446495, "grad_norm": 0.4461667537689209, "learning_rate": 3.3475192125767715e-05, "loss": 2.9163, "step": 8760 }, { "epoch": 0.6472324723247233, "grad_norm": 0.44983482360839844, "learning_rate": 3.335367849150084e-05, "loss": 2.8624, "step": 8770 }, { "epoch": 0.647970479704797, "grad_norm": 0.444402813911438, "learning_rate": 3.323227532973193e-05, "loss": 2.8645, "step": 8780 }, { "epoch": 0.6487084870848708, "grad_norm": 0.47475096583366394, "learning_rate": 3.311098344614715e-05, "loss": 2.8599, "step": 8790 }, { "epoch": 0.6494464944649446, "grad_norm": 0.42691770195961, "learning_rate": 3.298980364569413e-05, "loss": 2.9367, "step": 8800 }, { "epoch": 0.6501845018450184, "grad_norm": 0.43761834502220154, "learning_rate": 3.2868736732576696e-05, "loss": 2.8071, "step": 8810 }, { "epoch": 0.6509225092250922, "grad_norm": 0.4337967336177826, "learning_rate": 3.274778351024949e-05, "loss": 2.7961, "step": 8820 }, { "epoch": 0.6516605166051661, "grad_norm": 0.4518975615501404, "learning_rate": 3.262694478141265e-05, "loss": 2.8445, "step": 8830 }, { "epoch": 0.6523985239852399, "grad_norm": 0.44520917534828186, "learning_rate": 3.250622134800651e-05, "loss": 2.8298, "step": 8840 }, { "epoch": 0.6531365313653137, "grad_norm": 0.47246819734573364, "learning_rate": 3.238561401120619e-05, "loss": 2.8721, "step": 8850 }, { "epoch": 0.6538745387453875, "grad_norm": 0.46341249346733093, "learning_rate": 3.226512357141639e-05, "loss": 2.8465, "step": 8860 }, { "epoch": 0.6546125461254613, "grad_norm": 0.4418579339981079, "learning_rate": 3.214475082826602e-05, "loss": 2.7495, "step": 8870 }, { "epoch": 0.6553505535055351, "grad_norm": 0.4572698771953583, "learning_rate": 3.2024496580602895e-05, "loss": 2.8405, "step": 8880 }, { "epoch": 0.6560885608856089, "grad_norm": 0.4518590569496155, "learning_rate": 3.1904361626488464e-05, "loss": 2.8698, "step": 8890 }, { "epoch": 0.6568265682656826, "grad_norm": 0.49694785475730896, "learning_rate": 3.178434676319243e-05, "loss": 2.9178, "step": 8900 }, { "epoch": 0.6575645756457564, "grad_norm": 0.44036176800727844, "learning_rate": 3.166445278718758e-05, "loss": 2.9042, "step": 8910 }, { "epoch": 0.6583025830258302, "grad_norm": 0.4740366041660309, "learning_rate": 3.154468049414444e-05, "loss": 2.791, "step": 8920 }, { "epoch": 0.659040590405904, "grad_norm": 0.44894149899482727, "learning_rate": 3.1425030678925944e-05, "loss": 2.8882, "step": 8930 }, { "epoch": 0.6597785977859778, "grad_norm": 0.45504188537597656, "learning_rate": 3.1305504135582244e-05, "loss": 2.82, "step": 8940 }, { "epoch": 0.6605166051660517, "grad_norm": 0.45306116342544556, "learning_rate": 3.118610165734539e-05, "loss": 2.8076, "step": 8950 }, { "epoch": 0.6612546125461255, "grad_norm": 0.4355803430080414, "learning_rate": 3.106682403662409e-05, "loss": 2.8458, "step": 8960 }, { "epoch": 0.6619926199261993, "grad_norm": 0.45864707231521606, "learning_rate": 3.094767206499844e-05, "loss": 2.7888, "step": 8970 }, { "epoch": 0.6627306273062731, "grad_norm": 0.4467925727367401, "learning_rate": 3.082864653321466e-05, "loss": 2.8862, "step": 8980 }, { "epoch": 0.6634686346863469, "grad_norm": 0.4361802935600281, "learning_rate": 3.0709748231179855e-05, "loss": 2.8405, "step": 8990 }, { "epoch": 0.6642066420664207, "grad_norm": 0.4502997398376465, "learning_rate": 3.059097794795681e-05, "loss": 2.8651, "step": 9000 }, { "epoch": 0.6649446494464945, "grad_norm": 0.446232408285141, "learning_rate": 3.0472336471758678e-05, "loss": 2.9009, "step": 9010 }, { "epoch": 0.6656826568265682, "grad_norm": 0.4600978493690491, "learning_rate": 3.0353824589943834e-05, "loss": 2.8842, "step": 9020 }, { "epoch": 0.666420664206642, "grad_norm": 0.45147082209587097, "learning_rate": 3.0235443089010562e-05, "loss": 2.842, "step": 9030 }, { "epoch": 0.6671586715867158, "grad_norm": 0.470324844121933, "learning_rate": 3.0117192754591893e-05, "loss": 2.9098, "step": 9040 }, { "epoch": 0.6678966789667896, "grad_norm": 0.4519864320755005, "learning_rate": 2.999907437145042e-05, "loss": 2.917, "step": 9050 }, { "epoch": 0.6686346863468635, "grad_norm": 0.44655749201774597, "learning_rate": 2.9881088723472966e-05, "loss": 2.9205, "step": 9060 }, { "epoch": 0.6693726937269373, "grad_norm": 0.45969992876052856, "learning_rate": 2.9763236593665533e-05, "loss": 2.8726, "step": 9070 }, { "epoch": 0.6701107011070111, "grad_norm": 0.45693284273147583, "learning_rate": 2.9645518764148007e-05, "loss": 2.8753, "step": 9080 }, { "epoch": 0.6708487084870849, "grad_norm": 0.442354291677475, "learning_rate": 2.9527936016149006e-05, "loss": 2.8377, "step": 9090 }, { "epoch": 0.6715867158671587, "grad_norm": 0.4796278476715088, "learning_rate": 2.9410489130000684e-05, "loss": 2.8303, "step": 9100 }, { "epoch": 0.6723247232472325, "grad_norm": 0.4597807824611664, "learning_rate": 2.9293178885133525e-05, "loss": 2.8325, "step": 9110 }, { "epoch": 0.6730627306273063, "grad_norm": 0.47112002968788147, "learning_rate": 2.917600606007127e-05, "loss": 2.8479, "step": 9120 }, { "epoch": 0.67380073800738, "grad_norm": 0.4425598978996277, "learning_rate": 2.905897143242562e-05, "loss": 2.8416, "step": 9130 }, { "epoch": 0.6745387453874538, "grad_norm": 0.4444707930088043, "learning_rate": 2.8942075778891153e-05, "loss": 2.9409, "step": 9140 }, { "epoch": 0.6752767527675276, "grad_norm": 0.4575837254524231, "learning_rate": 2.882531987524017e-05, "loss": 2.8615, "step": 9150 }, { "epoch": 0.6760147601476014, "grad_norm": 0.4663306176662445, "learning_rate": 2.8708704496317474e-05, "loss": 2.8184, "step": 9160 }, { "epoch": 0.6767527675276753, "grad_norm": 0.441550076007843, "learning_rate": 2.8592230416035335e-05, "loss": 2.8981, "step": 9170 }, { "epoch": 0.6774907749077491, "grad_norm": 0.47013741731643677, "learning_rate": 2.8475898407368296e-05, "loss": 2.9034, "step": 9180 }, { "epoch": 0.6782287822878229, "grad_norm": 0.47934868931770325, "learning_rate": 2.8359709242348032e-05, "loss": 2.9483, "step": 9190 }, { "epoch": 0.6789667896678967, "grad_norm": 0.44904670119285583, "learning_rate": 2.824366369205825e-05, "loss": 2.9038, "step": 9200 }, { "epoch": 0.6797047970479705, "grad_norm": 0.4706343710422516, "learning_rate": 2.8127762526629553e-05, "loss": 2.8976, "step": 9210 }, { "epoch": 0.6804428044280443, "grad_norm": 0.4544294774532318, "learning_rate": 2.801200651523438e-05, "loss": 2.8875, "step": 9220 }, { "epoch": 0.6811808118081181, "grad_norm": 0.4476546347141266, "learning_rate": 2.7896396426081844e-05, "loss": 2.8378, "step": 9230 }, { "epoch": 0.6819188191881919, "grad_norm": 0.4503355920314789, "learning_rate": 2.7780933026412602e-05, "loss": 2.8917, "step": 9240 }, { "epoch": 0.6826568265682657, "grad_norm": 0.4393197298049927, "learning_rate": 2.766561708249387e-05, "loss": 2.7785, "step": 9250 }, { "epoch": 0.6833948339483394, "grad_norm": 0.45384228229522705, "learning_rate": 2.7550449359614272e-05, "loss": 2.8712, "step": 9260 }, { "epoch": 0.6841328413284132, "grad_norm": 0.462931752204895, "learning_rate": 2.743543062207876e-05, "loss": 2.9299, "step": 9270 }, { "epoch": 0.6848708487084871, "grad_norm": 0.4446216821670532, "learning_rate": 2.7320561633203566e-05, "loss": 2.93, "step": 9280 }, { "epoch": 0.6856088560885609, "grad_norm": 0.4498085677623749, "learning_rate": 2.7205843155311094e-05, "loss": 2.8614, "step": 9290 }, { "epoch": 0.6863468634686347, "grad_norm": 0.44905975461006165, "learning_rate": 2.7091275949724926e-05, "loss": 2.8681, "step": 9300 }, { "epoch": 0.6870848708487085, "grad_norm": 0.4424300491809845, "learning_rate": 2.6976860776764713e-05, "loss": 2.8048, "step": 9310 }, { "epoch": 0.6878228782287823, "grad_norm": 0.46064937114715576, "learning_rate": 2.6862598395741136e-05, "loss": 2.8376, "step": 9320 }, { "epoch": 0.6885608856088561, "grad_norm": 0.45401063561439514, "learning_rate": 2.6748489564950908e-05, "loss": 2.8168, "step": 9330 }, { "epoch": 0.6892988929889299, "grad_norm": 0.4572742283344269, "learning_rate": 2.6634535041671693e-05, "loss": 2.8182, "step": 9340 }, { "epoch": 0.6900369003690037, "grad_norm": 0.4515658915042877, "learning_rate": 2.652073558215711e-05, "loss": 2.8569, "step": 9350 }, { "epoch": 0.6907749077490775, "grad_norm": 0.44633907079696655, "learning_rate": 2.64070919416317e-05, "loss": 2.8684, "step": 9360 }, { "epoch": 0.6915129151291513, "grad_norm": 0.4616515636444092, "learning_rate": 2.6293604874285927e-05, "loss": 2.8791, "step": 9370 }, { "epoch": 0.692250922509225, "grad_norm": 0.4603336751461029, "learning_rate": 2.618027513327116e-05, "loss": 2.8685, "step": 9380 }, { "epoch": 0.6929889298892989, "grad_norm": 0.4635460376739502, "learning_rate": 2.6067103470694672e-05, "loss": 2.8819, "step": 9390 }, { "epoch": 0.6937269372693727, "grad_norm": 0.446821004152298, "learning_rate": 2.5954090637614658e-05, "loss": 2.8775, "step": 9400 }, { "epoch": 0.6944649446494465, "grad_norm": 0.45208224654197693, "learning_rate": 2.5841237384035265e-05, "loss": 2.9185, "step": 9410 }, { "epoch": 0.6952029520295203, "grad_norm": 0.43966442346572876, "learning_rate": 2.5728544458901593e-05, "loss": 2.844, "step": 9420 }, { "epoch": 0.6959409594095941, "grad_norm": 0.4660171866416931, "learning_rate": 2.5616012610094704e-05, "loss": 2.8533, "step": 9430 }, { "epoch": 0.6966789667896679, "grad_norm": 0.4844834804534912, "learning_rate": 2.5503642584426712e-05, "loss": 2.9139, "step": 9440 }, { "epoch": 0.6974169741697417, "grad_norm": 0.4675824046134949, "learning_rate": 2.5391435127635805e-05, "loss": 2.857, "step": 9450 }, { "epoch": 0.6981549815498155, "grad_norm": 0.4488329291343689, "learning_rate": 2.5279390984381264e-05, "loss": 2.8484, "step": 9460 }, { "epoch": 0.6988929889298893, "grad_norm": 0.4558933675289154, "learning_rate": 2.5167510898238566e-05, "loss": 2.8784, "step": 9470 }, { "epoch": 0.6996309963099631, "grad_norm": 0.45454517006874084, "learning_rate": 2.5055795611694433e-05, "loss": 2.8075, "step": 9480 }, { "epoch": 0.7003690036900369, "grad_norm": 0.4401450455188751, "learning_rate": 2.4944245866141886e-05, "loss": 2.8661, "step": 9490 }, { "epoch": 0.7011070110701108, "grad_norm": 0.42718032002449036, "learning_rate": 2.4832862401875378e-05, "loss": 2.8306, "step": 9500 }, { "epoch": 0.7018450184501845, "grad_norm": 0.4444067180156708, "learning_rate": 2.472164595808576e-05, "loss": 2.887, "step": 9510 }, { "epoch": 0.7025830258302583, "grad_norm": 0.4388265311717987, "learning_rate": 2.461059727285558e-05, "loss": 2.9248, "step": 9520 }, { "epoch": 0.7033210332103321, "grad_norm": 0.4537127614021301, "learning_rate": 2.449971708315397e-05, "loss": 2.866, "step": 9530 }, { "epoch": 0.7040590405904059, "grad_norm": 0.4571674168109894, "learning_rate": 2.4389006124831893e-05, "loss": 2.8524, "step": 9540 }, { "epoch": 0.7047970479704797, "grad_norm": 0.475065678358078, "learning_rate": 2.4278465132617207e-05, "loss": 2.9086, "step": 9550 }, { "epoch": 0.7055350553505535, "grad_norm": 0.4491478204727173, "learning_rate": 2.4168094840109785e-05, "loss": 2.8496, "step": 9560 }, { "epoch": 0.7062730627306273, "grad_norm": 0.4396122694015503, "learning_rate": 2.4057895979776683e-05, "loss": 2.8542, "step": 9570 }, { "epoch": 0.7070110701107011, "grad_norm": 0.45730844140052795, "learning_rate": 2.394786928294726e-05, "loss": 2.8448, "step": 9580 }, { "epoch": 0.7077490774907749, "grad_norm": 11.993217468261719, "learning_rate": 2.3838015479808263e-05, "loss": 2.8686, "step": 9590 }, { "epoch": 0.7084870848708487, "grad_norm": 0.4676622450351715, "learning_rate": 2.3728335299399106e-05, "loss": 2.8195, "step": 9600 }, { "epoch": 0.7092250922509226, "grad_norm": 0.4665907621383667, "learning_rate": 2.3618829469606912e-05, "loss": 2.8851, "step": 9610 }, { "epoch": 0.7099630996309964, "grad_norm": 0.4478704631328583, "learning_rate": 2.3509498717161804e-05, "loss": 2.8631, "step": 9620 }, { "epoch": 0.7107011070110701, "grad_norm": 0.4518534541130066, "learning_rate": 2.3400343767631944e-05, "loss": 2.8542, "step": 9630 }, { "epoch": 0.7114391143911439, "grad_norm": 0.45083850622177124, "learning_rate": 2.329136534541882e-05, "loss": 2.8447, "step": 9640 }, { "epoch": 0.7121771217712177, "grad_norm": 0.44704335927963257, "learning_rate": 2.3182564173752396e-05, "loss": 2.8001, "step": 9650 }, { "epoch": 0.7129151291512915, "grad_norm": 0.459086149930954, "learning_rate": 2.3073940974686337e-05, "loss": 2.8562, "step": 9660 }, { "epoch": 0.7136531365313653, "grad_norm": 0.4504683017730713, "learning_rate": 2.296549646909315e-05, "loss": 2.8153, "step": 9670 }, { "epoch": 0.7143911439114391, "grad_norm": 0.4484894275665283, "learning_rate": 2.2857231376659516e-05, "loss": 2.8652, "step": 9680 }, { "epoch": 0.7151291512915129, "grad_norm": 0.44552645087242126, "learning_rate": 2.274914641588141e-05, "loss": 2.8544, "step": 9690 }, { "epoch": 0.7158671586715867, "grad_norm": 0.44962164759635925, "learning_rate": 2.2641242304059394e-05, "loss": 2.809, "step": 9700 }, { "epoch": 0.7166051660516605, "grad_norm": 0.4649772047996521, "learning_rate": 2.2533519757293803e-05, "loss": 2.9047, "step": 9710 }, { "epoch": 0.7173431734317344, "grad_norm": 0.44465893507003784, "learning_rate": 2.242597949048008e-05, "loss": 2.9289, "step": 9720 }, { "epoch": 0.7180811808118082, "grad_norm": 0.4587944746017456, "learning_rate": 2.2318622217303935e-05, "loss": 2.9381, "step": 9730 }, { "epoch": 0.718819188191882, "grad_norm": 0.45747023820877075, "learning_rate": 2.221144865023666e-05, "loss": 2.8596, "step": 9740 }, { "epoch": 0.7195571955719557, "grad_norm": 0.4500565528869629, "learning_rate": 2.2104459500530362e-05, "loss": 2.8122, "step": 9750 }, { "epoch": 0.7202952029520295, "grad_norm": 0.44870901107788086, "learning_rate": 2.1997655478213313e-05, "loss": 2.8318, "step": 9760 }, { "epoch": 0.7210332103321033, "grad_norm": 0.46823742985725403, "learning_rate": 2.1891037292085175e-05, "loss": 2.7682, "step": 9770 }, { "epoch": 0.7217712177121771, "grad_norm": 0.4822959899902344, "learning_rate": 2.1784605649712324e-05, "loss": 2.8845, "step": 9780 }, { "epoch": 0.7225092250922509, "grad_norm": 0.4569961726665497, "learning_rate": 2.167836125742315e-05, "loss": 2.8073, "step": 9790 }, { "epoch": 0.7232472324723247, "grad_norm": 0.5003052949905396, "learning_rate": 2.1572304820303363e-05, "loss": 2.966, "step": 9800 }, { "epoch": 0.7239852398523985, "grad_norm": 0.4504786431789398, "learning_rate": 2.1466437042191297e-05, "loss": 2.8226, "step": 9810 }, { "epoch": 0.7247232472324723, "grad_norm": 0.4485565423965454, "learning_rate": 2.1360758625673327e-05, "loss": 2.8301, "step": 9820 }, { "epoch": 0.7254612546125462, "grad_norm": 0.46124571561813354, "learning_rate": 2.1255270272079042e-05, "loss": 2.8485, "step": 9830 }, { "epoch": 0.72619926199262, "grad_norm": 0.4612502455711365, "learning_rate": 2.1149972681476765e-05, "loss": 2.8276, "step": 9840 }, { "epoch": 0.7269372693726938, "grad_norm": 0.45740193128585815, "learning_rate": 2.104486655266879e-05, "loss": 2.8669, "step": 9850 }, { "epoch": 0.7276752767527676, "grad_norm": 0.47378960251808167, "learning_rate": 2.0939952583186807e-05, "loss": 2.8149, "step": 9860 }, { "epoch": 0.7284132841328413, "grad_norm": 0.45929577946662903, "learning_rate": 2.0835231469287232e-05, "loss": 2.8346, "step": 9870 }, { "epoch": 0.7291512915129151, "grad_norm": 0.45453017950057983, "learning_rate": 2.0730703905946612e-05, "loss": 2.8851, "step": 9880 }, { "epoch": 0.7298892988929889, "grad_norm": 0.4465833604335785, "learning_rate": 2.0626370586857007e-05, "loss": 2.8381, "step": 9890 }, { "epoch": 0.7306273062730627, "grad_norm": 0.46699321269989014, "learning_rate": 2.052223220442139e-05, "loss": 2.8394, "step": 9900 }, { "epoch": 0.7313653136531365, "grad_norm": 0.4374259412288666, "learning_rate": 2.0418289449749027e-05, "loss": 2.8501, "step": 9910 }, { "epoch": 0.7321033210332103, "grad_norm": 0.4604252576828003, "learning_rate": 2.0314543012650933e-05, "loss": 2.8711, "step": 9920 }, { "epoch": 0.7328413284132841, "grad_norm": 0.45612022280693054, "learning_rate": 2.0210993581635256e-05, "loss": 2.844, "step": 9930 }, { "epoch": 0.7335793357933579, "grad_norm": 0.43427881598472595, "learning_rate": 2.0107641843902726e-05, "loss": 2.8084, "step": 9940 }, { "epoch": 0.7343173431734318, "grad_norm": 0.4502193331718445, "learning_rate": 2.0004488485342088e-05, "loss": 2.909, "step": 9950 }, { "epoch": 0.7350553505535056, "grad_norm": 0.44448336958885193, "learning_rate": 1.9901534190525566e-05, "loss": 2.8662, "step": 9960 }, { "epoch": 0.7357933579335794, "grad_norm": 0.4308652877807617, "learning_rate": 1.9798779642704297e-05, "loss": 2.7882, "step": 9970 }, { "epoch": 0.7365313653136532, "grad_norm": 0.4563472867012024, "learning_rate": 1.96962255238038e-05, "loss": 2.8956, "step": 9980 }, { "epoch": 0.7372693726937269, "grad_norm": 0.4397279620170593, "learning_rate": 1.9593872514419476e-05, "loss": 2.7707, "step": 9990 }, { "epoch": 0.7380073800738007, "grad_norm": 0.47456085681915283, "learning_rate": 1.9491721293812076e-05, "loss": 2.9205, "step": 10000 }, { "epoch": 0.7387453874538745, "grad_norm": 0.43729913234710693, "learning_rate": 1.9389772539903122e-05, "loss": 2.8423, "step": 10010 }, { "epoch": 0.7394833948339483, "grad_norm": 0.4417737126350403, "learning_rate": 1.9288026929270587e-05, "loss": 2.832, "step": 10020 }, { "epoch": 0.7402214022140221, "grad_norm": 0.44813665747642517, "learning_rate": 1.9186485137144218e-05, "loss": 2.8494, "step": 10030 }, { "epoch": 0.7409594095940959, "grad_norm": 0.45640864968299866, "learning_rate": 1.908514783740114e-05, "loss": 2.8784, "step": 10040 }, { "epoch": 0.7416974169741697, "grad_norm": 0.4336318373680115, "learning_rate": 1.8984015702561393e-05, "loss": 2.8372, "step": 10050 }, { "epoch": 0.7424354243542436, "grad_norm": 0.4504336714744568, "learning_rate": 1.8883089403783434e-05, "loss": 2.7967, "step": 10060 }, { "epoch": 0.7431734317343174, "grad_norm": 0.46149566769599915, "learning_rate": 1.8782369610859708e-05, "loss": 2.8191, "step": 10070 }, { "epoch": 0.7439114391143912, "grad_norm": 0.4522392451763153, "learning_rate": 1.868185699221221e-05, "loss": 2.8794, "step": 10080 }, { "epoch": 0.744649446494465, "grad_norm": 0.4411635994911194, "learning_rate": 1.8581552214887977e-05, "loss": 2.8404, "step": 10090 }, { "epoch": 0.7453874538745388, "grad_norm": 0.46107056736946106, "learning_rate": 1.848145594455477e-05, "loss": 2.846, "step": 10100 }, { "epoch": 0.7461254612546125, "grad_norm": 0.45308247208595276, "learning_rate": 1.8381568845496578e-05, "loss": 2.807, "step": 10110 }, { "epoch": 0.7468634686346863, "grad_norm": 0.44377437233924866, "learning_rate": 1.828189158060927e-05, "loss": 2.9005, "step": 10120 }, { "epoch": 0.7476014760147601, "grad_norm": 0.45314696431159973, "learning_rate": 1.8182424811396133e-05, "loss": 2.8626, "step": 10130 }, { "epoch": 0.7483394833948339, "grad_norm": 0.4458778202533722, "learning_rate": 1.80831691979635e-05, "loss": 2.7985, "step": 10140 }, { "epoch": 0.7490774907749077, "grad_norm": 0.464269757270813, "learning_rate": 1.7984125399016392e-05, "loss": 2.9386, "step": 10150 }, { "epoch": 0.7498154981549815, "grad_norm": 0.4448395371437073, "learning_rate": 1.7885294071854157e-05, "loss": 2.833, "step": 10160 }, { "epoch": 0.7505535055350554, "grad_norm": 0.4455287754535675, "learning_rate": 1.7786675872366028e-05, "loss": 2.8184, "step": 10170 }, { "epoch": 0.7512915129151292, "grad_norm": 0.4467598497867584, "learning_rate": 1.7688271455026867e-05, "loss": 2.8357, "step": 10180 }, { "epoch": 0.752029520295203, "grad_norm": 0.4642166197299957, "learning_rate": 1.7590081472892776e-05, "loss": 2.9219, "step": 10190 }, { "epoch": 0.7527675276752768, "grad_norm": 0.44453924894332886, "learning_rate": 1.7492106577596772e-05, "loss": 2.8822, "step": 10200 }, { "epoch": 0.7535055350553506, "grad_norm": 0.4599774479866028, "learning_rate": 1.7394347419344432e-05, "loss": 2.8336, "step": 10210 }, { "epoch": 0.7542435424354244, "grad_norm": 0.4477832317352295, "learning_rate": 1.7296804646909654e-05, "loss": 2.785, "step": 10220 }, { "epoch": 0.7549815498154981, "grad_norm": 0.45672887563705444, "learning_rate": 1.7199478907630267e-05, "loss": 2.8166, "step": 10230 }, { "epoch": 0.7557195571955719, "grad_norm": 0.4571615159511566, "learning_rate": 1.710237084740378e-05, "loss": 2.9199, "step": 10240 }, { "epoch": 0.7564575645756457, "grad_norm": 0.4618014991283417, "learning_rate": 1.7005481110683062e-05, "loss": 2.907, "step": 10250 }, { "epoch": 0.7571955719557195, "grad_norm": 0.44089266657829285, "learning_rate": 1.690881034047212e-05, "loss": 2.854, "step": 10260 }, { "epoch": 0.7579335793357933, "grad_norm": 0.4468059837818146, "learning_rate": 1.6812359178321784e-05, "loss": 2.8511, "step": 10270 }, { "epoch": 0.7586715867158672, "grad_norm": 0.4517216682434082, "learning_rate": 1.6716128264325475e-05, "loss": 2.8117, "step": 10280 }, { "epoch": 0.759409594095941, "grad_norm": 0.4576111137866974, "learning_rate": 1.662011823711495e-05, "loss": 2.838, "step": 10290 }, { "epoch": 0.7601476014760148, "grad_norm": 0.4355645179748535, "learning_rate": 1.6524329733856047e-05, "loss": 2.8054, "step": 10300 }, { "epoch": 0.7608856088560886, "grad_norm": 0.4544225037097931, "learning_rate": 1.642876339024446e-05, "loss": 2.8703, "step": 10310 }, { "epoch": 0.7616236162361624, "grad_norm": 0.4510670006275177, "learning_rate": 1.633341984050162e-05, "loss": 2.8265, "step": 10320 }, { "epoch": 0.7623616236162362, "grad_norm": 0.444296658039093, "learning_rate": 1.6238299717370252e-05, "loss": 2.9467, "step": 10330 }, { "epoch": 0.76309963099631, "grad_norm": 0.44352987408638, "learning_rate": 1.614340365211044e-05, "loss": 2.8385, "step": 10340 }, { "epoch": 0.7638376383763837, "grad_norm": 0.4408433139324188, "learning_rate": 1.6048732274495255e-05, "loss": 2.7828, "step": 10350 }, { "epoch": 0.7645756457564575, "grad_norm": 0.4516165554523468, "learning_rate": 1.595428621280668e-05, "loss": 2.8448, "step": 10360 }, { "epoch": 0.7653136531365313, "grad_norm": 0.4665060341358185, "learning_rate": 1.5860066093831367e-05, "loss": 2.8067, "step": 10370 }, { "epoch": 0.7660516605166051, "grad_norm": 0.44463926553726196, "learning_rate": 1.5766072542856526e-05, "loss": 2.8421, "step": 10380 }, { "epoch": 0.766789667896679, "grad_norm": 0.426488995552063, "learning_rate": 1.5672306183665764e-05, "loss": 2.8121, "step": 10390 }, { "epoch": 0.7675276752767528, "grad_norm": 0.44521549344062805, "learning_rate": 1.557876763853493e-05, "loss": 2.7992, "step": 10400 }, { "epoch": 0.7682656826568266, "grad_norm": 0.45438680052757263, "learning_rate": 1.5485457528228003e-05, "loss": 2.8034, "step": 10410 }, { "epoch": 0.7690036900369004, "grad_norm": 0.4456971287727356, "learning_rate": 1.5392376471992965e-05, "loss": 2.8191, "step": 10420 }, { "epoch": 0.7697416974169742, "grad_norm": 0.4459834694862366, "learning_rate": 1.529952508755768e-05, "loss": 2.8668, "step": 10430 }, { "epoch": 0.770479704797048, "grad_norm": 0.4495449960231781, "learning_rate": 1.5206903991125832e-05, "loss": 2.8433, "step": 10440 }, { "epoch": 0.7712177121771218, "grad_norm": 0.4536389708518982, "learning_rate": 1.511451379737278e-05, "loss": 2.8522, "step": 10450 }, { "epoch": 0.7719557195571956, "grad_norm": 0.44112926721572876, "learning_rate": 1.502235511944154e-05, "loss": 2.872, "step": 10460 }, { "epoch": 0.7726937269372693, "grad_norm": 0.43305230140686035, "learning_rate": 1.4930428568938648e-05, "loss": 2.901, "step": 10470 }, { "epoch": 0.7734317343173431, "grad_norm": 0.4792589247226715, "learning_rate": 1.4838734755930167e-05, "loss": 2.7635, "step": 10480 }, { "epoch": 0.7741697416974169, "grad_norm": 0.4358636438846588, "learning_rate": 1.4747274288937596e-05, "loss": 2.8276, "step": 10490 }, { "epoch": 0.7749077490774908, "grad_norm": 0.44949012994766235, "learning_rate": 1.4656047774933874e-05, "loss": 2.8624, "step": 10500 }, { "epoch": 0.7756457564575646, "grad_norm": 0.4440300762653351, "learning_rate": 1.4565055819339235e-05, "loss": 2.8239, "step": 10510 }, { "epoch": 0.7763837638376384, "grad_norm": 0.4554462730884552, "learning_rate": 1.447429902601739e-05, "loss": 2.7734, "step": 10520 }, { "epoch": 0.7771217712177122, "grad_norm": 0.4523858428001404, "learning_rate": 1.4383777997271347e-05, "loss": 2.8976, "step": 10530 }, { "epoch": 0.777859778597786, "grad_norm": 0.46444228291511536, "learning_rate": 1.429349333383948e-05, "loss": 2.8756, "step": 10540 }, { "epoch": 0.7785977859778598, "grad_norm": 0.4419015347957611, "learning_rate": 1.4203445634891538e-05, "loss": 2.8626, "step": 10550 }, { "epoch": 0.7793357933579336, "grad_norm": 0.44527843594551086, "learning_rate": 1.4113635498024664e-05, "loss": 2.8063, "step": 10560 }, { "epoch": 0.7800738007380074, "grad_norm": 0.4554080665111542, "learning_rate": 1.4024063519259439e-05, "loss": 2.7555, "step": 10570 }, { "epoch": 0.7808118081180812, "grad_norm": 0.4289720952510834, "learning_rate": 1.3934730293035936e-05, "loss": 2.8304, "step": 10580 }, { "epoch": 0.7815498154981549, "grad_norm": 0.4606097936630249, "learning_rate": 1.38456364122097e-05, "loss": 2.8415, "step": 10590 }, { "epoch": 0.7822878228782287, "grad_norm": 0.4606861174106598, "learning_rate": 1.3756782468047936e-05, "loss": 2.889, "step": 10600 }, { "epoch": 0.7830258302583026, "grad_norm": 0.425731897354126, "learning_rate": 1.3668169050225472e-05, "loss": 2.8573, "step": 10610 }, { "epoch": 0.7837638376383764, "grad_norm": 0.4634413421154022, "learning_rate": 1.357979674682095e-05, "loss": 2.8677, "step": 10620 }, { "epoch": 0.7845018450184502, "grad_norm": 0.45793548226356506, "learning_rate": 1.349166614431282e-05, "loss": 2.9207, "step": 10630 }, { "epoch": 0.785239852398524, "grad_norm": 0.4642331898212433, "learning_rate": 1.3403777827575514e-05, "loss": 2.887, "step": 10640 }, { "epoch": 0.7859778597785978, "grad_norm": 0.4591294825077057, "learning_rate": 1.3316132379875551e-05, "loss": 2.8502, "step": 10650 }, { "epoch": 0.7867158671586716, "grad_norm": 0.4461764395236969, "learning_rate": 1.322873038286766e-05, "loss": 2.8357, "step": 10660 }, { "epoch": 0.7874538745387454, "grad_norm": 0.4518667757511139, "learning_rate": 1.3141572416590891e-05, "loss": 2.9274, "step": 10670 }, { "epoch": 0.7881918819188192, "grad_norm": 0.435041606426239, "learning_rate": 1.3054659059464835e-05, "loss": 2.7578, "step": 10680 }, { "epoch": 0.788929889298893, "grad_norm": 0.45000597834587097, "learning_rate": 1.2967990888285737e-05, "loss": 2.8792, "step": 10690 }, { "epoch": 0.7896678966789668, "grad_norm": 0.4507540464401245, "learning_rate": 1.2881568478222672e-05, "loss": 2.9286, "step": 10700 }, { "epoch": 0.7904059040590405, "grad_norm": 0.44547247886657715, "learning_rate": 1.2795392402813715e-05, "loss": 2.7792, "step": 10710 }, { "epoch": 0.7911439114391144, "grad_norm": 0.4526568353176117, "learning_rate": 1.2709463233962204e-05, "loss": 2.8923, "step": 10720 }, { "epoch": 0.7918819188191882, "grad_norm": 0.4650912284851074, "learning_rate": 1.262378154193285e-05, "loss": 2.7767, "step": 10730 }, { "epoch": 0.792619926199262, "grad_norm": 0.4619973301887512, "learning_rate": 1.2538347895348013e-05, "loss": 2.7074, "step": 10740 }, { "epoch": 0.7933579335793358, "grad_norm": 0.4545031487941742, "learning_rate": 1.2453162861183909e-05, "loss": 2.832, "step": 10750 }, { "epoch": 0.7940959409594096, "grad_norm": 0.45016980171203613, "learning_rate": 1.236822700476683e-05, "loss": 2.8709, "step": 10760 }, { "epoch": 0.7948339483394834, "grad_norm": 0.41397857666015625, "learning_rate": 1.2283540889769445e-05, "loss": 2.7864, "step": 10770 }, { "epoch": 0.7955719557195572, "grad_norm": 0.47167348861694336, "learning_rate": 1.2199105078207001e-05, "loss": 2.7768, "step": 10780 }, { "epoch": 0.796309963099631, "grad_norm": 0.46366357803344727, "learning_rate": 1.2114920130433644e-05, "loss": 2.8994, "step": 10790 }, { "epoch": 0.7970479704797048, "grad_norm": 0.4539276957511902, "learning_rate": 1.2030986605138644e-05, "loss": 2.8526, "step": 10800 }, { "epoch": 0.7977859778597786, "grad_norm": 0.430576354265213, "learning_rate": 1.1947305059342729e-05, "loss": 2.7993, "step": 10810 }, { "epoch": 0.7985239852398524, "grad_norm": 0.4400356113910675, "learning_rate": 1.1863876048394407e-05, "loss": 2.9068, "step": 10820 }, { "epoch": 0.7992619926199263, "grad_norm": 0.44879478216171265, "learning_rate": 1.1780700125966233e-05, "loss": 2.8591, "step": 10830 }, { "epoch": 0.8, "grad_norm": 0.44169095158576965, "learning_rate": 1.1697777844051105e-05, "loss": 2.793, "step": 10840 }, { "epoch": 0.8007380073800738, "grad_norm": 0.45461106300354004, "learning_rate": 1.1615109752958713e-05, "loss": 2.9182, "step": 10850 }, { "epoch": 0.8014760147601476, "grad_norm": 0.4425186812877655, "learning_rate": 1.1532696401311787e-05, "loss": 2.8754, "step": 10860 }, { "epoch": 0.8022140221402214, "grad_norm": 0.4334977865219116, "learning_rate": 1.1450538336042516e-05, "loss": 2.8037, "step": 10870 }, { "epoch": 0.8029520295202952, "grad_norm": 0.43513453006744385, "learning_rate": 1.1368636102388868e-05, "loss": 2.8548, "step": 10880 }, { "epoch": 0.803690036900369, "grad_norm": 0.4428231716156006, "learning_rate": 1.1286990243891011e-05, "loss": 2.8673, "step": 10890 }, { "epoch": 0.8044280442804428, "grad_norm": 0.4509079158306122, "learning_rate": 1.1205601302387692e-05, "loss": 2.9012, "step": 10900 }, { "epoch": 0.8051660516605166, "grad_norm": 0.44838449358940125, "learning_rate": 1.1124469818012635e-05, "loss": 2.8056, "step": 10910 }, { "epoch": 0.8059040590405904, "grad_norm": 0.4536844491958618, "learning_rate": 1.1043596329190964e-05, "loss": 2.883, "step": 10920 }, { "epoch": 0.8066420664206642, "grad_norm": 0.44634494185447693, "learning_rate": 1.0962981372635628e-05, "loss": 2.8049, "step": 10930 }, { "epoch": 0.8073800738007381, "grad_norm": 0.4615216553211212, "learning_rate": 1.0882625483343845e-05, "loss": 2.9058, "step": 10940 }, { "epoch": 0.8081180811808119, "grad_norm": 0.4436852037906647, "learning_rate": 1.0802529194593547e-05, "loss": 2.8492, "step": 10950 }, { "epoch": 0.8088560885608856, "grad_norm": 0.4358108341693878, "learning_rate": 1.0722693037939818e-05, "loss": 2.8513, "step": 10960 }, { "epoch": 0.8095940959409594, "grad_norm": 0.45849135518074036, "learning_rate": 1.0643117543211422e-05, "loss": 2.8141, "step": 10970 }, { "epoch": 0.8103321033210332, "grad_norm": 0.4694216251373291, "learning_rate": 1.0563803238507219e-05, "loss": 2.8304, "step": 10980 }, { "epoch": 0.811070110701107, "grad_norm": 0.4531688094139099, "learning_rate": 1.0484750650192726e-05, "loss": 2.9128, "step": 10990 }, { "epoch": 0.8118081180811808, "grad_norm": 0.4585440754890442, "learning_rate": 1.0405960302896562e-05, "loss": 2.8299, "step": 11000 }, { "epoch": 0.8125461254612546, "grad_norm": 0.4274667799472809, "learning_rate": 1.0327432719507019e-05, "loss": 2.7979, "step": 11010 }, { "epoch": 0.8132841328413284, "grad_norm": 0.43614691495895386, "learning_rate": 1.0249168421168558e-05, "loss": 2.8119, "step": 11020 }, { "epoch": 0.8140221402214022, "grad_norm": 0.45556968450546265, "learning_rate": 1.0171167927278368e-05, "loss": 2.9038, "step": 11030 }, { "epoch": 0.814760147601476, "grad_norm": 0.44112008810043335, "learning_rate": 1.0093431755482908e-05, "loss": 2.9019, "step": 11040 }, { "epoch": 0.8154981549815498, "grad_norm": 0.444204181432724, "learning_rate": 1.001596042167447e-05, "loss": 2.7909, "step": 11050 }, { "epoch": 0.8162361623616237, "grad_norm": 0.427478551864624, "learning_rate": 9.93875443998778e-06, "loss": 2.8195, "step": 11060 }, { "epoch": 0.8169741697416975, "grad_norm": 0.4325047433376312, "learning_rate": 9.861814322796553e-06, "loss": 2.8227, "step": 11070 }, { "epoch": 0.8177121771217712, "grad_norm": 0.4463500380516052, "learning_rate": 9.785140580710107e-06, "loss": 2.8502, "step": 11080 }, { "epoch": 0.818450184501845, "grad_norm": 0.44314101338386536, "learning_rate": 9.708733722569996e-06, "loss": 2.8617, "step": 11090 }, { "epoch": 0.8191881918819188, "grad_norm": 0.43770846724510193, "learning_rate": 9.632594255446565e-06, "loss": 2.815, "step": 11100 }, { "epoch": 0.8199261992619926, "grad_norm": 0.48664426803588867, "learning_rate": 9.556722684635667e-06, "loss": 2.8386, "step": 11110 }, { "epoch": 0.8206642066420664, "grad_norm": 0.42718470096588135, "learning_rate": 9.48111951365529e-06, "loss": 2.7743, "step": 11120 }, { "epoch": 0.8214022140221402, "grad_norm": 0.4534224569797516, "learning_rate": 9.405785244242165e-06, "loss": 2.885, "step": 11130 }, { "epoch": 0.822140221402214, "grad_norm": 0.4469706118106842, "learning_rate": 9.330720376348483e-06, "loss": 2.7431, "step": 11140 }, { "epoch": 0.8228782287822878, "grad_norm": 0.4499460756778717, "learning_rate": 9.25592540813857e-06, "loss": 2.8604, "step": 11150 }, { "epoch": 0.8236162361623616, "grad_norm": 0.4386638104915619, "learning_rate": 9.18140083598557e-06, "loss": 2.797, "step": 11160 }, { "epoch": 0.8243542435424355, "grad_norm": 0.4377821683883667, "learning_rate": 9.10714715446817e-06, "loss": 2.8071, "step": 11170 }, { "epoch": 0.8250922509225093, "grad_norm": 0.4503236413002014, "learning_rate": 9.03316485636727e-06, "loss": 2.8215, "step": 11180 }, { "epoch": 0.825830258302583, "grad_norm": 0.4537326693534851, "learning_rate": 8.959454432662778e-06, "loss": 2.7938, "step": 11190 }, { "epoch": 0.8265682656826568, "grad_norm": 0.4477526843547821, "learning_rate": 8.88601637253032e-06, "loss": 2.7778, "step": 11200 }, { "epoch": 0.8273062730627306, "grad_norm": 0.45014604926109314, "learning_rate": 8.812851163337975e-06, "loss": 2.792, "step": 11210 }, { "epoch": 0.8280442804428044, "grad_norm": 0.44553130865097046, "learning_rate": 8.739959290643097e-06, "loss": 2.8268, "step": 11220 }, { "epoch": 0.8287822878228782, "grad_norm": 0.45030757784843445, "learning_rate": 8.667341238189009e-06, "loss": 2.8332, "step": 11230 }, { "epoch": 0.829520295202952, "grad_norm": 0.44522371888160706, "learning_rate": 8.594997487901879e-06, "loss": 2.8526, "step": 11240 }, { "epoch": 0.8302583025830258, "grad_norm": 0.46951159834861755, "learning_rate": 8.522928519887463e-06, "loss": 2.8052, "step": 11250 }, { "epoch": 0.8309963099630996, "grad_norm": 0.45531222224235535, "learning_rate": 8.451134812427925e-06, "loss": 2.8108, "step": 11260 }, { "epoch": 0.8317343173431734, "grad_norm": 0.4519606828689575, "learning_rate": 8.379616841978699e-06, "loss": 2.8302, "step": 11270 }, { "epoch": 0.8324723247232473, "grad_norm": 0.45735597610473633, "learning_rate": 8.308375083165298e-06, "loss": 2.9323, "step": 11280 }, { "epoch": 0.8332103321033211, "grad_norm": 0.4518982172012329, "learning_rate": 8.237410008780161e-06, "loss": 2.796, "step": 11290 }, { "epoch": 0.8339483394833949, "grad_norm": 0.4294179379940033, "learning_rate": 8.166722089779539e-06, "loss": 2.8383, "step": 11300 }, { "epoch": 0.8346863468634687, "grad_norm": 0.43325817584991455, "learning_rate": 8.096311795280331e-06, "loss": 2.7896, "step": 11310 }, { "epoch": 0.8354243542435424, "grad_norm": 0.4492734670639038, "learning_rate": 8.026179592557037e-06, "loss": 2.8272, "step": 11320 }, { "epoch": 0.8361623616236162, "grad_norm": 0.4338243007659912, "learning_rate": 7.956325947038584e-06, "loss": 2.8173, "step": 11330 }, { "epoch": 0.83690036900369, "grad_norm": 0.4449402987957001, "learning_rate": 7.886751322305247e-06, "loss": 2.8244, "step": 11340 }, { "epoch": 0.8376383763837638, "grad_norm": 0.44180235266685486, "learning_rate": 7.817456180085636e-06, "loss": 2.8902, "step": 11350 }, { "epoch": 0.8383763837638376, "grad_norm": 0.45504215359687805, "learning_rate": 7.748440980253562e-06, "loss": 2.8344, "step": 11360 }, { "epoch": 0.8391143911439114, "grad_norm": 0.4654461443424225, "learning_rate": 7.67970618082503e-06, "loss": 2.8335, "step": 11370 }, { "epoch": 0.8398523985239852, "grad_norm": 0.47360721230506897, "learning_rate": 7.611252237955169e-06, "loss": 2.8943, "step": 11380 }, { "epoch": 0.8405904059040591, "grad_norm": 0.4570152461528778, "learning_rate": 7.543079605935221e-06, "loss": 2.8674, "step": 11390 }, { "epoch": 0.8413284132841329, "grad_norm": 0.41285139322280884, "learning_rate": 7.47518873718952e-06, "loss": 2.8292, "step": 11400 }, { "epoch": 0.8420664206642067, "grad_norm": 0.45135176181793213, "learning_rate": 7.407580082272492e-06, "loss": 2.7573, "step": 11410 }, { "epoch": 0.8428044280442805, "grad_norm": 0.4763992726802826, "learning_rate": 7.340254089865672e-06, "loss": 2.8902, "step": 11420 }, { "epoch": 0.8435424354243543, "grad_norm": 0.480816513299942, "learning_rate": 7.27321120677471e-06, "loss": 2.9058, "step": 11430 }, { "epoch": 0.844280442804428, "grad_norm": 0.4476820230484009, "learning_rate": 7.206451877926418e-06, "loss": 2.8191, "step": 11440 }, { "epoch": 0.8450184501845018, "grad_norm": 0.4477422833442688, "learning_rate": 7.139976546365817e-06, "loss": 2.8023, "step": 11450 }, { "epoch": 0.8457564575645756, "grad_norm": 0.4407312572002411, "learning_rate": 7.0737856532531895e-06, "loss": 2.8368, "step": 11460 }, { "epoch": 0.8464944649446494, "grad_norm": 0.45549750328063965, "learning_rate": 7.007879637861159e-06, "loss": 2.8561, "step": 11470 }, { "epoch": 0.8472324723247232, "grad_norm": 0.4288015067577362, "learning_rate": 6.942258937571772e-06, "loss": 2.7234, "step": 11480 }, { "epoch": 0.847970479704797, "grad_norm": 0.4370770752429962, "learning_rate": 6.87692398787359e-06, "loss": 2.8607, "step": 11490 }, { "epoch": 0.8487084870848709, "grad_norm": 0.44784659147262573, "learning_rate": 6.81187522235881e-06, "loss": 2.78, "step": 11500 }, { "epoch": 0.8494464944649447, "grad_norm": 0.43501320481300354, "learning_rate": 6.747113072720385e-06, "loss": 2.8121, "step": 11510 }, { "epoch": 0.8501845018450185, "grad_norm": 0.4419308006763458, "learning_rate": 6.6826379687491505e-06, "loss": 2.8502, "step": 11520 }, { "epoch": 0.8509225092250923, "grad_norm": 0.4417872130870819, "learning_rate": 6.6184503383309784e-06, "loss": 2.8042, "step": 11530 }, { "epoch": 0.8516605166051661, "grad_norm": 0.4433625638484955, "learning_rate": 6.5545506074439325e-06, "loss": 2.7962, "step": 11540 }, { "epoch": 0.8523985239852399, "grad_norm": 0.44587311148643494, "learning_rate": 6.490939200155449e-06, "loss": 2.841, "step": 11550 }, { "epoch": 0.8531365313653136, "grad_norm": 0.4439995288848877, "learning_rate": 6.427616538619524e-06, "loss": 2.8195, "step": 11560 }, { "epoch": 0.8538745387453874, "grad_norm": 0.4364805519580841, "learning_rate": 6.3645830430739015e-06, "loss": 2.7775, "step": 11570 }, { "epoch": 0.8546125461254612, "grad_norm": 0.4607424736022949, "learning_rate": 6.301839131837284e-06, "loss": 2.907, "step": 11580 }, { "epoch": 0.855350553505535, "grad_norm": 0.45834723114967346, "learning_rate": 6.239385221306587e-06, "loss": 2.8708, "step": 11590 }, { "epoch": 0.8560885608856088, "grad_norm": 0.43934082984924316, "learning_rate": 6.177221725954102e-06, "loss": 2.8159, "step": 11600 }, { "epoch": 0.8568265682656827, "grad_norm": 0.4437257945537567, "learning_rate": 6.1153490583248265e-06, "loss": 2.8734, "step": 11610 }, { "epoch": 0.8575645756457565, "grad_norm": 0.43929627537727356, "learning_rate": 6.053767629033713e-06, "loss": 2.874, "step": 11620 }, { "epoch": 0.8583025830258303, "grad_norm": 0.4439617097377777, "learning_rate": 5.992477846762895e-06, "loss": 2.8252, "step": 11630 }, { "epoch": 0.8590405904059041, "grad_norm": 0.4464716613292694, "learning_rate": 5.931480118259003e-06, "loss": 2.78, "step": 11640 }, { "epoch": 0.8597785977859779, "grad_norm": 0.43279653787612915, "learning_rate": 5.870774848330485e-06, "loss": 2.749, "step": 11650 }, { "epoch": 0.8605166051660517, "grad_norm": 0.4490513503551483, "learning_rate": 5.810362439844896e-06, "loss": 2.841, "step": 11660 }, { "epoch": 0.8612546125461255, "grad_norm": 0.4711556136608124, "learning_rate": 5.750243293726226e-06, "loss": 2.7801, "step": 11670 }, { "epoch": 0.8619926199261992, "grad_norm": 0.4525899887084961, "learning_rate": 5.690417808952242e-06, "loss": 2.8942, "step": 11680 }, { "epoch": 0.862730627306273, "grad_norm": 0.44727823138237, "learning_rate": 5.6308863825518425e-06, "loss": 2.8095, "step": 11690 }, { "epoch": 0.8634686346863468, "grad_norm": 0.43965160846710205, "learning_rate": 5.571649409602436e-06, "loss": 2.8073, "step": 11700 }, { "epoch": 0.8642066420664206, "grad_norm": 0.45212361216545105, "learning_rate": 5.512707283227275e-06, "loss": 2.8849, "step": 11710 }, { "epoch": 0.8649446494464945, "grad_norm": 0.4664202332496643, "learning_rate": 5.454060394592919e-06, "loss": 2.8199, "step": 11720 }, { "epoch": 0.8656826568265683, "grad_norm": 0.4387909471988678, "learning_rate": 5.395709132906568e-06, "loss": 2.8372, "step": 11730 }, { "epoch": 0.8664206642066421, "grad_norm": 0.4543474018573761, "learning_rate": 5.337653885413513e-06, "loss": 2.8331, "step": 11740 }, { "epoch": 0.8671586715867159, "grad_norm": 0.45128577947616577, "learning_rate": 5.279895037394566e-06, "loss": 2.8062, "step": 11750 }, { "epoch": 0.8678966789667897, "grad_norm": 0.4404621422290802, "learning_rate": 5.222432972163482e-06, "loss": 2.9088, "step": 11760 }, { "epoch": 0.8686346863468635, "grad_norm": 0.4398937225341797, "learning_rate": 5.165268071064455e-06, "loss": 2.7826, "step": 11770 }, { "epoch": 0.8693726937269373, "grad_norm": 0.4395955204963684, "learning_rate": 5.108400713469546e-06, "loss": 2.8196, "step": 11780 }, { "epoch": 0.870110701107011, "grad_norm": 0.43461933732032776, "learning_rate": 5.051831276776203e-06, "loss": 2.8663, "step": 11790 }, { "epoch": 0.8708487084870848, "grad_norm": 0.4447794258594513, "learning_rate": 4.995560136404709e-06, "loss": 2.8519, "step": 11800 }, { "epoch": 0.8715867158671586, "grad_norm": 0.4266679286956787, "learning_rate": 4.939587665795736e-06, "loss": 2.8062, "step": 11810 }, { "epoch": 0.8723247232472324, "grad_norm": 0.4411248564720154, "learning_rate": 4.88391423640786e-06, "loss": 2.8758, "step": 11820 }, { "epoch": 0.8730627306273063, "grad_norm": 0.44381076097488403, "learning_rate": 4.828540217715066e-06, "loss": 2.7979, "step": 11830 }, { "epoch": 0.8738007380073801, "grad_norm": 0.44569119811058044, "learning_rate": 4.773465977204311e-06, "loss": 2.8081, "step": 11840 }, { "epoch": 0.8745387453874539, "grad_norm": 0.48127833008766174, "learning_rate": 4.718691880373094e-06, "loss": 2.8617, "step": 11850 }, { "epoch": 0.8752767527675277, "grad_norm": 0.45613643527030945, "learning_rate": 4.664218290727035e-06, "loss": 2.8187, "step": 11860 }, { "epoch": 0.8760147601476015, "grad_norm": 0.440491646528244, "learning_rate": 4.610045569777444e-06, "loss": 2.8023, "step": 11870 }, { "epoch": 0.8767527675276753, "grad_norm": 0.4358707368373871, "learning_rate": 4.5561740770389275e-06, "loss": 2.8102, "step": 11880 }, { "epoch": 0.8774907749077491, "grad_norm": 0.43503841757774353, "learning_rate": 4.502604170027019e-06, "loss": 2.8204, "step": 11890 }, { "epoch": 0.8782287822878229, "grad_norm": 0.4486919343471527, "learning_rate": 4.449336204255777e-06, "loss": 2.8827, "step": 11900 }, { "epoch": 0.8789667896678967, "grad_norm": 0.43869447708129883, "learning_rate": 4.396370533235455e-06, "loss": 2.8374, "step": 11910 }, { "epoch": 0.8797047970479704, "grad_norm": 0.45128440856933594, "learning_rate": 4.343707508470135e-06, "loss": 2.8906, "step": 11920 }, { "epoch": 0.8804428044280442, "grad_norm": 0.46216467022895813, "learning_rate": 4.291347479455405e-06, "loss": 2.8381, "step": 11930 }, { "epoch": 0.8811808118081181, "grad_norm": 0.4366297721862793, "learning_rate": 4.2392907936760265e-06, "loss": 2.8183, "step": 11940 }, { "epoch": 0.8819188191881919, "grad_norm": 0.45038753747940063, "learning_rate": 4.187537796603658e-06, "loss": 2.7906, "step": 11950 }, { "epoch": 0.8826568265682657, "grad_norm": 0.45959797501564026, "learning_rate": 4.136088831694524e-06, "loss": 2.8724, "step": 11960 }, { "epoch": 0.8833948339483395, "grad_norm": 0.4413219392299652, "learning_rate": 4.084944240387168e-06, "loss": 2.8541, "step": 11970 }, { "epoch": 0.8841328413284133, "grad_norm": 0.47469910979270935, "learning_rate": 4.034104362100155e-06, "loss": 2.9288, "step": 11980 }, { "epoch": 0.8848708487084871, "grad_norm": 0.43708014488220215, "learning_rate": 3.983569534229864e-06, "loss": 2.7833, "step": 11990 }, { "epoch": 0.8856088560885609, "grad_norm": 0.44569307565689087, "learning_rate": 3.933340092148202e-06, "loss": 2.8684, "step": 12000 }, { "epoch": 0.8863468634686347, "grad_norm": 0.462568998336792, "learning_rate": 3.883416369200399e-06, "loss": 2.8399, "step": 12010 }, { "epoch": 0.8870848708487085, "grad_norm": 0.4384634494781494, "learning_rate": 3.8337986967028e-06, "loss": 2.837, "step": 12020 }, { "epoch": 0.8878228782287823, "grad_norm": 0.46717679500579834, "learning_rate": 3.7844874039406674e-06, "loss": 2.8523, "step": 12030 }, { "epoch": 0.888560885608856, "grad_norm": 0.4314653277397156, "learning_rate": 3.7354828181659695e-06, "loss": 2.8815, "step": 12040 }, { "epoch": 0.8892988929889298, "grad_norm": 0.43344810605049133, "learning_rate": 3.6867852645952494e-06, "loss": 2.7918, "step": 12050 }, { "epoch": 0.8900369003690037, "grad_norm": 0.46255967020988464, "learning_rate": 3.6383950664074405e-06, "loss": 2.8106, "step": 12060 }, { "epoch": 0.8907749077490775, "grad_norm": 0.44985824823379517, "learning_rate": 3.5903125447417196e-06, "loss": 2.8244, "step": 12070 }, { "epoch": 0.8915129151291513, "grad_norm": 0.441011518239975, "learning_rate": 3.5425380186953904e-06, "loss": 2.8061, "step": 12080 }, { "epoch": 0.8922509225092251, "grad_norm": 0.4453372359275818, "learning_rate": 3.495071805321759e-06, "loss": 2.9384, "step": 12090 }, { "epoch": 0.8929889298892989, "grad_norm": 0.43761390447616577, "learning_rate": 3.447914219628029e-06, "loss": 2.7863, "step": 12100 }, { "epoch": 0.8937269372693727, "grad_norm": 0.4433492124080658, "learning_rate": 3.4010655745731865e-06, "loss": 2.8553, "step": 12110 }, { "epoch": 0.8944649446494465, "grad_norm": 0.43299391865730286, "learning_rate": 3.354526181066003e-06, "loss": 2.7823, "step": 12120 }, { "epoch": 0.8952029520295203, "grad_norm": 0.45678773522377014, "learning_rate": 3.308296347962875e-06, "loss": 2.7281, "step": 12130 }, { "epoch": 0.8959409594095941, "grad_norm": 0.4413972795009613, "learning_rate": 3.2623763820658237e-06, "loss": 2.8478, "step": 12140 }, { "epoch": 0.8966789667896679, "grad_norm": 0.44608476758003235, "learning_rate": 3.2167665881204567e-06, "loss": 2.7823, "step": 12150 }, { "epoch": 0.8974169741697416, "grad_norm": 0.4420614242553711, "learning_rate": 3.171467268813938e-06, "loss": 2.8281, "step": 12160 }, { "epoch": 0.8981549815498155, "grad_norm": 0.4385377764701843, "learning_rate": 3.1264787247729908e-06, "loss": 2.7918, "step": 12170 }, { "epoch": 0.8988929889298893, "grad_norm": 0.44008246064186096, "learning_rate": 3.0818012545618835e-06, "loss": 2.793, "step": 12180 }, { "epoch": 0.8996309963099631, "grad_norm": 0.44634199142456055, "learning_rate": 3.0374351546804514e-06, "loss": 2.7829, "step": 12190 }, { "epoch": 0.9003690036900369, "grad_norm": 0.4375803768634796, "learning_rate": 2.9933807195621445e-06, "loss": 2.8107, "step": 12200 }, { "epoch": 0.9011070110701107, "grad_norm": 0.4388578534126282, "learning_rate": 2.9496382415720723e-06, "loss": 2.8524, "step": 12210 }, { "epoch": 0.9018450184501845, "grad_norm": 0.43253517150878906, "learning_rate": 2.9062080110050515e-06, "loss": 2.8215, "step": 12220 }, { "epoch": 0.9025830258302583, "grad_norm": 0.4246656894683838, "learning_rate": 2.8630903160836773e-06, "loss": 2.835, "step": 12230 }, { "epoch": 0.9033210332103321, "grad_norm": 0.4635641872882843, "learning_rate": 2.820285442956422e-06, "loss": 2.829, "step": 12240 }, { "epoch": 0.9040590405904059, "grad_norm": 0.4323824644088745, "learning_rate": 2.7777936756957333e-06, "loss": 2.7945, "step": 12250 }, { "epoch": 0.9047970479704797, "grad_norm": 0.4489029347896576, "learning_rate": 2.7356152962961567e-06, "loss": 2.8904, "step": 12260 }, { "epoch": 0.9055350553505535, "grad_norm": 0.4545091390609741, "learning_rate": 2.6937505846724165e-06, "loss": 2.8889, "step": 12270 }, { "epoch": 0.9062730627306274, "grad_norm": 0.4438563585281372, "learning_rate": 2.6521998186576357e-06, "loss": 2.836, "step": 12280 }, { "epoch": 0.9070110701107011, "grad_norm": 0.4264052212238312, "learning_rate": 2.610963274001438e-06, "loss": 2.7639, "step": 12290 }, { "epoch": 0.9077490774907749, "grad_norm": 0.4508605897426605, "learning_rate": 2.5700412243681417e-06, "loss": 2.7735, "step": 12300 }, { "epoch": 0.9084870848708487, "grad_norm": 0.4573262929916382, "learning_rate": 2.5294339413349076e-06, "loss": 2.8901, "step": 12310 }, { "epoch": 0.9092250922509225, "grad_norm": 0.4440000057220459, "learning_rate": 2.4891416943900014e-06, "loss": 2.8662, "step": 12320 }, { "epoch": 0.9099630996309963, "grad_norm": 0.4513186812400818, "learning_rate": 2.449164750930938e-06, "loss": 2.8268, "step": 12330 }, { "epoch": 0.9107011070110701, "grad_norm": 0.43622398376464844, "learning_rate": 2.409503376262762e-06, "loss": 2.8246, "step": 12340 }, { "epoch": 0.9114391143911439, "grad_norm": 0.44066351652145386, "learning_rate": 2.3701578335962206e-06, "loss": 2.7924, "step": 12350 }, { "epoch": 0.9121771217712177, "grad_norm": 0.4405202269554138, "learning_rate": 2.3311283840460994e-06, "loss": 2.8639, "step": 12360 }, { "epoch": 0.9129151291512915, "grad_norm": 0.4488193094730377, "learning_rate": 2.292415286629418e-06, "loss": 2.8531, "step": 12370 }, { "epoch": 0.9136531365313653, "grad_norm": 0.4245339632034302, "learning_rate": 2.254018798263763e-06, "loss": 2.8349, "step": 12380 }, { "epoch": 0.9143911439114392, "grad_norm": 0.43623387813568115, "learning_rate": 2.2159391737655466e-06, "loss": 2.8225, "step": 12390 }, { "epoch": 0.915129151291513, "grad_norm": 0.4482229948043823, "learning_rate": 2.1781766658483303e-06, "loss": 2.7716, "step": 12400 }, { "epoch": 0.9158671586715867, "grad_norm": 0.450795441865921, "learning_rate": 2.1407315251211422e-06, "loss": 2.7796, "step": 12410 }, { "epoch": 0.9166051660516605, "grad_norm": 0.45314326882362366, "learning_rate": 2.103604000086856e-06, "loss": 2.8009, "step": 12420 }, { "epoch": 0.9173431734317343, "grad_norm": 0.44693273305892944, "learning_rate": 2.066794337140443e-06, "loss": 2.8486, "step": 12430 }, { "epoch": 0.9180811808118081, "grad_norm": 0.43216079473495483, "learning_rate": 2.0303027805674445e-06, "loss": 2.7234, "step": 12440 }, { "epoch": 0.9188191881918819, "grad_norm": 0.45111674070358276, "learning_rate": 1.994129572542286e-06, "loss": 2.7963, "step": 12450 }, { "epoch": 0.9195571955719557, "grad_norm": 0.46144166588783264, "learning_rate": 1.958274953126693e-06, "loss": 2.8314, "step": 12460 }, { "epoch": 0.9202952029520295, "grad_norm": 0.45646706223487854, "learning_rate": 1.922739160268089e-06, "loss": 2.8796, "step": 12470 }, { "epoch": 0.9210332103321033, "grad_norm": 0.49224853515625, "learning_rate": 1.8875224297980332e-06, "loss": 2.7904, "step": 12480 }, { "epoch": 0.9217712177121771, "grad_norm": 0.44804316759109497, "learning_rate": 1.8526249954306241e-06, "loss": 2.7583, "step": 12490 }, { "epoch": 0.922509225092251, "grad_norm": 0.43229466676712036, "learning_rate": 1.8180470887609769e-06, "loss": 2.8608, "step": 12500 }, { "epoch": 0.9232472324723248, "grad_norm": 0.43958374857902527, "learning_rate": 1.7837889392636864e-06, "loss": 2.8282, "step": 12510 }, { "epoch": 0.9239852398523986, "grad_norm": 0.4417596459388733, "learning_rate": 1.7498507742912784e-06, "loss": 2.8048, "step": 12520 }, { "epoch": 0.9247232472324723, "grad_norm": 0.4306926429271698, "learning_rate": 1.7162328190727217e-06, "loss": 2.8095, "step": 12530 }, { "epoch": 0.9254612546125461, "grad_norm": 0.439455509185791, "learning_rate": 1.682935296711935e-06, "loss": 2.7822, "step": 12540 }, { "epoch": 0.9261992619926199, "grad_norm": 0.4519449472427368, "learning_rate": 1.6499584281862935e-06, "loss": 2.8494, "step": 12550 }, { "epoch": 0.9269372693726937, "grad_norm": 0.4483802318572998, "learning_rate": 1.6173024323451747e-06, "loss": 2.8629, "step": 12560 }, { "epoch": 0.9276752767527675, "grad_norm": 0.4460211396217346, "learning_rate": 1.5849675259084872e-06, "loss": 2.8258, "step": 12570 }, { "epoch": 0.9284132841328413, "grad_norm": 0.43958115577697754, "learning_rate": 1.5529539234652668e-06, "loss": 2.8093, "step": 12580 }, { "epoch": 0.9291512915129151, "grad_norm": 0.46250835061073303, "learning_rate": 1.5212618374722155e-06, "loss": 2.828, "step": 12590 }, { "epoch": 0.9298892988929889, "grad_norm": 0.46097636222839355, "learning_rate": 1.4898914782523143e-06, "loss": 2.8305, "step": 12600 }, { "epoch": 0.9306273062730628, "grad_norm": 0.4385923445224762, "learning_rate": 1.458843053993403e-06, "loss": 2.7875, "step": 12610 }, { "epoch": 0.9313653136531366, "grad_norm": 0.44254031777381897, "learning_rate": 1.4281167707468457e-06, "loss": 2.8113, "step": 12620 }, { "epoch": 0.9321033210332104, "grad_norm": 0.4598987102508545, "learning_rate": 1.3977128324261068e-06, "loss": 2.8511, "step": 12630 }, { "epoch": 0.9328413284132842, "grad_norm": 0.4526178240776062, "learning_rate": 1.3676314408054391e-06, "loss": 2.7979, "step": 12640 }, { "epoch": 0.933579335793358, "grad_norm": 0.45094090700149536, "learning_rate": 1.3378727955185244e-06, "loss": 2.8319, "step": 12650 }, { "epoch": 0.9343173431734317, "grad_norm": 0.45027512311935425, "learning_rate": 1.3084370940571577e-06, "loss": 2.8245, "step": 12660 }, { "epoch": 0.9350553505535055, "grad_norm": 0.4329124391078949, "learning_rate": 1.2793245317699321e-06, "loss": 2.7542, "step": 12670 }, { "epoch": 0.9357933579335793, "grad_norm": 0.4586227536201477, "learning_rate": 1.2505353018609444e-06, "loss": 2.7729, "step": 12680 }, { "epoch": 0.9365313653136531, "grad_norm": 0.4397171437740326, "learning_rate": 1.2220695953885031e-06, "loss": 2.8164, "step": 12690 }, { "epoch": 0.9372693726937269, "grad_norm": 0.4415930211544037, "learning_rate": 1.1939276012638723e-06, "loss": 2.8644, "step": 12700 }, { "epoch": 0.9380073800738007, "grad_norm": 0.43980923295021057, "learning_rate": 1.1661095062500237e-06, "loss": 2.8716, "step": 12710 }, { "epoch": 0.9387453874538746, "grad_norm": 0.46194180846214294, "learning_rate": 1.1386154949603934e-06, "loss": 2.8307, "step": 12720 }, { "epoch": 0.9394833948339484, "grad_norm": 0.4496355652809143, "learning_rate": 1.1114457498576258e-06, "loss": 2.7868, "step": 12730 }, { "epoch": 0.9402214022140222, "grad_norm": 0.4483359456062317, "learning_rate": 1.0846004512524211e-06, "loss": 2.8357, "step": 12740 }, { "epoch": 0.940959409594096, "grad_norm": 0.44404512643814087, "learning_rate": 1.0580797773022733e-06, "loss": 2.8843, "step": 12750 }, { "epoch": 0.9416974169741698, "grad_norm": 0.4440787136554718, "learning_rate": 1.03188390401035e-06, "loss": 2.8038, "step": 12760 }, { "epoch": 0.9424354243542435, "grad_norm": 0.4445192813873291, "learning_rate": 1.006013005224271e-06, "loss": 2.813, "step": 12770 }, { "epoch": 0.9431734317343173, "grad_norm": 0.4234587550163269, "learning_rate": 9.80467252634998e-07, "loss": 2.8414, "step": 12780 }, { "epoch": 0.9439114391143911, "grad_norm": 0.4393916726112366, "learning_rate": 9.552468157756622e-07, "loss": 2.7851, "step": 12790 }, { "epoch": 0.9446494464944649, "grad_norm": 0.4591200053691864, "learning_rate": 9.303518620204677e-07, "loss": 2.8378, "step": 12800 }, { "epoch": 0.9453874538745387, "grad_norm": 0.43322470784187317, "learning_rate": 9.057825565835399e-07, "loss": 2.7366, "step": 12810 }, { "epoch": 0.9461254612546125, "grad_norm": 0.4324533939361572, "learning_rate": 8.815390625178887e-07, "loss": 2.7483, "step": 12820 }, { "epoch": 0.9468634686346864, "grad_norm": 0.4632011950016022, "learning_rate": 8.576215407142651e-07, "loss": 2.7874, "step": 12830 }, { "epoch": 0.9476014760147602, "grad_norm": 0.4332893490791321, "learning_rate": 8.340301499001446e-07, "loss": 2.8252, "step": 12840 }, { "epoch": 0.948339483394834, "grad_norm": 0.436294287443161, "learning_rate": 8.107650466386285e-07, "loss": 2.8445, "step": 12850 }, { "epoch": 0.9490774907749078, "grad_norm": 0.43967026472091675, "learning_rate": 7.878263853274281e-07, "loss": 2.8411, "step": 12860 }, { "epoch": 0.9498154981549816, "grad_norm": 0.45120909810066223, "learning_rate": 7.652143181978655e-07, "loss": 2.8118, "step": 12870 }, { "epoch": 0.9505535055350554, "grad_norm": 0.4368390738964081, "learning_rate": 7.429289953138019e-07, "loss": 2.8086, "step": 12880 }, { "epoch": 0.9512915129151291, "grad_norm": 0.4452465772628784, "learning_rate": 7.209705645706944e-07, "loss": 2.8468, "step": 12890 }, { "epoch": 0.9520295202952029, "grad_norm": 0.4445231258869171, "learning_rate": 6.993391716946019e-07, "loss": 2.8114, "step": 12900 }, { "epoch": 0.9527675276752767, "grad_norm": 0.43402281403541565, "learning_rate": 6.780349602411918e-07, "loss": 2.8352, "step": 12910 }, { "epoch": 0.9535055350553505, "grad_norm": 0.45803192257881165, "learning_rate": 6.570580715948404e-07, "loss": 2.8013, "step": 12920 }, { "epoch": 0.9542435424354243, "grad_norm": 0.45193520188331604, "learning_rate": 6.364086449676232e-07, "loss": 2.8368, "step": 12930 }, { "epoch": 0.9549815498154982, "grad_norm": 0.44040247797966003, "learning_rate": 6.160868173984591e-07, "loss": 2.8559, "step": 12940 }, { "epoch": 0.955719557195572, "grad_norm": 0.4719098210334778, "learning_rate": 5.960927237521563e-07, "loss": 2.85, "step": 12950 }, { "epoch": 0.9564575645756458, "grad_norm": 0.4502539336681366, "learning_rate": 5.764264967185462e-07, "loss": 2.9074, "step": 12960 }, { "epoch": 0.9571955719557196, "grad_norm": 0.4299696683883667, "learning_rate": 5.570882668115784e-07, "loss": 2.7595, "step": 12970 }, { "epoch": 0.9579335793357934, "grad_norm": 0.44181373715400696, "learning_rate": 5.380781623684661e-07, "loss": 2.8024, "step": 12980 }, { "epoch": 0.9586715867158672, "grad_norm": 0.437763512134552, "learning_rate": 5.193963095488419e-07, "loss": 2.8231, "step": 12990 }, { "epoch": 0.959409594095941, "grad_norm": 0.4234910011291504, "learning_rate": 5.010428323339033e-07, "loss": 2.8898, "step": 13000 }, { "epoch": 0.9601476014760147, "grad_norm": 0.45260801911354065, "learning_rate": 4.830178525256079e-07, "loss": 2.8558, "step": 13010 }, { "epoch": 0.9608856088560885, "grad_norm": 0.4440422058105469, "learning_rate": 4.653214897458513e-07, "loss": 2.8007, "step": 13020 }, { "epoch": 0.9616236162361623, "grad_norm": 0.4362104833126068, "learning_rate": 4.4795386143567374e-07, "loss": 2.8271, "step": 13030 }, { "epoch": 0.9623616236162361, "grad_norm": 0.44079768657684326, "learning_rate": 4.309150828544939e-07, "loss": 2.8371, "step": 13040 }, { "epoch": 0.9630996309963099, "grad_norm": 0.46145325899124146, "learning_rate": 4.1420526707933727e-07, "loss": 2.8808, "step": 13050 }, { "epoch": 0.9638376383763838, "grad_norm": 0.4297032058238983, "learning_rate": 3.978245250040702e-07, "loss": 2.8506, "step": 13060 }, { "epoch": 0.9645756457564576, "grad_norm": 0.4474189579486847, "learning_rate": 3.817729653386892e-07, "loss": 2.8261, "step": 13070 }, { "epoch": 0.9653136531365314, "grad_norm": 0.43458986282348633, "learning_rate": 3.660506946085829e-07, "loss": 2.8319, "step": 13080 }, { "epoch": 0.9660516605166052, "grad_norm": 0.4418502151966095, "learning_rate": 3.506578171538377e-07, "loss": 2.8326, "step": 13090 }, { "epoch": 0.966789667896679, "grad_norm": 0.4373183846473694, "learning_rate": 3.355944351285278e-07, "loss": 2.7896, "step": 13100 }, { "epoch": 0.9675276752767528, "grad_norm": 0.4467260241508484, "learning_rate": 3.2086064850004314e-07, "loss": 2.8499, "step": 13110 }, { "epoch": 0.9682656826568266, "grad_norm": 0.45079532265663147, "learning_rate": 3.064565550484455e-07, "loss": 2.8005, "step": 13120 }, { "epoch": 0.9690036900369003, "grad_norm": 0.4311223328113556, "learning_rate": 2.9238225036579693e-07, "loss": 2.8419, "step": 13130 }, { "epoch": 0.9697416974169741, "grad_norm": 0.4524695575237274, "learning_rate": 2.7863782785552685e-07, "loss": 2.8581, "step": 13140 }, { "epoch": 0.9704797047970479, "grad_norm": 0.4483130872249603, "learning_rate": 2.65223378731827e-07, "loss": 2.8275, "step": 13150 }, { "epoch": 0.9712177121771217, "grad_norm": 0.4370816946029663, "learning_rate": 2.521389920190298e-07, "loss": 2.8673, "step": 13160 }, { "epoch": 0.9719557195571956, "grad_norm": 0.444195032119751, "learning_rate": 2.3938475455103083e-07, "loss": 2.9407, "step": 13170 }, { "epoch": 0.9726937269372694, "grad_norm": 0.44004592299461365, "learning_rate": 2.269607509707006e-07, "loss": 2.8481, "step": 13180 }, { "epoch": 0.9734317343173432, "grad_norm": 0.44630327820777893, "learning_rate": 2.1486706372932375e-07, "loss": 2.7954, "step": 13190 }, { "epoch": 0.974169741697417, "grad_norm": 0.42796429991722107, "learning_rate": 2.031037730860774e-07, "loss": 2.8533, "step": 13200 }, { "epoch": 0.9749077490774908, "grad_norm": 0.4611528217792511, "learning_rate": 1.916709571074482e-07, "loss": 2.8151, "step": 13210 }, { "epoch": 0.9756457564575646, "grad_norm": 0.451028972864151, "learning_rate": 1.8056869166677703e-07, "loss": 2.8355, "step": 13220 }, { "epoch": 0.9763837638376384, "grad_norm": 0.4451844096183777, "learning_rate": 1.6979705044369297e-07, "loss": 2.8121, "step": 13230 }, { "epoch": 0.9771217712177122, "grad_norm": 0.4613220989704132, "learning_rate": 1.5935610492366915e-07, "loss": 2.9067, "step": 13240 }, { "epoch": 0.977859778597786, "grad_norm": 0.44495347142219543, "learning_rate": 1.4924592439753416e-07, "loss": 2.7666, "step": 13250 }, { "epoch": 0.9785977859778597, "grad_norm": 0.4585348963737488, "learning_rate": 1.394665759610003e-07, "loss": 2.7254, "step": 13260 }, { "epoch": 0.9793357933579335, "grad_norm": 0.43729352951049805, "learning_rate": 1.3001812451423068e-07, "loss": 2.778, "step": 13270 }, { "epoch": 0.9800738007380074, "grad_norm": 0.450089693069458, "learning_rate": 1.209006327614226e-07, "loss": 2.809, "step": 13280 }, { "epoch": 0.9808118081180812, "grad_norm": 0.43959712982177734, "learning_rate": 1.1211416121035823e-07, "loss": 2.8325, "step": 13290 }, { "epoch": 0.981549815498155, "grad_norm": 0.4504597783088684, "learning_rate": 1.036587681720269e-07, "loss": 2.7841, "step": 13300 }, { "epoch": 0.9822878228782288, "grad_norm": 0.44741228222846985, "learning_rate": 9.55345097602256e-08, "loss": 2.8358, "step": 13310 }, { "epoch": 0.9830258302583026, "grad_norm": 0.4463639557361603, "learning_rate": 8.774143989119798e-08, "loss": 2.8313, "step": 13320 }, { "epoch": 0.9837638376383764, "grad_norm": 0.4775594472885132, "learning_rate": 8.027961028328479e-08, "loss": 2.8781, "step": 13330 }, { "epoch": 0.9845018450184502, "grad_norm": 0.4243060350418091, "learning_rate": 7.314907045653519e-08, "loss": 2.7926, "step": 13340 }, { "epoch": 0.985239852398524, "grad_norm": 0.43475958704948425, "learning_rate": 6.634986773244034e-08, "loss": 2.7885, "step": 13350 }, { "epoch": 0.9859778597785978, "grad_norm": 0.4415262043476105, "learning_rate": 5.988204723356705e-08, "loss": 2.7721, "step": 13360 }, { "epoch": 0.9867158671586715, "grad_norm": 0.438672810792923, "learning_rate": 5.374565188329683e-08, "loss": 2.8138, "step": 13370 }, { "epoch": 0.9874538745387453, "grad_norm": 0.46068814396858215, "learning_rate": 4.794072240550951e-08, "loss": 2.7988, "step": 13380 }, { "epoch": 0.9881918819188192, "grad_norm": 0.44185954332351685, "learning_rate": 4.246729732434451e-08, "loss": 2.7823, "step": 13390 }, { "epoch": 0.988929889298893, "grad_norm": 0.4282056391239166, "learning_rate": 3.7325412963912235e-08, "loss": 2.872, "step": 13400 }, { "epoch": 0.9896678966789668, "grad_norm": 0.46537652611732483, "learning_rate": 3.251510344807751e-08, "loss": 2.9374, "step": 13410 }, { "epoch": 0.9904059040590406, "grad_norm": 0.4430101215839386, "learning_rate": 2.8036400700232058e-08, "loss": 2.7839, "step": 13420 }, { "epoch": 0.9911439114391144, "grad_norm": 0.45416316390037537, "learning_rate": 2.3889334443055744e-08, "loss": 2.8689, "step": 13430 }, { "epoch": 0.9918819188191882, "grad_norm": 0.4388124346733093, "learning_rate": 2.007393219836118e-08, "loss": 2.9239, "step": 13440 }, { "epoch": 0.992619926199262, "grad_norm": 0.43018996715545654, "learning_rate": 1.6590219286871655e-08, "loss": 2.8412, "step": 13450 }, { "epoch": 0.9933579335793358, "grad_norm": 0.42218539118766785, "learning_rate": 1.3438218828076832e-08, "loss": 2.7462, "step": 13460 }, { "epoch": 0.9940959409594096, "grad_norm": 0.4494752883911133, "learning_rate": 1.0617951740077292e-08, "loss": 2.8598, "step": 13470 }, { "epoch": 0.9948339483394834, "grad_norm": 0.41235294938087463, "learning_rate": 8.12943673943467e-09, "loss": 2.8083, "step": 13480 }, { "epoch": 0.9955719557195571, "grad_norm": 0.4434475004673004, "learning_rate": 5.9726903410661786e-09, "loss": 2.929, "step": 13490 }, { "epoch": 0.996309963099631, "grad_norm": 0.43739476799964905, "learning_rate": 4.147726858100276e-09, "loss": 2.844, "step": 13500 }, { "epoch": 0.9970479704797048, "grad_norm": 0.46633192896842957, "learning_rate": 2.6545584018211613e-09, "loss": 2.8096, "step": 13510 }, { "epoch": 0.9977859778597786, "grad_norm": 0.4500004053115845, "learning_rate": 1.4931948815744e-09, "loss": 2.8317, "step": 13520 }, { "epoch": 0.9985239852398524, "grad_norm": 0.45538780093193054, "learning_rate": 6.636440046892123e-10, "loss": 2.8792, "step": 13530 }, { "epoch": 0.9992619926199262, "grad_norm": 0.4632636308670044, "learning_rate": 1.6591127643961202e-10, "loss": 2.8205, "step": 13540 }, { "epoch": 1.0, "grad_norm": 0.4356023073196411, "learning_rate": 0.0, "loss": 2.8161, "step": 13550 }, { "epoch": 1.0, "step": 13550, "total_flos": 5.404563590201999e+18, "train_loss": 3.236852196415412, "train_runtime": 292848.6684, "train_samples_per_second": 0.74, "train_steps_per_second": 0.046 } ], "logging_steps": 10, "max_steps": 13550, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 4000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.404563590201999e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }