diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,9527 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 13550, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007380073800738007, + "grad_norm": 100.91170501708984, + "learning_rate": 7.380073800738008e-07, + "loss": 15.7032, + "step": 10 + }, + { + "epoch": 0.0014760147601476014, + "grad_norm": 53.92641830444336, + "learning_rate": 1.4760147601476015e-06, + "loss": 12.3986, + "step": 20 + }, + { + "epoch": 0.002214022140221402, + "grad_norm": 22.292367935180664, + "learning_rate": 2.2140221402214023e-06, + "loss": 10.8644, + "step": 30 + }, + { + "epoch": 0.002952029520295203, + "grad_norm": 9.494361877441406, + "learning_rate": 2.952029520295203e-06, + "loss": 10.0464, + "step": 40 + }, + { + "epoch": 0.0036900369003690036, + "grad_norm": 5.657174110412598, + "learning_rate": 3.690036900369004e-06, + "loss": 9.8267, + "step": 50 + }, + { + "epoch": 0.004428044280442804, + "grad_norm": 11.480326652526855, + "learning_rate": 4.428044280442805e-06, + "loss": 9.6732, + "step": 60 + }, + { + "epoch": 0.0051660516605166054, + "grad_norm": 32.78021240234375, + "learning_rate": 5.166051660516605e-06, + "loss": 9.6283, + "step": 70 + }, + { + "epoch": 0.005904059040590406, + "grad_norm": 26.172266006469727, + "learning_rate": 5.904059040590406e-06, + "loss": 9.4751, + "step": 80 + }, + { + "epoch": 0.006642066420664207, + "grad_norm": 33.70742416381836, + "learning_rate": 6.642066420664207e-06, + "loss": 9.3959, + "step": 90 + }, + { + "epoch": 0.007380073800738007, + "grad_norm": 63.11279296875, + "learning_rate": 7.380073800738008e-06, + "loss": 9.3828, + "step": 100 + }, + { + "epoch": 0.008118081180811807, + "grad_norm": 15.8975191116333, + "learning_rate": 8.118081180811808e-06, + "loss": 9.2352, + "step": 110 + }, + { + "epoch": 0.008856088560885609, + "grad_norm": 12.312295913696289, + "learning_rate": 8.85608856088561e-06, + "loss": 9.1436, + "step": 120 + }, + { + "epoch": 0.00959409594095941, + "grad_norm": 10.606693267822266, + "learning_rate": 9.59409594095941e-06, + "loss": 8.8854, + "step": 130 + }, + { + "epoch": 0.010332103321033211, + "grad_norm": 16.000524520874023, + "learning_rate": 1.033210332103321e-05, + "loss": 8.703, + "step": 140 + }, + { + "epoch": 0.01107011070110701, + "grad_norm": 14.297750473022461, + "learning_rate": 1.1070110701107012e-05, + "loss": 8.5243, + "step": 150 + }, + { + "epoch": 0.011808118081180811, + "grad_norm": 11.472665786743164, + "learning_rate": 1.1808118081180812e-05, + "loss": 8.232, + "step": 160 + }, + { + "epoch": 0.012546125461254613, + "grad_norm": 7.633975028991699, + "learning_rate": 1.2546125461254612e-05, + "loss": 8.0453, + "step": 170 + }, + { + "epoch": 0.013284132841328414, + "grad_norm": 7.606258869171143, + "learning_rate": 1.3284132841328414e-05, + "loss": 7.9445, + "step": 180 + }, + { + "epoch": 0.014022140221402213, + "grad_norm": 13.680715560913086, + "learning_rate": 1.4022140221402214e-05, + "loss": 7.9335, + "step": 190 + }, + { + "epoch": 0.014760147601476014, + "grad_norm": 10.28775405883789, + "learning_rate": 1.4760147601476015e-05, + "loss": 7.7923, + "step": 200 + }, + { + "epoch": 0.015498154981549815, + "grad_norm": 7.461697101593018, + "learning_rate": 1.5498154981549817e-05, + "loss": 7.763, + "step": 210 + }, + { + "epoch": 0.016236162361623615, + "grad_norm": 4.384743690490723, + "learning_rate": 1.6236162361623615e-05, + "loss": 7.7702, + "step": 220 + }, + { + "epoch": 0.016974169741697416, + "grad_norm": 5.806989669799805, + "learning_rate": 1.6974169741697417e-05, + "loss": 7.76, + "step": 230 + }, + { + "epoch": 0.017712177121771217, + "grad_norm": 5.75732421875, + "learning_rate": 1.771217712177122e-05, + "loss": 7.6101, + "step": 240 + }, + { + "epoch": 0.01845018450184502, + "grad_norm": 3.3278968334198, + "learning_rate": 1.845018450184502e-05, + "loss": 7.5669, + "step": 250 + }, + { + "epoch": 0.01918819188191882, + "grad_norm": 5.252697467803955, + "learning_rate": 1.918819188191882e-05, + "loss": 7.3905, + "step": 260 + }, + { + "epoch": 0.01992619926199262, + "grad_norm": 3.135658025741577, + "learning_rate": 1.992619926199262e-05, + "loss": 7.3472, + "step": 270 + }, + { + "epoch": 0.020664206642066422, + "grad_norm": 5.030785083770752, + "learning_rate": 2.066420664206642e-05, + "loss": 7.2426, + "step": 280 + }, + { + "epoch": 0.021402214022140223, + "grad_norm": 4.882932186126709, + "learning_rate": 2.140221402214022e-05, + "loss": 7.0541, + "step": 290 + }, + { + "epoch": 0.02214022140221402, + "grad_norm": 2.2638933658599854, + "learning_rate": 2.2140221402214025e-05, + "loss": 7.0113, + "step": 300 + }, + { + "epoch": 0.022878228782287822, + "grad_norm": 4.782796859741211, + "learning_rate": 2.2878228782287826e-05, + "loss": 6.8661, + "step": 310 + }, + { + "epoch": 0.023616236162361623, + "grad_norm": 1.9799453020095825, + "learning_rate": 2.3616236162361624e-05, + "loss": 7.0323, + "step": 320 + }, + { + "epoch": 0.024354243542435424, + "grad_norm": 4.8417558670043945, + "learning_rate": 2.4354243542435426e-05, + "loss": 6.8865, + "step": 330 + }, + { + "epoch": 0.025092250922509225, + "grad_norm": 4.531852722167969, + "learning_rate": 2.5092250922509224e-05, + "loss": 6.7982, + "step": 340 + }, + { + "epoch": 0.025830258302583026, + "grad_norm": 3.0997111797332764, + "learning_rate": 2.5830258302583026e-05, + "loss": 6.79, + "step": 350 + }, + { + "epoch": 0.026568265682656828, + "grad_norm": 3.2981128692626953, + "learning_rate": 2.6568265682656828e-05, + "loss": 6.7459, + "step": 360 + }, + { + "epoch": 0.02730627306273063, + "grad_norm": 3.313589572906494, + "learning_rate": 2.730627306273063e-05, + "loss": 6.637, + "step": 370 + }, + { + "epoch": 0.028044280442804426, + "grad_norm": 2.1940433979034424, + "learning_rate": 2.8044280442804427e-05, + "loss": 6.5645, + "step": 380 + }, + { + "epoch": 0.028782287822878228, + "grad_norm": 3.6912360191345215, + "learning_rate": 2.878228782287823e-05, + "loss": 6.4398, + "step": 390 + }, + { + "epoch": 0.02952029520295203, + "grad_norm": 3.37406325340271, + "learning_rate": 2.952029520295203e-05, + "loss": 6.4774, + "step": 400 + }, + { + "epoch": 0.03025830258302583, + "grad_norm": 3.22963285446167, + "learning_rate": 3.0258302583025832e-05, + "loss": 6.3106, + "step": 410 + }, + { + "epoch": 0.03099630996309963, + "grad_norm": 2.419431686401367, + "learning_rate": 3.0996309963099634e-05, + "loss": 6.3172, + "step": 420 + }, + { + "epoch": 0.03173431734317343, + "grad_norm": 2.677661895751953, + "learning_rate": 3.173431734317343e-05, + "loss": 6.1359, + "step": 430 + }, + { + "epoch": 0.03247232472324723, + "grad_norm": 2.795398712158203, + "learning_rate": 3.247232472324723e-05, + "loss": 6.2412, + "step": 440 + }, + { + "epoch": 0.033210332103321034, + "grad_norm": 2.9979288578033447, + "learning_rate": 3.3210332103321035e-05, + "loss": 6.2192, + "step": 450 + }, + { + "epoch": 0.03394833948339483, + "grad_norm": 3.352975845336914, + "learning_rate": 3.3948339483394833e-05, + "loss": 6.1654, + "step": 460 + }, + { + "epoch": 0.03468634686346864, + "grad_norm": 2.6526570320129395, + "learning_rate": 3.468634686346864e-05, + "loss": 6.0669, + "step": 470 + }, + { + "epoch": 0.035424354243542434, + "grad_norm": 2.950063467025757, + "learning_rate": 3.542435424354244e-05, + "loss": 6.0332, + "step": 480 + }, + { + "epoch": 0.03616236162361624, + "grad_norm": 4.221488952636719, + "learning_rate": 3.6162361623616235e-05, + "loss": 5.9708, + "step": 490 + }, + { + "epoch": 0.03690036900369004, + "grad_norm": 2.6405985355377197, + "learning_rate": 3.690036900369004e-05, + "loss": 5.76, + "step": 500 + }, + { + "epoch": 0.037638376383763834, + "grad_norm": 4.019585132598877, + "learning_rate": 3.763837638376384e-05, + "loss": 5.9036, + "step": 510 + }, + { + "epoch": 0.03837638376383764, + "grad_norm": 2.687580108642578, + "learning_rate": 3.837638376383764e-05, + "loss": 5.808, + "step": 520 + }, + { + "epoch": 0.03911439114391144, + "grad_norm": 3.339268207550049, + "learning_rate": 3.911439114391144e-05, + "loss": 5.7391, + "step": 530 + }, + { + "epoch": 0.03985239852398524, + "grad_norm": 3.0441882610321045, + "learning_rate": 3.985239852398524e-05, + "loss": 5.7045, + "step": 540 + }, + { + "epoch": 0.04059040590405904, + "grad_norm": 2.8957650661468506, + "learning_rate": 4.0590405904059045e-05, + "loss": 5.6956, + "step": 550 + }, + { + "epoch": 0.041328413284132844, + "grad_norm": 3.6834869384765625, + "learning_rate": 4.132841328413284e-05, + "loss": 5.6103, + "step": 560 + }, + { + "epoch": 0.04206642066420664, + "grad_norm": 3.4573564529418945, + "learning_rate": 4.206642066420665e-05, + "loss": 5.4107, + "step": 570 + }, + { + "epoch": 0.042804428044280446, + "grad_norm": 3.2341487407684326, + "learning_rate": 4.280442804428044e-05, + "loss": 5.4208, + "step": 580 + }, + { + "epoch": 0.043542435424354244, + "grad_norm": 3.6147806644439697, + "learning_rate": 4.3542435424354244e-05, + "loss": 5.4217, + "step": 590 + }, + { + "epoch": 0.04428044280442804, + "grad_norm": 3.6139488220214844, + "learning_rate": 4.428044280442805e-05, + "loss": 5.3994, + "step": 600 + }, + { + "epoch": 0.045018450184501846, + "grad_norm": 3.277580499649048, + "learning_rate": 4.501845018450185e-05, + "loss": 5.3605, + "step": 610 + }, + { + "epoch": 0.045756457564575644, + "grad_norm": 2.5641043186187744, + "learning_rate": 4.575645756457565e-05, + "loss": 5.1682, + "step": 620 + }, + { + "epoch": 0.04649446494464945, + "grad_norm": 2.422578811645508, + "learning_rate": 4.6494464944649444e-05, + "loss": 5.1585, + "step": 630 + }, + { + "epoch": 0.047232472324723246, + "grad_norm": 4.027858257293701, + "learning_rate": 4.723247232472325e-05, + "loss": 5.2147, + "step": 640 + }, + { + "epoch": 0.04797047970479705, + "grad_norm": 2.401747226715088, + "learning_rate": 4.797047970479705e-05, + "loss": 5.0839, + "step": 650 + }, + { + "epoch": 0.04870848708487085, + "grad_norm": 2.9220759868621826, + "learning_rate": 4.870848708487085e-05, + "loss": 5.1216, + "step": 660 + }, + { + "epoch": 0.04944649446494465, + "grad_norm": 2.4891719818115234, + "learning_rate": 4.944649446494466e-05, + "loss": 4.9789, + "step": 670 + }, + { + "epoch": 0.05018450184501845, + "grad_norm": 2.279683828353882, + "learning_rate": 5.018450184501845e-05, + "loss": 4.9611, + "step": 680 + }, + { + "epoch": 0.05092250922509225, + "grad_norm": 2.045536518096924, + "learning_rate": 5.0922509225092254e-05, + "loss": 4.8993, + "step": 690 + }, + { + "epoch": 0.05166051660516605, + "grad_norm": 1.9132373332977295, + "learning_rate": 5.166051660516605e-05, + "loss": 4.8083, + "step": 700 + }, + { + "epoch": 0.05239852398523985, + "grad_norm": 2.304215669631958, + "learning_rate": 5.239852398523986e-05, + "loss": 4.828, + "step": 710 + }, + { + "epoch": 0.053136531365313655, + "grad_norm": 2.2891597747802734, + "learning_rate": 5.3136531365313655e-05, + "loss": 4.7838, + "step": 720 + }, + { + "epoch": 0.05387453874538745, + "grad_norm": 2.411600351333618, + "learning_rate": 5.387453874538746e-05, + "loss": 4.6931, + "step": 730 + }, + { + "epoch": 0.05461254612546126, + "grad_norm": 1.6772541999816895, + "learning_rate": 5.461254612546126e-05, + "loss": 4.7027, + "step": 740 + }, + { + "epoch": 0.055350553505535055, + "grad_norm": 1.7979137897491455, + "learning_rate": 5.535055350553506e-05, + "loss": 4.7452, + "step": 750 + }, + { + "epoch": 0.05608856088560885, + "grad_norm": 2.3298912048339844, + "learning_rate": 5.6088560885608855e-05, + "loss": 4.7062, + "step": 760 + }, + { + "epoch": 0.05682656826568266, + "grad_norm": 1.986875295639038, + "learning_rate": 5.682656826568265e-05, + "loss": 4.6142, + "step": 770 + }, + { + "epoch": 0.057564575645756455, + "grad_norm": 1.8501532077789307, + "learning_rate": 5.756457564575646e-05, + "loss": 4.4524, + "step": 780 + }, + { + "epoch": 0.05830258302583026, + "grad_norm": 1.5959872007369995, + "learning_rate": 5.830258302583026e-05, + "loss": 4.5299, + "step": 790 + }, + { + "epoch": 0.05904059040590406, + "grad_norm": 2.339456796646118, + "learning_rate": 5.904059040590406e-05, + "loss": 4.4703, + "step": 800 + }, + { + "epoch": 0.05977859778597786, + "grad_norm": 1.6436880826950073, + "learning_rate": 5.9778597785977866e-05, + "loss": 4.463, + "step": 810 + }, + { + "epoch": 0.06051660516605166, + "grad_norm": 1.7336505651474, + "learning_rate": 6.0516605166051664e-05, + "loss": 4.4363, + "step": 820 + }, + { + "epoch": 0.061254612546125464, + "grad_norm": 1.691726565361023, + "learning_rate": 6.125461254612547e-05, + "loss": 4.4099, + "step": 830 + }, + { + "epoch": 0.06199261992619926, + "grad_norm": 1.5019862651824951, + "learning_rate": 6.199261992619927e-05, + "loss": 4.4153, + "step": 840 + }, + { + "epoch": 0.06273062730627306, + "grad_norm": 1.4851793050765991, + "learning_rate": 6.273062730627307e-05, + "loss": 4.4721, + "step": 850 + }, + { + "epoch": 0.06346863468634686, + "grad_norm": 1.4793798923492432, + "learning_rate": 6.346863468634686e-05, + "loss": 4.3287, + "step": 860 + }, + { + "epoch": 0.06420664206642067, + "grad_norm": 1.5791796445846558, + "learning_rate": 6.420664206642066e-05, + "loss": 4.3766, + "step": 870 + }, + { + "epoch": 0.06494464944649446, + "grad_norm": 1.5449219942092896, + "learning_rate": 6.494464944649446e-05, + "loss": 4.2412, + "step": 880 + }, + { + "epoch": 0.06568265682656826, + "grad_norm": 1.229464054107666, + "learning_rate": 6.568265682656827e-05, + "loss": 4.1558, + "step": 890 + }, + { + "epoch": 0.06642066420664207, + "grad_norm": 1.5863291025161743, + "learning_rate": 6.642066420664207e-05, + "loss": 4.2074, + "step": 900 + }, + { + "epoch": 0.06715867158671587, + "grad_norm": 1.319446086883545, + "learning_rate": 6.715867158671587e-05, + "loss": 4.258, + "step": 910 + }, + { + "epoch": 0.06789667896678966, + "grad_norm": 1.3132025003433228, + "learning_rate": 6.789667896678967e-05, + "loss": 4.1444, + "step": 920 + }, + { + "epoch": 0.06863468634686347, + "grad_norm": 1.5694645643234253, + "learning_rate": 6.863468634686348e-05, + "loss": 4.1201, + "step": 930 + }, + { + "epoch": 0.06937269372693727, + "grad_norm": 1.4163988828659058, + "learning_rate": 6.937269372693728e-05, + "loss": 4.1113, + "step": 940 + }, + { + "epoch": 0.07011070110701106, + "grad_norm": 1.487798810005188, + "learning_rate": 7.011070110701108e-05, + "loss": 4.0805, + "step": 950 + }, + { + "epoch": 0.07084870848708487, + "grad_norm": 1.2213908433914185, + "learning_rate": 7.084870848708487e-05, + "loss": 4.0324, + "step": 960 + }, + { + "epoch": 0.07158671586715867, + "grad_norm": 1.332588791847229, + "learning_rate": 7.158671586715867e-05, + "loss": 4.0455, + "step": 970 + }, + { + "epoch": 0.07232472324723248, + "grad_norm": 1.212963342666626, + "learning_rate": 7.232472324723247e-05, + "loss": 4.0044, + "step": 980 + }, + { + "epoch": 0.07306273062730627, + "grad_norm": 1.0928430557250977, + "learning_rate": 7.306273062730628e-05, + "loss": 4.0215, + "step": 990 + }, + { + "epoch": 0.07380073800738007, + "grad_norm": 1.1430400609970093, + "learning_rate": 7.380073800738008e-05, + "loss": 4.0744, + "step": 1000 + }, + { + "epoch": 0.07453874538745388, + "grad_norm": 0.9975944757461548, + "learning_rate": 7.453874538745388e-05, + "loss": 3.9955, + "step": 1010 + }, + { + "epoch": 0.07527675276752767, + "grad_norm": 1.1288777589797974, + "learning_rate": 7.527675276752768e-05, + "loss": 3.9916, + "step": 1020 + }, + { + "epoch": 0.07601476014760147, + "grad_norm": 1.0064688920974731, + "learning_rate": 7.601476014760149e-05, + "loss": 3.9025, + "step": 1030 + }, + { + "epoch": 0.07675276752767528, + "grad_norm": 1.329229474067688, + "learning_rate": 7.675276752767529e-05, + "loss": 3.9822, + "step": 1040 + }, + { + "epoch": 0.07749077490774908, + "grad_norm": 1.022760033607483, + "learning_rate": 7.749077490774908e-05, + "loss": 3.8762, + "step": 1050 + }, + { + "epoch": 0.07822878228782287, + "grad_norm": 1.0934398174285889, + "learning_rate": 7.822878228782288e-05, + "loss": 3.7911, + "step": 1060 + }, + { + "epoch": 0.07896678966789668, + "grad_norm": 1.008171796798706, + "learning_rate": 7.896678966789668e-05, + "loss": 3.8972, + "step": 1070 + }, + { + "epoch": 0.07970479704797048, + "grad_norm": 1.1563254594802856, + "learning_rate": 7.970479704797048e-05, + "loss": 3.7901, + "step": 1080 + }, + { + "epoch": 0.08044280442804429, + "grad_norm": 1.06783926486969, + "learning_rate": 8.044280442804428e-05, + "loss": 3.9768, + "step": 1090 + }, + { + "epoch": 0.08118081180811808, + "grad_norm": 1.0809143781661987, + "learning_rate": 8.118081180811809e-05, + "loss": 3.7534, + "step": 1100 + }, + { + "epoch": 0.08191881918819188, + "grad_norm": 1.040157675743103, + "learning_rate": 8.191881918819189e-05, + "loss": 3.7609, + "step": 1110 + }, + { + "epoch": 0.08265682656826569, + "grad_norm": 1.0198458433151245, + "learning_rate": 8.265682656826569e-05, + "loss": 3.8031, + "step": 1120 + }, + { + "epoch": 0.08339483394833948, + "grad_norm": 1.3017419576644897, + "learning_rate": 8.339483394833948e-05, + "loss": 3.8526, + "step": 1130 + }, + { + "epoch": 0.08413284132841328, + "grad_norm": 0.9285693168640137, + "learning_rate": 8.41328413284133e-05, + "loss": 3.7551, + "step": 1140 + }, + { + "epoch": 0.08487084870848709, + "grad_norm": 0.9603882431983948, + "learning_rate": 8.48708487084871e-05, + "loss": 3.834, + "step": 1150 + }, + { + "epoch": 0.08560885608856089, + "grad_norm": 0.9195291996002197, + "learning_rate": 8.560885608856088e-05, + "loss": 3.7804, + "step": 1160 + }, + { + "epoch": 0.08634686346863468, + "grad_norm": 1.0526838302612305, + "learning_rate": 8.634686346863469e-05, + "loss": 3.8757, + "step": 1170 + }, + { + "epoch": 0.08708487084870849, + "grad_norm": 0.8891322612762451, + "learning_rate": 8.708487084870849e-05, + "loss": 3.7513, + "step": 1180 + }, + { + "epoch": 0.08782287822878229, + "grad_norm": 0.9467900395393372, + "learning_rate": 8.782287822878229e-05, + "loss": 3.7555, + "step": 1190 + }, + { + "epoch": 0.08856088560885608, + "grad_norm": 1.0294831991195679, + "learning_rate": 8.85608856088561e-05, + "loss": 3.834, + "step": 1200 + }, + { + "epoch": 0.08929889298892989, + "grad_norm": 1.0832924842834473, + "learning_rate": 8.92988929889299e-05, + "loss": 3.6299, + "step": 1210 + }, + { + "epoch": 0.09003690036900369, + "grad_norm": 0.9595062732696533, + "learning_rate": 9.00369003690037e-05, + "loss": 3.7695, + "step": 1220 + }, + { + "epoch": 0.0907749077490775, + "grad_norm": 0.8714928030967712, + "learning_rate": 9.077490774907749e-05, + "loss": 3.7346, + "step": 1230 + }, + { + "epoch": 0.09151291512915129, + "grad_norm": 0.9189225435256958, + "learning_rate": 9.15129151291513e-05, + "loss": 3.7646, + "step": 1240 + }, + { + "epoch": 0.09225092250922509, + "grad_norm": 1.0212230682373047, + "learning_rate": 9.22509225092251e-05, + "loss": 3.6518, + "step": 1250 + }, + { + "epoch": 0.0929889298892989, + "grad_norm": 0.8631012439727783, + "learning_rate": 9.298892988929889e-05, + "loss": 3.6702, + "step": 1260 + }, + { + "epoch": 0.09372693726937269, + "grad_norm": 0.76339191198349, + "learning_rate": 9.37269372693727e-05, + "loss": 3.6757, + "step": 1270 + }, + { + "epoch": 0.09446494464944649, + "grad_norm": 0.8459323048591614, + "learning_rate": 9.44649446494465e-05, + "loss": 3.8173, + "step": 1280 + }, + { + "epoch": 0.0952029520295203, + "grad_norm": 0.7884580492973328, + "learning_rate": 9.52029520295203e-05, + "loss": 3.6719, + "step": 1290 + }, + { + "epoch": 0.0959409594095941, + "grad_norm": 0.9069279432296753, + "learning_rate": 9.59409594095941e-05, + "loss": 3.6447, + "step": 1300 + }, + { + "epoch": 0.09667896678966789, + "grad_norm": 0.8386545181274414, + "learning_rate": 9.66789667896679e-05, + "loss": 3.6681, + "step": 1310 + }, + { + "epoch": 0.0974169741697417, + "grad_norm": 0.8082497119903564, + "learning_rate": 9.74169741697417e-05, + "loss": 3.6526, + "step": 1320 + }, + { + "epoch": 0.0981549815498155, + "grad_norm": 0.7619675993919373, + "learning_rate": 9.81549815498155e-05, + "loss": 3.6717, + "step": 1330 + }, + { + "epoch": 0.0988929889298893, + "grad_norm": 0.803425133228302, + "learning_rate": 9.889298892988931e-05, + "loss": 3.5892, + "step": 1340 + }, + { + "epoch": 0.0996309963099631, + "grad_norm": 0.8372170925140381, + "learning_rate": 9.963099630996311e-05, + "loss": 3.6615, + "step": 1350 + }, + { + "epoch": 0.1003690036900369, + "grad_norm": 0.8343318700790405, + "learning_rate": 9.999995852216369e-05, + "loss": 3.5785, + "step": 1360 + }, + { + "epoch": 0.1011070110701107, + "grad_norm": 0.8367707133293152, + "learning_rate": 9.999962669988607e-05, + "loss": 3.625, + "step": 1370 + }, + { + "epoch": 0.1018450184501845, + "grad_norm": 0.8662716150283813, + "learning_rate": 9.999896305753297e-05, + "loss": 3.6656, + "step": 1380 + }, + { + "epoch": 0.1025830258302583, + "grad_norm": 0.747052788734436, + "learning_rate": 9.999796759950864e-05, + "loss": 3.5761, + "step": 1390 + }, + { + "epoch": 0.1033210332103321, + "grad_norm": 0.7763943672180176, + "learning_rate": 9.999664033241933e-05, + "loss": 3.5234, + "step": 1400 + }, + { + "epoch": 0.10405904059040591, + "grad_norm": 0.7435528039932251, + "learning_rate": 9.99949812650734e-05, + "loss": 3.5132, + "step": 1410 + }, + { + "epoch": 0.1047970479704797, + "grad_norm": 0.8303211331367493, + "learning_rate": 9.999299040848121e-05, + "loss": 3.5173, + "step": 1420 + }, + { + "epoch": 0.1055350553505535, + "grad_norm": 0.8359752297401428, + "learning_rate": 9.999066777585495e-05, + "loss": 3.5605, + "step": 1430 + }, + { + "epoch": 0.10627306273062731, + "grad_norm": 0.909545361995697, + "learning_rate": 9.998801338260865e-05, + "loss": 3.5839, + "step": 1440 + }, + { + "epoch": 0.1070110701107011, + "grad_norm": 0.845916748046875, + "learning_rate": 9.99850272463581e-05, + "loss": 3.5685, + "step": 1450 + }, + { + "epoch": 0.1077490774907749, + "grad_norm": 0.834235429763794, + "learning_rate": 9.99817093869206e-05, + "loss": 3.5476, + "step": 1460 + }, + { + "epoch": 0.10848708487084871, + "grad_norm": 0.7273171544075012, + "learning_rate": 9.997805982631499e-05, + "loss": 3.4777, + "step": 1470 + }, + { + "epoch": 0.10922509225092251, + "grad_norm": 0.839796245098114, + "learning_rate": 9.99740785887614e-05, + "loss": 3.5084, + "step": 1480 + }, + { + "epoch": 0.1099630996309963, + "grad_norm": 0.7638348340988159, + "learning_rate": 9.99697657006811e-05, + "loss": 3.5741, + "step": 1490 + }, + { + "epoch": 0.11070110701107011, + "grad_norm": 0.7195069193840027, + "learning_rate": 9.996512119069636e-05, + "loss": 3.5083, + "step": 1500 + }, + { + "epoch": 0.11143911439114391, + "grad_norm": 0.7351711392402649, + "learning_rate": 9.996014508963028e-05, + "loss": 3.365, + "step": 1510 + }, + { + "epoch": 0.1121771217712177, + "grad_norm": 0.7192705869674683, + "learning_rate": 9.995483743050648e-05, + "loss": 3.5233, + "step": 1520 + }, + { + "epoch": 0.11291512915129151, + "grad_norm": 0.7362285256385803, + "learning_rate": 9.994919824854898e-05, + "loss": 3.5548, + "step": 1530 + }, + { + "epoch": 0.11365313653136531, + "grad_norm": 0.6908057928085327, + "learning_rate": 9.994322758118196e-05, + "loss": 3.4293, + "step": 1540 + }, + { + "epoch": 0.11439114391143912, + "grad_norm": 0.7892534136772156, + "learning_rate": 9.993692546802941e-05, + "loss": 3.4583, + "step": 1550 + }, + { + "epoch": 0.11512915129151291, + "grad_norm": 0.7085639834403992, + "learning_rate": 9.993029195091505e-05, + "loss": 3.4349, + "step": 1560 + }, + { + "epoch": 0.11586715867158671, + "grad_norm": 0.7825974225997925, + "learning_rate": 9.992332707386188e-05, + "loss": 3.4496, + "step": 1570 + }, + { + "epoch": 0.11660516605166052, + "grad_norm": 0.7284643054008484, + "learning_rate": 9.991603088309194e-05, + "loss": 3.517, + "step": 1580 + }, + { + "epoch": 0.11734317343173432, + "grad_norm": 0.7682483792304993, + "learning_rate": 9.990840342702606e-05, + "loss": 3.4505, + "step": 1590 + }, + { + "epoch": 0.11808118081180811, + "grad_norm": 0.8391796350479126, + "learning_rate": 9.990044475628347e-05, + "loss": 3.5077, + "step": 1600 + }, + { + "epoch": 0.11881918819188192, + "grad_norm": 0.7043576836585999, + "learning_rate": 9.989215492368151e-05, + "loss": 3.4272, + "step": 1610 + }, + { + "epoch": 0.11955719557195572, + "grad_norm": 0.72553551197052, + "learning_rate": 9.988353398423527e-05, + "loss": 3.3559, + "step": 1620 + }, + { + "epoch": 0.12029520295202951, + "grad_norm": 0.7156850099563599, + "learning_rate": 9.987458199515713e-05, + "loss": 3.4108, + "step": 1630 + }, + { + "epoch": 0.12103321033210332, + "grad_norm": 0.6410751342773438, + "learning_rate": 9.98652990158566e-05, + "loss": 3.4688, + "step": 1640 + }, + { + "epoch": 0.12177121771217712, + "grad_norm": 0.8124927282333374, + "learning_rate": 9.985568510793967e-05, + "loss": 3.4611, + "step": 1650 + }, + { + "epoch": 0.12250922509225093, + "grad_norm": 0.7403334379196167, + "learning_rate": 9.984574033520857e-05, + "loss": 3.4669, + "step": 1660 + }, + { + "epoch": 0.12324723247232472, + "grad_norm": 0.662948727607727, + "learning_rate": 9.983546476366132e-05, + "loss": 3.4798, + "step": 1670 + }, + { + "epoch": 0.12398523985239852, + "grad_norm": 0.6987183690071106, + "learning_rate": 9.982485846149125e-05, + "loss": 3.3932, + "step": 1680 + }, + { + "epoch": 0.12472324723247233, + "grad_norm": 0.650486171245575, + "learning_rate": 9.981392149908652e-05, + "loss": 3.3856, + "step": 1690 + }, + { + "epoch": 0.12546125461254612, + "grad_norm": 0.6416191458702087, + "learning_rate": 9.98026539490298e-05, + "loss": 3.455, + "step": 1700 + }, + { + "epoch": 0.12619926199261994, + "grad_norm": 0.6319407820701599, + "learning_rate": 9.979105588609762e-05, + "loss": 3.4001, + "step": 1710 + }, + { + "epoch": 0.12693726937269373, + "grad_norm": 0.6667493581771851, + "learning_rate": 9.977912738725994e-05, + "loss": 3.4277, + "step": 1720 + }, + { + "epoch": 0.12767527675276752, + "grad_norm": 0.6686265468597412, + "learning_rate": 9.976686853167967e-05, + "loss": 3.4075, + "step": 1730 + }, + { + "epoch": 0.12841328413284134, + "grad_norm": 0.731555700302124, + "learning_rate": 9.975427940071211e-05, + "loss": 3.4226, + "step": 1740 + }, + { + "epoch": 0.12915129151291513, + "grad_norm": 0.6553905606269836, + "learning_rate": 9.97413600779044e-05, + "loss": 3.4306, + "step": 1750 + }, + { + "epoch": 0.12988929889298892, + "grad_norm": 0.7509811520576477, + "learning_rate": 9.9728110648995e-05, + "loss": 3.3937, + "step": 1760 + }, + { + "epoch": 0.13062730627306274, + "grad_norm": 0.7052728533744812, + "learning_rate": 9.971453120191309e-05, + "loss": 3.3822, + "step": 1770 + }, + { + "epoch": 0.13136531365313653, + "grad_norm": 0.6742541790008545, + "learning_rate": 9.970062182677801e-05, + "loss": 3.3824, + "step": 1780 + }, + { + "epoch": 0.13210332103321032, + "grad_norm": 0.6257262825965881, + "learning_rate": 9.968638261589866e-05, + "loss": 3.4047, + "step": 1790 + }, + { + "epoch": 0.13284132841328414, + "grad_norm": 0.6546107530593872, + "learning_rate": 9.967181366377285e-05, + "loss": 3.3903, + "step": 1800 + }, + { + "epoch": 0.13357933579335793, + "grad_norm": 0.8019782304763794, + "learning_rate": 9.965691506708672e-05, + "loss": 3.3911, + "step": 1810 + }, + { + "epoch": 0.13431734317343175, + "grad_norm": 0.6207643151283264, + "learning_rate": 9.964168692471408e-05, + "loss": 3.3861, + "step": 1820 + }, + { + "epoch": 0.13505535055350554, + "grad_norm": 0.6750718355178833, + "learning_rate": 9.962612933771576e-05, + "loss": 3.4424, + "step": 1830 + }, + { + "epoch": 0.13579335793357933, + "grad_norm": 0.9330940246582031, + "learning_rate": 9.961024240933892e-05, + "loss": 3.3459, + "step": 1840 + }, + { + "epoch": 0.13653136531365315, + "grad_norm": 0.7058202028274536, + "learning_rate": 9.959402624501636e-05, + "loss": 3.3327, + "step": 1850 + }, + { + "epoch": 0.13726937269372694, + "grad_norm": 0.779712438583374, + "learning_rate": 9.957748095236589e-05, + "loss": 3.4398, + "step": 1860 + }, + { + "epoch": 0.13800738007380073, + "grad_norm": 0.663960337638855, + "learning_rate": 9.956060664118951e-05, + "loss": 3.3513, + "step": 1870 + }, + { + "epoch": 0.13874538745387455, + "grad_norm": 0.756618082523346, + "learning_rate": 9.954340342347279e-05, + "loss": 3.304, + "step": 1880 + }, + { + "epoch": 0.13948339483394834, + "grad_norm": 0.7523687481880188, + "learning_rate": 9.952587141338403e-05, + "loss": 3.3155, + "step": 1890 + }, + { + "epoch": 0.14022140221402213, + "grad_norm": 0.6524930596351624, + "learning_rate": 9.950801072727356e-05, + "loss": 3.3803, + "step": 1900 + }, + { + "epoch": 0.14095940959409595, + "grad_norm": 0.7161090970039368, + "learning_rate": 9.948982148367292e-05, + "loss": 3.4219, + "step": 1910 + }, + { + "epoch": 0.14169741697416974, + "grad_norm": 0.7181054949760437, + "learning_rate": 9.947130380329418e-05, + "loss": 3.301, + "step": 1920 + }, + { + "epoch": 0.14243542435424356, + "grad_norm": 0.6185216903686523, + "learning_rate": 9.945245780902899e-05, + "loss": 3.3666, + "step": 1930 + }, + { + "epoch": 0.14317343173431735, + "grad_norm": 0.6279731392860413, + "learning_rate": 9.943328362594788e-05, + "loss": 3.2862, + "step": 1940 + }, + { + "epoch": 0.14391143911439114, + "grad_norm": 0.6401661038398743, + "learning_rate": 9.941378138129938e-05, + "loss": 3.3112, + "step": 1950 + }, + { + "epoch": 0.14464944649446496, + "grad_norm": 0.6105781197547913, + "learning_rate": 9.939395120450916e-05, + "loss": 3.3539, + "step": 1960 + }, + { + "epoch": 0.14538745387453875, + "grad_norm": 0.6660001873970032, + "learning_rate": 9.937379322717924e-05, + "loss": 3.3722, + "step": 1970 + }, + { + "epoch": 0.14612546125461254, + "grad_norm": 0.6415931582450867, + "learning_rate": 9.935330758308705e-05, + "loss": 3.3329, + "step": 1980 + }, + { + "epoch": 0.14686346863468636, + "grad_norm": 0.6147580742835999, + "learning_rate": 9.933249440818455e-05, + "loss": 3.2807, + "step": 1990 + }, + { + "epoch": 0.14760147601476015, + "grad_norm": 0.694519579410553, + "learning_rate": 9.931135384059736e-05, + "loss": 3.2662, + "step": 2000 + }, + { + "epoch": 0.14833948339483394, + "grad_norm": 0.6452217102050781, + "learning_rate": 9.928988602062384e-05, + "loss": 3.2942, + "step": 2010 + }, + { + "epoch": 0.14907749077490776, + "grad_norm": 0.6983804106712341, + "learning_rate": 9.926809109073412e-05, + "loss": 3.2639, + "step": 2020 + }, + { + "epoch": 0.14981549815498155, + "grad_norm": 0.6302483677864075, + "learning_rate": 9.924596919556917e-05, + "loss": 3.3648, + "step": 2030 + }, + { + "epoch": 0.15055350553505534, + "grad_norm": 0.6506009697914124, + "learning_rate": 9.922352048193986e-05, + "loss": 3.3417, + "step": 2040 + }, + { + "epoch": 0.15129151291512916, + "grad_norm": 0.6232055425643921, + "learning_rate": 9.920074509882602e-05, + "loss": 3.3304, + "step": 2050 + }, + { + "epoch": 0.15202952029520295, + "grad_norm": 0.6454508900642395, + "learning_rate": 9.917764319737533e-05, + "loss": 3.2585, + "step": 2060 + }, + { + "epoch": 0.15276752767527677, + "grad_norm": 0.6281662583351135, + "learning_rate": 9.915421493090243e-05, + "loss": 3.2753, + "step": 2070 + }, + { + "epoch": 0.15350553505535056, + "grad_norm": 0.7222394943237305, + "learning_rate": 9.913046045488786e-05, + "loss": 3.2683, + "step": 2080 + }, + { + "epoch": 0.15424354243542435, + "grad_norm": 0.6333222389221191, + "learning_rate": 9.910637992697707e-05, + "loss": 3.2676, + "step": 2090 + }, + { + "epoch": 0.15498154981549817, + "grad_norm": 0.6758008003234863, + "learning_rate": 9.908197350697926e-05, + "loss": 3.2941, + "step": 2100 + }, + { + "epoch": 0.15571955719557196, + "grad_norm": 0.5930529832839966, + "learning_rate": 9.905724135686648e-05, + "loss": 3.3365, + "step": 2110 + }, + { + "epoch": 0.15645756457564575, + "grad_norm": 0.7024756669998169, + "learning_rate": 9.903218364077243e-05, + "loss": 3.2594, + "step": 2120 + }, + { + "epoch": 0.15719557195571957, + "grad_norm": 0.6018502712249756, + "learning_rate": 9.900680052499138e-05, + "loss": 3.3316, + "step": 2130 + }, + { + "epoch": 0.15793357933579336, + "grad_norm": 0.6856579184532166, + "learning_rate": 9.898109217797717e-05, + "loss": 3.3196, + "step": 2140 + }, + { + "epoch": 0.15867158671586715, + "grad_norm": 0.6864190101623535, + "learning_rate": 9.895505877034198e-05, + "loss": 3.3116, + "step": 2150 + }, + { + "epoch": 0.15940959409594097, + "grad_norm": 0.57015061378479, + "learning_rate": 9.892870047485526e-05, + "loss": 3.3119, + "step": 2160 + }, + { + "epoch": 0.16014760147601476, + "grad_norm": 0.5812332630157471, + "learning_rate": 9.89020174664425e-05, + "loss": 3.2727, + "step": 2170 + }, + { + "epoch": 0.16088560885608857, + "grad_norm": 0.6356363296508789, + "learning_rate": 9.887500992218421e-05, + "loss": 3.3661, + "step": 2180 + }, + { + "epoch": 0.16162361623616237, + "grad_norm": 0.672024130821228, + "learning_rate": 9.884767802131465e-05, + "loss": 3.3215, + "step": 2190 + }, + { + "epoch": 0.16236162361623616, + "grad_norm": 0.6531562805175781, + "learning_rate": 9.882002194522064e-05, + "loss": 3.2374, + "step": 2200 + }, + { + "epoch": 0.16309963099630997, + "grad_norm": 0.6039624214172363, + "learning_rate": 9.879204187744036e-05, + "loss": 3.2342, + "step": 2210 + }, + { + "epoch": 0.16383763837638377, + "grad_norm": 0.5702035427093506, + "learning_rate": 9.876373800366215e-05, + "loss": 3.3181, + "step": 2220 + }, + { + "epoch": 0.16457564575645756, + "grad_norm": 0.6860033273696899, + "learning_rate": 9.87351105117233e-05, + "loss": 3.3758, + "step": 2230 + }, + { + "epoch": 0.16531365313653137, + "grad_norm": 0.6462620496749878, + "learning_rate": 9.870615959160875e-05, + "loss": 3.3542, + "step": 2240 + }, + { + "epoch": 0.16605166051660517, + "grad_norm": 0.6575970649719238, + "learning_rate": 9.867688543544988e-05, + "loss": 3.2135, + "step": 2250 + }, + { + "epoch": 0.16678966789667896, + "grad_norm": 0.6185761094093323, + "learning_rate": 9.86472882375232e-05, + "loss": 3.294, + "step": 2260 + }, + { + "epoch": 0.16752767527675277, + "grad_norm": 0.6141475439071655, + "learning_rate": 9.861736819424902e-05, + "loss": 3.1992, + "step": 2270 + }, + { + "epoch": 0.16826568265682657, + "grad_norm": 0.6172120571136475, + "learning_rate": 9.85871255041903e-05, + "loss": 3.2167, + "step": 2280 + }, + { + "epoch": 0.16900369003690036, + "grad_norm": 0.5904815196990967, + "learning_rate": 9.855656036805114e-05, + "loss": 3.2945, + "step": 2290 + }, + { + "epoch": 0.16974169741697417, + "grad_norm": 0.6383630633354187, + "learning_rate": 9.852567298867557e-05, + "loss": 3.2865, + "step": 2300 + }, + { + "epoch": 0.17047970479704797, + "grad_norm": 0.60262531042099, + "learning_rate": 9.84944635710462e-05, + "loss": 3.2188, + "step": 2310 + }, + { + "epoch": 0.17121771217712178, + "grad_norm": 0.5909958481788635, + "learning_rate": 9.846293232228274e-05, + "loss": 3.2896, + "step": 2320 + }, + { + "epoch": 0.17195571955719557, + "grad_norm": 0.5554500818252563, + "learning_rate": 9.843107945164086e-05, + "loss": 3.1705, + "step": 2330 + }, + { + "epoch": 0.17269372693726937, + "grad_norm": 0.620606005191803, + "learning_rate": 9.83989051705105e-05, + "loss": 3.2288, + "step": 2340 + }, + { + "epoch": 0.17343173431734318, + "grad_norm": 0.6841108202934265, + "learning_rate": 9.836640969241475e-05, + "loss": 3.2441, + "step": 2350 + }, + { + "epoch": 0.17416974169741697, + "grad_norm": 0.6839698553085327, + "learning_rate": 9.833359323300826e-05, + "loss": 3.2246, + "step": 2360 + }, + { + "epoch": 0.17490774907749077, + "grad_norm": 0.7128744721412659, + "learning_rate": 9.830045601007584e-05, + "loss": 3.2008, + "step": 2370 + }, + { + "epoch": 0.17564575645756458, + "grad_norm": 0.65251624584198, + "learning_rate": 9.826699824353106e-05, + "loss": 3.3275, + "step": 2380 + }, + { + "epoch": 0.17638376383763837, + "grad_norm": 0.5380867123603821, + "learning_rate": 9.823322015541474e-05, + "loss": 3.2064, + "step": 2390 + }, + { + "epoch": 0.17712177121771217, + "grad_norm": 0.5963719487190247, + "learning_rate": 9.819912196989351e-05, + "loss": 3.1643, + "step": 2400 + }, + { + "epoch": 0.17785977859778598, + "grad_norm": 0.8703069090843201, + "learning_rate": 9.816470391325832e-05, + "loss": 3.1848, + "step": 2410 + }, + { + "epoch": 0.17859778597785977, + "grad_norm": 0.608935534954071, + "learning_rate": 9.81299662139229e-05, + "loss": 3.2719, + "step": 2420 + }, + { + "epoch": 0.1793357933579336, + "grad_norm": 0.6425730586051941, + "learning_rate": 9.809490910242229e-05, + "loss": 3.2619, + "step": 2430 + }, + { + "epoch": 0.18007380073800738, + "grad_norm": 0.5790001749992371, + "learning_rate": 9.805953281141131e-05, + "loss": 3.243, + "step": 2440 + }, + { + "epoch": 0.18081180811808117, + "grad_norm": 0.6436141133308411, + "learning_rate": 9.802383757566301e-05, + "loss": 3.2284, + "step": 2450 + }, + { + "epoch": 0.181549815498155, + "grad_norm": 0.5458927154541016, + "learning_rate": 9.798782363206702e-05, + "loss": 3.2043, + "step": 2460 + }, + { + "epoch": 0.18228782287822878, + "grad_norm": 0.6296219229698181, + "learning_rate": 9.795149121962815e-05, + "loss": 3.2683, + "step": 2470 + }, + { + "epoch": 0.18302583025830257, + "grad_norm": 0.6964813470840454, + "learning_rate": 9.791484057946465e-05, + "loss": 3.1977, + "step": 2480 + }, + { + "epoch": 0.1837638376383764, + "grad_norm": 0.5911018252372742, + "learning_rate": 9.787787195480672e-05, + "loss": 3.2263, + "step": 2490 + }, + { + "epoch": 0.18450184501845018, + "grad_norm": 0.5431626439094543, + "learning_rate": 9.784058559099483e-05, + "loss": 3.1628, + "step": 2500 + }, + { + "epoch": 0.18523985239852397, + "grad_norm": 0.6068975329399109, + "learning_rate": 9.78029817354781e-05, + "loss": 3.1828, + "step": 2510 + }, + { + "epoch": 0.1859778597785978, + "grad_norm": 0.580287516117096, + "learning_rate": 9.776506063781269e-05, + "loss": 3.2248, + "step": 2520 + }, + { + "epoch": 0.18671586715867158, + "grad_norm": 0.6136944890022278, + "learning_rate": 9.772682254966008e-05, + "loss": 3.2495, + "step": 2530 + }, + { + "epoch": 0.18745387453874537, + "grad_norm": 0.6076098680496216, + "learning_rate": 9.76882677247855e-05, + "loss": 3.1979, + "step": 2540 + }, + { + "epoch": 0.1881918819188192, + "grad_norm": 0.5682818293571472, + "learning_rate": 9.764939641905615e-05, + "loss": 3.1714, + "step": 2550 + }, + { + "epoch": 0.18892988929889298, + "grad_norm": 0.5991480350494385, + "learning_rate": 9.761020889043954e-05, + "loss": 3.154, + "step": 2560 + }, + { + "epoch": 0.1896678966789668, + "grad_norm": 0.6232896447181702, + "learning_rate": 9.75707053990018e-05, + "loss": 3.2036, + "step": 2570 + }, + { + "epoch": 0.1904059040590406, + "grad_norm": 0.5560643672943115, + "learning_rate": 9.75308862069059e-05, + "loss": 3.2392, + "step": 2580 + }, + { + "epoch": 0.19114391143911438, + "grad_norm": 0.5718569755554199, + "learning_rate": 9.749075157840996e-05, + "loss": 3.2528, + "step": 2590 + }, + { + "epoch": 0.1918819188191882, + "grad_norm": 0.5662999749183655, + "learning_rate": 9.74503017798655e-05, + "loss": 3.2256, + "step": 2600 + }, + { + "epoch": 0.192619926199262, + "grad_norm": 0.6026265621185303, + "learning_rate": 9.74095370797156e-05, + "loss": 3.2183, + "step": 2610 + }, + { + "epoch": 0.19335793357933578, + "grad_norm": 0.6032066941261292, + "learning_rate": 9.736845774849321e-05, + "loss": 3.2418, + "step": 2620 + }, + { + "epoch": 0.1940959409594096, + "grad_norm": 0.5830618143081665, + "learning_rate": 9.732706405881931e-05, + "loss": 3.191, + "step": 2630 + }, + { + "epoch": 0.1948339483394834, + "grad_norm": 0.5695509314537048, + "learning_rate": 9.728535628540109e-05, + "loss": 3.1968, + "step": 2640 + }, + { + "epoch": 0.19557195571955718, + "grad_norm": 0.5905478000640869, + "learning_rate": 9.724333470503013e-05, + "loss": 3.2596, + "step": 2650 + }, + { + "epoch": 0.196309963099631, + "grad_norm": 0.5251249670982361, + "learning_rate": 9.720099959658062e-05, + "loss": 3.1729, + "step": 2660 + }, + { + "epoch": 0.1970479704797048, + "grad_norm": 0.6502349972724915, + "learning_rate": 9.715835124100742e-05, + "loss": 3.2604, + "step": 2670 + }, + { + "epoch": 0.1977859778597786, + "grad_norm": 0.6250560283660889, + "learning_rate": 9.711538992134426e-05, + "loss": 3.2194, + "step": 2680 + }, + { + "epoch": 0.1985239852398524, + "grad_norm": 0.5793785452842712, + "learning_rate": 9.707211592270183e-05, + "loss": 3.1994, + "step": 2690 + }, + { + "epoch": 0.1992619926199262, + "grad_norm": 0.6495150327682495, + "learning_rate": 9.70285295322659e-05, + "loss": 3.1919, + "step": 2700 + }, + { + "epoch": 0.2, + "grad_norm": 0.5875915288925171, + "learning_rate": 9.698463103929542e-05, + "loss": 3.2464, + "step": 2710 + }, + { + "epoch": 0.2007380073800738, + "grad_norm": 0.5518725514411926, + "learning_rate": 9.69404207351206e-05, + "loss": 3.2042, + "step": 2720 + }, + { + "epoch": 0.2014760147601476, + "grad_norm": 0.5390283465385437, + "learning_rate": 9.689589891314094e-05, + "loss": 3.2012, + "step": 2730 + }, + { + "epoch": 0.2022140221402214, + "grad_norm": 0.5596645474433899, + "learning_rate": 9.685106586882336e-05, + "loss": 3.2053, + "step": 2740 + }, + { + "epoch": 0.2029520295202952, + "grad_norm": 0.5377479195594788, + "learning_rate": 9.680592189970015e-05, + "loss": 3.177, + "step": 2750 + }, + { + "epoch": 0.203690036900369, + "grad_norm": 0.5858853459358215, + "learning_rate": 9.676046730536704e-05, + "loss": 3.2039, + "step": 2760 + }, + { + "epoch": 0.2044280442804428, + "grad_norm": 0.5771840810775757, + "learning_rate": 9.671470238748124e-05, + "loss": 3.1654, + "step": 2770 + }, + { + "epoch": 0.2051660516605166, + "grad_norm": 0.5626157522201538, + "learning_rate": 9.666862744975938e-05, + "loss": 3.1978, + "step": 2780 + }, + { + "epoch": 0.2059040590405904, + "grad_norm": 0.5536968111991882, + "learning_rate": 9.662224279797552e-05, + "loss": 3.2152, + "step": 2790 + }, + { + "epoch": 0.2066420664206642, + "grad_norm": 0.5982388854026794, + "learning_rate": 9.657554873995913e-05, + "loss": 3.1699, + "step": 2800 + }, + { + "epoch": 0.207380073800738, + "grad_norm": 0.5761833190917969, + "learning_rate": 9.652854558559308e-05, + "loss": 3.1766, + "step": 2810 + }, + { + "epoch": 0.20811808118081182, + "grad_norm": 0.5907506346702576, + "learning_rate": 9.648123364681145e-05, + "loss": 3.0935, + "step": 2820 + }, + { + "epoch": 0.2088560885608856, + "grad_norm": 0.5584788918495178, + "learning_rate": 9.643361323759763e-05, + "loss": 3.1111, + "step": 2830 + }, + { + "epoch": 0.2095940959409594, + "grad_norm": 0.5568063855171204, + "learning_rate": 9.638568467398215e-05, + "loss": 3.1739, + "step": 2840 + }, + { + "epoch": 0.21033210332103322, + "grad_norm": 0.5453604459762573, + "learning_rate": 9.633744827404055e-05, + "loss": 3.2064, + "step": 2850 + }, + { + "epoch": 0.211070110701107, + "grad_norm": 0.6171849966049194, + "learning_rate": 9.628890435789135e-05, + "loss": 3.2281, + "step": 2860 + }, + { + "epoch": 0.2118081180811808, + "grad_norm": 0.5285280346870422, + "learning_rate": 9.624005324769388e-05, + "loss": 3.113, + "step": 2870 + }, + { + "epoch": 0.21254612546125462, + "grad_norm": 0.5632630586624146, + "learning_rate": 9.619089526764614e-05, + "loss": 3.1592, + "step": 2880 + }, + { + "epoch": 0.2132841328413284, + "grad_norm": 0.6024160385131836, + "learning_rate": 9.614143074398264e-05, + "loss": 3.1904, + "step": 2890 + }, + { + "epoch": 0.2140221402214022, + "grad_norm": 0.5437342524528503, + "learning_rate": 9.609166000497229e-05, + "loss": 3.1156, + "step": 2900 + }, + { + "epoch": 0.21476014760147602, + "grad_norm": 0.5884766578674316, + "learning_rate": 9.604158338091615e-05, + "loss": 3.1888, + "step": 2910 + }, + { + "epoch": 0.2154981549815498, + "grad_norm": 0.547242283821106, + "learning_rate": 9.599120120414531e-05, + "loss": 3.1079, + "step": 2920 + }, + { + "epoch": 0.21623616236162363, + "grad_norm": 0.5443885326385498, + "learning_rate": 9.594051380901859e-05, + "loss": 3.1147, + "step": 2930 + }, + { + "epoch": 0.21697416974169742, + "grad_norm": 0.5350677371025085, + "learning_rate": 9.588952153192041e-05, + "loss": 3.1061, + "step": 2940 + }, + { + "epoch": 0.2177121771217712, + "grad_norm": 0.5434796214103699, + "learning_rate": 9.583822471125854e-05, + "loss": 3.1172, + "step": 2950 + }, + { + "epoch": 0.21845018450184503, + "grad_norm": 0.5185326933860779, + "learning_rate": 9.578662368746182e-05, + "loss": 3.2186, + "step": 2960 + }, + { + "epoch": 0.21918819188191882, + "grad_norm": 0.5394032001495361, + "learning_rate": 9.57347188029779e-05, + "loss": 3.1628, + "step": 2970 + }, + { + "epoch": 0.2199261992619926, + "grad_norm": 0.5857832431793213, + "learning_rate": 9.568251040227101e-05, + "loss": 3.1291, + "step": 2980 + }, + { + "epoch": 0.22066420664206643, + "grad_norm": 0.6189760565757751, + "learning_rate": 9.562999883181967e-05, + "loss": 3.1305, + "step": 2990 + }, + { + "epoch": 0.22140221402214022, + "grad_norm": 0.5518510937690735, + "learning_rate": 9.557718444011431e-05, + "loss": 3.2148, + "step": 3000 + }, + { + "epoch": 0.222140221402214, + "grad_norm": 0.5947515964508057, + "learning_rate": 9.552406757765509e-05, + "loss": 3.1322, + "step": 3010 + }, + { + "epoch": 0.22287822878228783, + "grad_norm": 0.5554746985435486, + "learning_rate": 9.547064859694943e-05, + "loss": 3.1822, + "step": 3020 + }, + { + "epoch": 0.22361623616236162, + "grad_norm": 0.5308244824409485, + "learning_rate": 9.541692785250981e-05, + "loss": 3.1371, + "step": 3030 + }, + { + "epoch": 0.2243542435424354, + "grad_norm": 0.5285702347755432, + "learning_rate": 9.536290570085131e-05, + "loss": 3.1329, + "step": 3040 + }, + { + "epoch": 0.22509225092250923, + "grad_norm": 0.5468854904174805, + "learning_rate": 9.530858250048932e-05, + "loss": 3.2538, + "step": 3050 + }, + { + "epoch": 0.22583025830258302, + "grad_norm": 0.5449059009552002, + "learning_rate": 9.525395861193707e-05, + "loss": 3.2139, + "step": 3060 + }, + { + "epoch": 0.22656826568265684, + "grad_norm": 0.5692685842514038, + "learning_rate": 9.519903439770332e-05, + "loss": 3.1138, + "step": 3070 + }, + { + "epoch": 0.22730627306273063, + "grad_norm": 0.5263866782188416, + "learning_rate": 9.514381022228997e-05, + "loss": 3.0872, + "step": 3080 + }, + { + "epoch": 0.22804428044280442, + "grad_norm": 0.5696788430213928, + "learning_rate": 9.50882864521895e-05, + "loss": 3.167, + "step": 3090 + }, + { + "epoch": 0.22878228782287824, + "grad_norm": 0.5760169625282288, + "learning_rate": 9.503246345588274e-05, + "loss": 3.15, + "step": 3100 + }, + { + "epoch": 0.22952029520295203, + "grad_norm": 0.5390339493751526, + "learning_rate": 9.497634160383626e-05, + "loss": 3.1367, + "step": 3110 + }, + { + "epoch": 0.23025830258302582, + "grad_norm": 0.5490269660949707, + "learning_rate": 9.491992126849997e-05, + "loss": 3.1779, + "step": 3120 + }, + { + "epoch": 0.23099630996309964, + "grad_norm": 0.5177121758460999, + "learning_rate": 9.486320282430468e-05, + "loss": 3.0789, + "step": 3130 + }, + { + "epoch": 0.23173431734317343, + "grad_norm": 0.5448027849197388, + "learning_rate": 9.480618664765955e-05, + "loss": 3.1866, + "step": 3140 + }, + { + "epoch": 0.23247232472324722, + "grad_norm": 0.5371176600456238, + "learning_rate": 9.474887311694968e-05, + "loss": 3.2089, + "step": 3150 + }, + { + "epoch": 0.23321033210332104, + "grad_norm": 0.6013469099998474, + "learning_rate": 9.469126261253348e-05, + "loss": 3.1159, + "step": 3160 + }, + { + "epoch": 0.23394833948339483, + "grad_norm": 0.5597007274627686, + "learning_rate": 9.463335551674025e-05, + "loss": 3.124, + "step": 3170 + }, + { + "epoch": 0.23468634686346865, + "grad_norm": 0.5460641384124756, + "learning_rate": 9.45751522138676e-05, + "loss": 3.103, + "step": 3180 + }, + { + "epoch": 0.23542435424354244, + "grad_norm": 0.5389031767845154, + "learning_rate": 9.45166530901789e-05, + "loss": 3.1502, + "step": 3190 + }, + { + "epoch": 0.23616236162361623, + "grad_norm": 0.5293789505958557, + "learning_rate": 9.445785853390073e-05, + "loss": 3.0856, + "step": 3200 + }, + { + "epoch": 0.23690036900369005, + "grad_norm": 0.677259087562561, + "learning_rate": 9.439876893522028e-05, + "loss": 3.1143, + "step": 3210 + }, + { + "epoch": 0.23763837638376384, + "grad_norm": 0.5259451866149902, + "learning_rate": 9.433938468628277e-05, + "loss": 3.1628, + "step": 3220 + }, + { + "epoch": 0.23837638376383763, + "grad_norm": 0.5321341156959534, + "learning_rate": 9.427970618118888e-05, + "loss": 3.1164, + "step": 3230 + }, + { + "epoch": 0.23911439114391145, + "grad_norm": 0.5752614140510559, + "learning_rate": 9.421973381599208e-05, + "loss": 3.0361, + "step": 3240 + }, + { + "epoch": 0.23985239852398524, + "grad_norm": 0.5552977323532104, + "learning_rate": 9.415946798869602e-05, + "loss": 3.1452, + "step": 3250 + }, + { + "epoch": 0.24059040590405903, + "grad_norm": 0.5862517952919006, + "learning_rate": 9.409890909925193e-05, + "loss": 3.1493, + "step": 3260 + }, + { + "epoch": 0.24132841328413285, + "grad_norm": 0.5374996066093445, + "learning_rate": 9.40380575495559e-05, + "loss": 3.1315, + "step": 3270 + }, + { + "epoch": 0.24206642066420664, + "grad_norm": 0.5315213203430176, + "learning_rate": 9.39769137434463e-05, + "loss": 3.1218, + "step": 3280 + }, + { + "epoch": 0.24280442804428043, + "grad_norm": 0.5306174159049988, + "learning_rate": 9.391547808670096e-05, + "loss": 3.0916, + "step": 3290 + }, + { + "epoch": 0.24354243542435425, + "grad_norm": 0.5105913281440735, + "learning_rate": 9.385375098703465e-05, + "loss": 3.0469, + "step": 3300 + }, + { + "epoch": 0.24428044280442804, + "grad_norm": 0.5171898603439331, + "learning_rate": 9.379173285409621e-05, + "loss": 3.068, + "step": 3310 + }, + { + "epoch": 0.24501845018450186, + "grad_norm": 0.5028154253959656, + "learning_rate": 9.372942409946596e-05, + "loss": 3.1542, + "step": 3320 + }, + { + "epoch": 0.24575645756457565, + "grad_norm": 0.5281797647476196, + "learning_rate": 9.366682513665293e-05, + "loss": 3.1484, + "step": 3330 + }, + { + "epoch": 0.24649446494464944, + "grad_norm": 0.5240592956542969, + "learning_rate": 9.360393638109201e-05, + "loss": 3.103, + "step": 3340 + }, + { + "epoch": 0.24723247232472326, + "grad_norm": 0.5516790747642517, + "learning_rate": 9.354075825014139e-05, + "loss": 3.0701, + "step": 3350 + }, + { + "epoch": 0.24797047970479705, + "grad_norm": 0.6081251502037048, + "learning_rate": 9.347729116307964e-05, + "loss": 3.1434, + "step": 3360 + }, + { + "epoch": 0.24870848708487084, + "grad_norm": 0.5216418504714966, + "learning_rate": 9.341353554110297e-05, + "loss": 3.1567, + "step": 3370 + }, + { + "epoch": 0.24944649446494466, + "grad_norm": 0.5264909863471985, + "learning_rate": 9.334949180732245e-05, + "loss": 3.162, + "step": 3380 + }, + { + "epoch": 0.25018450184501845, + "grad_norm": 0.4942391812801361, + "learning_rate": 9.328516038676119e-05, + "loss": 3.1532, + "step": 3390 + }, + { + "epoch": 0.25092250922509224, + "grad_norm": 0.5401615500450134, + "learning_rate": 9.322054170635149e-05, + "loss": 3.1, + "step": 3400 + }, + { + "epoch": 0.25166051660516603, + "grad_norm": 0.5021462440490723, + "learning_rate": 9.315563619493209e-05, + "loss": 3.0438, + "step": 3410 + }, + { + "epoch": 0.2523985239852399, + "grad_norm": 0.5627569556236267, + "learning_rate": 9.309044428324522e-05, + "loss": 3.2005, + "step": 3420 + }, + { + "epoch": 0.25313653136531367, + "grad_norm": 0.514385461807251, + "learning_rate": 9.302496640393382e-05, + "loss": 3.1035, + "step": 3430 + }, + { + "epoch": 0.25387453874538746, + "grad_norm": 0.5261507630348206, + "learning_rate": 9.295920299153863e-05, + "loss": 3.1706, + "step": 3440 + }, + { + "epoch": 0.25461254612546125, + "grad_norm": 0.5069513916969299, + "learning_rate": 9.289315448249531e-05, + "loss": 3.1218, + "step": 3450 + }, + { + "epoch": 0.25535055350553504, + "grad_norm": 0.49072757363319397, + "learning_rate": 9.282682131513157e-05, + "loss": 3.1231, + "step": 3460 + }, + { + "epoch": 0.25608856088560883, + "grad_norm": 0.6358250379562378, + "learning_rate": 9.276020392966422e-05, + "loss": 3.1082, + "step": 3470 + }, + { + "epoch": 0.2568265682656827, + "grad_norm": 0.5456467270851135, + "learning_rate": 9.26933027681963e-05, + "loss": 3.1454, + "step": 3480 + }, + { + "epoch": 0.25756457564575647, + "grad_norm": 0.5754953026771545, + "learning_rate": 9.262611827471406e-05, + "loss": 3.1334, + "step": 3490 + }, + { + "epoch": 0.25830258302583026, + "grad_norm": 0.5355437397956848, + "learning_rate": 9.25586508950841e-05, + "loss": 3.0149, + "step": 3500 + }, + { + "epoch": 0.25904059040590405, + "grad_norm": 0.5386449694633484, + "learning_rate": 9.249090107705044e-05, + "loss": 3.1859, + "step": 3510 + }, + { + "epoch": 0.25977859778597784, + "grad_norm": 0.5665399432182312, + "learning_rate": 9.242286927023136e-05, + "loss": 3.171, + "step": 3520 + }, + { + "epoch": 0.2605166051660517, + "grad_norm": 0.5453583002090454, + "learning_rate": 9.235455592611665e-05, + "loss": 3.1198, + "step": 3530 + }, + { + "epoch": 0.2612546125461255, + "grad_norm": 0.5409013032913208, + "learning_rate": 9.22859614980645e-05, + "loss": 3.0841, + "step": 3540 + }, + { + "epoch": 0.26199261992619927, + "grad_norm": 0.5243815779685974, + "learning_rate": 9.221708644129843e-05, + "loss": 3.13, + "step": 3550 + }, + { + "epoch": 0.26273062730627306, + "grad_norm": 0.562589168548584, + "learning_rate": 9.214793121290442e-05, + "loss": 3.0718, + "step": 3560 + }, + { + "epoch": 0.26346863468634685, + "grad_norm": 0.5075133442878723, + "learning_rate": 9.207849627182772e-05, + "loss": 3.1159, + "step": 3570 + }, + { + "epoch": 0.26420664206642064, + "grad_norm": 0.5348154902458191, + "learning_rate": 9.200878207886993e-05, + "loss": 3.1932, + "step": 3580 + }, + { + "epoch": 0.2649446494464945, + "grad_norm": 0.5550357103347778, + "learning_rate": 9.19387890966859e-05, + "loss": 3.0973, + "step": 3590 + }, + { + "epoch": 0.2656826568265683, + "grad_norm": 0.534482479095459, + "learning_rate": 9.186851778978062e-05, + "loss": 3.1466, + "step": 3600 + }, + { + "epoch": 0.26642066420664207, + "grad_norm": 0.521537184715271, + "learning_rate": 9.179796862450618e-05, + "loss": 3.0424, + "step": 3610 + }, + { + "epoch": 0.26715867158671586, + "grad_norm": 0.5350748896598816, + "learning_rate": 9.172714206905866e-05, + "loss": 3.0505, + "step": 3620 + }, + { + "epoch": 0.26789667896678965, + "grad_norm": 0.5348935127258301, + "learning_rate": 9.165603859347502e-05, + "loss": 3.1561, + "step": 3630 + }, + { + "epoch": 0.2686346863468635, + "grad_norm": 0.5182725191116333, + "learning_rate": 9.158465866963002e-05, + "loss": 3.0778, + "step": 3640 + }, + { + "epoch": 0.2693726937269373, + "grad_norm": 0.5188565850257874, + "learning_rate": 9.151300277123301e-05, + "loss": 3.0517, + "step": 3650 + }, + { + "epoch": 0.2701107011070111, + "grad_norm": 0.5163888931274414, + "learning_rate": 9.144107137382484e-05, + "loss": 2.979, + "step": 3660 + }, + { + "epoch": 0.27084870848708487, + "grad_norm": 0.5174587965011597, + "learning_rate": 9.136886495477475e-05, + "loss": 3.0661, + "step": 3670 + }, + { + "epoch": 0.27158671586715866, + "grad_norm": 0.5590752363204956, + "learning_rate": 9.129638399327706e-05, + "loss": 3.0624, + "step": 3680 + }, + { + "epoch": 0.27232472324723245, + "grad_norm": 0.48960742354393005, + "learning_rate": 9.122362897034817e-05, + "loss": 3.0344, + "step": 3690 + }, + { + "epoch": 0.2730627306273063, + "grad_norm": 0.5071660876274109, + "learning_rate": 9.115060036882318e-05, + "loss": 3.0374, + "step": 3700 + }, + { + "epoch": 0.2738007380073801, + "grad_norm": 0.5058993697166443, + "learning_rate": 9.107729867335288e-05, + "loss": 3.0823, + "step": 3710 + }, + { + "epoch": 0.2745387453874539, + "grad_norm": 0.5252380967140198, + "learning_rate": 9.100372437040034e-05, + "loss": 3.0558, + "step": 3720 + }, + { + "epoch": 0.27527675276752767, + "grad_norm": 0.49785932898521423, + "learning_rate": 9.092987794823786e-05, + "loss": 3.0836, + "step": 3730 + }, + { + "epoch": 0.27601476014760146, + "grad_norm": 0.5140420794487, + "learning_rate": 9.085575989694357e-05, + "loss": 3.1079, + "step": 3740 + }, + { + "epoch": 0.2767527675276753, + "grad_norm": 0.5329453945159912, + "learning_rate": 9.078137070839832e-05, + "loss": 3.0775, + "step": 3750 + }, + { + "epoch": 0.2774907749077491, + "grad_norm": 0.4971647560596466, + "learning_rate": 9.070671087628229e-05, + "loss": 3.0756, + "step": 3760 + }, + { + "epoch": 0.2782287822878229, + "grad_norm": 0.5552874803543091, + "learning_rate": 9.063178089607183e-05, + "loss": 3.0615, + "step": 3770 + }, + { + "epoch": 0.2789667896678967, + "grad_norm": 0.525969922542572, + "learning_rate": 9.055658126503605e-05, + "loss": 3.0594, + "step": 3780 + }, + { + "epoch": 0.27970479704797047, + "grad_norm": 0.5235247611999512, + "learning_rate": 9.048111248223368e-05, + "loss": 3.097, + "step": 3790 + }, + { + "epoch": 0.28044280442804426, + "grad_norm": 0.5573784112930298, + "learning_rate": 9.040537504850954e-05, + "loss": 3.0303, + "step": 3800 + }, + { + "epoch": 0.2811808118081181, + "grad_norm": 0.5464443564414978, + "learning_rate": 9.032936946649144e-05, + "loss": 3.063, + "step": 3810 + }, + { + "epoch": 0.2819188191881919, + "grad_norm": 0.5378391146659851, + "learning_rate": 9.02530962405867e-05, + "loss": 3.0853, + "step": 3820 + }, + { + "epoch": 0.2826568265682657, + "grad_norm": 0.5274621844291687, + "learning_rate": 9.017655587697885e-05, + "loss": 3.1374, + "step": 3830 + }, + { + "epoch": 0.2833948339483395, + "grad_norm": 0.5044965744018555, + "learning_rate": 9.009974888362424e-05, + "loss": 3.064, + "step": 3840 + }, + { + "epoch": 0.28413284132841327, + "grad_norm": 0.5318046808242798, + "learning_rate": 9.002267577024876e-05, + "loss": 3.0662, + "step": 3850 + }, + { + "epoch": 0.2848708487084871, + "grad_norm": 0.5438222289085388, + "learning_rate": 8.994533704834435e-05, + "loss": 3.0999, + "step": 3860 + }, + { + "epoch": 0.2856088560885609, + "grad_norm": 0.5226894021034241, + "learning_rate": 8.986773323116563e-05, + "loss": 3.0496, + "step": 3870 + }, + { + "epoch": 0.2863468634686347, + "grad_norm": 1.9248789548873901, + "learning_rate": 8.978986483372655e-05, + "loss": 3.0549, + "step": 3880 + }, + { + "epoch": 0.2870848708487085, + "grad_norm": 0.49465620517730713, + "learning_rate": 8.971173237279692e-05, + "loss": 3.085, + "step": 3890 + }, + { + "epoch": 0.2878228782287823, + "grad_norm": 0.5317748785018921, + "learning_rate": 8.963333636689898e-05, + "loss": 3.0659, + "step": 3900 + }, + { + "epoch": 0.28856088560885607, + "grad_norm": 0.5400087833404541, + "learning_rate": 8.9554677336304e-05, + "loss": 3.0963, + "step": 3910 + }, + { + "epoch": 0.2892988929889299, + "grad_norm": 0.5060845613479614, + "learning_rate": 8.947575580302878e-05, + "loss": 3.0503, + "step": 3920 + }, + { + "epoch": 0.2900369003690037, + "grad_norm": 0.5168414115905762, + "learning_rate": 8.939657229083222e-05, + "loss": 3.1322, + "step": 3930 + }, + { + "epoch": 0.2907749077490775, + "grad_norm": 0.5268558263778687, + "learning_rate": 8.931712732521183e-05, + "loss": 3.0947, + "step": 3940 + }, + { + "epoch": 0.2915129151291513, + "grad_norm": 0.5113683938980103, + "learning_rate": 8.92374214334002e-05, + "loss": 3.0379, + "step": 3950 + }, + { + "epoch": 0.2922509225092251, + "grad_norm": 0.5602664947509766, + "learning_rate": 8.915745514436161e-05, + "loss": 3.0636, + "step": 3960 + }, + { + "epoch": 0.29298892988929887, + "grad_norm": 0.507926344871521, + "learning_rate": 8.907722898878844e-05, + "loss": 3.0737, + "step": 3970 + }, + { + "epoch": 0.2937269372693727, + "grad_norm": 0.5805441737174988, + "learning_rate": 8.899674349909759e-05, + "loss": 3.0743, + "step": 3980 + }, + { + "epoch": 0.2944649446494465, + "grad_norm": 0.5141892433166504, + "learning_rate": 8.891599920942713e-05, + "loss": 3.0711, + "step": 3990 + }, + { + "epoch": 0.2952029520295203, + "grad_norm": 0.5769287347793579, + "learning_rate": 8.883499665563253e-05, + "loss": 3.0302, + "step": 4000 + }, + { + "epoch": 0.2959409594095941, + "grad_norm": 0.5248669981956482, + "learning_rate": 8.875373637528335e-05, + "loss": 3.0871, + "step": 4010 + }, + { + "epoch": 0.2966789667896679, + "grad_norm": 0.5001204609870911, + "learning_rate": 8.867221890765938e-05, + "loss": 3.0342, + "step": 4020 + }, + { + "epoch": 0.2974169741697417, + "grad_norm": 0.5176003575325012, + "learning_rate": 8.859044479374736e-05, + "loss": 3.1404, + "step": 4030 + }, + { + "epoch": 0.2981549815498155, + "grad_norm": 0.5125160217285156, + "learning_rate": 8.850841457623719e-05, + "loss": 3.0399, + "step": 4040 + }, + { + "epoch": 0.2988929889298893, + "grad_norm": 0.49271440505981445, + "learning_rate": 8.842612879951837e-05, + "loss": 3.0082, + "step": 4050 + }, + { + "epoch": 0.2996309963099631, + "grad_norm": 0.5456764698028564, + "learning_rate": 8.834358800967645e-05, + "loss": 3.0537, + "step": 4060 + }, + { + "epoch": 0.3003690036900369, + "grad_norm": 0.5039022564888, + "learning_rate": 8.826079275448933e-05, + "loss": 3.0508, + "step": 4070 + }, + { + "epoch": 0.3011070110701107, + "grad_norm": 0.48597994446754456, + "learning_rate": 8.817774358342367e-05, + "loss": 3.0806, + "step": 4080 + }, + { + "epoch": 0.3018450184501845, + "grad_norm": 0.5243167877197266, + "learning_rate": 8.809444104763122e-05, + "loss": 3.1176, + "step": 4090 + }, + { + "epoch": 0.3025830258302583, + "grad_norm": 0.5244473218917847, + "learning_rate": 8.801088569994522e-05, + "loss": 3.0985, + "step": 4100 + }, + { + "epoch": 0.3033210332103321, + "grad_norm": 0.4856514632701874, + "learning_rate": 8.792707809487661e-05, + "loss": 3.0546, + "step": 4110 + }, + { + "epoch": 0.3040590405904059, + "grad_norm": 0.48701879382133484, + "learning_rate": 8.784301878861047e-05, + "loss": 3.083, + "step": 4120 + }, + { + "epoch": 0.3047970479704797, + "grad_norm": 0.5364317297935486, + "learning_rate": 8.775870833900226e-05, + "loss": 3.0672, + "step": 4130 + }, + { + "epoch": 0.30553505535055353, + "grad_norm": 0.5016632676124573, + "learning_rate": 8.767414730557418e-05, + "loss": 2.9692, + "step": 4140 + }, + { + "epoch": 0.3062730627306273, + "grad_norm": 0.5020787715911865, + "learning_rate": 8.758933624951135e-05, + "loss": 3.0618, + "step": 4150 + }, + { + "epoch": 0.3070110701107011, + "grad_norm": 0.5041311383247375, + "learning_rate": 8.750427573365824e-05, + "loss": 3.0193, + "step": 4160 + }, + { + "epoch": 0.3077490774907749, + "grad_norm": 0.5102233290672302, + "learning_rate": 8.741896632251476e-05, + "loss": 3.0837, + "step": 4170 + }, + { + "epoch": 0.3084870848708487, + "grad_norm": 0.5173757672309875, + "learning_rate": 8.733340858223268e-05, + "loss": 2.9969, + "step": 4180 + }, + { + "epoch": 0.3092250922509225, + "grad_norm": 0.47782695293426514, + "learning_rate": 8.724760308061172e-05, + "loss": 2.9934, + "step": 4190 + }, + { + "epoch": 0.30996309963099633, + "grad_norm": 0.4984055161476135, + "learning_rate": 8.71615503870959e-05, + "loss": 3.0055, + "step": 4200 + }, + { + "epoch": 0.3107011070110701, + "grad_norm": 0.535744845867157, + "learning_rate": 8.707525107276971e-05, + "loss": 3.1124, + "step": 4210 + }, + { + "epoch": 0.3114391143911439, + "grad_norm": 0.5163019895553589, + "learning_rate": 8.698870571035435e-05, + "loss": 3.0904, + "step": 4220 + }, + { + "epoch": 0.3121771217712177, + "grad_norm": 0.5297439694404602, + "learning_rate": 8.690191487420385e-05, + "loss": 3.039, + "step": 4230 + }, + { + "epoch": 0.3129151291512915, + "grad_norm": 0.5315809845924377, + "learning_rate": 8.681487914030137e-05, + "loss": 3.1418, + "step": 4240 + }, + { + "epoch": 0.31365313653136534, + "grad_norm": 0.5038068890571594, + "learning_rate": 8.672759908625528e-05, + "loss": 3.105, + "step": 4250 + }, + { + "epoch": 0.31439114391143913, + "grad_norm": 0.5104600787162781, + "learning_rate": 8.664007529129539e-05, + "loss": 3.0253, + "step": 4260 + }, + { + "epoch": 0.3151291512915129, + "grad_norm": 0.5337395668029785, + "learning_rate": 8.655230833626908e-05, + "loss": 3.0637, + "step": 4270 + }, + { + "epoch": 0.3158671586715867, + "grad_norm": 0.5203779935836792, + "learning_rate": 8.646429880363746e-05, + "loss": 3.0862, + "step": 4280 + }, + { + "epoch": 0.3166051660516605, + "grad_norm": 0.510831356048584, + "learning_rate": 8.637604727747149e-05, + "loss": 2.9944, + "step": 4290 + }, + { + "epoch": 0.3173431734317343, + "grad_norm": 0.5363606214523315, + "learning_rate": 8.62875543434481e-05, + "loss": 3.1227, + "step": 4300 + }, + { + "epoch": 0.31808118081180814, + "grad_norm": 0.5156981945037842, + "learning_rate": 8.61988205888463e-05, + "loss": 3.046, + "step": 4310 + }, + { + "epoch": 0.31881918819188193, + "grad_norm": 0.530002772808075, + "learning_rate": 8.610984660254333e-05, + "loss": 3.037, + "step": 4320 + }, + { + "epoch": 0.3195571955719557, + "grad_norm": 0.5514121651649475, + "learning_rate": 8.602063297501068e-05, + "loss": 3.0828, + "step": 4330 + }, + { + "epoch": 0.3202952029520295, + "grad_norm": 0.49961575865745544, + "learning_rate": 8.593118029831025e-05, + "loss": 3.0404, + "step": 4340 + }, + { + "epoch": 0.3210332103321033, + "grad_norm": 0.4883437752723694, + "learning_rate": 8.584148916609032e-05, + "loss": 3.0681, + "step": 4350 + }, + { + "epoch": 0.32177121771217715, + "grad_norm": 0.5226607918739319, + "learning_rate": 8.575156017358171e-05, + "loss": 3.0631, + "step": 4360 + }, + { + "epoch": 0.32250922509225094, + "grad_norm": 0.5821093320846558, + "learning_rate": 8.566139391759378e-05, + "loss": 3.0793, + "step": 4370 + }, + { + "epoch": 0.32324723247232473, + "grad_norm": 0.5188676118850708, + "learning_rate": 8.557099099651047e-05, + "loss": 3.086, + "step": 4380 + }, + { + "epoch": 0.3239852398523985, + "grad_norm": 0.5117591023445129, + "learning_rate": 8.548035201028636e-05, + "loss": 3.1174, + "step": 4390 + }, + { + "epoch": 0.3247232472324723, + "grad_norm": 0.48335784673690796, + "learning_rate": 8.538947756044261e-05, + "loss": 2.9864, + "step": 4400 + }, + { + "epoch": 0.3254612546125461, + "grad_norm": 0.5281744599342346, + "learning_rate": 8.52983682500631e-05, + "loss": 3.0942, + "step": 4410 + }, + { + "epoch": 0.32619926199261995, + "grad_norm": 0.4935998022556305, + "learning_rate": 8.520702468379028e-05, + "loss": 3.0716, + "step": 4420 + }, + { + "epoch": 0.32693726937269374, + "grad_norm": 0.4817652404308319, + "learning_rate": 8.511544746782125e-05, + "loss": 3.0314, + "step": 4430 + }, + { + "epoch": 0.32767527675276753, + "grad_norm": 0.49610570073127747, + "learning_rate": 8.502363720990374e-05, + "loss": 2.9699, + "step": 4440 + }, + { + "epoch": 0.3284132841328413, + "grad_norm": 0.5101500749588013, + "learning_rate": 8.493159451933203e-05, + "loss": 2.9248, + "step": 4450 + }, + { + "epoch": 0.3291512915129151, + "grad_norm": 0.48433801531791687, + "learning_rate": 8.483932000694295e-05, + "loss": 3.0812, + "step": 4460 + }, + { + "epoch": 0.3298892988929889, + "grad_norm": 0.4775218665599823, + "learning_rate": 8.474681428511177e-05, + "loss": 2.986, + "step": 4470 + }, + { + "epoch": 0.33062730627306275, + "grad_norm": 0.49710339307785034, + "learning_rate": 8.465407796774816e-05, + "loss": 3.0331, + "step": 4480 + }, + { + "epoch": 0.33136531365313654, + "grad_norm": 0.5008261799812317, + "learning_rate": 8.456111167029219e-05, + "loss": 3.0763, + "step": 4490 + }, + { + "epoch": 0.33210332103321033, + "grad_norm": 0.5350390672683716, + "learning_rate": 8.446791600971012e-05, + "loss": 3.0238, + "step": 4500 + }, + { + "epoch": 0.3328413284132841, + "grad_norm": 0.5100720524787903, + "learning_rate": 8.43744916044904e-05, + "loss": 3.1137, + "step": 4510 + }, + { + "epoch": 0.3335793357933579, + "grad_norm": 0.5103323459625244, + "learning_rate": 8.428083907463951e-05, + "loss": 3.0862, + "step": 4520 + }, + { + "epoch": 0.33431734317343176, + "grad_norm": 0.563750147819519, + "learning_rate": 8.418695904167788e-05, + "loss": 3.0551, + "step": 4530 + }, + { + "epoch": 0.33505535055350555, + "grad_norm": 0.4909681975841522, + "learning_rate": 8.40928521286358e-05, + "loss": 2.9769, + "step": 4540 + }, + { + "epoch": 0.33579335793357934, + "grad_norm": 0.5330002903938293, + "learning_rate": 8.399851896004913e-05, + "loss": 3.046, + "step": 4550 + }, + { + "epoch": 0.33653136531365313, + "grad_norm": 0.49845483899116516, + "learning_rate": 8.390396016195537e-05, + "loss": 3.0318, + "step": 4560 + }, + { + "epoch": 0.3372693726937269, + "grad_norm": 0.4647519290447235, + "learning_rate": 8.380917636188934e-05, + "loss": 3.0097, + "step": 4570 + }, + { + "epoch": 0.3380073800738007, + "grad_norm": 0.4947097599506378, + "learning_rate": 8.371416818887908e-05, + "loss": 3.0244, + "step": 4580 + }, + { + "epoch": 0.33874538745387456, + "grad_norm": 0.514033854007721, + "learning_rate": 8.361893627344168e-05, + "loss": 3.0259, + "step": 4590 + }, + { + "epoch": 0.33948339483394835, + "grad_norm": 0.5403528213500977, + "learning_rate": 8.35234812475791e-05, + "loss": 3.0071, + "step": 4600 + }, + { + "epoch": 0.34022140221402214, + "grad_norm": 0.495109498500824, + "learning_rate": 8.342780374477396e-05, + "loss": 3.058, + "step": 4610 + }, + { + "epoch": 0.34095940959409593, + "grad_norm": 0.48301902413368225, + "learning_rate": 8.33319043999853e-05, + "loss": 3.0686, + "step": 4620 + }, + { + "epoch": 0.3416974169741697, + "grad_norm": 0.4977583885192871, + "learning_rate": 8.323578384964444e-05, + "loss": 2.9218, + "step": 4630 + }, + { + "epoch": 0.34243542435424357, + "grad_norm": 0.4929274022579193, + "learning_rate": 8.313944273165069e-05, + "loss": 3.0489, + "step": 4640 + }, + { + "epoch": 0.34317343173431736, + "grad_norm": 0.5092618465423584, + "learning_rate": 8.304288168536718e-05, + "loss": 2.9915, + "step": 4650 + }, + { + "epoch": 0.34391143911439115, + "grad_norm": 0.48645535111427307, + "learning_rate": 8.294610135161658e-05, + "loss": 2.9596, + "step": 4660 + }, + { + "epoch": 0.34464944649446494, + "grad_norm": 0.5053686499595642, + "learning_rate": 8.284910237267682e-05, + "loss": 3.0022, + "step": 4670 + }, + { + "epoch": 0.34538745387453873, + "grad_norm": 0.5074572563171387, + "learning_rate": 8.275188539227686e-05, + "loss": 3.0701, + "step": 4680 + }, + { + "epoch": 0.3461254612546125, + "grad_norm": 0.5153145790100098, + "learning_rate": 8.265445105559247e-05, + "loss": 2.9951, + "step": 4690 + }, + { + "epoch": 0.34686346863468637, + "grad_norm": 0.5247951745986938, + "learning_rate": 8.255680000924184e-05, + "loss": 3.0631, + "step": 4700 + }, + { + "epoch": 0.34760147601476016, + "grad_norm": 0.4750431180000305, + "learning_rate": 8.245893290128136e-05, + "loss": 3.0917, + "step": 4710 + }, + { + "epoch": 0.34833948339483395, + "grad_norm": 0.4787590503692627, + "learning_rate": 8.236085038120129e-05, + "loss": 3.0494, + "step": 4720 + }, + { + "epoch": 0.34907749077490774, + "grad_norm": 0.49496400356292725, + "learning_rate": 8.22625530999215e-05, + "loss": 3.0276, + "step": 4730 + }, + { + "epoch": 0.34981549815498153, + "grad_norm": 0.517461359500885, + "learning_rate": 8.216404170978707e-05, + "loss": 2.9682, + "step": 4740 + }, + { + "epoch": 0.3505535055350554, + "grad_norm": 0.4839133024215698, + "learning_rate": 8.206531686456403e-05, + "loss": 3.0396, + "step": 4750 + }, + { + "epoch": 0.35129151291512917, + "grad_norm": 0.5224480628967285, + "learning_rate": 8.196637921943496e-05, + "loss": 3.048, + "step": 4760 + }, + { + "epoch": 0.35202952029520296, + "grad_norm": 0.5209102034568787, + "learning_rate": 8.186722943099472e-05, + "loss": 3.0128, + "step": 4770 + }, + { + "epoch": 0.35276752767527675, + "grad_norm": 0.480421781539917, + "learning_rate": 8.176786815724601e-05, + "loss": 3.0139, + "step": 4780 + }, + { + "epoch": 0.35350553505535054, + "grad_norm": 0.4676721692085266, + "learning_rate": 8.166829605759507e-05, + "loss": 2.8988, + "step": 4790 + }, + { + "epoch": 0.35424354243542433, + "grad_norm": 0.5178680419921875, + "learning_rate": 8.156851379284729e-05, + "loss": 3.0074, + "step": 4800 + }, + { + "epoch": 0.3549815498154982, + "grad_norm": 0.5426033735275269, + "learning_rate": 8.146852202520277e-05, + "loss": 2.9998, + "step": 4810 + }, + { + "epoch": 0.35571955719557197, + "grad_norm": 0.4766799807548523, + "learning_rate": 8.136832141825196e-05, + "loss": 3.0129, + "step": 4820 + }, + { + "epoch": 0.35645756457564576, + "grad_norm": 0.49461451172828674, + "learning_rate": 8.12679126369713e-05, + "loss": 3.0726, + "step": 4830 + }, + { + "epoch": 0.35719557195571955, + "grad_norm": 0.4843361973762512, + "learning_rate": 8.116729634771876e-05, + "loss": 2.9953, + "step": 4840 + }, + { + "epoch": 0.35793357933579334, + "grad_norm": 0.5127764344215393, + "learning_rate": 8.106647321822943e-05, + "loss": 3.0525, + "step": 4850 + }, + { + "epoch": 0.3586715867158672, + "grad_norm": 0.4938580393791199, + "learning_rate": 8.096544391761103e-05, + "loss": 2.975, + "step": 4860 + }, + { + "epoch": 0.359409594095941, + "grad_norm": 0.4944118559360504, + "learning_rate": 8.08642091163396e-05, + "loss": 3.0102, + "step": 4870 + }, + { + "epoch": 0.36014760147601477, + "grad_norm": 0.4949988126754761, + "learning_rate": 8.076276948625494e-05, + "loss": 2.9756, + "step": 4880 + }, + { + "epoch": 0.36088560885608856, + "grad_norm": 0.5549206733703613, + "learning_rate": 8.066112570055621e-05, + "loss": 3.0896, + "step": 4890 + }, + { + "epoch": 0.36162361623616235, + "grad_norm": 0.4933255910873413, + "learning_rate": 8.055927843379738e-05, + "loss": 3.036, + "step": 4900 + }, + { + "epoch": 0.36236162361623614, + "grad_norm": 0.5120234489440918, + "learning_rate": 8.04572283618829e-05, + "loss": 3.0661, + "step": 4910 + }, + { + "epoch": 0.36309963099631, + "grad_norm": 0.47579410672187805, + "learning_rate": 8.035497616206302e-05, + "loss": 2.9517, + "step": 4920 + }, + { + "epoch": 0.3638376383763838, + "grad_norm": 0.47006312012672424, + "learning_rate": 8.025252251292949e-05, + "loss": 2.9931, + "step": 4930 + }, + { + "epoch": 0.36457564575645757, + "grad_norm": 0.498418927192688, + "learning_rate": 8.014986809441094e-05, + "loss": 2.9749, + "step": 4940 + }, + { + "epoch": 0.36531365313653136, + "grad_norm": 0.4772182106971741, + "learning_rate": 8.00470135877684e-05, + "loss": 2.9708, + "step": 4950 + }, + { + "epoch": 0.36605166051660515, + "grad_norm": 0.47467556595802307, + "learning_rate": 7.994395967559076e-05, + "loss": 2.9898, + "step": 4960 + }, + { + "epoch": 0.36678966789667894, + "grad_norm": 0.509661078453064, + "learning_rate": 7.984070704179026e-05, + "loss": 3.0238, + "step": 4970 + }, + { + "epoch": 0.3675276752767528, + "grad_norm": 0.47225892543792725, + "learning_rate": 7.973725637159794e-05, + "loss": 3.0066, + "step": 4980 + }, + { + "epoch": 0.3682656826568266, + "grad_norm": 0.5211546421051025, + "learning_rate": 7.963360835155915e-05, + "loss": 3.0896, + "step": 4990 + }, + { + "epoch": 0.36900369003690037, + "grad_norm": 0.4817075729370117, + "learning_rate": 7.952976366952888e-05, + "loss": 3.0348, + "step": 5000 + }, + { + "epoch": 0.36974169741697416, + "grad_norm": 0.4747537672519684, + "learning_rate": 7.942572301466727e-05, + "loss": 3.0146, + "step": 5010 + }, + { + "epoch": 0.37047970479704795, + "grad_norm": 0.5026445984840393, + "learning_rate": 7.932148707743503e-05, + "loss": 2.9681, + "step": 5020 + }, + { + "epoch": 0.3712177121771218, + "grad_norm": 0.47187340259552, + "learning_rate": 7.921705654958886e-05, + "loss": 3.0161, + "step": 5030 + }, + { + "epoch": 0.3719557195571956, + "grad_norm": 0.5039234161376953, + "learning_rate": 7.911243212417687e-05, + "loss": 3.0002, + "step": 5040 + }, + { + "epoch": 0.3726937269372694, + "grad_norm": 0.481448233127594, + "learning_rate": 7.900761449553394e-05, + "loss": 2.9907, + "step": 5050 + }, + { + "epoch": 0.37343173431734317, + "grad_norm": 0.4844491481781006, + "learning_rate": 7.890260435927708e-05, + "loss": 3.0501, + "step": 5060 + }, + { + "epoch": 0.37416974169741696, + "grad_norm": 0.502325177192688, + "learning_rate": 7.879740241230098e-05, + "loss": 2.9843, + "step": 5070 + }, + { + "epoch": 0.37490774907749075, + "grad_norm": 0.49289822578430176, + "learning_rate": 7.869200935277317e-05, + "loss": 2.9808, + "step": 5080 + }, + { + "epoch": 0.3756457564575646, + "grad_norm": 0.4960924983024597, + "learning_rate": 7.858642588012957e-05, + "loss": 3.0367, + "step": 5090 + }, + { + "epoch": 0.3763837638376384, + "grad_norm": 0.4961390495300293, + "learning_rate": 7.848065269506968e-05, + "loss": 3.0371, + "step": 5100 + }, + { + "epoch": 0.3771217712177122, + "grad_norm": 0.5095449090003967, + "learning_rate": 7.837469049955211e-05, + "loss": 2.9584, + "step": 5110 + }, + { + "epoch": 0.37785977859778597, + "grad_norm": 0.5364798307418823, + "learning_rate": 7.826853999678979e-05, + "loss": 3.0194, + "step": 5120 + }, + { + "epoch": 0.37859778597785976, + "grad_norm": 0.47735193371772766, + "learning_rate": 7.816220189124526e-05, + "loss": 2.9603, + "step": 5130 + }, + { + "epoch": 0.3793357933579336, + "grad_norm": 0.47760894894599915, + "learning_rate": 7.805567688862626e-05, + "loss": 3.0335, + "step": 5140 + }, + { + "epoch": 0.3800738007380074, + "grad_norm": 0.4874935448169708, + "learning_rate": 7.794896569588066e-05, + "loss": 3.0274, + "step": 5150 + }, + { + "epoch": 0.3808118081180812, + "grad_norm": 0.48565617203712463, + "learning_rate": 7.784206902119213e-05, + "loss": 3.0081, + "step": 5160 + }, + { + "epoch": 0.381549815498155, + "grad_norm": 0.513862133026123, + "learning_rate": 7.773498757397522e-05, + "loss": 2.9605, + "step": 5170 + }, + { + "epoch": 0.38228782287822877, + "grad_norm": 0.4750123918056488, + "learning_rate": 7.762772206487066e-05, + "loss": 3.0109, + "step": 5180 + }, + { + "epoch": 0.38302583025830256, + "grad_norm": 0.4761565327644348, + "learning_rate": 7.75202732057408e-05, + "loss": 3.0137, + "step": 5190 + }, + { + "epoch": 0.3837638376383764, + "grad_norm": 0.5001286864280701, + "learning_rate": 7.741264170966472e-05, + "loss": 3.0493, + "step": 5200 + }, + { + "epoch": 0.3845018450184502, + "grad_norm": 0.48891499638557434, + "learning_rate": 7.730482829093358e-05, + "loss": 3.0333, + "step": 5210 + }, + { + "epoch": 0.385239852398524, + "grad_norm": 0.4714498221874237, + "learning_rate": 7.719683366504586e-05, + "loss": 2.9868, + "step": 5220 + }, + { + "epoch": 0.3859778597785978, + "grad_norm": 0.4761471748352051, + "learning_rate": 7.708865854870258e-05, + "loss": 3.0351, + "step": 5230 + }, + { + "epoch": 0.38671586715867157, + "grad_norm": 0.47278621792793274, + "learning_rate": 7.698030365980265e-05, + "loss": 3.0056, + "step": 5240 + }, + { + "epoch": 0.3874538745387454, + "grad_norm": 0.502041220664978, + "learning_rate": 7.687176971743796e-05, + "loss": 3.013, + "step": 5250 + }, + { + "epoch": 0.3881918819188192, + "grad_norm": 0.4808847904205322, + "learning_rate": 7.676305744188871e-05, + "loss": 3.0363, + "step": 5260 + }, + { + "epoch": 0.388929889298893, + "grad_norm": 0.4782809615135193, + "learning_rate": 7.665416755461859e-05, + "loss": 2.9693, + "step": 5270 + }, + { + "epoch": 0.3896678966789668, + "grad_norm": 0.4984862804412842, + "learning_rate": 7.654510077827003e-05, + "loss": 2.9882, + "step": 5280 + }, + { + "epoch": 0.3904059040590406, + "grad_norm": 0.48033297061920166, + "learning_rate": 7.643585783665931e-05, + "loss": 2.9822, + "step": 5290 + }, + { + "epoch": 0.39114391143911437, + "grad_norm": 0.5328406691551208, + "learning_rate": 7.632643945477193e-05, + "loss": 2.9835, + "step": 5300 + }, + { + "epoch": 0.3918819188191882, + "grad_norm": 0.4741387963294983, + "learning_rate": 7.621684635875756e-05, + "loss": 3.0095, + "step": 5310 + }, + { + "epoch": 0.392619926199262, + "grad_norm": 0.8941669464111328, + "learning_rate": 7.610707927592549e-05, + "loss": 2.9642, + "step": 5320 + }, + { + "epoch": 0.3933579335793358, + "grad_norm": 0.501148521900177, + "learning_rate": 7.59971389347395e-05, + "loss": 2.9973, + "step": 5330 + }, + { + "epoch": 0.3940959409594096, + "grad_norm": 0.4852311611175537, + "learning_rate": 7.588702606481337e-05, + "loss": 3.019, + "step": 5340 + }, + { + "epoch": 0.3948339483394834, + "grad_norm": 0.44878798723220825, + "learning_rate": 7.577674139690572e-05, + "loss": 2.9582, + "step": 5350 + }, + { + "epoch": 0.3955719557195572, + "grad_norm": 0.4837028384208679, + "learning_rate": 7.566628566291536e-05, + "loss": 2.9865, + "step": 5360 + }, + { + "epoch": 0.396309963099631, + "grad_norm": 0.5781135559082031, + "learning_rate": 7.555565959587638e-05, + "loss": 2.9709, + "step": 5370 + }, + { + "epoch": 0.3970479704797048, + "grad_norm": 0.4646313786506653, + "learning_rate": 7.544486392995324e-05, + "loss": 3.0123, + "step": 5380 + }, + { + "epoch": 0.3977859778597786, + "grad_norm": 0.45897990465164185, + "learning_rate": 7.533389940043598e-05, + "loss": 2.9744, + "step": 5390 + }, + { + "epoch": 0.3985239852398524, + "grad_norm": 0.47609013319015503, + "learning_rate": 7.522276674373525e-05, + "loss": 2.9654, + "step": 5400 + }, + { + "epoch": 0.3992619926199262, + "grad_norm": 0.48847806453704834, + "learning_rate": 7.51114666973775e-05, + "loss": 3.0279, + "step": 5410 + }, + { + "epoch": 0.4, + "grad_norm": 0.5017388463020325, + "learning_rate": 7.500000000000001e-05, + "loss": 2.9632, + "step": 5420 + }, + { + "epoch": 0.4007380073800738, + "grad_norm": 0.49840694665908813, + "learning_rate": 7.488836739134608e-05, + "loss": 3.0054, + "step": 5430 + }, + { + "epoch": 0.4014760147601476, + "grad_norm": 0.48498594760894775, + "learning_rate": 7.477656961226007e-05, + "loss": 2.9744, + "step": 5440 + }, + { + "epoch": 0.4022140221402214, + "grad_norm": 0.49641212821006775, + "learning_rate": 7.466460740468245e-05, + "loss": 3.0054, + "step": 5450 + }, + { + "epoch": 0.4029520295202952, + "grad_norm": 0.47951868176460266, + "learning_rate": 7.455248151164493e-05, + "loss": 2.9506, + "step": 5460 + }, + { + "epoch": 0.40369003690036903, + "grad_norm": 0.5073153972625732, + "learning_rate": 7.444019267726553e-05, + "loss": 2.9172, + "step": 5470 + }, + { + "epoch": 0.4044280442804428, + "grad_norm": 0.48473188281059265, + "learning_rate": 7.432774164674359e-05, + "loss": 2.9388, + "step": 5480 + }, + { + "epoch": 0.4051660516605166, + "grad_norm": 0.4775610566139221, + "learning_rate": 7.421512916635485e-05, + "loss": 3.0088, + "step": 5490 + }, + { + "epoch": 0.4059040590405904, + "grad_norm": 0.5261042714118958, + "learning_rate": 7.410235598344657e-05, + "loss": 2.9721, + "step": 5500 + }, + { + "epoch": 0.4066420664206642, + "grad_norm": 0.45107316970825195, + "learning_rate": 7.398942284643241e-05, + "loss": 2.9521, + "step": 5510 + }, + { + "epoch": 0.407380073800738, + "grad_norm": 0.46772444248199463, + "learning_rate": 7.387633050478766e-05, + "loss": 2.9259, + "step": 5520 + }, + { + "epoch": 0.40811808118081183, + "grad_norm": 0.4604153633117676, + "learning_rate": 7.376307970904408e-05, + "loss": 3.082, + "step": 5530 + }, + { + "epoch": 0.4088560885608856, + "grad_norm": 0.47096291184425354, + "learning_rate": 7.364967121078502e-05, + "loss": 2.9186, + "step": 5540 + }, + { + "epoch": 0.4095940959409594, + "grad_norm": 0.4761073589324951, + "learning_rate": 7.353610576264045e-05, + "loss": 3.028, + "step": 5550 + }, + { + "epoch": 0.4103321033210332, + "grad_norm": 0.5043940544128418, + "learning_rate": 7.34223841182819e-05, + "loss": 2.9259, + "step": 5560 + }, + { + "epoch": 0.411070110701107, + "grad_norm": 0.48511525988578796, + "learning_rate": 7.33085070324175e-05, + "loss": 2.9453, + "step": 5570 + }, + { + "epoch": 0.4118081180811808, + "grad_norm": 0.4717444181442261, + "learning_rate": 7.319447526078696e-05, + "loss": 3.0091, + "step": 5580 + }, + { + "epoch": 0.41254612546125463, + "grad_norm": 0.44939619302749634, + "learning_rate": 7.308028956015653e-05, + "loss": 2.9809, + "step": 5590 + }, + { + "epoch": 0.4132841328413284, + "grad_norm": 0.46631982922554016, + "learning_rate": 7.296595068831406e-05, + "loss": 2.9969, + "step": 5600 + }, + { + "epoch": 0.4140221402214022, + "grad_norm": 0.4884931743144989, + "learning_rate": 7.285145940406386e-05, + "loss": 2.9521, + "step": 5610 + }, + { + "epoch": 0.414760147601476, + "grad_norm": 0.4892655611038208, + "learning_rate": 7.273681646722173e-05, + "loss": 2.9666, + "step": 5620 + }, + { + "epoch": 0.4154981549815498, + "grad_norm": 0.4869326651096344, + "learning_rate": 7.262202263860988e-05, + "loss": 2.9618, + "step": 5630 + }, + { + "epoch": 0.41623616236162364, + "grad_norm": 0.48076122999191284, + "learning_rate": 7.2507078680052e-05, + "loss": 2.9113, + "step": 5640 + }, + { + "epoch": 0.41697416974169743, + "grad_norm": 0.46369293332099915, + "learning_rate": 7.239198535436801e-05, + "loss": 2.9309, + "step": 5650 + }, + { + "epoch": 0.4177121771217712, + "grad_norm": 0.49062806367874146, + "learning_rate": 7.227674342536913e-05, + "loss": 3.0057, + "step": 5660 + }, + { + "epoch": 0.418450184501845, + "grad_norm": 0.4727836847305298, + "learning_rate": 7.216135365785279e-05, + "loss": 3.0034, + "step": 5670 + }, + { + "epoch": 0.4191881918819188, + "grad_norm": 0.5185651779174805, + "learning_rate": 7.20458168175975e-05, + "loss": 2.9296, + "step": 5680 + }, + { + "epoch": 0.4199261992619926, + "grad_norm": 0.4758572280406952, + "learning_rate": 7.193013367135792e-05, + "loss": 2.9805, + "step": 5690 + }, + { + "epoch": 0.42066420664206644, + "grad_norm": 0.507834255695343, + "learning_rate": 7.181430498685954e-05, + "loss": 2.9829, + "step": 5700 + }, + { + "epoch": 0.42140221402214023, + "grad_norm": 0.48527729511260986, + "learning_rate": 7.169833153279375e-05, + "loss": 2.9951, + "step": 5710 + }, + { + "epoch": 0.422140221402214, + "grad_norm": 0.5018925070762634, + "learning_rate": 7.158221407881272e-05, + "loss": 3.0251, + "step": 5720 + }, + { + "epoch": 0.4228782287822878, + "grad_norm": 0.5182327032089233, + "learning_rate": 7.146595339552422e-05, + "loss": 2.9954, + "step": 5730 + }, + { + "epoch": 0.4236162361623616, + "grad_norm": 0.5015000104904175, + "learning_rate": 7.134955025448663e-05, + "loss": 2.9285, + "step": 5740 + }, + { + "epoch": 0.42435424354243545, + "grad_norm": 0.47007137537002563, + "learning_rate": 7.123300542820366e-05, + "loss": 2.923, + "step": 5750 + }, + { + "epoch": 0.42509225092250924, + "grad_norm": 0.4987011253833771, + "learning_rate": 7.111631969011938e-05, + "loss": 2.9555, + "step": 5760 + }, + { + "epoch": 0.42583025830258303, + "grad_norm": 0.4811478853225708, + "learning_rate": 7.099949381461296e-05, + "loss": 2.9797, + "step": 5770 + }, + { + "epoch": 0.4265682656826568, + "grad_norm": 0.4753568470478058, + "learning_rate": 7.08825285769936e-05, + "loss": 2.9137, + "step": 5780 + }, + { + "epoch": 0.4273062730627306, + "grad_norm": 0.46175628900527954, + "learning_rate": 7.076542475349537e-05, + "loss": 2.9291, + "step": 5790 + }, + { + "epoch": 0.4280442804428044, + "grad_norm": 0.5033062696456909, + "learning_rate": 7.06481831212721e-05, + "loss": 2.9927, + "step": 5800 + }, + { + "epoch": 0.42878228782287825, + "grad_norm": 0.4942483603954315, + "learning_rate": 7.05308044583921e-05, + "loss": 2.8999, + "step": 5810 + }, + { + "epoch": 0.42952029520295204, + "grad_norm": 0.46212270855903625, + "learning_rate": 7.041328954383316e-05, + "loss": 2.9618, + "step": 5820 + }, + { + "epoch": 0.43025830258302583, + "grad_norm": 0.4895878732204437, + "learning_rate": 7.029563915747722e-05, + "loss": 3.0415, + "step": 5830 + }, + { + "epoch": 0.4309963099630996, + "grad_norm": 0.48732495307922363, + "learning_rate": 7.017785408010533e-05, + "loss": 2.9275, + "step": 5840 + }, + { + "epoch": 0.4317343173431734, + "grad_norm": 0.49087876081466675, + "learning_rate": 7.005993509339241e-05, + "loss": 2.981, + "step": 5850 + }, + { + "epoch": 0.43247232472324726, + "grad_norm": 0.5266060829162598, + "learning_rate": 6.9941882979902e-05, + "loss": 2.8859, + "step": 5860 + }, + { + "epoch": 0.43321033210332105, + "grad_norm": 0.45862722396850586, + "learning_rate": 6.982369852308124e-05, + "loss": 2.9225, + "step": 5870 + }, + { + "epoch": 0.43394833948339484, + "grad_norm": 0.5097654461860657, + "learning_rate": 6.97053825072554e-05, + "loss": 2.9179, + "step": 5880 + }, + { + "epoch": 0.43468634686346863, + "grad_norm": 0.5156700611114502, + "learning_rate": 6.958693571762301e-05, + "loss": 3.0092, + "step": 5890 + }, + { + "epoch": 0.4354243542435424, + "grad_norm": 0.4698309898376465, + "learning_rate": 6.946835894025037e-05, + "loss": 2.8776, + "step": 5900 + }, + { + "epoch": 0.4361623616236162, + "grad_norm": 0.4787076711654663, + "learning_rate": 6.934965296206645e-05, + "loss": 2.9759, + "step": 5910 + }, + { + "epoch": 0.43690036900369006, + "grad_norm": 0.4753543734550476, + "learning_rate": 6.923081857085766e-05, + "loss": 3.0012, + "step": 5920 + }, + { + "epoch": 0.43763837638376385, + "grad_norm": 0.4781608283519745, + "learning_rate": 6.911185655526263e-05, + "loss": 2.9636, + "step": 5930 + }, + { + "epoch": 0.43837638376383764, + "grad_norm": 0.46679866313934326, + "learning_rate": 6.899276770476695e-05, + "loss": 2.9666, + "step": 5940 + }, + { + "epoch": 0.43911439114391143, + "grad_norm": 0.4817095100879669, + "learning_rate": 6.887355280969796e-05, + "loss": 2.9268, + "step": 5950 + }, + { + "epoch": 0.4398523985239852, + "grad_norm": 0.46391561627388, + "learning_rate": 6.875421266121946e-05, + "loss": 2.9796, + "step": 5960 + }, + { + "epoch": 0.44059040590405907, + "grad_norm": 0.4704035222530365, + "learning_rate": 6.86347480513265e-05, + "loss": 2.93, + "step": 5970 + }, + { + "epoch": 0.44132841328413286, + "grad_norm": 0.5005739331245422, + "learning_rate": 6.851515977284013e-05, + "loss": 2.9329, + "step": 5980 + }, + { + "epoch": 0.44206642066420665, + "grad_norm": 0.5069407224655151, + "learning_rate": 6.839544861940214e-05, + "loss": 3.0269, + "step": 5990 + }, + { + "epoch": 0.44280442804428044, + "grad_norm": 0.4672479033470154, + "learning_rate": 6.827561538546967e-05, + "loss": 2.9522, + "step": 6000 + }, + { + "epoch": 0.44354243542435423, + "grad_norm": 0.4877452850341797, + "learning_rate": 6.815566086631016e-05, + "loss": 2.9381, + "step": 6010 + }, + { + "epoch": 0.444280442804428, + "grad_norm": 0.4852764308452606, + "learning_rate": 6.80355858579959e-05, + "loss": 2.9431, + "step": 6020 + }, + { + "epoch": 0.44501845018450187, + "grad_norm": 0.4775632321834564, + "learning_rate": 6.791539115739879e-05, + "loss": 2.9923, + "step": 6030 + }, + { + "epoch": 0.44575645756457566, + "grad_norm": 0.48804882168769836, + "learning_rate": 6.779507756218509e-05, + "loss": 3.0321, + "step": 6040 + }, + { + "epoch": 0.44649446494464945, + "grad_norm": 0.4770827293395996, + "learning_rate": 6.76746458708101e-05, + "loss": 3.0004, + "step": 6050 + }, + { + "epoch": 0.44723247232472324, + "grad_norm": 0.47312870621681213, + "learning_rate": 6.75540968825128e-05, + "loss": 2.9975, + "step": 6060 + }, + { + "epoch": 0.44797047970479703, + "grad_norm": 0.48013314604759216, + "learning_rate": 6.74334313973107e-05, + "loss": 2.9666, + "step": 6070 + }, + { + "epoch": 0.4487084870848708, + "grad_norm": 0.4521431624889374, + "learning_rate": 6.731265021599436e-05, + "loss": 2.8592, + "step": 6080 + }, + { + "epoch": 0.44944649446494467, + "grad_norm": 0.4653100073337555, + "learning_rate": 6.719175414012219e-05, + "loss": 2.9367, + "step": 6090 + }, + { + "epoch": 0.45018450184501846, + "grad_norm": 0.5198903679847717, + "learning_rate": 6.707074397201508e-05, + "loss": 3.014, + "step": 6100 + }, + { + "epoch": 0.45092250922509225, + "grad_norm": 0.4655381441116333, + "learning_rate": 6.694962051475107e-05, + "loss": 2.9422, + "step": 6110 + }, + { + "epoch": 0.45166051660516604, + "grad_norm": 0.4614551067352295, + "learning_rate": 6.682838457216009e-05, + "loss": 2.9474, + "step": 6120 + }, + { + "epoch": 0.45239852398523983, + "grad_norm": 0.4937768876552582, + "learning_rate": 6.67070369488185e-05, + "loss": 2.8953, + "step": 6130 + }, + { + "epoch": 0.4531365313653137, + "grad_norm": 0.4759802222251892, + "learning_rate": 6.65855784500439e-05, + "loss": 2.9553, + "step": 6140 + }, + { + "epoch": 0.45387453874538747, + "grad_norm": 0.519924521446228, + "learning_rate": 6.646400988188964e-05, + "loss": 2.8839, + "step": 6150 + }, + { + "epoch": 0.45461254612546126, + "grad_norm": 0.46175694465637207, + "learning_rate": 6.63423320511396e-05, + "loss": 2.9878, + "step": 6160 + }, + { + "epoch": 0.45535055350553505, + "grad_norm": 0.48847445845603943, + "learning_rate": 6.622054576530274e-05, + "loss": 2.9601, + "step": 6170 + }, + { + "epoch": 0.45608856088560884, + "grad_norm": 0.46752119064331055, + "learning_rate": 6.609865183260778e-05, + "loss": 2.9375, + "step": 6180 + }, + { + "epoch": 0.45682656826568263, + "grad_norm": 0.48789575695991516, + "learning_rate": 6.597665106199783e-05, + "loss": 2.9675, + "step": 6190 + }, + { + "epoch": 0.4575645756457565, + "grad_norm": 0.46002650260925293, + "learning_rate": 6.585454426312506e-05, + "loss": 2.9194, + "step": 6200 + }, + { + "epoch": 0.45830258302583027, + "grad_norm": 0.4882054924964905, + "learning_rate": 6.573233224634524e-05, + "loss": 2.931, + "step": 6210 + }, + { + "epoch": 0.45904059040590406, + "grad_norm": 0.4962427318096161, + "learning_rate": 6.561001582271245e-05, + "loss": 2.9639, + "step": 6220 + }, + { + "epoch": 0.45977859778597785, + "grad_norm": 0.47860512137413025, + "learning_rate": 6.548759580397363e-05, + "loss": 2.9726, + "step": 6230 + }, + { + "epoch": 0.46051660516605164, + "grad_norm": 0.4823954701423645, + "learning_rate": 6.536507300256327e-05, + "loss": 2.9363, + "step": 6240 + }, + { + "epoch": 0.4612546125461255, + "grad_norm": 0.46530622243881226, + "learning_rate": 6.524244823159794e-05, + "loss": 2.9696, + "step": 6250 + }, + { + "epoch": 0.4619926199261993, + "grad_norm": 0.4861395061016083, + "learning_rate": 6.511972230487091e-05, + "loss": 2.9816, + "step": 6260 + }, + { + "epoch": 0.46273062730627307, + "grad_norm": 0.47099757194519043, + "learning_rate": 6.499689603684682e-05, + "loss": 2.8812, + "step": 6270 + }, + { + "epoch": 0.46346863468634686, + "grad_norm": 0.47105422616004944, + "learning_rate": 6.487397024265616e-05, + "loss": 2.8715, + "step": 6280 + }, + { + "epoch": 0.46420664206642065, + "grad_norm": 0.4647127091884613, + "learning_rate": 6.475094573808993e-05, + "loss": 2.972, + "step": 6290 + }, + { + "epoch": 0.46494464944649444, + "grad_norm": 0.4713263213634491, + "learning_rate": 6.462782333959429e-05, + "loss": 2.9297, + "step": 6300 + }, + { + "epoch": 0.4656826568265683, + "grad_norm": 0.4704754650592804, + "learning_rate": 6.450460386426495e-05, + "loss": 2.9489, + "step": 6310 + }, + { + "epoch": 0.4664206642066421, + "grad_norm": 0.49764499068260193, + "learning_rate": 6.438128812984199e-05, + "loss": 2.8814, + "step": 6320 + }, + { + "epoch": 0.46715867158671587, + "grad_norm": 0.46612176299095154, + "learning_rate": 6.425787695470419e-05, + "loss": 2.9663, + "step": 6330 + }, + { + "epoch": 0.46789667896678966, + "grad_norm": 0.46676209568977356, + "learning_rate": 6.41343711578638e-05, + "loss": 2.9843, + "step": 6340 + }, + { + "epoch": 0.46863468634686345, + "grad_norm": 0.45879995822906494, + "learning_rate": 6.401077155896099e-05, + "loss": 2.8991, + "step": 6350 + }, + { + "epoch": 0.4693726937269373, + "grad_norm": 0.4595896303653717, + "learning_rate": 6.388707897825846e-05, + "loss": 2.9603, + "step": 6360 + }, + { + "epoch": 0.4701107011070111, + "grad_norm": 0.47197359800338745, + "learning_rate": 6.376329423663596e-05, + "loss": 3.0058, + "step": 6370 + }, + { + "epoch": 0.4708487084870849, + "grad_norm": 0.4487576186656952, + "learning_rate": 6.363941815558484e-05, + "loss": 2.9126, + "step": 6380 + }, + { + "epoch": 0.47158671586715867, + "grad_norm": 0.45560458302497864, + "learning_rate": 6.35154515572027e-05, + "loss": 2.9979, + "step": 6390 + }, + { + "epoch": 0.47232472324723246, + "grad_norm": 0.4601997435092926, + "learning_rate": 6.339139526418778e-05, + "loss": 2.8166, + "step": 6400 + }, + { + "epoch": 0.47306273062730625, + "grad_norm": 0.48877766728401184, + "learning_rate": 6.32672500998336e-05, + "loss": 2.8798, + "step": 6410 + }, + { + "epoch": 0.4738007380073801, + "grad_norm": 0.4835923910140991, + "learning_rate": 6.314301688802347e-05, + "loss": 2.9273, + "step": 6420 + }, + { + "epoch": 0.4745387453874539, + "grad_norm": 0.465264230966568, + "learning_rate": 6.301869645322498e-05, + "loss": 2.9399, + "step": 6430 + }, + { + "epoch": 0.4752767527675277, + "grad_norm": 0.49252355098724365, + "learning_rate": 6.289428962048467e-05, + "loss": 2.9608, + "step": 6440 + }, + { + "epoch": 0.47601476014760147, + "grad_norm": 0.48788875341415405, + "learning_rate": 6.276979721542239e-05, + "loss": 2.9896, + "step": 6450 + }, + { + "epoch": 0.47675276752767526, + "grad_norm": 0.4745902121067047, + "learning_rate": 6.264522006422586e-05, + "loss": 2.9076, + "step": 6460 + }, + { + "epoch": 0.4774907749077491, + "grad_norm": 0.47580885887145996, + "learning_rate": 6.252055899364525e-05, + "loss": 2.899, + "step": 6470 + }, + { + "epoch": 0.4782287822878229, + "grad_norm": 0.47672221064567566, + "learning_rate": 6.239581483098766e-05, + "loss": 2.9338, + "step": 6480 + }, + { + "epoch": 0.4789667896678967, + "grad_norm": 0.46901679039001465, + "learning_rate": 6.227098840411166e-05, + "loss": 2.9081, + "step": 6490 + }, + { + "epoch": 0.4797047970479705, + "grad_norm": 0.45821747183799744, + "learning_rate": 6.214608054142167e-05, + "loss": 2.9717, + "step": 6500 + }, + { + "epoch": 0.48044280442804427, + "grad_norm": 0.457815945148468, + "learning_rate": 6.202109207186263e-05, + "loss": 2.9594, + "step": 6510 + }, + { + "epoch": 0.48118081180811806, + "grad_norm": 0.45802658796310425, + "learning_rate": 6.189602382491439e-05, + "loss": 2.958, + "step": 6520 + }, + { + "epoch": 0.4819188191881919, + "grad_norm": 0.47702470421791077, + "learning_rate": 6.177087663058626e-05, + "loss": 2.9481, + "step": 6530 + }, + { + "epoch": 0.4826568265682657, + "grad_norm": 0.4765585660934448, + "learning_rate": 6.164565131941147e-05, + "loss": 2.9139, + "step": 6540 + }, + { + "epoch": 0.4833948339483395, + "grad_norm": 0.49875739216804504, + "learning_rate": 6.152034872244166e-05, + "loss": 2.9726, + "step": 6550 + }, + { + "epoch": 0.4841328413284133, + "grad_norm": 0.46083393692970276, + "learning_rate": 6.13949696712414e-05, + "loss": 2.9462, + "step": 6560 + }, + { + "epoch": 0.48487084870848707, + "grad_norm": 0.4647446274757385, + "learning_rate": 6.126951499788261e-05, + "loss": 2.9349, + "step": 6570 + }, + { + "epoch": 0.48560885608856086, + "grad_norm": 0.4930126667022705, + "learning_rate": 6.114398553493908e-05, + "loss": 2.9763, + "step": 6580 + }, + { + "epoch": 0.4863468634686347, + "grad_norm": 0.4873722791671753, + "learning_rate": 6.1018382115480985e-05, + "loss": 2.9322, + "step": 6590 + }, + { + "epoch": 0.4870848708487085, + "grad_norm": 0.4486652910709381, + "learning_rate": 6.089270557306923e-05, + "loss": 2.8796, + "step": 6600 + }, + { + "epoch": 0.4878228782287823, + "grad_norm": 0.482166588306427, + "learning_rate": 6.076695674175007e-05, + "loss": 2.9542, + "step": 6610 + }, + { + "epoch": 0.4885608856088561, + "grad_norm": 0.4913167953491211, + "learning_rate": 6.0641136456049454e-05, + "loss": 3.0476, + "step": 6620 + }, + { + "epoch": 0.48929889298892987, + "grad_norm": 0.4978322982788086, + "learning_rate": 6.051524555096754e-05, + "loss": 2.8936, + "step": 6630 + }, + { + "epoch": 0.4900369003690037, + "grad_norm": 0.4421325922012329, + "learning_rate": 6.038928486197316e-05, + "loss": 2.9131, + "step": 6640 + }, + { + "epoch": 0.4907749077490775, + "grad_norm": 0.4662306308746338, + "learning_rate": 6.02632552249983e-05, + "loss": 2.8394, + "step": 6650 + }, + { + "epoch": 0.4915129151291513, + "grad_norm": 0.5267830491065979, + "learning_rate": 6.0137157476432424e-05, + "loss": 2.8703, + "step": 6660 + }, + { + "epoch": 0.4922509225092251, + "grad_norm": 0.509088397026062, + "learning_rate": 6.001099245311711e-05, + "loss": 2.9691, + "step": 6670 + }, + { + "epoch": 0.4929889298892989, + "grad_norm": 0.46723711490631104, + "learning_rate": 5.988476099234033e-05, + "loss": 2.9496, + "step": 6680 + }, + { + "epoch": 0.49372693726937267, + "grad_norm": 0.4566686153411865, + "learning_rate": 5.975846393183101e-05, + "loss": 2.8571, + "step": 6690 + }, + { + "epoch": 0.4944649446494465, + "grad_norm": 0.4769027829170227, + "learning_rate": 5.963210210975343e-05, + "loss": 2.898, + "step": 6700 + }, + { + "epoch": 0.4952029520295203, + "grad_norm": 0.4787648320198059, + "learning_rate": 5.95056763647016e-05, + "loss": 2.9649, + "step": 6710 + }, + { + "epoch": 0.4959409594095941, + "grad_norm": 0.45179930329322815, + "learning_rate": 5.9379187535693804e-05, + "loss": 2.9201, + "step": 6720 + }, + { + "epoch": 0.4966789667896679, + "grad_norm": 0.4381027817726135, + "learning_rate": 5.925263646216697e-05, + "loss": 2.9402, + "step": 6730 + }, + { + "epoch": 0.4974169741697417, + "grad_norm": 0.49445804953575134, + "learning_rate": 5.912602398397111e-05, + "loss": 2.9305, + "step": 6740 + }, + { + "epoch": 0.4981549815498155, + "grad_norm": 0.4826495349407196, + "learning_rate": 5.8999350941363726e-05, + "loss": 2.9346, + "step": 6750 + }, + { + "epoch": 0.4988929889298893, + "grad_norm": 0.4974125921726227, + "learning_rate": 5.887261817500427e-05, + "loss": 2.9743, + "step": 6760 + }, + { + "epoch": 0.4996309963099631, + "grad_norm": 0.47447288036346436, + "learning_rate": 5.874582652594854e-05, + "loss": 2.9399, + "step": 6770 + }, + { + "epoch": 0.5003690036900369, + "grad_norm": 0.48605871200561523, + "learning_rate": 5.861897683564312e-05, + "loss": 2.9667, + "step": 6780 + }, + { + "epoch": 0.5011070110701107, + "grad_norm": 0.4562762379646301, + "learning_rate": 5.849206994591976e-05, + "loss": 2.9355, + "step": 6790 + }, + { + "epoch": 0.5018450184501845, + "grad_norm": 0.4724028706550598, + "learning_rate": 5.8365106698989834e-05, + "loss": 2.8938, + "step": 6800 + }, + { + "epoch": 0.5025830258302583, + "grad_norm": 0.4404136538505554, + "learning_rate": 5.82380879374387e-05, + "loss": 2.8332, + "step": 6810 + }, + { + "epoch": 0.5033210332103321, + "grad_norm": 0.4685560464859009, + "learning_rate": 5.8111014504220165e-05, + "loss": 2.9792, + "step": 6820 + }, + { + "epoch": 0.5040590405904058, + "grad_norm": 0.47112590074539185, + "learning_rate": 5.7983887242650846e-05, + "loss": 2.9933, + "step": 6830 + }, + { + "epoch": 0.5047970479704798, + "grad_norm": 0.46272197365760803, + "learning_rate": 5.78567069964046e-05, + "loss": 2.9916, + "step": 6840 + }, + { + "epoch": 0.5055350553505535, + "grad_norm": 0.47110989689826965, + "learning_rate": 5.772947460950688e-05, + "loss": 2.8869, + "step": 6850 + }, + { + "epoch": 0.5062730627306273, + "grad_norm": 0.47916916012763977, + "learning_rate": 5.760219092632924e-05, + "loss": 2.9576, + "step": 6860 + }, + { + "epoch": 0.5070110701107011, + "grad_norm": 0.47247427701950073, + "learning_rate": 5.7474856791583576e-05, + "loss": 2.9433, + "step": 6870 + }, + { + "epoch": 0.5077490774907749, + "grad_norm": 0.4856591820716858, + "learning_rate": 5.7347473050316636e-05, + "loss": 2.983, + "step": 6880 + }, + { + "epoch": 0.5084870848708487, + "grad_norm": 0.4498710036277771, + "learning_rate": 5.722004054790442e-05, + "loss": 2.95, + "step": 6890 + }, + { + "epoch": 0.5092250922509225, + "grad_norm": 0.4407157003879547, + "learning_rate": 5.7092560130046466e-05, + "loss": 2.9004, + "step": 6900 + }, + { + "epoch": 0.5099630996309963, + "grad_norm": 0.4676019847393036, + "learning_rate": 5.696503264276035e-05, + "loss": 2.8584, + "step": 6910 + }, + { + "epoch": 0.5107011070110701, + "grad_norm": 0.44521570205688477, + "learning_rate": 5.683745893237597e-05, + "loss": 2.9214, + "step": 6920 + }, + { + "epoch": 0.5114391143911439, + "grad_norm": 0.4693831503391266, + "learning_rate": 5.670983984553003e-05, + "loss": 2.9721, + "step": 6930 + }, + { + "epoch": 0.5121771217712177, + "grad_norm": 0.43683314323425293, + "learning_rate": 5.6582176229160355e-05, + "loss": 2.8837, + "step": 6940 + }, + { + "epoch": 0.5129151291512916, + "grad_norm": 0.4462457299232483, + "learning_rate": 5.645446893050029e-05, + "loss": 2.8014, + "step": 6950 + }, + { + "epoch": 0.5136531365313654, + "grad_norm": 0.46673473715782166, + "learning_rate": 5.632671879707307e-05, + "loss": 2.8542, + "step": 6960 + }, + { + "epoch": 0.5143911439114391, + "grad_norm": 0.5018209218978882, + "learning_rate": 5.619892667668618e-05, + "loss": 2.9344, + "step": 6970 + }, + { + "epoch": 0.5151291512915129, + "grad_norm": 0.4942212700843811, + "learning_rate": 5.607109341742579e-05, + "loss": 2.9002, + "step": 6980 + }, + { + "epoch": 0.5158671586715867, + "grad_norm": 0.4789501428604126, + "learning_rate": 5.5943219867651086e-05, + "loss": 2.8955, + "step": 6990 + }, + { + "epoch": 0.5166051660516605, + "grad_norm": 0.44573846459388733, + "learning_rate": 5.58153068759886e-05, + "loss": 2.9184, + "step": 7000 + }, + { + "epoch": 0.5173431734317343, + "grad_norm": 0.4906388819217682, + "learning_rate": 5.568735529132665e-05, + "loss": 2.9369, + "step": 7010 + }, + { + "epoch": 0.5180811808118081, + "grad_norm": 0.44844797253608704, + "learning_rate": 5.555936596280966e-05, + "loss": 2.9435, + "step": 7020 + }, + { + "epoch": 0.5188191881918819, + "grad_norm": 0.46517252922058105, + "learning_rate": 5.5431339739832545e-05, + "loss": 2.9933, + "step": 7030 + }, + { + "epoch": 0.5195571955719557, + "grad_norm": 0.4549432396888733, + "learning_rate": 5.530327747203506e-05, + "loss": 2.8739, + "step": 7040 + }, + { + "epoch": 0.5202952029520295, + "grad_norm": 0.47701096534729004, + "learning_rate": 5.51751800092962e-05, + "loss": 2.9088, + "step": 7050 + }, + { + "epoch": 0.5210332103321034, + "grad_norm": 0.489654541015625, + "learning_rate": 5.50470482017285e-05, + "loss": 2.9574, + "step": 7060 + }, + { + "epoch": 0.5217712177121772, + "grad_norm": 0.4661862850189209, + "learning_rate": 5.491888289967241e-05, + "loss": 2.9482, + "step": 7070 + }, + { + "epoch": 0.522509225092251, + "grad_norm": 0.446463406085968, + "learning_rate": 5.4790684953690706e-05, + "loss": 2.9176, + "step": 7080 + }, + { + "epoch": 0.5232472324723247, + "grad_norm": 0.4751204550266266, + "learning_rate": 5.466245521456278e-05, + "loss": 2.924, + "step": 7090 + }, + { + "epoch": 0.5239852398523985, + "grad_norm": 0.5041395425796509, + "learning_rate": 5.4534194533279e-05, + "loss": 2.8624, + "step": 7100 + }, + { + "epoch": 0.5247232472324723, + "grad_norm": 0.4631516635417938, + "learning_rate": 5.4405903761035124e-05, + "loss": 2.9072, + "step": 7110 + }, + { + "epoch": 0.5254612546125461, + "grad_norm": 0.45753976702690125, + "learning_rate": 5.427758374922658e-05, + "loss": 2.9332, + "step": 7120 + }, + { + "epoch": 0.5261992619926199, + "grad_norm": 0.4684479236602783, + "learning_rate": 5.414923534944283e-05, + "loss": 2.9017, + "step": 7130 + }, + { + "epoch": 0.5269372693726937, + "grad_norm": 0.46777448058128357, + "learning_rate": 5.4020859413461756e-05, + "loss": 2.9231, + "step": 7140 + }, + { + "epoch": 0.5276752767527675, + "grad_norm": 0.47089943289756775, + "learning_rate": 5.389245679324398e-05, + "loss": 2.9215, + "step": 7150 + }, + { + "epoch": 0.5284132841328413, + "grad_norm": 0.44447311758995056, + "learning_rate": 5.376402834092721e-05, + "loss": 2.9281, + "step": 7160 + }, + { + "epoch": 0.5291512915129152, + "grad_norm": 0.47463953495025635, + "learning_rate": 5.363557490882057e-05, + "loss": 2.947, + "step": 7170 + }, + { + "epoch": 0.529889298892989, + "grad_norm": 0.47504737973213196, + "learning_rate": 5.350709734939897e-05, + "loss": 3.0103, + "step": 7180 + }, + { + "epoch": 0.5306273062730628, + "grad_norm": 0.472151517868042, + "learning_rate": 5.337859651529746e-05, + "loss": 2.966, + "step": 7190 + }, + { + "epoch": 0.5313653136531366, + "grad_norm": 0.44552987813949585, + "learning_rate": 5.325007325930554e-05, + "loss": 2.8962, + "step": 7200 + }, + { + "epoch": 0.5321033210332103, + "grad_norm": 0.487582266330719, + "learning_rate": 5.3121528434361524e-05, + "loss": 2.9548, + "step": 7210 + }, + { + "epoch": 0.5328413284132841, + "grad_norm": 0.47288230061531067, + "learning_rate": 5.299296289354681e-05, + "loss": 2.8969, + "step": 7220 + }, + { + "epoch": 0.5335793357933579, + "grad_norm": 0.4963250756263733, + "learning_rate": 5.2864377490080306e-05, + "loss": 2.9785, + "step": 7230 + }, + { + "epoch": 0.5343173431734317, + "grad_norm": 0.4519381821155548, + "learning_rate": 5.2735773077312814e-05, + "loss": 2.9112, + "step": 7240 + }, + { + "epoch": 0.5350553505535055, + "grad_norm": 0.47766226530075073, + "learning_rate": 5.2607150508721195e-05, + "loss": 2.8749, + "step": 7250 + }, + { + "epoch": 0.5357933579335793, + "grad_norm": 0.4712168872356415, + "learning_rate": 5.24785106379028e-05, + "loss": 2.9148, + "step": 7260 + }, + { + "epoch": 0.5365313653136531, + "grad_norm": 0.44543230533599854, + "learning_rate": 5.234985431856988e-05, + "loss": 2.9281, + "step": 7270 + }, + { + "epoch": 0.537269372693727, + "grad_norm": 0.46235865354537964, + "learning_rate": 5.2221182404543754e-05, + "loss": 2.9294, + "step": 7280 + }, + { + "epoch": 0.5380073800738008, + "grad_norm": 0.4579477608203888, + "learning_rate": 5.2092495749749346e-05, + "loss": 2.9286, + "step": 7290 + }, + { + "epoch": 0.5387453874538746, + "grad_norm": 0.4533149302005768, + "learning_rate": 5.196379520820929e-05, + "loss": 2.9063, + "step": 7300 + }, + { + "epoch": 0.5394833948339484, + "grad_norm": 0.48128604888916016, + "learning_rate": 5.183508163403845e-05, + "loss": 2.8985, + "step": 7310 + }, + { + "epoch": 0.5402214022140222, + "grad_norm": 0.46598076820373535, + "learning_rate": 5.170635588143816e-05, + "loss": 2.9074, + "step": 7320 + }, + { + "epoch": 0.5409594095940959, + "grad_norm": 0.4706079363822937, + "learning_rate": 5.157761880469058e-05, + "loss": 2.9216, + "step": 7330 + }, + { + "epoch": 0.5416974169741697, + "grad_norm": 0.45854324102401733, + "learning_rate": 5.144887125815301e-05, + "loss": 2.9771, + "step": 7340 + }, + { + "epoch": 0.5424354243542435, + "grad_norm": 0.4575222134590149, + "learning_rate": 5.132011409625224e-05, + "loss": 2.878, + "step": 7350 + }, + { + "epoch": 0.5431734317343173, + "grad_norm": 0.45603683590888977, + "learning_rate": 5.1191348173478884e-05, + "loss": 2.9328, + "step": 7360 + }, + { + "epoch": 0.5439114391143911, + "grad_norm": 0.47662872076034546, + "learning_rate": 5.1062574344381686e-05, + "loss": 2.9483, + "step": 7370 + }, + { + "epoch": 0.5446494464944649, + "grad_norm": 0.4564341604709625, + "learning_rate": 5.093379346356185e-05, + "loss": 2.8084, + "step": 7380 + }, + { + "epoch": 0.5453874538745388, + "grad_norm": 0.4610985219478607, + "learning_rate": 5.080500638566741e-05, + "loss": 2.9255, + "step": 7390 + }, + { + "epoch": 0.5461254612546126, + "grad_norm": 0.46059536933898926, + "learning_rate": 5.0676213965387475e-05, + "loss": 2.851, + "step": 7400 + }, + { + "epoch": 0.5468634686346864, + "grad_norm": 0.482048362493515, + "learning_rate": 5.0547417057446665e-05, + "loss": 2.9626, + "step": 7410 + }, + { + "epoch": 0.5476014760147602, + "grad_norm": 0.4469466209411621, + "learning_rate": 5.0418616516599346e-05, + "loss": 2.8261, + "step": 7420 + }, + { + "epoch": 0.548339483394834, + "grad_norm": 0.4489482343196869, + "learning_rate": 5.028981319762399e-05, + "loss": 2.9388, + "step": 7430 + }, + { + "epoch": 0.5490774907749078, + "grad_norm": 0.4895458221435547, + "learning_rate": 5.016100795531754e-05, + "loss": 2.9598, + "step": 7440 + }, + { + "epoch": 0.5498154981549815, + "grad_norm": 0.45136043429374695, + "learning_rate": 5.003220164448967e-05, + "loss": 2.8466, + "step": 7450 + }, + { + "epoch": 0.5505535055350553, + "grad_norm": 0.4319990873336792, + "learning_rate": 4.990339511995718e-05, + "loss": 2.8589, + "step": 7460 + }, + { + "epoch": 0.5512915129151291, + "grad_norm": 0.4822845458984375, + "learning_rate": 4.977458923653823e-05, + "loss": 2.8766, + "step": 7470 + }, + { + "epoch": 0.5520295202952029, + "grad_norm": 0.4683190882205963, + "learning_rate": 4.9645784849046786e-05, + "loss": 2.9471, + "step": 7480 + }, + { + "epoch": 0.5527675276752767, + "grad_norm": 0.4755018353462219, + "learning_rate": 4.9516982812286854e-05, + "loss": 2.9336, + "step": 7490 + }, + { + "epoch": 0.5535055350553506, + "grad_norm": 0.4847009778022766, + "learning_rate": 4.938818398104685e-05, + "loss": 2.8928, + "step": 7500 + }, + { + "epoch": 0.5542435424354244, + "grad_norm": 0.49205484986305237, + "learning_rate": 4.92593892100939e-05, + "loss": 2.9413, + "step": 7510 + }, + { + "epoch": 0.5549815498154982, + "grad_norm": 0.4603287875652313, + "learning_rate": 4.913059935416822e-05, + "loss": 2.8814, + "step": 7520 + }, + { + "epoch": 0.555719557195572, + "grad_norm": 0.4724648594856262, + "learning_rate": 4.900181526797737e-05, + "loss": 2.9493, + "step": 7530 + }, + { + "epoch": 0.5564575645756458, + "grad_norm": 0.6270569562911987, + "learning_rate": 4.887303780619066e-05, + "loss": 2.9201, + "step": 7540 + }, + { + "epoch": 0.5571955719557196, + "grad_norm": 0.4619079828262329, + "learning_rate": 4.874426782343338e-05, + "loss": 2.915, + "step": 7550 + }, + { + "epoch": 0.5579335793357934, + "grad_norm": 0.45699045062065125, + "learning_rate": 4.861550617428122e-05, + "loss": 2.914, + "step": 7560 + }, + { + "epoch": 0.5586715867158671, + "grad_norm": 0.46511203050613403, + "learning_rate": 4.8486753713254586e-05, + "loss": 2.8837, + "step": 7570 + }, + { + "epoch": 0.5594095940959409, + "grad_norm": 0.4465058147907257, + "learning_rate": 4.835801129481287e-05, + "loss": 2.9087, + "step": 7580 + }, + { + "epoch": 0.5601476014760147, + "grad_norm": 0.4666641652584076, + "learning_rate": 4.8229279773348845e-05, + "loss": 2.9486, + "step": 7590 + }, + { + "epoch": 0.5608856088560885, + "grad_norm": 0.4582604765892029, + "learning_rate": 4.810056000318293e-05, + "loss": 2.9275, + "step": 7600 + }, + { + "epoch": 0.5616236162361624, + "grad_norm": 0.4589548408985138, + "learning_rate": 4.7971852838557565e-05, + "loss": 2.8683, + "step": 7610 + }, + { + "epoch": 0.5623616236162362, + "grad_norm": 0.4380606412887573, + "learning_rate": 4.78431591336316e-05, + "loss": 2.8368, + "step": 7620 + }, + { + "epoch": 0.56309963099631, + "grad_norm": 0.44517070055007935, + "learning_rate": 4.771447974247449e-05, + "loss": 2.8804, + "step": 7630 + }, + { + "epoch": 0.5638376383763838, + "grad_norm": 0.46472036838531494, + "learning_rate": 4.7585815519060694e-05, + "loss": 2.8983, + "step": 7640 + }, + { + "epoch": 0.5645756457564576, + "grad_norm": 0.47114098072052, + "learning_rate": 4.7457167317264064e-05, + "loss": 2.9284, + "step": 7650 + }, + { + "epoch": 0.5653136531365314, + "grad_norm": 0.4522678256034851, + "learning_rate": 4.732853599085207e-05, + "loss": 2.8971, + "step": 7660 + }, + { + "epoch": 0.5660516605166052, + "grad_norm": 0.46045982837677, + "learning_rate": 4.719992239348024e-05, + "loss": 2.844, + "step": 7670 + }, + { + "epoch": 0.566789667896679, + "grad_norm": 0.4543171525001526, + "learning_rate": 4.7071327378686386e-05, + "loss": 2.9121, + "step": 7680 + }, + { + "epoch": 0.5675276752767527, + "grad_norm": 0.48567166924476624, + "learning_rate": 4.6942751799885054e-05, + "loss": 2.9274, + "step": 7690 + }, + { + "epoch": 0.5682656826568265, + "grad_norm": 0.4700704514980316, + "learning_rate": 4.681419651036177e-05, + "loss": 2.9872, + "step": 7700 + }, + { + "epoch": 0.5690036900369003, + "grad_norm": 0.44953039288520813, + "learning_rate": 4.6685662363267415e-05, + "loss": 2.873, + "step": 7710 + }, + { + "epoch": 0.5697416974169742, + "grad_norm": 0.46205776929855347, + "learning_rate": 4.655715021161258e-05, + "loss": 2.8282, + "step": 7720 + }, + { + "epoch": 0.570479704797048, + "grad_norm": 0.4394710063934326, + "learning_rate": 4.6428660908261864e-05, + "loss": 2.8753, + "step": 7730 + }, + { + "epoch": 0.5712177121771218, + "grad_norm": 0.43995216488838196, + "learning_rate": 4.6300195305928243e-05, + "loss": 2.7643, + "step": 7740 + }, + { + "epoch": 0.5719557195571956, + "grad_norm": 0.4612707495689392, + "learning_rate": 4.617175425716741e-05, + "loss": 2.8683, + "step": 7750 + }, + { + "epoch": 0.5726937269372694, + "grad_norm": 0.4660702347755432, + "learning_rate": 4.604333861437207e-05, + "loss": 2.9493, + "step": 7760 + }, + { + "epoch": 0.5734317343173432, + "grad_norm": 0.47154900431632996, + "learning_rate": 4.591494922976637e-05, + "loss": 2.9493, + "step": 7770 + }, + { + "epoch": 0.574169741697417, + "grad_norm": 0.4602459967136383, + "learning_rate": 4.578658695540018e-05, + "loss": 2.9144, + "step": 7780 + }, + { + "epoch": 0.5749077490774908, + "grad_norm": 0.4484480917453766, + "learning_rate": 4.5658252643143435e-05, + "loss": 2.9145, + "step": 7790 + }, + { + "epoch": 0.5756457564575646, + "grad_norm": 0.469936341047287, + "learning_rate": 4.552994714468055e-05, + "loss": 2.8947, + "step": 7800 + }, + { + "epoch": 0.5763837638376383, + "grad_norm": 0.48601603507995605, + "learning_rate": 4.5401671311504616e-05, + "loss": 2.9164, + "step": 7810 + }, + { + "epoch": 0.5771217712177121, + "grad_norm": 0.46561533212661743, + "learning_rate": 4.5273425994912e-05, + "loss": 2.8656, + "step": 7820 + }, + { + "epoch": 0.5778597785977859, + "grad_norm": 0.48168033361434937, + "learning_rate": 4.5145212045996446e-05, + "loss": 2.8667, + "step": 7830 + }, + { + "epoch": 0.5785977859778598, + "grad_norm": 0.45122450590133667, + "learning_rate": 4.5017030315643536e-05, + "loss": 2.9668, + "step": 7840 + }, + { + "epoch": 0.5793357933579336, + "grad_norm": 0.4591752290725708, + "learning_rate": 4.4888881654525057e-05, + "loss": 2.8924, + "step": 7850 + }, + { + "epoch": 0.5800738007380074, + "grad_norm": 0.4341951906681061, + "learning_rate": 4.4760766913093325e-05, + "loss": 2.8232, + "step": 7860 + }, + { + "epoch": 0.5808118081180812, + "grad_norm": 0.46191418170928955, + "learning_rate": 4.463268694157556e-05, + "loss": 2.9198, + "step": 7870 + }, + { + "epoch": 0.581549815498155, + "grad_norm": 0.43734246492385864, + "learning_rate": 4.450464258996822e-05, + "loss": 2.8755, + "step": 7880 + }, + { + "epoch": 0.5822878228782288, + "grad_norm": 0.4456181228160858, + "learning_rate": 4.437663470803137e-05, + "loss": 2.8545, + "step": 7890 + }, + { + "epoch": 0.5830258302583026, + "grad_norm": 0.46855318546295166, + "learning_rate": 4.4248664145283054e-05, + "loss": 2.8658, + "step": 7900 + }, + { + "epoch": 0.5837638376383764, + "grad_norm": 0.4666096568107605, + "learning_rate": 4.4120731750993645e-05, + "loss": 2.9317, + "step": 7910 + }, + { + "epoch": 0.5845018450184502, + "grad_norm": 0.46038341522216797, + "learning_rate": 4.3992838374180234e-05, + "loss": 2.9288, + "step": 7920 + }, + { + "epoch": 0.5852398523985239, + "grad_norm": 0.47123417258262634, + "learning_rate": 4.386498486360094e-05, + "loss": 2.9348, + "step": 7930 + }, + { + "epoch": 0.5859778597785977, + "grad_norm": 0.43836262822151184, + "learning_rate": 4.373717206774935e-05, + "loss": 2.8594, + "step": 7940 + }, + { + "epoch": 0.5867158671586716, + "grad_norm": 0.46412384510040283, + "learning_rate": 4.360940083484881e-05, + "loss": 2.9131, + "step": 7950 + }, + { + "epoch": 0.5874538745387454, + "grad_norm": 0.43723878264427185, + "learning_rate": 4.3481672012846865e-05, + "loss": 2.9116, + "step": 7960 + }, + { + "epoch": 0.5881918819188192, + "grad_norm": 0.46796315908432007, + "learning_rate": 4.335398644940957e-05, + "loss": 2.9236, + "step": 7970 + }, + { + "epoch": 0.588929889298893, + "grad_norm": 0.4761864244937897, + "learning_rate": 4.322634499191594e-05, + "loss": 2.8988, + "step": 7980 + }, + { + "epoch": 0.5896678966789668, + "grad_norm": 0.4379028081893921, + "learning_rate": 4.309874848745225e-05, + "loss": 2.851, + "step": 7990 + }, + { + "epoch": 0.5904059040590406, + "grad_norm": 0.4515070617198944, + "learning_rate": 4.297119778280645e-05, + "loss": 2.8823, + "step": 8000 + }, + { + "epoch": 0.5911439114391144, + "grad_norm": 0.456480473279953, + "learning_rate": 4.2843693724462555e-05, + "loss": 2.9163, + "step": 8010 + }, + { + "epoch": 0.5918819188191882, + "grad_norm": 0.4556421935558319, + "learning_rate": 4.271623715859501e-05, + "loss": 2.8997, + "step": 8020 + }, + { + "epoch": 0.592619926199262, + "grad_norm": 0.4618515372276306, + "learning_rate": 4.2588828931063086e-05, + "loss": 2.9223, + "step": 8030 + }, + { + "epoch": 0.5933579335793358, + "grad_norm": 0.4617830812931061, + "learning_rate": 4.246146988740525e-05, + "loss": 2.8476, + "step": 8040 + }, + { + "epoch": 0.5940959409594095, + "grad_norm": 0.43721622228622437, + "learning_rate": 4.233416087283354e-05, + "loss": 2.9253, + "step": 8050 + }, + { + "epoch": 0.5948339483394834, + "grad_norm": 0.43407517671585083, + "learning_rate": 4.2206902732228015e-05, + "loss": 2.9307, + "step": 8060 + }, + { + "epoch": 0.5955719557195572, + "grad_norm": 0.4590218663215637, + "learning_rate": 4.207969631013109e-05, + "loss": 2.9194, + "step": 8070 + }, + { + "epoch": 0.596309963099631, + "grad_norm": 0.45232662558555603, + "learning_rate": 4.195254245074196e-05, + "loss": 2.814, + "step": 8080 + }, + { + "epoch": 0.5970479704797048, + "grad_norm": 0.47659075260162354, + "learning_rate": 4.1825441997911016e-05, + "loss": 2.8991, + "step": 8090 + }, + { + "epoch": 0.5977859778597786, + "grad_norm": 0.4390777349472046, + "learning_rate": 4.169839579513415e-05, + "loss": 2.8377, + "step": 8100 + }, + { + "epoch": 0.5985239852398524, + "grad_norm": 0.44624418020248413, + "learning_rate": 4.1571404685547265e-05, + "loss": 2.9126, + "step": 8110 + }, + { + "epoch": 0.5992619926199262, + "grad_norm": 0.4411090314388275, + "learning_rate": 4.14444695119207e-05, + "loss": 2.8661, + "step": 8120 + }, + { + "epoch": 0.6, + "grad_norm": 0.45906946063041687, + "learning_rate": 4.131759111665349e-05, + "loss": 2.8862, + "step": 8130 + }, + { + "epoch": 0.6007380073800738, + "grad_norm": 0.450738787651062, + "learning_rate": 4.1190770341767884e-05, + "loss": 2.8788, + "step": 8140 + }, + { + "epoch": 0.6014760147601476, + "grad_norm": 0.4635327458381653, + "learning_rate": 4.1064008028903766e-05, + "loss": 2.8856, + "step": 8150 + }, + { + "epoch": 0.6022140221402214, + "grad_norm": 0.46390798687934875, + "learning_rate": 4.093730501931301e-05, + "loss": 2.8435, + "step": 8160 + }, + { + "epoch": 0.6029520295202953, + "grad_norm": 0.46583694219589233, + "learning_rate": 4.0810662153853955e-05, + "loss": 2.9068, + "step": 8170 + }, + { + "epoch": 0.603690036900369, + "grad_norm": 0.441485732793808, + "learning_rate": 4.068408027298576e-05, + "loss": 2.9141, + "step": 8180 + }, + { + "epoch": 0.6044280442804428, + "grad_norm": 0.43635720014572144, + "learning_rate": 4.0557560216762884e-05, + "loss": 2.8165, + "step": 8190 + }, + { + "epoch": 0.6051660516605166, + "grad_norm": 0.45056867599487305, + "learning_rate": 4.0431102824829495e-05, + "loss": 2.8923, + "step": 8200 + }, + { + "epoch": 0.6059040590405904, + "grad_norm": 0.47618359327316284, + "learning_rate": 4.030470893641387e-05, + "loss": 2.8337, + "step": 8210 + }, + { + "epoch": 0.6066420664206642, + "grad_norm": 0.46678489446640015, + "learning_rate": 4.0178379390322896e-05, + "loss": 2.9041, + "step": 8220 + }, + { + "epoch": 0.607380073800738, + "grad_norm": 0.45858731865882874, + "learning_rate": 4.0052115024936396e-05, + "loss": 2.8919, + "step": 8230 + }, + { + "epoch": 0.6081180811808118, + "grad_norm": 0.46500325202941895, + "learning_rate": 3.9925916678201656e-05, + "loss": 2.7873, + "step": 8240 + }, + { + "epoch": 0.6088560885608856, + "grad_norm": 0.4576093256473541, + "learning_rate": 3.9799785187627844e-05, + "loss": 2.9581, + "step": 8250 + }, + { + "epoch": 0.6095940959409594, + "grad_norm": 0.4603584408760071, + "learning_rate": 3.96737213902804e-05, + "loss": 2.932, + "step": 8260 + }, + { + "epoch": 0.6103321033210332, + "grad_norm": 0.4474504888057709, + "learning_rate": 3.954772612277556e-05, + "loss": 2.8907, + "step": 8270 + }, + { + "epoch": 0.6110701107011071, + "grad_norm": 0.4676888585090637, + "learning_rate": 3.942180022127475e-05, + "loss": 2.9279, + "step": 8280 + }, + { + "epoch": 0.6118081180811809, + "grad_norm": 0.4762161374092102, + "learning_rate": 3.929594452147903e-05, + "loss": 2.8668, + "step": 8290 + }, + { + "epoch": 0.6125461254612546, + "grad_norm": 0.45031213760375977, + "learning_rate": 3.917015985862364e-05, + "loss": 3.0203, + "step": 8300 + }, + { + "epoch": 0.6132841328413284, + "grad_norm": 0.4627397656440735, + "learning_rate": 3.904444706747227e-05, + "loss": 2.8669, + "step": 8310 + }, + { + "epoch": 0.6140221402214022, + "grad_norm": 0.4964381456375122, + "learning_rate": 3.891880698231176e-05, + "loss": 2.8888, + "step": 8320 + }, + { + "epoch": 0.614760147601476, + "grad_norm": 0.4690164029598236, + "learning_rate": 3.879324043694639e-05, + "loss": 2.8772, + "step": 8330 + }, + { + "epoch": 0.6154981549815498, + "grad_norm": 0.46316999197006226, + "learning_rate": 3.8667748264692355e-05, + "loss": 2.9203, + "step": 8340 + }, + { + "epoch": 0.6162361623616236, + "grad_norm": 0.46457648277282715, + "learning_rate": 3.854233129837233e-05, + "loss": 2.8959, + "step": 8350 + }, + { + "epoch": 0.6169741697416974, + "grad_norm": 0.46210619807243347, + "learning_rate": 3.841699037030989e-05, + "loss": 2.9754, + "step": 8360 + }, + { + "epoch": 0.6177121771217712, + "grad_norm": 0.4708150029182434, + "learning_rate": 3.829172631232395e-05, + "loss": 2.8779, + "step": 8370 + }, + { + "epoch": 0.618450184501845, + "grad_norm": 0.4539421498775482, + "learning_rate": 3.8166539955723315e-05, + "loss": 2.7857, + "step": 8380 + }, + { + "epoch": 0.6191881918819189, + "grad_norm": 0.4383450150489807, + "learning_rate": 3.80414321313011e-05, + "loss": 2.9466, + "step": 8390 + }, + { + "epoch": 0.6199261992619927, + "grad_norm": 0.47667232155799866, + "learning_rate": 3.791640366932926e-05, + "loss": 2.8896, + "step": 8400 + }, + { + "epoch": 0.6206642066420665, + "grad_norm": 0.47078999876976013, + "learning_rate": 3.7791455399553054e-05, + "loss": 2.8787, + "step": 8410 + }, + { + "epoch": 0.6214022140221402, + "grad_norm": 0.4621264934539795, + "learning_rate": 3.7666588151185586e-05, + "loss": 2.9516, + "step": 8420 + }, + { + "epoch": 0.622140221402214, + "grad_norm": 0.4561121165752411, + "learning_rate": 3.754180275290222e-05, + "loss": 2.8712, + "step": 8430 + }, + { + "epoch": 0.6228782287822878, + "grad_norm": 0.4745158851146698, + "learning_rate": 3.741710003283515e-05, + "loss": 2.9942, + "step": 8440 + }, + { + "epoch": 0.6236162361623616, + "grad_norm": 0.4506776034832001, + "learning_rate": 3.729248081856788e-05, + "loss": 2.8662, + "step": 8450 + }, + { + "epoch": 0.6243542435424354, + "grad_norm": 0.4925256073474884, + "learning_rate": 3.716794593712973e-05, + "loss": 2.9148, + "step": 8460 + }, + { + "epoch": 0.6250922509225092, + "grad_norm": 0.4477274715900421, + "learning_rate": 3.704349621499032e-05, + "loss": 2.8946, + "step": 8470 + }, + { + "epoch": 0.625830258302583, + "grad_norm": 0.45974335074424744, + "learning_rate": 3.691913247805415e-05, + "loss": 2.8444, + "step": 8480 + }, + { + "epoch": 0.6265682656826568, + "grad_norm": 0.4468931555747986, + "learning_rate": 3.6794855551655095e-05, + "loss": 2.8183, + "step": 8490 + }, + { + "epoch": 0.6273062730627307, + "grad_norm": 0.45352327823638916, + "learning_rate": 3.6670666260550866e-05, + "loss": 2.8385, + "step": 8500 + }, + { + "epoch": 0.6280442804428045, + "grad_norm": 0.48543328046798706, + "learning_rate": 3.654656542891762e-05, + "loss": 2.8982, + "step": 8510 + }, + { + "epoch": 0.6287822878228783, + "grad_norm": 0.47315549850463867, + "learning_rate": 3.642255388034448e-05, + "loss": 2.8477, + "step": 8520 + }, + { + "epoch": 0.629520295202952, + "grad_norm": 0.4466278851032257, + "learning_rate": 3.629863243782799e-05, + "loss": 2.9499, + "step": 8530 + }, + { + "epoch": 0.6302583025830258, + "grad_norm": 0.4634998142719269, + "learning_rate": 3.617480192376676e-05, + "loss": 2.9209, + "step": 8540 + }, + { + "epoch": 0.6309963099630996, + "grad_norm": 0.4444449841976166, + "learning_rate": 3.6051063159955914e-05, + "loss": 2.8547, + "step": 8550 + }, + { + "epoch": 0.6317343173431734, + "grad_norm": 0.4805346727371216, + "learning_rate": 3.592741696758171e-05, + "loss": 2.9504, + "step": 8560 + }, + { + "epoch": 0.6324723247232472, + "grad_norm": 0.4576335549354553, + "learning_rate": 3.580386416721605e-05, + "loss": 2.8166, + "step": 8570 + }, + { + "epoch": 0.633210332103321, + "grad_norm": 0.48051634430885315, + "learning_rate": 3.568040557881106e-05, + "loss": 2.8457, + "step": 8580 + }, + { + "epoch": 0.6339483394833948, + "grad_norm": 0.45053961873054504, + "learning_rate": 3.55570420216936e-05, + "loss": 2.8554, + "step": 8590 + }, + { + "epoch": 0.6346863468634686, + "grad_norm": 0.4763762652873993, + "learning_rate": 3.543377431455991e-05, + "loss": 2.9245, + "step": 8600 + }, + { + "epoch": 0.6354243542435425, + "grad_norm": 0.466516375541687, + "learning_rate": 3.531060327547003e-05, + "loss": 2.8784, + "step": 8610 + }, + { + "epoch": 0.6361623616236163, + "grad_norm": 0.4508006274700165, + "learning_rate": 3.51875297218426e-05, + "loss": 2.8572, + "step": 8620 + }, + { + "epoch": 0.6369003690036901, + "grad_norm": 0.43419796228408813, + "learning_rate": 3.506455447044923e-05, + "loss": 2.9553, + "step": 8630 + }, + { + "epoch": 0.6376383763837639, + "grad_norm": 0.4657207131385803, + "learning_rate": 3.494167833740912e-05, + "loss": 2.9388, + "step": 8640 + }, + { + "epoch": 0.6383763837638377, + "grad_norm": 0.47769656777381897, + "learning_rate": 3.481890213818374e-05, + "loss": 2.889, + "step": 8650 + }, + { + "epoch": 0.6391143911439114, + "grad_norm": 0.452332466840744, + "learning_rate": 3.469622668757132e-05, + "loss": 2.8618, + "step": 8660 + }, + { + "epoch": 0.6398523985239852, + "grad_norm": 0.44228044152259827, + "learning_rate": 3.457365279970147e-05, + "loss": 2.858, + "step": 8670 + }, + { + "epoch": 0.640590405904059, + "grad_norm": 0.45381829142570496, + "learning_rate": 3.4451181288029835e-05, + "loss": 2.9324, + "step": 8680 + }, + { + "epoch": 0.6413284132841328, + "grad_norm": 0.45243462920188904, + "learning_rate": 3.4328812965332566e-05, + "loss": 2.8569, + "step": 8690 + }, + { + "epoch": 0.6420664206642066, + "grad_norm": 0.44624003767967224, + "learning_rate": 3.420654864370107e-05, + "loss": 2.8305, + "step": 8700 + }, + { + "epoch": 0.6428044280442804, + "grad_norm": 0.45331937074661255, + "learning_rate": 3.408438913453652e-05, + "loss": 2.9233, + "step": 8710 + }, + { + "epoch": 0.6435424354243543, + "grad_norm": 0.46031826734542847, + "learning_rate": 3.396233524854453e-05, + "loss": 2.8136, + "step": 8720 + }, + { + "epoch": 0.6442804428044281, + "grad_norm": 0.4405251443386078, + "learning_rate": 3.384038779572975e-05, + "loss": 2.8196, + "step": 8730 + }, + { + "epoch": 0.6450184501845019, + "grad_norm": 0.433918297290802, + "learning_rate": 3.371854758539047e-05, + "loss": 2.828, + "step": 8740 + }, + { + "epoch": 0.6457564575645757, + "grad_norm": 0.437752366065979, + "learning_rate": 3.3596815426113285e-05, + "loss": 2.9084, + "step": 8750 + }, + { + "epoch": 0.6464944649446495, + "grad_norm": 0.4461667537689209, + "learning_rate": 3.3475192125767715e-05, + "loss": 2.9163, + "step": 8760 + }, + { + "epoch": 0.6472324723247233, + "grad_norm": 0.44983482360839844, + "learning_rate": 3.335367849150084e-05, + "loss": 2.8624, + "step": 8770 + }, + { + "epoch": 0.647970479704797, + "grad_norm": 0.444402813911438, + "learning_rate": 3.323227532973193e-05, + "loss": 2.8645, + "step": 8780 + }, + { + "epoch": 0.6487084870848708, + "grad_norm": 0.47475096583366394, + "learning_rate": 3.311098344614715e-05, + "loss": 2.8599, + "step": 8790 + }, + { + "epoch": 0.6494464944649446, + "grad_norm": 0.42691770195961, + "learning_rate": 3.298980364569413e-05, + "loss": 2.9367, + "step": 8800 + }, + { + "epoch": 0.6501845018450184, + "grad_norm": 0.43761834502220154, + "learning_rate": 3.2868736732576696e-05, + "loss": 2.8071, + "step": 8810 + }, + { + "epoch": 0.6509225092250922, + "grad_norm": 0.4337967336177826, + "learning_rate": 3.274778351024949e-05, + "loss": 2.7961, + "step": 8820 + }, + { + "epoch": 0.6516605166051661, + "grad_norm": 0.4518975615501404, + "learning_rate": 3.262694478141265e-05, + "loss": 2.8445, + "step": 8830 + }, + { + "epoch": 0.6523985239852399, + "grad_norm": 0.44520917534828186, + "learning_rate": 3.250622134800651e-05, + "loss": 2.8298, + "step": 8840 + }, + { + "epoch": 0.6531365313653137, + "grad_norm": 0.47246819734573364, + "learning_rate": 3.238561401120619e-05, + "loss": 2.8721, + "step": 8850 + }, + { + "epoch": 0.6538745387453875, + "grad_norm": 0.46341249346733093, + "learning_rate": 3.226512357141639e-05, + "loss": 2.8465, + "step": 8860 + }, + { + "epoch": 0.6546125461254613, + "grad_norm": 0.4418579339981079, + "learning_rate": 3.214475082826602e-05, + "loss": 2.7495, + "step": 8870 + }, + { + "epoch": 0.6553505535055351, + "grad_norm": 0.4572698771953583, + "learning_rate": 3.2024496580602895e-05, + "loss": 2.8405, + "step": 8880 + }, + { + "epoch": 0.6560885608856089, + "grad_norm": 0.4518590569496155, + "learning_rate": 3.1904361626488464e-05, + "loss": 2.8698, + "step": 8890 + }, + { + "epoch": 0.6568265682656826, + "grad_norm": 0.49694785475730896, + "learning_rate": 3.178434676319243e-05, + "loss": 2.9178, + "step": 8900 + }, + { + "epoch": 0.6575645756457564, + "grad_norm": 0.44036176800727844, + "learning_rate": 3.166445278718758e-05, + "loss": 2.9042, + "step": 8910 + }, + { + "epoch": 0.6583025830258302, + "grad_norm": 0.4740366041660309, + "learning_rate": 3.154468049414444e-05, + "loss": 2.791, + "step": 8920 + }, + { + "epoch": 0.659040590405904, + "grad_norm": 0.44894149899482727, + "learning_rate": 3.1425030678925944e-05, + "loss": 2.8882, + "step": 8930 + }, + { + "epoch": 0.6597785977859778, + "grad_norm": 0.45504188537597656, + "learning_rate": 3.1305504135582244e-05, + "loss": 2.82, + "step": 8940 + }, + { + "epoch": 0.6605166051660517, + "grad_norm": 0.45306116342544556, + "learning_rate": 3.118610165734539e-05, + "loss": 2.8076, + "step": 8950 + }, + { + "epoch": 0.6612546125461255, + "grad_norm": 0.4355803430080414, + "learning_rate": 3.106682403662409e-05, + "loss": 2.8458, + "step": 8960 + }, + { + "epoch": 0.6619926199261993, + "grad_norm": 0.45864707231521606, + "learning_rate": 3.094767206499844e-05, + "loss": 2.7888, + "step": 8970 + }, + { + "epoch": 0.6627306273062731, + "grad_norm": 0.4467925727367401, + "learning_rate": 3.082864653321466e-05, + "loss": 2.8862, + "step": 8980 + }, + { + "epoch": 0.6634686346863469, + "grad_norm": 0.4361802935600281, + "learning_rate": 3.0709748231179855e-05, + "loss": 2.8405, + "step": 8990 + }, + { + "epoch": 0.6642066420664207, + "grad_norm": 0.4502997398376465, + "learning_rate": 3.059097794795681e-05, + "loss": 2.8651, + "step": 9000 + }, + { + "epoch": 0.6649446494464945, + "grad_norm": 0.446232408285141, + "learning_rate": 3.0472336471758678e-05, + "loss": 2.9009, + "step": 9010 + }, + { + "epoch": 0.6656826568265682, + "grad_norm": 0.4600978493690491, + "learning_rate": 3.0353824589943834e-05, + "loss": 2.8842, + "step": 9020 + }, + { + "epoch": 0.666420664206642, + "grad_norm": 0.45147082209587097, + "learning_rate": 3.0235443089010562e-05, + "loss": 2.842, + "step": 9030 + }, + { + "epoch": 0.6671586715867158, + "grad_norm": 0.470324844121933, + "learning_rate": 3.0117192754591893e-05, + "loss": 2.9098, + "step": 9040 + }, + { + "epoch": 0.6678966789667896, + "grad_norm": 0.4519864320755005, + "learning_rate": 2.999907437145042e-05, + "loss": 2.917, + "step": 9050 + }, + { + "epoch": 0.6686346863468635, + "grad_norm": 0.44655749201774597, + "learning_rate": 2.9881088723472966e-05, + "loss": 2.9205, + "step": 9060 + }, + { + "epoch": 0.6693726937269373, + "grad_norm": 0.45969992876052856, + "learning_rate": 2.9763236593665533e-05, + "loss": 2.8726, + "step": 9070 + }, + { + "epoch": 0.6701107011070111, + "grad_norm": 0.45693284273147583, + "learning_rate": 2.9645518764148007e-05, + "loss": 2.8753, + "step": 9080 + }, + { + "epoch": 0.6708487084870849, + "grad_norm": 0.442354291677475, + "learning_rate": 2.9527936016149006e-05, + "loss": 2.8377, + "step": 9090 + }, + { + "epoch": 0.6715867158671587, + "grad_norm": 0.4796278476715088, + "learning_rate": 2.9410489130000684e-05, + "loss": 2.8303, + "step": 9100 + }, + { + "epoch": 0.6723247232472325, + "grad_norm": 0.4597807824611664, + "learning_rate": 2.9293178885133525e-05, + "loss": 2.8325, + "step": 9110 + }, + { + "epoch": 0.6730627306273063, + "grad_norm": 0.47112002968788147, + "learning_rate": 2.917600606007127e-05, + "loss": 2.8479, + "step": 9120 + }, + { + "epoch": 0.67380073800738, + "grad_norm": 0.4425598978996277, + "learning_rate": 2.905897143242562e-05, + "loss": 2.8416, + "step": 9130 + }, + { + "epoch": 0.6745387453874538, + "grad_norm": 0.4444707930088043, + "learning_rate": 2.8942075778891153e-05, + "loss": 2.9409, + "step": 9140 + }, + { + "epoch": 0.6752767527675276, + "grad_norm": 0.4575837254524231, + "learning_rate": 2.882531987524017e-05, + "loss": 2.8615, + "step": 9150 + }, + { + "epoch": 0.6760147601476014, + "grad_norm": 0.4663306176662445, + "learning_rate": 2.8708704496317474e-05, + "loss": 2.8184, + "step": 9160 + }, + { + "epoch": 0.6767527675276753, + "grad_norm": 0.441550076007843, + "learning_rate": 2.8592230416035335e-05, + "loss": 2.8981, + "step": 9170 + }, + { + "epoch": 0.6774907749077491, + "grad_norm": 0.47013741731643677, + "learning_rate": 2.8475898407368296e-05, + "loss": 2.9034, + "step": 9180 + }, + { + "epoch": 0.6782287822878229, + "grad_norm": 0.47934868931770325, + "learning_rate": 2.8359709242348032e-05, + "loss": 2.9483, + "step": 9190 + }, + { + "epoch": 0.6789667896678967, + "grad_norm": 0.44904670119285583, + "learning_rate": 2.824366369205825e-05, + "loss": 2.9038, + "step": 9200 + }, + { + "epoch": 0.6797047970479705, + "grad_norm": 0.4706343710422516, + "learning_rate": 2.8127762526629553e-05, + "loss": 2.8976, + "step": 9210 + }, + { + "epoch": 0.6804428044280443, + "grad_norm": 0.4544294774532318, + "learning_rate": 2.801200651523438e-05, + "loss": 2.8875, + "step": 9220 + }, + { + "epoch": 0.6811808118081181, + "grad_norm": 0.4476546347141266, + "learning_rate": 2.7896396426081844e-05, + "loss": 2.8378, + "step": 9230 + }, + { + "epoch": 0.6819188191881919, + "grad_norm": 0.4503355920314789, + "learning_rate": 2.7780933026412602e-05, + "loss": 2.8917, + "step": 9240 + }, + { + "epoch": 0.6826568265682657, + "grad_norm": 0.4393197298049927, + "learning_rate": 2.766561708249387e-05, + "loss": 2.7785, + "step": 9250 + }, + { + "epoch": 0.6833948339483394, + "grad_norm": 0.45384228229522705, + "learning_rate": 2.7550449359614272e-05, + "loss": 2.8712, + "step": 9260 + }, + { + "epoch": 0.6841328413284132, + "grad_norm": 0.462931752204895, + "learning_rate": 2.743543062207876e-05, + "loss": 2.9299, + "step": 9270 + }, + { + "epoch": 0.6848708487084871, + "grad_norm": 0.4446216821670532, + "learning_rate": 2.7320561633203566e-05, + "loss": 2.93, + "step": 9280 + }, + { + "epoch": 0.6856088560885609, + "grad_norm": 0.4498085677623749, + "learning_rate": 2.7205843155311094e-05, + "loss": 2.8614, + "step": 9290 + }, + { + "epoch": 0.6863468634686347, + "grad_norm": 0.44905975461006165, + "learning_rate": 2.7091275949724926e-05, + "loss": 2.8681, + "step": 9300 + }, + { + "epoch": 0.6870848708487085, + "grad_norm": 0.4424300491809845, + "learning_rate": 2.6976860776764713e-05, + "loss": 2.8048, + "step": 9310 + }, + { + "epoch": 0.6878228782287823, + "grad_norm": 0.46064937114715576, + "learning_rate": 2.6862598395741136e-05, + "loss": 2.8376, + "step": 9320 + }, + { + "epoch": 0.6885608856088561, + "grad_norm": 0.45401063561439514, + "learning_rate": 2.6748489564950908e-05, + "loss": 2.8168, + "step": 9330 + }, + { + "epoch": 0.6892988929889299, + "grad_norm": 0.4572742283344269, + "learning_rate": 2.6634535041671693e-05, + "loss": 2.8182, + "step": 9340 + }, + { + "epoch": 0.6900369003690037, + "grad_norm": 0.4515658915042877, + "learning_rate": 2.652073558215711e-05, + "loss": 2.8569, + "step": 9350 + }, + { + "epoch": 0.6907749077490775, + "grad_norm": 0.44633907079696655, + "learning_rate": 2.64070919416317e-05, + "loss": 2.8684, + "step": 9360 + }, + { + "epoch": 0.6915129151291513, + "grad_norm": 0.4616515636444092, + "learning_rate": 2.6293604874285927e-05, + "loss": 2.8791, + "step": 9370 + }, + { + "epoch": 0.692250922509225, + "grad_norm": 0.4603336751461029, + "learning_rate": 2.618027513327116e-05, + "loss": 2.8685, + "step": 9380 + }, + { + "epoch": 0.6929889298892989, + "grad_norm": 0.4635460376739502, + "learning_rate": 2.6067103470694672e-05, + "loss": 2.8819, + "step": 9390 + }, + { + "epoch": 0.6937269372693727, + "grad_norm": 0.446821004152298, + "learning_rate": 2.5954090637614658e-05, + "loss": 2.8775, + "step": 9400 + }, + { + "epoch": 0.6944649446494465, + "grad_norm": 0.45208224654197693, + "learning_rate": 2.5841237384035265e-05, + "loss": 2.9185, + "step": 9410 + }, + { + "epoch": 0.6952029520295203, + "grad_norm": 0.43966442346572876, + "learning_rate": 2.5728544458901593e-05, + "loss": 2.844, + "step": 9420 + }, + { + "epoch": 0.6959409594095941, + "grad_norm": 0.4660171866416931, + "learning_rate": 2.5616012610094704e-05, + "loss": 2.8533, + "step": 9430 + }, + { + "epoch": 0.6966789667896679, + "grad_norm": 0.4844834804534912, + "learning_rate": 2.5503642584426712e-05, + "loss": 2.9139, + "step": 9440 + }, + { + "epoch": 0.6974169741697417, + "grad_norm": 0.4675824046134949, + "learning_rate": 2.5391435127635805e-05, + "loss": 2.857, + "step": 9450 + }, + { + "epoch": 0.6981549815498155, + "grad_norm": 0.4488329291343689, + "learning_rate": 2.5279390984381264e-05, + "loss": 2.8484, + "step": 9460 + }, + { + "epoch": 0.6988929889298893, + "grad_norm": 0.4558933675289154, + "learning_rate": 2.5167510898238566e-05, + "loss": 2.8784, + "step": 9470 + }, + { + "epoch": 0.6996309963099631, + "grad_norm": 0.45454517006874084, + "learning_rate": 2.5055795611694433e-05, + "loss": 2.8075, + "step": 9480 + }, + { + "epoch": 0.7003690036900369, + "grad_norm": 0.4401450455188751, + "learning_rate": 2.4944245866141886e-05, + "loss": 2.8661, + "step": 9490 + }, + { + "epoch": 0.7011070110701108, + "grad_norm": 0.42718032002449036, + "learning_rate": 2.4832862401875378e-05, + "loss": 2.8306, + "step": 9500 + }, + { + "epoch": 0.7018450184501845, + "grad_norm": 0.4444067180156708, + "learning_rate": 2.472164595808576e-05, + "loss": 2.887, + "step": 9510 + }, + { + "epoch": 0.7025830258302583, + "grad_norm": 0.4388265311717987, + "learning_rate": 2.461059727285558e-05, + "loss": 2.9248, + "step": 9520 + }, + { + "epoch": 0.7033210332103321, + "grad_norm": 0.4537127614021301, + "learning_rate": 2.449971708315397e-05, + "loss": 2.866, + "step": 9530 + }, + { + "epoch": 0.7040590405904059, + "grad_norm": 0.4571674168109894, + "learning_rate": 2.4389006124831893e-05, + "loss": 2.8524, + "step": 9540 + }, + { + "epoch": 0.7047970479704797, + "grad_norm": 0.475065678358078, + "learning_rate": 2.4278465132617207e-05, + "loss": 2.9086, + "step": 9550 + }, + { + "epoch": 0.7055350553505535, + "grad_norm": 0.4491478204727173, + "learning_rate": 2.4168094840109785e-05, + "loss": 2.8496, + "step": 9560 + }, + { + "epoch": 0.7062730627306273, + "grad_norm": 0.4396122694015503, + "learning_rate": 2.4057895979776683e-05, + "loss": 2.8542, + "step": 9570 + }, + { + "epoch": 0.7070110701107011, + "grad_norm": 0.45730844140052795, + "learning_rate": 2.394786928294726e-05, + "loss": 2.8448, + "step": 9580 + }, + { + "epoch": 0.7077490774907749, + "grad_norm": 11.993217468261719, + "learning_rate": 2.3838015479808263e-05, + "loss": 2.8686, + "step": 9590 + }, + { + "epoch": 0.7084870848708487, + "grad_norm": 0.4676622450351715, + "learning_rate": 2.3728335299399106e-05, + "loss": 2.8195, + "step": 9600 + }, + { + "epoch": 0.7092250922509226, + "grad_norm": 0.4665907621383667, + "learning_rate": 2.3618829469606912e-05, + "loss": 2.8851, + "step": 9610 + }, + { + "epoch": 0.7099630996309964, + "grad_norm": 0.4478704631328583, + "learning_rate": 2.3509498717161804e-05, + "loss": 2.8631, + "step": 9620 + }, + { + "epoch": 0.7107011070110701, + "grad_norm": 0.4518534541130066, + "learning_rate": 2.3400343767631944e-05, + "loss": 2.8542, + "step": 9630 + }, + { + "epoch": 0.7114391143911439, + "grad_norm": 0.45083850622177124, + "learning_rate": 2.329136534541882e-05, + "loss": 2.8447, + "step": 9640 + }, + { + "epoch": 0.7121771217712177, + "grad_norm": 0.44704335927963257, + "learning_rate": 2.3182564173752396e-05, + "loss": 2.8001, + "step": 9650 + }, + { + "epoch": 0.7129151291512915, + "grad_norm": 0.459086149930954, + "learning_rate": 2.3073940974686337e-05, + "loss": 2.8562, + "step": 9660 + }, + { + "epoch": 0.7136531365313653, + "grad_norm": 0.4504683017730713, + "learning_rate": 2.296549646909315e-05, + "loss": 2.8153, + "step": 9670 + }, + { + "epoch": 0.7143911439114391, + "grad_norm": 0.4484894275665283, + "learning_rate": 2.2857231376659516e-05, + "loss": 2.8652, + "step": 9680 + }, + { + "epoch": 0.7151291512915129, + "grad_norm": 0.44552645087242126, + "learning_rate": 2.274914641588141e-05, + "loss": 2.8544, + "step": 9690 + }, + { + "epoch": 0.7158671586715867, + "grad_norm": 0.44962164759635925, + "learning_rate": 2.2641242304059394e-05, + "loss": 2.809, + "step": 9700 + }, + { + "epoch": 0.7166051660516605, + "grad_norm": 0.4649772047996521, + "learning_rate": 2.2533519757293803e-05, + "loss": 2.9047, + "step": 9710 + }, + { + "epoch": 0.7173431734317344, + "grad_norm": 0.44465893507003784, + "learning_rate": 2.242597949048008e-05, + "loss": 2.9289, + "step": 9720 + }, + { + "epoch": 0.7180811808118082, + "grad_norm": 0.4587944746017456, + "learning_rate": 2.2318622217303935e-05, + "loss": 2.9381, + "step": 9730 + }, + { + "epoch": 0.718819188191882, + "grad_norm": 0.45747023820877075, + "learning_rate": 2.221144865023666e-05, + "loss": 2.8596, + "step": 9740 + }, + { + "epoch": 0.7195571955719557, + "grad_norm": 0.4500565528869629, + "learning_rate": 2.2104459500530362e-05, + "loss": 2.8122, + "step": 9750 + }, + { + "epoch": 0.7202952029520295, + "grad_norm": 0.44870901107788086, + "learning_rate": 2.1997655478213313e-05, + "loss": 2.8318, + "step": 9760 + }, + { + "epoch": 0.7210332103321033, + "grad_norm": 0.46823742985725403, + "learning_rate": 2.1891037292085175e-05, + "loss": 2.7682, + "step": 9770 + }, + { + "epoch": 0.7217712177121771, + "grad_norm": 0.4822959899902344, + "learning_rate": 2.1784605649712324e-05, + "loss": 2.8845, + "step": 9780 + }, + { + "epoch": 0.7225092250922509, + "grad_norm": 0.4569961726665497, + "learning_rate": 2.167836125742315e-05, + "loss": 2.8073, + "step": 9790 + }, + { + "epoch": 0.7232472324723247, + "grad_norm": 0.5003052949905396, + "learning_rate": 2.1572304820303363e-05, + "loss": 2.966, + "step": 9800 + }, + { + "epoch": 0.7239852398523985, + "grad_norm": 0.4504786431789398, + "learning_rate": 2.1466437042191297e-05, + "loss": 2.8226, + "step": 9810 + }, + { + "epoch": 0.7247232472324723, + "grad_norm": 0.4485565423965454, + "learning_rate": 2.1360758625673327e-05, + "loss": 2.8301, + "step": 9820 + }, + { + "epoch": 0.7254612546125462, + "grad_norm": 0.46124571561813354, + "learning_rate": 2.1255270272079042e-05, + "loss": 2.8485, + "step": 9830 + }, + { + "epoch": 0.72619926199262, + "grad_norm": 0.4612502455711365, + "learning_rate": 2.1149972681476765e-05, + "loss": 2.8276, + "step": 9840 + }, + { + "epoch": 0.7269372693726938, + "grad_norm": 0.45740193128585815, + "learning_rate": 2.104486655266879e-05, + "loss": 2.8669, + "step": 9850 + }, + { + "epoch": 0.7276752767527676, + "grad_norm": 0.47378960251808167, + "learning_rate": 2.0939952583186807e-05, + "loss": 2.8149, + "step": 9860 + }, + { + "epoch": 0.7284132841328413, + "grad_norm": 0.45929577946662903, + "learning_rate": 2.0835231469287232e-05, + "loss": 2.8346, + "step": 9870 + }, + { + "epoch": 0.7291512915129151, + "grad_norm": 0.45453017950057983, + "learning_rate": 2.0730703905946612e-05, + "loss": 2.8851, + "step": 9880 + }, + { + "epoch": 0.7298892988929889, + "grad_norm": 0.4465833604335785, + "learning_rate": 2.0626370586857007e-05, + "loss": 2.8381, + "step": 9890 + }, + { + "epoch": 0.7306273062730627, + "grad_norm": 0.46699321269989014, + "learning_rate": 2.052223220442139e-05, + "loss": 2.8394, + "step": 9900 + }, + { + "epoch": 0.7313653136531365, + "grad_norm": 0.4374259412288666, + "learning_rate": 2.0418289449749027e-05, + "loss": 2.8501, + "step": 9910 + }, + { + "epoch": 0.7321033210332103, + "grad_norm": 0.4604252576828003, + "learning_rate": 2.0314543012650933e-05, + "loss": 2.8711, + "step": 9920 + }, + { + "epoch": 0.7328413284132841, + "grad_norm": 0.45612022280693054, + "learning_rate": 2.0210993581635256e-05, + "loss": 2.844, + "step": 9930 + }, + { + "epoch": 0.7335793357933579, + "grad_norm": 0.43427881598472595, + "learning_rate": 2.0107641843902726e-05, + "loss": 2.8084, + "step": 9940 + }, + { + "epoch": 0.7343173431734318, + "grad_norm": 0.4502193331718445, + "learning_rate": 2.0004488485342088e-05, + "loss": 2.909, + "step": 9950 + }, + { + "epoch": 0.7350553505535056, + "grad_norm": 0.44448336958885193, + "learning_rate": 1.9901534190525566e-05, + "loss": 2.8662, + "step": 9960 + }, + { + "epoch": 0.7357933579335794, + "grad_norm": 0.4308652877807617, + "learning_rate": 1.9798779642704297e-05, + "loss": 2.7882, + "step": 9970 + }, + { + "epoch": 0.7365313653136532, + "grad_norm": 0.4563472867012024, + "learning_rate": 1.96962255238038e-05, + "loss": 2.8956, + "step": 9980 + }, + { + "epoch": 0.7372693726937269, + "grad_norm": 0.4397279620170593, + "learning_rate": 1.9593872514419476e-05, + "loss": 2.7707, + "step": 9990 + }, + { + "epoch": 0.7380073800738007, + "grad_norm": 0.47456085681915283, + "learning_rate": 1.9491721293812076e-05, + "loss": 2.9205, + "step": 10000 + }, + { + "epoch": 0.7387453874538745, + "grad_norm": 0.43729913234710693, + "learning_rate": 1.9389772539903122e-05, + "loss": 2.8423, + "step": 10010 + }, + { + "epoch": 0.7394833948339483, + "grad_norm": 0.4417737126350403, + "learning_rate": 1.9288026929270587e-05, + "loss": 2.832, + "step": 10020 + }, + { + "epoch": 0.7402214022140221, + "grad_norm": 0.44813665747642517, + "learning_rate": 1.9186485137144218e-05, + "loss": 2.8494, + "step": 10030 + }, + { + "epoch": 0.7409594095940959, + "grad_norm": 0.45640864968299866, + "learning_rate": 1.908514783740114e-05, + "loss": 2.8784, + "step": 10040 + }, + { + "epoch": 0.7416974169741697, + "grad_norm": 0.4336318373680115, + "learning_rate": 1.8984015702561393e-05, + "loss": 2.8372, + "step": 10050 + }, + { + "epoch": 0.7424354243542436, + "grad_norm": 0.4504336714744568, + "learning_rate": 1.8883089403783434e-05, + "loss": 2.7967, + "step": 10060 + }, + { + "epoch": 0.7431734317343174, + "grad_norm": 0.46149566769599915, + "learning_rate": 1.8782369610859708e-05, + "loss": 2.8191, + "step": 10070 + }, + { + "epoch": 0.7439114391143912, + "grad_norm": 0.4522392451763153, + "learning_rate": 1.868185699221221e-05, + "loss": 2.8794, + "step": 10080 + }, + { + "epoch": 0.744649446494465, + "grad_norm": 0.4411635994911194, + "learning_rate": 1.8581552214887977e-05, + "loss": 2.8404, + "step": 10090 + }, + { + "epoch": 0.7453874538745388, + "grad_norm": 0.46107056736946106, + "learning_rate": 1.848145594455477e-05, + "loss": 2.846, + "step": 10100 + }, + { + "epoch": 0.7461254612546125, + "grad_norm": 0.45308247208595276, + "learning_rate": 1.8381568845496578e-05, + "loss": 2.807, + "step": 10110 + }, + { + "epoch": 0.7468634686346863, + "grad_norm": 0.44377437233924866, + "learning_rate": 1.828189158060927e-05, + "loss": 2.9005, + "step": 10120 + }, + { + "epoch": 0.7476014760147601, + "grad_norm": 0.45314696431159973, + "learning_rate": 1.8182424811396133e-05, + "loss": 2.8626, + "step": 10130 + }, + { + "epoch": 0.7483394833948339, + "grad_norm": 0.4458778202533722, + "learning_rate": 1.80831691979635e-05, + "loss": 2.7985, + "step": 10140 + }, + { + "epoch": 0.7490774907749077, + "grad_norm": 0.464269757270813, + "learning_rate": 1.7984125399016392e-05, + "loss": 2.9386, + "step": 10150 + }, + { + "epoch": 0.7498154981549815, + "grad_norm": 0.4448395371437073, + "learning_rate": 1.7885294071854157e-05, + "loss": 2.833, + "step": 10160 + }, + { + "epoch": 0.7505535055350554, + "grad_norm": 0.4455287754535675, + "learning_rate": 1.7786675872366028e-05, + "loss": 2.8184, + "step": 10170 + }, + { + "epoch": 0.7512915129151292, + "grad_norm": 0.4467598497867584, + "learning_rate": 1.7688271455026867e-05, + "loss": 2.8357, + "step": 10180 + }, + { + "epoch": 0.752029520295203, + "grad_norm": 0.4642166197299957, + "learning_rate": 1.7590081472892776e-05, + "loss": 2.9219, + "step": 10190 + }, + { + "epoch": 0.7527675276752768, + "grad_norm": 0.44453924894332886, + "learning_rate": 1.7492106577596772e-05, + "loss": 2.8822, + "step": 10200 + }, + { + "epoch": 0.7535055350553506, + "grad_norm": 0.4599774479866028, + "learning_rate": 1.7394347419344432e-05, + "loss": 2.8336, + "step": 10210 + }, + { + "epoch": 0.7542435424354244, + "grad_norm": 0.4477832317352295, + "learning_rate": 1.7296804646909654e-05, + "loss": 2.785, + "step": 10220 + }, + { + "epoch": 0.7549815498154981, + "grad_norm": 0.45672887563705444, + "learning_rate": 1.7199478907630267e-05, + "loss": 2.8166, + "step": 10230 + }, + { + "epoch": 0.7557195571955719, + "grad_norm": 0.4571615159511566, + "learning_rate": 1.710237084740378e-05, + "loss": 2.9199, + "step": 10240 + }, + { + "epoch": 0.7564575645756457, + "grad_norm": 0.4618014991283417, + "learning_rate": 1.7005481110683062e-05, + "loss": 2.907, + "step": 10250 + }, + { + "epoch": 0.7571955719557195, + "grad_norm": 0.44089266657829285, + "learning_rate": 1.690881034047212e-05, + "loss": 2.854, + "step": 10260 + }, + { + "epoch": 0.7579335793357933, + "grad_norm": 0.4468059837818146, + "learning_rate": 1.6812359178321784e-05, + "loss": 2.8511, + "step": 10270 + }, + { + "epoch": 0.7586715867158672, + "grad_norm": 0.4517216682434082, + "learning_rate": 1.6716128264325475e-05, + "loss": 2.8117, + "step": 10280 + }, + { + "epoch": 0.759409594095941, + "grad_norm": 0.4576111137866974, + "learning_rate": 1.662011823711495e-05, + "loss": 2.838, + "step": 10290 + }, + { + "epoch": 0.7601476014760148, + "grad_norm": 0.4355645179748535, + "learning_rate": 1.6524329733856047e-05, + "loss": 2.8054, + "step": 10300 + }, + { + "epoch": 0.7608856088560886, + "grad_norm": 0.4544225037097931, + "learning_rate": 1.642876339024446e-05, + "loss": 2.8703, + "step": 10310 + }, + { + "epoch": 0.7616236162361624, + "grad_norm": 0.4510670006275177, + "learning_rate": 1.633341984050162e-05, + "loss": 2.8265, + "step": 10320 + }, + { + "epoch": 0.7623616236162362, + "grad_norm": 0.444296658039093, + "learning_rate": 1.6238299717370252e-05, + "loss": 2.9467, + "step": 10330 + }, + { + "epoch": 0.76309963099631, + "grad_norm": 0.44352987408638, + "learning_rate": 1.614340365211044e-05, + "loss": 2.8385, + "step": 10340 + }, + { + "epoch": 0.7638376383763837, + "grad_norm": 0.4408433139324188, + "learning_rate": 1.6048732274495255e-05, + "loss": 2.7828, + "step": 10350 + }, + { + "epoch": 0.7645756457564575, + "grad_norm": 0.4516165554523468, + "learning_rate": 1.595428621280668e-05, + "loss": 2.8448, + "step": 10360 + }, + { + "epoch": 0.7653136531365313, + "grad_norm": 0.4665060341358185, + "learning_rate": 1.5860066093831367e-05, + "loss": 2.8067, + "step": 10370 + }, + { + "epoch": 0.7660516605166051, + "grad_norm": 0.44463926553726196, + "learning_rate": 1.5766072542856526e-05, + "loss": 2.8421, + "step": 10380 + }, + { + "epoch": 0.766789667896679, + "grad_norm": 0.426488995552063, + "learning_rate": 1.5672306183665764e-05, + "loss": 2.8121, + "step": 10390 + }, + { + "epoch": 0.7675276752767528, + "grad_norm": 0.44521549344062805, + "learning_rate": 1.557876763853493e-05, + "loss": 2.7992, + "step": 10400 + }, + { + "epoch": 0.7682656826568266, + "grad_norm": 0.45438680052757263, + "learning_rate": 1.5485457528228003e-05, + "loss": 2.8034, + "step": 10410 + }, + { + "epoch": 0.7690036900369004, + "grad_norm": 0.4456971287727356, + "learning_rate": 1.5392376471992965e-05, + "loss": 2.8191, + "step": 10420 + }, + { + "epoch": 0.7697416974169742, + "grad_norm": 0.4459834694862366, + "learning_rate": 1.529952508755768e-05, + "loss": 2.8668, + "step": 10430 + }, + { + "epoch": 0.770479704797048, + "grad_norm": 0.4495449960231781, + "learning_rate": 1.5206903991125832e-05, + "loss": 2.8433, + "step": 10440 + }, + { + "epoch": 0.7712177121771218, + "grad_norm": 0.4536389708518982, + "learning_rate": 1.511451379737278e-05, + "loss": 2.8522, + "step": 10450 + }, + { + "epoch": 0.7719557195571956, + "grad_norm": 0.44112926721572876, + "learning_rate": 1.502235511944154e-05, + "loss": 2.872, + "step": 10460 + }, + { + "epoch": 0.7726937269372693, + "grad_norm": 0.43305230140686035, + "learning_rate": 1.4930428568938648e-05, + "loss": 2.901, + "step": 10470 + }, + { + "epoch": 0.7734317343173431, + "grad_norm": 0.4792589247226715, + "learning_rate": 1.4838734755930167e-05, + "loss": 2.7635, + "step": 10480 + }, + { + "epoch": 0.7741697416974169, + "grad_norm": 0.4358636438846588, + "learning_rate": 1.4747274288937596e-05, + "loss": 2.8276, + "step": 10490 + }, + { + "epoch": 0.7749077490774908, + "grad_norm": 0.44949012994766235, + "learning_rate": 1.4656047774933874e-05, + "loss": 2.8624, + "step": 10500 + }, + { + "epoch": 0.7756457564575646, + "grad_norm": 0.4440300762653351, + "learning_rate": 1.4565055819339235e-05, + "loss": 2.8239, + "step": 10510 + }, + { + "epoch": 0.7763837638376384, + "grad_norm": 0.4554462730884552, + "learning_rate": 1.447429902601739e-05, + "loss": 2.7734, + "step": 10520 + }, + { + "epoch": 0.7771217712177122, + "grad_norm": 0.4523858428001404, + "learning_rate": 1.4383777997271347e-05, + "loss": 2.8976, + "step": 10530 + }, + { + "epoch": 0.777859778597786, + "grad_norm": 0.46444228291511536, + "learning_rate": 1.429349333383948e-05, + "loss": 2.8756, + "step": 10540 + }, + { + "epoch": 0.7785977859778598, + "grad_norm": 0.4419015347957611, + "learning_rate": 1.4203445634891538e-05, + "loss": 2.8626, + "step": 10550 + }, + { + "epoch": 0.7793357933579336, + "grad_norm": 0.44527843594551086, + "learning_rate": 1.4113635498024664e-05, + "loss": 2.8063, + "step": 10560 + }, + { + "epoch": 0.7800738007380074, + "grad_norm": 0.4554080665111542, + "learning_rate": 1.4024063519259439e-05, + "loss": 2.7555, + "step": 10570 + }, + { + "epoch": 0.7808118081180812, + "grad_norm": 0.4289720952510834, + "learning_rate": 1.3934730293035936e-05, + "loss": 2.8304, + "step": 10580 + }, + { + "epoch": 0.7815498154981549, + "grad_norm": 0.4606097936630249, + "learning_rate": 1.38456364122097e-05, + "loss": 2.8415, + "step": 10590 + }, + { + "epoch": 0.7822878228782287, + "grad_norm": 0.4606861174106598, + "learning_rate": 1.3756782468047936e-05, + "loss": 2.889, + "step": 10600 + }, + { + "epoch": 0.7830258302583026, + "grad_norm": 0.425731897354126, + "learning_rate": 1.3668169050225472e-05, + "loss": 2.8573, + "step": 10610 + }, + { + "epoch": 0.7837638376383764, + "grad_norm": 0.4634413421154022, + "learning_rate": 1.357979674682095e-05, + "loss": 2.8677, + "step": 10620 + }, + { + "epoch": 0.7845018450184502, + "grad_norm": 0.45793548226356506, + "learning_rate": 1.349166614431282e-05, + "loss": 2.9207, + "step": 10630 + }, + { + "epoch": 0.785239852398524, + "grad_norm": 0.4642331898212433, + "learning_rate": 1.3403777827575514e-05, + "loss": 2.887, + "step": 10640 + }, + { + "epoch": 0.7859778597785978, + "grad_norm": 0.4591294825077057, + "learning_rate": 1.3316132379875551e-05, + "loss": 2.8502, + "step": 10650 + }, + { + "epoch": 0.7867158671586716, + "grad_norm": 0.4461764395236969, + "learning_rate": 1.322873038286766e-05, + "loss": 2.8357, + "step": 10660 + }, + { + "epoch": 0.7874538745387454, + "grad_norm": 0.4518667757511139, + "learning_rate": 1.3141572416590891e-05, + "loss": 2.9274, + "step": 10670 + }, + { + "epoch": 0.7881918819188192, + "grad_norm": 0.435041606426239, + "learning_rate": 1.3054659059464835e-05, + "loss": 2.7578, + "step": 10680 + }, + { + "epoch": 0.788929889298893, + "grad_norm": 0.45000597834587097, + "learning_rate": 1.2967990888285737e-05, + "loss": 2.8792, + "step": 10690 + }, + { + "epoch": 0.7896678966789668, + "grad_norm": 0.4507540464401245, + "learning_rate": 1.2881568478222672e-05, + "loss": 2.9286, + "step": 10700 + }, + { + "epoch": 0.7904059040590405, + "grad_norm": 0.44547247886657715, + "learning_rate": 1.2795392402813715e-05, + "loss": 2.7792, + "step": 10710 + }, + { + "epoch": 0.7911439114391144, + "grad_norm": 0.4526568353176117, + "learning_rate": 1.2709463233962204e-05, + "loss": 2.8923, + "step": 10720 + }, + { + "epoch": 0.7918819188191882, + "grad_norm": 0.4650912284851074, + "learning_rate": 1.262378154193285e-05, + "loss": 2.7767, + "step": 10730 + }, + { + "epoch": 0.792619926199262, + "grad_norm": 0.4619973301887512, + "learning_rate": 1.2538347895348013e-05, + "loss": 2.7074, + "step": 10740 + }, + { + "epoch": 0.7933579335793358, + "grad_norm": 0.4545031487941742, + "learning_rate": 1.2453162861183909e-05, + "loss": 2.832, + "step": 10750 + }, + { + "epoch": 0.7940959409594096, + "grad_norm": 0.45016980171203613, + "learning_rate": 1.236822700476683e-05, + "loss": 2.8709, + "step": 10760 + }, + { + "epoch": 0.7948339483394834, + "grad_norm": 0.41397857666015625, + "learning_rate": 1.2283540889769445e-05, + "loss": 2.7864, + "step": 10770 + }, + { + "epoch": 0.7955719557195572, + "grad_norm": 0.47167348861694336, + "learning_rate": 1.2199105078207001e-05, + "loss": 2.7768, + "step": 10780 + }, + { + "epoch": 0.796309963099631, + "grad_norm": 0.46366357803344727, + "learning_rate": 1.2114920130433644e-05, + "loss": 2.8994, + "step": 10790 + }, + { + "epoch": 0.7970479704797048, + "grad_norm": 0.4539276957511902, + "learning_rate": 1.2030986605138644e-05, + "loss": 2.8526, + "step": 10800 + }, + { + "epoch": 0.7977859778597786, + "grad_norm": 0.430576354265213, + "learning_rate": 1.1947305059342729e-05, + "loss": 2.7993, + "step": 10810 + }, + { + "epoch": 0.7985239852398524, + "grad_norm": 0.4400356113910675, + "learning_rate": 1.1863876048394407e-05, + "loss": 2.9068, + "step": 10820 + }, + { + "epoch": 0.7992619926199263, + "grad_norm": 0.44879478216171265, + "learning_rate": 1.1780700125966233e-05, + "loss": 2.8591, + "step": 10830 + }, + { + "epoch": 0.8, + "grad_norm": 0.44169095158576965, + "learning_rate": 1.1697777844051105e-05, + "loss": 2.793, + "step": 10840 + }, + { + "epoch": 0.8007380073800738, + "grad_norm": 0.45461106300354004, + "learning_rate": 1.1615109752958713e-05, + "loss": 2.9182, + "step": 10850 + }, + { + "epoch": 0.8014760147601476, + "grad_norm": 0.4425186812877655, + "learning_rate": 1.1532696401311787e-05, + "loss": 2.8754, + "step": 10860 + }, + { + "epoch": 0.8022140221402214, + "grad_norm": 0.4334977865219116, + "learning_rate": 1.1450538336042516e-05, + "loss": 2.8037, + "step": 10870 + }, + { + "epoch": 0.8029520295202952, + "grad_norm": 0.43513453006744385, + "learning_rate": 1.1368636102388868e-05, + "loss": 2.8548, + "step": 10880 + }, + { + "epoch": 0.803690036900369, + "grad_norm": 0.4428231716156006, + "learning_rate": 1.1286990243891011e-05, + "loss": 2.8673, + "step": 10890 + }, + { + "epoch": 0.8044280442804428, + "grad_norm": 0.4509079158306122, + "learning_rate": 1.1205601302387692e-05, + "loss": 2.9012, + "step": 10900 + }, + { + "epoch": 0.8051660516605166, + "grad_norm": 0.44838449358940125, + "learning_rate": 1.1124469818012635e-05, + "loss": 2.8056, + "step": 10910 + }, + { + "epoch": 0.8059040590405904, + "grad_norm": 0.4536844491958618, + "learning_rate": 1.1043596329190964e-05, + "loss": 2.883, + "step": 10920 + }, + { + "epoch": 0.8066420664206642, + "grad_norm": 0.44634494185447693, + "learning_rate": 1.0962981372635628e-05, + "loss": 2.8049, + "step": 10930 + }, + { + "epoch": 0.8073800738007381, + "grad_norm": 0.4615216553211212, + "learning_rate": 1.0882625483343845e-05, + "loss": 2.9058, + "step": 10940 + }, + { + "epoch": 0.8081180811808119, + "grad_norm": 0.4436852037906647, + "learning_rate": 1.0802529194593547e-05, + "loss": 2.8492, + "step": 10950 + }, + { + "epoch": 0.8088560885608856, + "grad_norm": 0.4358108341693878, + "learning_rate": 1.0722693037939818e-05, + "loss": 2.8513, + "step": 10960 + }, + { + "epoch": 0.8095940959409594, + "grad_norm": 0.45849135518074036, + "learning_rate": 1.0643117543211422e-05, + "loss": 2.8141, + "step": 10970 + }, + { + "epoch": 0.8103321033210332, + "grad_norm": 0.4694216251373291, + "learning_rate": 1.0563803238507219e-05, + "loss": 2.8304, + "step": 10980 + }, + { + "epoch": 0.811070110701107, + "grad_norm": 0.4531688094139099, + "learning_rate": 1.0484750650192726e-05, + "loss": 2.9128, + "step": 10990 + }, + { + "epoch": 0.8118081180811808, + "grad_norm": 0.4585440754890442, + "learning_rate": 1.0405960302896562e-05, + "loss": 2.8299, + "step": 11000 + }, + { + "epoch": 0.8125461254612546, + "grad_norm": 0.4274667799472809, + "learning_rate": 1.0327432719507019e-05, + "loss": 2.7979, + "step": 11010 + }, + { + "epoch": 0.8132841328413284, + "grad_norm": 0.43614691495895386, + "learning_rate": 1.0249168421168558e-05, + "loss": 2.8119, + "step": 11020 + }, + { + "epoch": 0.8140221402214022, + "grad_norm": 0.45556968450546265, + "learning_rate": 1.0171167927278368e-05, + "loss": 2.9038, + "step": 11030 + }, + { + "epoch": 0.814760147601476, + "grad_norm": 0.44112008810043335, + "learning_rate": 1.0093431755482908e-05, + "loss": 2.9019, + "step": 11040 + }, + { + "epoch": 0.8154981549815498, + "grad_norm": 0.444204181432724, + "learning_rate": 1.001596042167447e-05, + "loss": 2.7909, + "step": 11050 + }, + { + "epoch": 0.8162361623616237, + "grad_norm": 0.427478551864624, + "learning_rate": 9.93875443998778e-06, + "loss": 2.8195, + "step": 11060 + }, + { + "epoch": 0.8169741697416975, + "grad_norm": 0.4325047433376312, + "learning_rate": 9.861814322796553e-06, + "loss": 2.8227, + "step": 11070 + }, + { + "epoch": 0.8177121771217712, + "grad_norm": 0.4463500380516052, + "learning_rate": 9.785140580710107e-06, + "loss": 2.8502, + "step": 11080 + }, + { + "epoch": 0.818450184501845, + "grad_norm": 0.44314101338386536, + "learning_rate": 9.708733722569996e-06, + "loss": 2.8617, + "step": 11090 + }, + { + "epoch": 0.8191881918819188, + "grad_norm": 0.43770846724510193, + "learning_rate": 9.632594255446565e-06, + "loss": 2.815, + "step": 11100 + }, + { + "epoch": 0.8199261992619926, + "grad_norm": 0.48664426803588867, + "learning_rate": 9.556722684635667e-06, + "loss": 2.8386, + "step": 11110 + }, + { + "epoch": 0.8206642066420664, + "grad_norm": 0.42718470096588135, + "learning_rate": 9.48111951365529e-06, + "loss": 2.7743, + "step": 11120 + }, + { + "epoch": 0.8214022140221402, + "grad_norm": 0.4534224569797516, + "learning_rate": 9.405785244242165e-06, + "loss": 2.885, + "step": 11130 + }, + { + "epoch": 0.822140221402214, + "grad_norm": 0.4469706118106842, + "learning_rate": 9.330720376348483e-06, + "loss": 2.7431, + "step": 11140 + }, + { + "epoch": 0.8228782287822878, + "grad_norm": 0.4499460756778717, + "learning_rate": 9.25592540813857e-06, + "loss": 2.8604, + "step": 11150 + }, + { + "epoch": 0.8236162361623616, + "grad_norm": 0.4386638104915619, + "learning_rate": 9.18140083598557e-06, + "loss": 2.797, + "step": 11160 + }, + { + "epoch": 0.8243542435424355, + "grad_norm": 0.4377821683883667, + "learning_rate": 9.10714715446817e-06, + "loss": 2.8071, + "step": 11170 + }, + { + "epoch": 0.8250922509225093, + "grad_norm": 0.4503236413002014, + "learning_rate": 9.03316485636727e-06, + "loss": 2.8215, + "step": 11180 + }, + { + "epoch": 0.825830258302583, + "grad_norm": 0.4537326693534851, + "learning_rate": 8.959454432662778e-06, + "loss": 2.7938, + "step": 11190 + }, + { + "epoch": 0.8265682656826568, + "grad_norm": 0.4477526843547821, + "learning_rate": 8.88601637253032e-06, + "loss": 2.7778, + "step": 11200 + }, + { + "epoch": 0.8273062730627306, + "grad_norm": 0.45014604926109314, + "learning_rate": 8.812851163337975e-06, + "loss": 2.792, + "step": 11210 + }, + { + "epoch": 0.8280442804428044, + "grad_norm": 0.44553130865097046, + "learning_rate": 8.739959290643097e-06, + "loss": 2.8268, + "step": 11220 + }, + { + "epoch": 0.8287822878228782, + "grad_norm": 0.45030757784843445, + "learning_rate": 8.667341238189009e-06, + "loss": 2.8332, + "step": 11230 + }, + { + "epoch": 0.829520295202952, + "grad_norm": 0.44522371888160706, + "learning_rate": 8.594997487901879e-06, + "loss": 2.8526, + "step": 11240 + }, + { + "epoch": 0.8302583025830258, + "grad_norm": 0.46951159834861755, + "learning_rate": 8.522928519887463e-06, + "loss": 2.8052, + "step": 11250 + }, + { + "epoch": 0.8309963099630996, + "grad_norm": 0.45531222224235535, + "learning_rate": 8.451134812427925e-06, + "loss": 2.8108, + "step": 11260 + }, + { + "epoch": 0.8317343173431734, + "grad_norm": 0.4519606828689575, + "learning_rate": 8.379616841978699e-06, + "loss": 2.8302, + "step": 11270 + }, + { + "epoch": 0.8324723247232473, + "grad_norm": 0.45735597610473633, + "learning_rate": 8.308375083165298e-06, + "loss": 2.9323, + "step": 11280 + }, + { + "epoch": 0.8332103321033211, + "grad_norm": 0.4518982172012329, + "learning_rate": 8.237410008780161e-06, + "loss": 2.796, + "step": 11290 + }, + { + "epoch": 0.8339483394833949, + "grad_norm": 0.4294179379940033, + "learning_rate": 8.166722089779539e-06, + "loss": 2.8383, + "step": 11300 + }, + { + "epoch": 0.8346863468634687, + "grad_norm": 0.43325817584991455, + "learning_rate": 8.096311795280331e-06, + "loss": 2.7896, + "step": 11310 + }, + { + "epoch": 0.8354243542435424, + "grad_norm": 0.4492734670639038, + "learning_rate": 8.026179592557037e-06, + "loss": 2.8272, + "step": 11320 + }, + { + "epoch": 0.8361623616236162, + "grad_norm": 0.4338243007659912, + "learning_rate": 7.956325947038584e-06, + "loss": 2.8173, + "step": 11330 + }, + { + "epoch": 0.83690036900369, + "grad_norm": 0.4449402987957001, + "learning_rate": 7.886751322305247e-06, + "loss": 2.8244, + "step": 11340 + }, + { + "epoch": 0.8376383763837638, + "grad_norm": 0.44180235266685486, + "learning_rate": 7.817456180085636e-06, + "loss": 2.8902, + "step": 11350 + }, + { + "epoch": 0.8383763837638376, + "grad_norm": 0.45504215359687805, + "learning_rate": 7.748440980253562e-06, + "loss": 2.8344, + "step": 11360 + }, + { + "epoch": 0.8391143911439114, + "grad_norm": 0.4654461443424225, + "learning_rate": 7.67970618082503e-06, + "loss": 2.8335, + "step": 11370 + }, + { + "epoch": 0.8398523985239852, + "grad_norm": 0.47360721230506897, + "learning_rate": 7.611252237955169e-06, + "loss": 2.8943, + "step": 11380 + }, + { + "epoch": 0.8405904059040591, + "grad_norm": 0.4570152461528778, + "learning_rate": 7.543079605935221e-06, + "loss": 2.8674, + "step": 11390 + }, + { + "epoch": 0.8413284132841329, + "grad_norm": 0.41285139322280884, + "learning_rate": 7.47518873718952e-06, + "loss": 2.8292, + "step": 11400 + }, + { + "epoch": 0.8420664206642067, + "grad_norm": 0.45135176181793213, + "learning_rate": 7.407580082272492e-06, + "loss": 2.7573, + "step": 11410 + }, + { + "epoch": 0.8428044280442805, + "grad_norm": 0.4763992726802826, + "learning_rate": 7.340254089865672e-06, + "loss": 2.8902, + "step": 11420 + }, + { + "epoch": 0.8435424354243543, + "grad_norm": 0.480816513299942, + "learning_rate": 7.27321120677471e-06, + "loss": 2.9058, + "step": 11430 + }, + { + "epoch": 0.844280442804428, + "grad_norm": 0.4476820230484009, + "learning_rate": 7.206451877926418e-06, + "loss": 2.8191, + "step": 11440 + }, + { + "epoch": 0.8450184501845018, + "grad_norm": 0.4477422833442688, + "learning_rate": 7.139976546365817e-06, + "loss": 2.8023, + "step": 11450 + }, + { + "epoch": 0.8457564575645756, + "grad_norm": 0.4407312572002411, + "learning_rate": 7.0737856532531895e-06, + "loss": 2.8368, + "step": 11460 + }, + { + "epoch": 0.8464944649446494, + "grad_norm": 0.45549750328063965, + "learning_rate": 7.007879637861159e-06, + "loss": 2.8561, + "step": 11470 + }, + { + "epoch": 0.8472324723247232, + "grad_norm": 0.4288015067577362, + "learning_rate": 6.942258937571772e-06, + "loss": 2.7234, + "step": 11480 + }, + { + "epoch": 0.847970479704797, + "grad_norm": 0.4370770752429962, + "learning_rate": 6.87692398787359e-06, + "loss": 2.8607, + "step": 11490 + }, + { + "epoch": 0.8487084870848709, + "grad_norm": 0.44784659147262573, + "learning_rate": 6.81187522235881e-06, + "loss": 2.78, + "step": 11500 + }, + { + "epoch": 0.8494464944649447, + "grad_norm": 0.43501320481300354, + "learning_rate": 6.747113072720385e-06, + "loss": 2.8121, + "step": 11510 + }, + { + "epoch": 0.8501845018450185, + "grad_norm": 0.4419308006763458, + "learning_rate": 6.6826379687491505e-06, + "loss": 2.8502, + "step": 11520 + }, + { + "epoch": 0.8509225092250923, + "grad_norm": 0.4417872130870819, + "learning_rate": 6.6184503383309784e-06, + "loss": 2.8042, + "step": 11530 + }, + { + "epoch": 0.8516605166051661, + "grad_norm": 0.4433625638484955, + "learning_rate": 6.5545506074439325e-06, + "loss": 2.7962, + "step": 11540 + }, + { + "epoch": 0.8523985239852399, + "grad_norm": 0.44587311148643494, + "learning_rate": 6.490939200155449e-06, + "loss": 2.841, + "step": 11550 + }, + { + "epoch": 0.8531365313653136, + "grad_norm": 0.4439995288848877, + "learning_rate": 6.427616538619524e-06, + "loss": 2.8195, + "step": 11560 + }, + { + "epoch": 0.8538745387453874, + "grad_norm": 0.4364805519580841, + "learning_rate": 6.3645830430739015e-06, + "loss": 2.7775, + "step": 11570 + }, + { + "epoch": 0.8546125461254612, + "grad_norm": 0.4607424736022949, + "learning_rate": 6.301839131837284e-06, + "loss": 2.907, + "step": 11580 + }, + { + "epoch": 0.855350553505535, + "grad_norm": 0.45834723114967346, + "learning_rate": 6.239385221306587e-06, + "loss": 2.8708, + "step": 11590 + }, + { + "epoch": 0.8560885608856088, + "grad_norm": 0.43934082984924316, + "learning_rate": 6.177221725954102e-06, + "loss": 2.8159, + "step": 11600 + }, + { + "epoch": 0.8568265682656827, + "grad_norm": 0.4437257945537567, + "learning_rate": 6.1153490583248265e-06, + "loss": 2.8734, + "step": 11610 + }, + { + "epoch": 0.8575645756457565, + "grad_norm": 0.43929627537727356, + "learning_rate": 6.053767629033713e-06, + "loss": 2.874, + "step": 11620 + }, + { + "epoch": 0.8583025830258303, + "grad_norm": 0.4439617097377777, + "learning_rate": 5.992477846762895e-06, + "loss": 2.8252, + "step": 11630 + }, + { + "epoch": 0.8590405904059041, + "grad_norm": 0.4464716613292694, + "learning_rate": 5.931480118259003e-06, + "loss": 2.78, + "step": 11640 + }, + { + "epoch": 0.8597785977859779, + "grad_norm": 0.43279653787612915, + "learning_rate": 5.870774848330485e-06, + "loss": 2.749, + "step": 11650 + }, + { + "epoch": 0.8605166051660517, + "grad_norm": 0.4490513503551483, + "learning_rate": 5.810362439844896e-06, + "loss": 2.841, + "step": 11660 + }, + { + "epoch": 0.8612546125461255, + "grad_norm": 0.4711556136608124, + "learning_rate": 5.750243293726226e-06, + "loss": 2.7801, + "step": 11670 + }, + { + "epoch": 0.8619926199261992, + "grad_norm": 0.4525899887084961, + "learning_rate": 5.690417808952242e-06, + "loss": 2.8942, + "step": 11680 + }, + { + "epoch": 0.862730627306273, + "grad_norm": 0.44727823138237, + "learning_rate": 5.6308863825518425e-06, + "loss": 2.8095, + "step": 11690 + }, + { + "epoch": 0.8634686346863468, + "grad_norm": 0.43965160846710205, + "learning_rate": 5.571649409602436e-06, + "loss": 2.8073, + "step": 11700 + }, + { + "epoch": 0.8642066420664206, + "grad_norm": 0.45212361216545105, + "learning_rate": 5.512707283227275e-06, + "loss": 2.8849, + "step": 11710 + }, + { + "epoch": 0.8649446494464945, + "grad_norm": 0.4664202332496643, + "learning_rate": 5.454060394592919e-06, + "loss": 2.8199, + "step": 11720 + }, + { + "epoch": 0.8656826568265683, + "grad_norm": 0.4387909471988678, + "learning_rate": 5.395709132906568e-06, + "loss": 2.8372, + "step": 11730 + }, + { + "epoch": 0.8664206642066421, + "grad_norm": 0.4543474018573761, + "learning_rate": 5.337653885413513e-06, + "loss": 2.8331, + "step": 11740 + }, + { + "epoch": 0.8671586715867159, + "grad_norm": 0.45128577947616577, + "learning_rate": 5.279895037394566e-06, + "loss": 2.8062, + "step": 11750 + }, + { + "epoch": 0.8678966789667897, + "grad_norm": 0.4404621422290802, + "learning_rate": 5.222432972163482e-06, + "loss": 2.9088, + "step": 11760 + }, + { + "epoch": 0.8686346863468635, + "grad_norm": 0.4398937225341797, + "learning_rate": 5.165268071064455e-06, + "loss": 2.7826, + "step": 11770 + }, + { + "epoch": 0.8693726937269373, + "grad_norm": 0.4395955204963684, + "learning_rate": 5.108400713469546e-06, + "loss": 2.8196, + "step": 11780 + }, + { + "epoch": 0.870110701107011, + "grad_norm": 0.43461933732032776, + "learning_rate": 5.051831276776203e-06, + "loss": 2.8663, + "step": 11790 + }, + { + "epoch": 0.8708487084870848, + "grad_norm": 0.4447794258594513, + "learning_rate": 4.995560136404709e-06, + "loss": 2.8519, + "step": 11800 + }, + { + "epoch": 0.8715867158671586, + "grad_norm": 0.4266679286956787, + "learning_rate": 4.939587665795736e-06, + "loss": 2.8062, + "step": 11810 + }, + { + "epoch": 0.8723247232472324, + "grad_norm": 0.4411248564720154, + "learning_rate": 4.88391423640786e-06, + "loss": 2.8758, + "step": 11820 + }, + { + "epoch": 0.8730627306273063, + "grad_norm": 0.44381076097488403, + "learning_rate": 4.828540217715066e-06, + "loss": 2.7979, + "step": 11830 + }, + { + "epoch": 0.8738007380073801, + "grad_norm": 0.44569119811058044, + "learning_rate": 4.773465977204311e-06, + "loss": 2.8081, + "step": 11840 + }, + { + "epoch": 0.8745387453874539, + "grad_norm": 0.48127833008766174, + "learning_rate": 4.718691880373094e-06, + "loss": 2.8617, + "step": 11850 + }, + { + "epoch": 0.8752767527675277, + "grad_norm": 0.45613643527030945, + "learning_rate": 4.664218290727035e-06, + "loss": 2.8187, + "step": 11860 + }, + { + "epoch": 0.8760147601476015, + "grad_norm": 0.440491646528244, + "learning_rate": 4.610045569777444e-06, + "loss": 2.8023, + "step": 11870 + }, + { + "epoch": 0.8767527675276753, + "grad_norm": 0.4358707368373871, + "learning_rate": 4.5561740770389275e-06, + "loss": 2.8102, + "step": 11880 + }, + { + "epoch": 0.8774907749077491, + "grad_norm": 0.43503841757774353, + "learning_rate": 4.502604170027019e-06, + "loss": 2.8204, + "step": 11890 + }, + { + "epoch": 0.8782287822878229, + "grad_norm": 0.4486919343471527, + "learning_rate": 4.449336204255777e-06, + "loss": 2.8827, + "step": 11900 + }, + { + "epoch": 0.8789667896678967, + "grad_norm": 0.43869447708129883, + "learning_rate": 4.396370533235455e-06, + "loss": 2.8374, + "step": 11910 + }, + { + "epoch": 0.8797047970479704, + "grad_norm": 0.45128440856933594, + "learning_rate": 4.343707508470135e-06, + "loss": 2.8906, + "step": 11920 + }, + { + "epoch": 0.8804428044280442, + "grad_norm": 0.46216467022895813, + "learning_rate": 4.291347479455405e-06, + "loss": 2.8381, + "step": 11930 + }, + { + "epoch": 0.8811808118081181, + "grad_norm": 0.4366297721862793, + "learning_rate": 4.2392907936760265e-06, + "loss": 2.8183, + "step": 11940 + }, + { + "epoch": 0.8819188191881919, + "grad_norm": 0.45038753747940063, + "learning_rate": 4.187537796603658e-06, + "loss": 2.7906, + "step": 11950 + }, + { + "epoch": 0.8826568265682657, + "grad_norm": 0.45959797501564026, + "learning_rate": 4.136088831694524e-06, + "loss": 2.8724, + "step": 11960 + }, + { + "epoch": 0.8833948339483395, + "grad_norm": 0.4413219392299652, + "learning_rate": 4.084944240387168e-06, + "loss": 2.8541, + "step": 11970 + }, + { + "epoch": 0.8841328413284133, + "grad_norm": 0.47469910979270935, + "learning_rate": 4.034104362100155e-06, + "loss": 2.9288, + "step": 11980 + }, + { + "epoch": 0.8848708487084871, + "grad_norm": 0.43708014488220215, + "learning_rate": 3.983569534229864e-06, + "loss": 2.7833, + "step": 11990 + }, + { + "epoch": 0.8856088560885609, + "grad_norm": 0.44569307565689087, + "learning_rate": 3.933340092148202e-06, + "loss": 2.8684, + "step": 12000 + }, + { + "epoch": 0.8863468634686347, + "grad_norm": 0.462568998336792, + "learning_rate": 3.883416369200399e-06, + "loss": 2.8399, + "step": 12010 + }, + { + "epoch": 0.8870848708487085, + "grad_norm": 0.4384634494781494, + "learning_rate": 3.8337986967028e-06, + "loss": 2.837, + "step": 12020 + }, + { + "epoch": 0.8878228782287823, + "grad_norm": 0.46717679500579834, + "learning_rate": 3.7844874039406674e-06, + "loss": 2.8523, + "step": 12030 + }, + { + "epoch": 0.888560885608856, + "grad_norm": 0.4314653277397156, + "learning_rate": 3.7354828181659695e-06, + "loss": 2.8815, + "step": 12040 + }, + { + "epoch": 0.8892988929889298, + "grad_norm": 0.43344810605049133, + "learning_rate": 3.6867852645952494e-06, + "loss": 2.7918, + "step": 12050 + }, + { + "epoch": 0.8900369003690037, + "grad_norm": 0.46255967020988464, + "learning_rate": 3.6383950664074405e-06, + "loss": 2.8106, + "step": 12060 + }, + { + "epoch": 0.8907749077490775, + "grad_norm": 0.44985824823379517, + "learning_rate": 3.5903125447417196e-06, + "loss": 2.8244, + "step": 12070 + }, + { + "epoch": 0.8915129151291513, + "grad_norm": 0.441011518239975, + "learning_rate": 3.5425380186953904e-06, + "loss": 2.8061, + "step": 12080 + }, + { + "epoch": 0.8922509225092251, + "grad_norm": 0.4453372359275818, + "learning_rate": 3.495071805321759e-06, + "loss": 2.9384, + "step": 12090 + }, + { + "epoch": 0.8929889298892989, + "grad_norm": 0.43761390447616577, + "learning_rate": 3.447914219628029e-06, + "loss": 2.7863, + "step": 12100 + }, + { + "epoch": 0.8937269372693727, + "grad_norm": 0.4433492124080658, + "learning_rate": 3.4010655745731865e-06, + "loss": 2.8553, + "step": 12110 + }, + { + "epoch": 0.8944649446494465, + "grad_norm": 0.43299391865730286, + "learning_rate": 3.354526181066003e-06, + "loss": 2.7823, + "step": 12120 + }, + { + "epoch": 0.8952029520295203, + "grad_norm": 0.45678773522377014, + "learning_rate": 3.308296347962875e-06, + "loss": 2.7281, + "step": 12130 + }, + { + "epoch": 0.8959409594095941, + "grad_norm": 0.4413972795009613, + "learning_rate": 3.2623763820658237e-06, + "loss": 2.8478, + "step": 12140 + }, + { + "epoch": 0.8966789667896679, + "grad_norm": 0.44608476758003235, + "learning_rate": 3.2167665881204567e-06, + "loss": 2.7823, + "step": 12150 + }, + { + "epoch": 0.8974169741697416, + "grad_norm": 0.4420614242553711, + "learning_rate": 3.171467268813938e-06, + "loss": 2.8281, + "step": 12160 + }, + { + "epoch": 0.8981549815498155, + "grad_norm": 0.4385377764701843, + "learning_rate": 3.1264787247729908e-06, + "loss": 2.7918, + "step": 12170 + }, + { + "epoch": 0.8988929889298893, + "grad_norm": 0.44008246064186096, + "learning_rate": 3.0818012545618835e-06, + "loss": 2.793, + "step": 12180 + }, + { + "epoch": 0.8996309963099631, + "grad_norm": 0.44634199142456055, + "learning_rate": 3.0374351546804514e-06, + "loss": 2.7829, + "step": 12190 + }, + { + "epoch": 0.9003690036900369, + "grad_norm": 0.4375803768634796, + "learning_rate": 2.9933807195621445e-06, + "loss": 2.8107, + "step": 12200 + }, + { + "epoch": 0.9011070110701107, + "grad_norm": 0.4388578534126282, + "learning_rate": 2.9496382415720723e-06, + "loss": 2.8524, + "step": 12210 + }, + { + "epoch": 0.9018450184501845, + "grad_norm": 0.43253517150878906, + "learning_rate": 2.9062080110050515e-06, + "loss": 2.8215, + "step": 12220 + }, + { + "epoch": 0.9025830258302583, + "grad_norm": 0.4246656894683838, + "learning_rate": 2.8630903160836773e-06, + "loss": 2.835, + "step": 12230 + }, + { + "epoch": 0.9033210332103321, + "grad_norm": 0.4635641872882843, + "learning_rate": 2.820285442956422e-06, + "loss": 2.829, + "step": 12240 + }, + { + "epoch": 0.9040590405904059, + "grad_norm": 0.4323824644088745, + "learning_rate": 2.7777936756957333e-06, + "loss": 2.7945, + "step": 12250 + }, + { + "epoch": 0.9047970479704797, + "grad_norm": 0.4489029347896576, + "learning_rate": 2.7356152962961567e-06, + "loss": 2.8904, + "step": 12260 + }, + { + "epoch": 0.9055350553505535, + "grad_norm": 0.4545091390609741, + "learning_rate": 2.6937505846724165e-06, + "loss": 2.8889, + "step": 12270 + }, + { + "epoch": 0.9062730627306274, + "grad_norm": 0.4438563585281372, + "learning_rate": 2.6521998186576357e-06, + "loss": 2.836, + "step": 12280 + }, + { + "epoch": 0.9070110701107011, + "grad_norm": 0.4264052212238312, + "learning_rate": 2.610963274001438e-06, + "loss": 2.7639, + "step": 12290 + }, + { + "epoch": 0.9077490774907749, + "grad_norm": 0.4508605897426605, + "learning_rate": 2.5700412243681417e-06, + "loss": 2.7735, + "step": 12300 + }, + { + "epoch": 0.9084870848708487, + "grad_norm": 0.4573262929916382, + "learning_rate": 2.5294339413349076e-06, + "loss": 2.8901, + "step": 12310 + }, + { + "epoch": 0.9092250922509225, + "grad_norm": 0.4440000057220459, + "learning_rate": 2.4891416943900014e-06, + "loss": 2.8662, + "step": 12320 + }, + { + "epoch": 0.9099630996309963, + "grad_norm": 0.4513186812400818, + "learning_rate": 2.449164750930938e-06, + "loss": 2.8268, + "step": 12330 + }, + { + "epoch": 0.9107011070110701, + "grad_norm": 0.43622398376464844, + "learning_rate": 2.409503376262762e-06, + "loss": 2.8246, + "step": 12340 + }, + { + "epoch": 0.9114391143911439, + "grad_norm": 0.44066351652145386, + "learning_rate": 2.3701578335962206e-06, + "loss": 2.7924, + "step": 12350 + }, + { + "epoch": 0.9121771217712177, + "grad_norm": 0.4405202269554138, + "learning_rate": 2.3311283840460994e-06, + "loss": 2.8639, + "step": 12360 + }, + { + "epoch": 0.9129151291512915, + "grad_norm": 0.4488193094730377, + "learning_rate": 2.292415286629418e-06, + "loss": 2.8531, + "step": 12370 + }, + { + "epoch": 0.9136531365313653, + "grad_norm": 0.4245339632034302, + "learning_rate": 2.254018798263763e-06, + "loss": 2.8349, + "step": 12380 + }, + { + "epoch": 0.9143911439114392, + "grad_norm": 0.43623387813568115, + "learning_rate": 2.2159391737655466e-06, + "loss": 2.8225, + "step": 12390 + }, + { + "epoch": 0.915129151291513, + "grad_norm": 0.4482229948043823, + "learning_rate": 2.1781766658483303e-06, + "loss": 2.7716, + "step": 12400 + }, + { + "epoch": 0.9158671586715867, + "grad_norm": 0.450795441865921, + "learning_rate": 2.1407315251211422e-06, + "loss": 2.7796, + "step": 12410 + }, + { + "epoch": 0.9166051660516605, + "grad_norm": 0.45314326882362366, + "learning_rate": 2.103604000086856e-06, + "loss": 2.8009, + "step": 12420 + }, + { + "epoch": 0.9173431734317343, + "grad_norm": 0.44693273305892944, + "learning_rate": 2.066794337140443e-06, + "loss": 2.8486, + "step": 12430 + }, + { + "epoch": 0.9180811808118081, + "grad_norm": 0.43216079473495483, + "learning_rate": 2.0303027805674445e-06, + "loss": 2.7234, + "step": 12440 + }, + { + "epoch": 0.9188191881918819, + "grad_norm": 0.45111674070358276, + "learning_rate": 1.994129572542286e-06, + "loss": 2.7963, + "step": 12450 + }, + { + "epoch": 0.9195571955719557, + "grad_norm": 0.46144166588783264, + "learning_rate": 1.958274953126693e-06, + "loss": 2.8314, + "step": 12460 + }, + { + "epoch": 0.9202952029520295, + "grad_norm": 0.45646706223487854, + "learning_rate": 1.922739160268089e-06, + "loss": 2.8796, + "step": 12470 + }, + { + "epoch": 0.9210332103321033, + "grad_norm": 0.49224853515625, + "learning_rate": 1.8875224297980332e-06, + "loss": 2.7904, + "step": 12480 + }, + { + "epoch": 0.9217712177121771, + "grad_norm": 0.44804316759109497, + "learning_rate": 1.8526249954306241e-06, + "loss": 2.7583, + "step": 12490 + }, + { + "epoch": 0.922509225092251, + "grad_norm": 0.43229466676712036, + "learning_rate": 1.8180470887609769e-06, + "loss": 2.8608, + "step": 12500 + }, + { + "epoch": 0.9232472324723248, + "grad_norm": 0.43958374857902527, + "learning_rate": 1.7837889392636864e-06, + "loss": 2.8282, + "step": 12510 + }, + { + "epoch": 0.9239852398523986, + "grad_norm": 0.4417596459388733, + "learning_rate": 1.7498507742912784e-06, + "loss": 2.8048, + "step": 12520 + }, + { + "epoch": 0.9247232472324723, + "grad_norm": 0.4306926429271698, + "learning_rate": 1.7162328190727217e-06, + "loss": 2.8095, + "step": 12530 + }, + { + "epoch": 0.9254612546125461, + "grad_norm": 0.439455509185791, + "learning_rate": 1.682935296711935e-06, + "loss": 2.7822, + "step": 12540 + }, + { + "epoch": 0.9261992619926199, + "grad_norm": 0.4519449472427368, + "learning_rate": 1.6499584281862935e-06, + "loss": 2.8494, + "step": 12550 + }, + { + "epoch": 0.9269372693726937, + "grad_norm": 0.4483802318572998, + "learning_rate": 1.6173024323451747e-06, + "loss": 2.8629, + "step": 12560 + }, + { + "epoch": 0.9276752767527675, + "grad_norm": 0.4460211396217346, + "learning_rate": 1.5849675259084872e-06, + "loss": 2.8258, + "step": 12570 + }, + { + "epoch": 0.9284132841328413, + "grad_norm": 0.43958115577697754, + "learning_rate": 1.5529539234652668e-06, + "loss": 2.8093, + "step": 12580 + }, + { + "epoch": 0.9291512915129151, + "grad_norm": 0.46250835061073303, + "learning_rate": 1.5212618374722155e-06, + "loss": 2.828, + "step": 12590 + }, + { + "epoch": 0.9298892988929889, + "grad_norm": 0.46097636222839355, + "learning_rate": 1.4898914782523143e-06, + "loss": 2.8305, + "step": 12600 + }, + { + "epoch": 0.9306273062730628, + "grad_norm": 0.4385923445224762, + "learning_rate": 1.458843053993403e-06, + "loss": 2.7875, + "step": 12610 + }, + { + "epoch": 0.9313653136531366, + "grad_norm": 0.44254031777381897, + "learning_rate": 1.4281167707468457e-06, + "loss": 2.8113, + "step": 12620 + }, + { + "epoch": 0.9321033210332104, + "grad_norm": 0.4598987102508545, + "learning_rate": 1.3977128324261068e-06, + "loss": 2.8511, + "step": 12630 + }, + { + "epoch": 0.9328413284132842, + "grad_norm": 0.4526178240776062, + "learning_rate": 1.3676314408054391e-06, + "loss": 2.7979, + "step": 12640 + }, + { + "epoch": 0.933579335793358, + "grad_norm": 0.45094090700149536, + "learning_rate": 1.3378727955185244e-06, + "loss": 2.8319, + "step": 12650 + }, + { + "epoch": 0.9343173431734317, + "grad_norm": 0.45027512311935425, + "learning_rate": 1.3084370940571577e-06, + "loss": 2.8245, + "step": 12660 + }, + { + "epoch": 0.9350553505535055, + "grad_norm": 0.4329124391078949, + "learning_rate": 1.2793245317699321e-06, + "loss": 2.7542, + "step": 12670 + }, + { + "epoch": 0.9357933579335793, + "grad_norm": 0.4586227536201477, + "learning_rate": 1.2505353018609444e-06, + "loss": 2.7729, + "step": 12680 + }, + { + "epoch": 0.9365313653136531, + "grad_norm": 0.4397171437740326, + "learning_rate": 1.2220695953885031e-06, + "loss": 2.8164, + "step": 12690 + }, + { + "epoch": 0.9372693726937269, + "grad_norm": 0.4415930211544037, + "learning_rate": 1.1939276012638723e-06, + "loss": 2.8644, + "step": 12700 + }, + { + "epoch": 0.9380073800738007, + "grad_norm": 0.43980923295021057, + "learning_rate": 1.1661095062500237e-06, + "loss": 2.8716, + "step": 12710 + }, + { + "epoch": 0.9387453874538746, + "grad_norm": 0.46194180846214294, + "learning_rate": 1.1386154949603934e-06, + "loss": 2.8307, + "step": 12720 + }, + { + "epoch": 0.9394833948339484, + "grad_norm": 0.4496355652809143, + "learning_rate": 1.1114457498576258e-06, + "loss": 2.7868, + "step": 12730 + }, + { + "epoch": 0.9402214022140222, + "grad_norm": 0.4483359456062317, + "learning_rate": 1.0846004512524211e-06, + "loss": 2.8357, + "step": 12740 + }, + { + "epoch": 0.940959409594096, + "grad_norm": 0.44404512643814087, + "learning_rate": 1.0580797773022733e-06, + "loss": 2.8843, + "step": 12750 + }, + { + "epoch": 0.9416974169741698, + "grad_norm": 0.4440787136554718, + "learning_rate": 1.03188390401035e-06, + "loss": 2.8038, + "step": 12760 + }, + { + "epoch": 0.9424354243542435, + "grad_norm": 0.4445192813873291, + "learning_rate": 1.006013005224271e-06, + "loss": 2.813, + "step": 12770 + }, + { + "epoch": 0.9431734317343173, + "grad_norm": 0.4234587550163269, + "learning_rate": 9.80467252634998e-07, + "loss": 2.8414, + "step": 12780 + }, + { + "epoch": 0.9439114391143911, + "grad_norm": 0.4393916726112366, + "learning_rate": 9.552468157756622e-07, + "loss": 2.7851, + "step": 12790 + }, + { + "epoch": 0.9446494464944649, + "grad_norm": 0.4591200053691864, + "learning_rate": 9.303518620204677e-07, + "loss": 2.8378, + "step": 12800 + }, + { + "epoch": 0.9453874538745387, + "grad_norm": 0.43322470784187317, + "learning_rate": 9.057825565835399e-07, + "loss": 2.7366, + "step": 12810 + }, + { + "epoch": 0.9461254612546125, + "grad_norm": 0.4324533939361572, + "learning_rate": 8.815390625178887e-07, + "loss": 2.7483, + "step": 12820 + }, + { + "epoch": 0.9468634686346864, + "grad_norm": 0.4632011950016022, + "learning_rate": 8.576215407142651e-07, + "loss": 2.7874, + "step": 12830 + }, + { + "epoch": 0.9476014760147602, + "grad_norm": 0.4332893490791321, + "learning_rate": 8.340301499001446e-07, + "loss": 2.8252, + "step": 12840 + }, + { + "epoch": 0.948339483394834, + "grad_norm": 0.436294287443161, + "learning_rate": 8.107650466386285e-07, + "loss": 2.8445, + "step": 12850 + }, + { + "epoch": 0.9490774907749078, + "grad_norm": 0.43967026472091675, + "learning_rate": 7.878263853274281e-07, + "loss": 2.8411, + "step": 12860 + }, + { + "epoch": 0.9498154981549816, + "grad_norm": 0.45120909810066223, + "learning_rate": 7.652143181978655e-07, + "loss": 2.8118, + "step": 12870 + }, + { + "epoch": 0.9505535055350554, + "grad_norm": 0.4368390738964081, + "learning_rate": 7.429289953138019e-07, + "loss": 2.8086, + "step": 12880 + }, + { + "epoch": 0.9512915129151291, + "grad_norm": 0.4452465772628784, + "learning_rate": 7.209705645706944e-07, + "loss": 2.8468, + "step": 12890 + }, + { + "epoch": 0.9520295202952029, + "grad_norm": 0.4445231258869171, + "learning_rate": 6.993391716946019e-07, + "loss": 2.8114, + "step": 12900 + }, + { + "epoch": 0.9527675276752767, + "grad_norm": 0.43402281403541565, + "learning_rate": 6.780349602411918e-07, + "loss": 2.8352, + "step": 12910 + }, + { + "epoch": 0.9535055350553505, + "grad_norm": 0.45803192257881165, + "learning_rate": 6.570580715948404e-07, + "loss": 2.8013, + "step": 12920 + }, + { + "epoch": 0.9542435424354243, + "grad_norm": 0.45193520188331604, + "learning_rate": 6.364086449676232e-07, + "loss": 2.8368, + "step": 12930 + }, + { + "epoch": 0.9549815498154982, + "grad_norm": 0.44040247797966003, + "learning_rate": 6.160868173984591e-07, + "loss": 2.8559, + "step": 12940 + }, + { + "epoch": 0.955719557195572, + "grad_norm": 0.4719098210334778, + "learning_rate": 5.960927237521563e-07, + "loss": 2.85, + "step": 12950 + }, + { + "epoch": 0.9564575645756458, + "grad_norm": 0.4502539336681366, + "learning_rate": 5.764264967185462e-07, + "loss": 2.9074, + "step": 12960 + }, + { + "epoch": 0.9571955719557196, + "grad_norm": 0.4299696683883667, + "learning_rate": 5.570882668115784e-07, + "loss": 2.7595, + "step": 12970 + }, + { + "epoch": 0.9579335793357934, + "grad_norm": 0.44181373715400696, + "learning_rate": 5.380781623684661e-07, + "loss": 2.8024, + "step": 12980 + }, + { + "epoch": 0.9586715867158672, + "grad_norm": 0.437763512134552, + "learning_rate": 5.193963095488419e-07, + "loss": 2.8231, + "step": 12990 + }, + { + "epoch": 0.959409594095941, + "grad_norm": 0.4234910011291504, + "learning_rate": 5.010428323339033e-07, + "loss": 2.8898, + "step": 13000 + }, + { + "epoch": 0.9601476014760147, + "grad_norm": 0.45260801911354065, + "learning_rate": 4.830178525256079e-07, + "loss": 2.8558, + "step": 13010 + }, + { + "epoch": 0.9608856088560885, + "grad_norm": 0.4440422058105469, + "learning_rate": 4.653214897458513e-07, + "loss": 2.8007, + "step": 13020 + }, + { + "epoch": 0.9616236162361623, + "grad_norm": 0.4362104833126068, + "learning_rate": 4.4795386143567374e-07, + "loss": 2.8271, + "step": 13030 + }, + { + "epoch": 0.9623616236162361, + "grad_norm": 0.44079768657684326, + "learning_rate": 4.309150828544939e-07, + "loss": 2.8371, + "step": 13040 + }, + { + "epoch": 0.9630996309963099, + "grad_norm": 0.46145325899124146, + "learning_rate": 4.1420526707933727e-07, + "loss": 2.8808, + "step": 13050 + }, + { + "epoch": 0.9638376383763838, + "grad_norm": 0.4297032058238983, + "learning_rate": 3.978245250040702e-07, + "loss": 2.8506, + "step": 13060 + }, + { + "epoch": 0.9645756457564576, + "grad_norm": 0.4474189579486847, + "learning_rate": 3.817729653386892e-07, + "loss": 2.8261, + "step": 13070 + }, + { + "epoch": 0.9653136531365314, + "grad_norm": 0.43458986282348633, + "learning_rate": 3.660506946085829e-07, + "loss": 2.8319, + "step": 13080 + }, + { + "epoch": 0.9660516605166052, + "grad_norm": 0.4418502151966095, + "learning_rate": 3.506578171538377e-07, + "loss": 2.8326, + "step": 13090 + }, + { + "epoch": 0.966789667896679, + "grad_norm": 0.4373183846473694, + "learning_rate": 3.355944351285278e-07, + "loss": 2.7896, + "step": 13100 + }, + { + "epoch": 0.9675276752767528, + "grad_norm": 0.4467260241508484, + "learning_rate": 3.2086064850004314e-07, + "loss": 2.8499, + "step": 13110 + }, + { + "epoch": 0.9682656826568266, + "grad_norm": 0.45079532265663147, + "learning_rate": 3.064565550484455e-07, + "loss": 2.8005, + "step": 13120 + }, + { + "epoch": 0.9690036900369003, + "grad_norm": 0.4311223328113556, + "learning_rate": 2.9238225036579693e-07, + "loss": 2.8419, + "step": 13130 + }, + { + "epoch": 0.9697416974169741, + "grad_norm": 0.4524695575237274, + "learning_rate": 2.7863782785552685e-07, + "loss": 2.8581, + "step": 13140 + }, + { + "epoch": 0.9704797047970479, + "grad_norm": 0.4483130872249603, + "learning_rate": 2.65223378731827e-07, + "loss": 2.8275, + "step": 13150 + }, + { + "epoch": 0.9712177121771217, + "grad_norm": 0.4370816946029663, + "learning_rate": 2.521389920190298e-07, + "loss": 2.8673, + "step": 13160 + }, + { + "epoch": 0.9719557195571956, + "grad_norm": 0.444195032119751, + "learning_rate": 2.3938475455103083e-07, + "loss": 2.9407, + "step": 13170 + }, + { + "epoch": 0.9726937269372694, + "grad_norm": 0.44004592299461365, + "learning_rate": 2.269607509707006e-07, + "loss": 2.8481, + "step": 13180 + }, + { + "epoch": 0.9734317343173432, + "grad_norm": 0.44630327820777893, + "learning_rate": 2.1486706372932375e-07, + "loss": 2.7954, + "step": 13190 + }, + { + "epoch": 0.974169741697417, + "grad_norm": 0.42796429991722107, + "learning_rate": 2.031037730860774e-07, + "loss": 2.8533, + "step": 13200 + }, + { + "epoch": 0.9749077490774908, + "grad_norm": 0.4611528217792511, + "learning_rate": 1.916709571074482e-07, + "loss": 2.8151, + "step": 13210 + }, + { + "epoch": 0.9756457564575646, + "grad_norm": 0.451028972864151, + "learning_rate": 1.8056869166677703e-07, + "loss": 2.8355, + "step": 13220 + }, + { + "epoch": 0.9763837638376384, + "grad_norm": 0.4451844096183777, + "learning_rate": 1.6979705044369297e-07, + "loss": 2.8121, + "step": 13230 + }, + { + "epoch": 0.9771217712177122, + "grad_norm": 0.4613220989704132, + "learning_rate": 1.5935610492366915e-07, + "loss": 2.9067, + "step": 13240 + }, + { + "epoch": 0.977859778597786, + "grad_norm": 0.44495347142219543, + "learning_rate": 1.4924592439753416e-07, + "loss": 2.7666, + "step": 13250 + }, + { + "epoch": 0.9785977859778597, + "grad_norm": 0.4585348963737488, + "learning_rate": 1.394665759610003e-07, + "loss": 2.7254, + "step": 13260 + }, + { + "epoch": 0.9793357933579335, + "grad_norm": 0.43729352951049805, + "learning_rate": 1.3001812451423068e-07, + "loss": 2.778, + "step": 13270 + }, + { + "epoch": 0.9800738007380074, + "grad_norm": 0.450089693069458, + "learning_rate": 1.209006327614226e-07, + "loss": 2.809, + "step": 13280 + }, + { + "epoch": 0.9808118081180812, + "grad_norm": 0.43959712982177734, + "learning_rate": 1.1211416121035823e-07, + "loss": 2.8325, + "step": 13290 + }, + { + "epoch": 0.981549815498155, + "grad_norm": 0.4504597783088684, + "learning_rate": 1.036587681720269e-07, + "loss": 2.7841, + "step": 13300 + }, + { + "epoch": 0.9822878228782288, + "grad_norm": 0.44741228222846985, + "learning_rate": 9.55345097602256e-08, + "loss": 2.8358, + "step": 13310 + }, + { + "epoch": 0.9830258302583026, + "grad_norm": 0.4463639557361603, + "learning_rate": 8.774143989119798e-08, + "loss": 2.8313, + "step": 13320 + }, + { + "epoch": 0.9837638376383764, + "grad_norm": 0.4775594472885132, + "learning_rate": 8.027961028328479e-08, + "loss": 2.8781, + "step": 13330 + }, + { + "epoch": 0.9845018450184502, + "grad_norm": 0.4243060350418091, + "learning_rate": 7.314907045653519e-08, + "loss": 2.7926, + "step": 13340 + }, + { + "epoch": 0.985239852398524, + "grad_norm": 0.43475958704948425, + "learning_rate": 6.634986773244034e-08, + "loss": 2.7885, + "step": 13350 + }, + { + "epoch": 0.9859778597785978, + "grad_norm": 0.4415262043476105, + "learning_rate": 5.988204723356705e-08, + "loss": 2.7721, + "step": 13360 + }, + { + "epoch": 0.9867158671586715, + "grad_norm": 0.438672810792923, + "learning_rate": 5.374565188329683e-08, + "loss": 2.8138, + "step": 13370 + }, + { + "epoch": 0.9874538745387453, + "grad_norm": 0.46068814396858215, + "learning_rate": 4.794072240550951e-08, + "loss": 2.7988, + "step": 13380 + }, + { + "epoch": 0.9881918819188192, + "grad_norm": 0.44185954332351685, + "learning_rate": 4.246729732434451e-08, + "loss": 2.7823, + "step": 13390 + }, + { + "epoch": 0.988929889298893, + "grad_norm": 0.4282056391239166, + "learning_rate": 3.7325412963912235e-08, + "loss": 2.872, + "step": 13400 + }, + { + "epoch": 0.9896678966789668, + "grad_norm": 0.46537652611732483, + "learning_rate": 3.251510344807751e-08, + "loss": 2.9374, + "step": 13410 + }, + { + "epoch": 0.9904059040590406, + "grad_norm": 0.4430101215839386, + "learning_rate": 2.8036400700232058e-08, + "loss": 2.7839, + "step": 13420 + }, + { + "epoch": 0.9911439114391144, + "grad_norm": 0.45416316390037537, + "learning_rate": 2.3889334443055744e-08, + "loss": 2.8689, + "step": 13430 + }, + { + "epoch": 0.9918819188191882, + "grad_norm": 0.4388124346733093, + "learning_rate": 2.007393219836118e-08, + "loss": 2.9239, + "step": 13440 + }, + { + "epoch": 0.992619926199262, + "grad_norm": 0.43018996715545654, + "learning_rate": 1.6590219286871655e-08, + "loss": 2.8412, + "step": 13450 + }, + { + "epoch": 0.9933579335793358, + "grad_norm": 0.42218539118766785, + "learning_rate": 1.3438218828076832e-08, + "loss": 2.7462, + "step": 13460 + }, + { + "epoch": 0.9940959409594096, + "grad_norm": 0.4494752883911133, + "learning_rate": 1.0617951740077292e-08, + "loss": 2.8598, + "step": 13470 + }, + { + "epoch": 0.9948339483394834, + "grad_norm": 0.41235294938087463, + "learning_rate": 8.12943673943467e-09, + "loss": 2.8083, + "step": 13480 + }, + { + "epoch": 0.9955719557195571, + "grad_norm": 0.4434475004673004, + "learning_rate": 5.9726903410661786e-09, + "loss": 2.929, + "step": 13490 + }, + { + "epoch": 0.996309963099631, + "grad_norm": 0.43739476799964905, + "learning_rate": 4.147726858100276e-09, + "loss": 2.844, + "step": 13500 + }, + { + "epoch": 0.9970479704797048, + "grad_norm": 0.46633192896842957, + "learning_rate": 2.6545584018211613e-09, + "loss": 2.8096, + "step": 13510 + }, + { + "epoch": 0.9977859778597786, + "grad_norm": 0.4500004053115845, + "learning_rate": 1.4931948815744e-09, + "loss": 2.8317, + "step": 13520 + }, + { + "epoch": 0.9985239852398524, + "grad_norm": 0.45538780093193054, + "learning_rate": 6.636440046892123e-10, + "loss": 2.8792, + "step": 13530 + }, + { + "epoch": 0.9992619926199262, + "grad_norm": 0.4632636308670044, + "learning_rate": 1.6591127643961202e-10, + "loss": 2.8205, + "step": 13540 + }, + { + "epoch": 1.0, + "grad_norm": 0.4356023073196411, + "learning_rate": 0.0, + "loss": 2.8161, + "step": 13550 + }, + { + "epoch": 1.0, + "step": 13550, + "total_flos": 5.404563590201999e+18, + "train_loss": 3.236852196415412, + "train_runtime": 292848.6684, + "train_samples_per_second": 0.74, + "train_steps_per_second": 0.046 + } + ], + "logging_steps": 10, + "max_steps": 13550, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 4000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.404563590201999e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}