{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 9600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 9.6875, "learning_rate": 2.0833333333333333e-07, "loss": 1.5494, "step": 10 }, { "epoch": 0.01, "grad_norm": 2.375, "learning_rate": 4.1666666666666667e-07, "loss": 1.6442, "step": 20 }, { "epoch": 0.01, "grad_norm": 1.78125, "learning_rate": 6.25e-07, "loss": 1.5817, "step": 30 }, { "epoch": 0.01, "grad_norm": 1.2734375, "learning_rate": 8.333333333333333e-07, "loss": 1.6262, "step": 40 }, { "epoch": 0.02, "grad_norm": 0.9921875, "learning_rate": 1.0416666666666667e-06, "loss": 1.6365, "step": 50 }, { "epoch": 0.02, "grad_norm": 0.77734375, "learning_rate": 1.25e-06, "loss": 1.5901, "step": 60 }, { "epoch": 0.02, "grad_norm": 0.62109375, "learning_rate": 1.4583333333333335e-06, "loss": 1.5757, "step": 70 }, { "epoch": 0.03, "grad_norm": 0.5234375, "learning_rate": 1.6666666666666667e-06, "loss": 1.5956, "step": 80 }, { "epoch": 0.03, "grad_norm": 6.75, "learning_rate": 1.8750000000000003e-06, "loss": 1.561, "step": 90 }, { "epoch": 0.03, "grad_norm": 4.59375, "learning_rate": 2.0833333333333334e-06, "loss": 1.5564, "step": 100 }, { "epoch": 0.03, "grad_norm": 0.546875, "learning_rate": 2.2916666666666666e-06, "loss": 1.6172, "step": 110 }, { "epoch": 0.04, "grad_norm": 1.1640625, "learning_rate": 2.5e-06, "loss": 1.6872, "step": 120 }, { "epoch": 0.04, "grad_norm": 1.4765625, "learning_rate": 2.7083333333333334e-06, "loss": 1.577, "step": 130 }, { "epoch": 0.04, "grad_norm": 2.15625, "learning_rate": 2.916666666666667e-06, "loss": 1.56, "step": 140 }, { "epoch": 0.05, "grad_norm": 0.462890625, "learning_rate": 3.125e-06, "loss": 1.552, "step": 150 }, { "epoch": 0.05, "grad_norm": 0.416015625, "learning_rate": 3.3333333333333333e-06, "loss": 1.5752, "step": 160 }, { "epoch": 0.05, "grad_norm": 0.41796875, "learning_rate": 3.5416666666666673e-06, "loss": 1.6739, "step": 170 }, { "epoch": 0.06, "grad_norm": 13.25, "learning_rate": 3.7500000000000005e-06, "loss": 1.5538, "step": 180 }, { "epoch": 0.06, "grad_norm": 1.8984375, "learning_rate": 3.958333333333333e-06, "loss": 1.6864, "step": 190 }, { "epoch": 0.06, "grad_norm": 11.1875, "learning_rate": 4.166666666666667e-06, "loss": 1.5576, "step": 200 }, { "epoch": 0.07, "grad_norm": 1.546875, "learning_rate": 4.3750000000000005e-06, "loss": 1.5328, "step": 210 }, { "epoch": 0.07, "grad_norm": 0.357421875, "learning_rate": 4.583333333333333e-06, "loss": 1.4953, "step": 220 }, { "epoch": 0.07, "grad_norm": 2.578125, "learning_rate": 4.791666666666668e-06, "loss": 1.5663, "step": 230 }, { "epoch": 0.07, "grad_norm": 1.2265625, "learning_rate": 5e-06, "loss": 1.5063, "step": 240 }, { "epoch": 0.08, "grad_norm": 0.439453125, "learning_rate": 5.208333333333334e-06, "loss": 1.4404, "step": 250 }, { "epoch": 0.08, "grad_norm": 0.443359375, "learning_rate": 5.416666666666667e-06, "loss": 1.5304, "step": 260 }, { "epoch": 0.08, "grad_norm": 0.44921875, "learning_rate": 5.625e-06, "loss": 1.5536, "step": 270 }, { "epoch": 0.09, "grad_norm": 1.4140625, "learning_rate": 5.833333333333334e-06, "loss": 1.5385, "step": 280 }, { "epoch": 0.09, "grad_norm": 0.4609375, "learning_rate": 6.041666666666667e-06, "loss": 1.513, "step": 290 }, { "epoch": 0.09, "grad_norm": 0.50390625, "learning_rate": 6.25e-06, "loss": 1.4549, "step": 300 }, { "epoch": 0.1, "grad_norm": 0.6328125, "learning_rate": 6.458333333333334e-06, "loss": 1.4931, "step": 310 }, { "epoch": 0.1, "grad_norm": 0.48046875, "learning_rate": 6.666666666666667e-06, "loss": 1.4201, "step": 320 }, { "epoch": 0.1, "grad_norm": 0.392578125, "learning_rate": 6.875e-06, "loss": 1.5471, "step": 330 }, { "epoch": 0.11, "grad_norm": 0.95703125, "learning_rate": 7.083333333333335e-06, "loss": 1.4287, "step": 340 }, { "epoch": 0.11, "grad_norm": 0.388671875, "learning_rate": 7.291666666666667e-06, "loss": 1.4475, "step": 350 }, { "epoch": 0.11, "grad_norm": 0.33984375, "learning_rate": 7.500000000000001e-06, "loss": 1.477, "step": 360 }, { "epoch": 0.12, "grad_norm": 1.296875, "learning_rate": 7.708333333333334e-06, "loss": 1.4533, "step": 370 }, { "epoch": 0.12, "grad_norm": 0.400390625, "learning_rate": 7.916666666666667e-06, "loss": 1.3663, "step": 380 }, { "epoch": 0.12, "grad_norm": 0.375, "learning_rate": 8.125000000000001e-06, "loss": 1.3953, "step": 390 }, { "epoch": 0.12, "grad_norm": 1.4921875, "learning_rate": 8.333333333333334e-06, "loss": 1.471, "step": 400 }, { "epoch": 0.13, "grad_norm": 0.392578125, "learning_rate": 8.541666666666666e-06, "loss": 1.4374, "step": 410 }, { "epoch": 0.13, "grad_norm": 2.046875, "learning_rate": 8.750000000000001e-06, "loss": 1.4224, "step": 420 }, { "epoch": 0.13, "grad_norm": 0.5546875, "learning_rate": 8.958333333333334e-06, "loss": 1.3261, "step": 430 }, { "epoch": 0.14, "grad_norm": 0.2578125, "learning_rate": 9.166666666666666e-06, "loss": 1.4033, "step": 440 }, { "epoch": 0.14, "grad_norm": 1.546875, "learning_rate": 9.375000000000001e-06, "loss": 1.4237, "step": 450 }, { "epoch": 0.14, "grad_norm": 0.435546875, "learning_rate": 9.583333333333335e-06, "loss": 1.4083, "step": 460 }, { "epoch": 0.15, "grad_norm": 0.2255859375, "learning_rate": 9.791666666666666e-06, "loss": 1.3245, "step": 470 }, { "epoch": 0.15, "grad_norm": 0.333984375, "learning_rate": 1e-05, "loss": 1.3771, "step": 480 }, { "epoch": 0.15, "grad_norm": 0.5546875, "learning_rate": 1.0208333333333334e-05, "loss": 1.3789, "step": 490 }, { "epoch": 0.16, "grad_norm": 0.7578125, "learning_rate": 1.0416666666666668e-05, "loss": 1.4275, "step": 500 }, { "epoch": 0.16, "grad_norm": 0.3359375, "learning_rate": 1.0625e-05, "loss": 1.3739, "step": 510 }, { "epoch": 0.16, "grad_norm": 1.0625, "learning_rate": 1.0833333333333334e-05, "loss": 1.3758, "step": 520 }, { "epoch": 0.17, "grad_norm": 2.140625, "learning_rate": 1.1041666666666668e-05, "loss": 1.412, "step": 530 }, { "epoch": 0.17, "grad_norm": 0.279296875, "learning_rate": 1.125e-05, "loss": 1.4025, "step": 540 }, { "epoch": 0.17, "grad_norm": 2.3125, "learning_rate": 1.1458333333333333e-05, "loss": 1.4138, "step": 550 }, { "epoch": 0.17, "grad_norm": 0.259765625, "learning_rate": 1.1666666666666668e-05, "loss": 1.2598, "step": 560 }, { "epoch": 0.18, "grad_norm": 2.625, "learning_rate": 1.1875e-05, "loss": 1.3543, "step": 570 }, { "epoch": 0.18, "grad_norm": 1.2734375, "learning_rate": 1.2083333333333333e-05, "loss": 1.273, "step": 580 }, { "epoch": 0.18, "grad_norm": 2.328125, "learning_rate": 1.2291666666666668e-05, "loss": 1.2361, "step": 590 }, { "epoch": 0.19, "grad_norm": 0.384765625, "learning_rate": 1.25e-05, "loss": 1.2996, "step": 600 }, { "epoch": 0.19, "grad_norm": 1.2734375, "learning_rate": 1.2708333333333333e-05, "loss": 1.235, "step": 610 }, { "epoch": 0.19, "grad_norm": 1.2109375, "learning_rate": 1.2916666666666668e-05, "loss": 1.3838, "step": 620 }, { "epoch": 0.2, "grad_norm": 0.9765625, "learning_rate": 1.3125e-05, "loss": 1.2292, "step": 630 }, { "epoch": 0.2, "grad_norm": 0.8125, "learning_rate": 1.3333333333333333e-05, "loss": 1.2698, "step": 640 }, { "epoch": 0.2, "grad_norm": 0.28515625, "learning_rate": 1.3541666666666668e-05, "loss": 1.3358, "step": 650 }, { "epoch": 0.21, "grad_norm": 0.216796875, "learning_rate": 1.375e-05, "loss": 1.237, "step": 660 }, { "epoch": 0.21, "grad_norm": 0.73046875, "learning_rate": 1.3958333333333333e-05, "loss": 1.2976, "step": 670 }, { "epoch": 0.21, "grad_norm": 0.466796875, "learning_rate": 1.416666666666667e-05, "loss": 1.2324, "step": 680 }, { "epoch": 0.22, "grad_norm": 0.302734375, "learning_rate": 1.4375e-05, "loss": 1.2436, "step": 690 }, { "epoch": 0.22, "grad_norm": 1.125, "learning_rate": 1.4583333333333333e-05, "loss": 1.2607, "step": 700 }, { "epoch": 0.22, "grad_norm": 0.25, "learning_rate": 1.479166666666667e-05, "loss": 1.2724, "step": 710 }, { "epoch": 0.23, "grad_norm": 0.439453125, "learning_rate": 1.5000000000000002e-05, "loss": 1.2314, "step": 720 }, { "epoch": 0.23, "grad_norm": 1.375, "learning_rate": 1.5208333333333333e-05, "loss": 1.2632, "step": 730 }, { "epoch": 0.23, "grad_norm": 0.7890625, "learning_rate": 1.5416666666666668e-05, "loss": 1.3291, "step": 740 }, { "epoch": 0.23, "grad_norm": 0.89453125, "learning_rate": 1.5625e-05, "loss": 1.2816, "step": 750 }, { "epoch": 0.24, "grad_norm": 0.2197265625, "learning_rate": 1.5833333333333333e-05, "loss": 1.2163, "step": 760 }, { "epoch": 0.24, "grad_norm": 0.216796875, "learning_rate": 1.604166666666667e-05, "loss": 1.2693, "step": 770 }, { "epoch": 0.24, "grad_norm": 0.41015625, "learning_rate": 1.6250000000000002e-05, "loss": 1.2114, "step": 780 }, { "epoch": 0.25, "grad_norm": 0.77734375, "learning_rate": 1.6458333333333335e-05, "loss": 1.1689, "step": 790 }, { "epoch": 0.25, "grad_norm": 0.271484375, "learning_rate": 1.6666666666666667e-05, "loss": 1.2469, "step": 800 }, { "epoch": 0.25, "grad_norm": 0.88671875, "learning_rate": 1.6875e-05, "loss": 1.2159, "step": 810 }, { "epoch": 0.26, "grad_norm": 0.359375, "learning_rate": 1.7083333333333333e-05, "loss": 1.2198, "step": 820 }, { "epoch": 0.26, "grad_norm": 0.2099609375, "learning_rate": 1.729166666666667e-05, "loss": 1.2687, "step": 830 }, { "epoch": 0.26, "grad_norm": 1.109375, "learning_rate": 1.7500000000000002e-05, "loss": 1.2719, "step": 840 }, { "epoch": 0.27, "grad_norm": 0.2216796875, "learning_rate": 1.7708333333333335e-05, "loss": 1.2074, "step": 850 }, { "epoch": 0.27, "grad_norm": 0.2578125, "learning_rate": 1.7916666666666667e-05, "loss": 1.1637, "step": 860 }, { "epoch": 0.27, "grad_norm": 0.3515625, "learning_rate": 1.8125e-05, "loss": 1.1775, "step": 870 }, { "epoch": 0.28, "grad_norm": 6.09375, "learning_rate": 1.8333333333333333e-05, "loss": 1.1949, "step": 880 }, { "epoch": 0.28, "grad_norm": 0.294921875, "learning_rate": 1.854166666666667e-05, "loss": 1.1488, "step": 890 }, { "epoch": 0.28, "grad_norm": 0.328125, "learning_rate": 1.8750000000000002e-05, "loss": 1.2244, "step": 900 }, { "epoch": 0.28, "grad_norm": 0.361328125, "learning_rate": 1.8958333333333334e-05, "loss": 1.1557, "step": 910 }, { "epoch": 0.29, "grad_norm": 0.1943359375, "learning_rate": 1.916666666666667e-05, "loss": 1.1331, "step": 920 }, { "epoch": 0.29, "grad_norm": 0.1630859375, "learning_rate": 1.9375e-05, "loss": 1.235, "step": 930 }, { "epoch": 0.29, "grad_norm": 0.28515625, "learning_rate": 1.9583333333333333e-05, "loss": 1.1916, "step": 940 }, { "epoch": 0.3, "grad_norm": 0.244140625, "learning_rate": 1.979166666666667e-05, "loss": 1.1931, "step": 950 }, { "epoch": 0.3, "grad_norm": 0.455078125, "learning_rate": 2e-05, "loss": 1.1947, "step": 960 }, { "epoch": 0.3, "grad_norm": 0.27734375, "learning_rate": 1.9999933893861945e-05, "loss": 1.123, "step": 970 }, { "epoch": 0.31, "grad_norm": 0.609375, "learning_rate": 1.9999735576321776e-05, "loss": 1.2481, "step": 980 }, { "epoch": 0.31, "grad_norm": 1.203125, "learning_rate": 1.99994050500015e-05, "loss": 1.2567, "step": 990 }, { "epoch": 0.31, "grad_norm": 0.4453125, "learning_rate": 1.9998942319271076e-05, "loss": 1.1169, "step": 1000 }, { "epoch": 0.32, "grad_norm": 0.1767578125, "learning_rate": 1.9998347390248377e-05, "loss": 1.1375, "step": 1010 }, { "epoch": 0.32, "grad_norm": 0.357421875, "learning_rate": 1.999762027079909e-05, "loss": 1.2387, "step": 1020 }, { "epoch": 0.32, "grad_norm": 0.2109375, "learning_rate": 1.9996760970536635e-05, "loss": 1.239, "step": 1030 }, { "epoch": 0.33, "grad_norm": 0.193359375, "learning_rate": 1.9995769500822007e-05, "loss": 1.1548, "step": 1040 }, { "epoch": 0.33, "grad_norm": 0.2060546875, "learning_rate": 1.9994645874763657e-05, "loss": 1.1675, "step": 1050 }, { "epoch": 0.33, "grad_norm": 0.15234375, "learning_rate": 1.9993390107217302e-05, "loss": 1.1675, "step": 1060 }, { "epoch": 0.33, "grad_norm": 0.71875, "learning_rate": 1.999200221478573e-05, "loss": 1.1909, "step": 1070 }, { "epoch": 0.34, "grad_norm": 0.32421875, "learning_rate": 1.999048221581858e-05, "loss": 1.1468, "step": 1080 }, { "epoch": 0.34, "grad_norm": 0.271484375, "learning_rate": 1.9988830130412106e-05, "loss": 1.2334, "step": 1090 }, { "epoch": 0.34, "grad_norm": 0.62109375, "learning_rate": 1.9987045980408907e-05, "loss": 1.2107, "step": 1100 }, { "epoch": 0.35, "grad_norm": 0.24609375, "learning_rate": 1.9985129789397633e-05, "loss": 1.1329, "step": 1110 }, { "epoch": 0.35, "grad_norm": 1.3125, "learning_rate": 1.9983081582712684e-05, "loss": 1.2072, "step": 1120 }, { "epoch": 0.35, "grad_norm": 0.25390625, "learning_rate": 1.9980901387433866e-05, "loss": 1.1971, "step": 1130 }, { "epoch": 0.36, "grad_norm": 0.66796875, "learning_rate": 1.9978589232386036e-05, "loss": 1.2553, "step": 1140 }, { "epoch": 0.36, "grad_norm": 0.423828125, "learning_rate": 1.9976145148138727e-05, "loss": 1.1773, "step": 1150 }, { "epoch": 0.36, "grad_norm": 0.15234375, "learning_rate": 1.9973569167005725e-05, "loss": 1.2301, "step": 1160 }, { "epoch": 0.37, "grad_norm": 0.271484375, "learning_rate": 1.9970861323044667e-05, "loss": 1.1788, "step": 1170 }, { "epoch": 0.37, "grad_norm": 0.1875, "learning_rate": 1.9968021652056578e-05, "loss": 1.2372, "step": 1180 }, { "epoch": 0.37, "grad_norm": 0.2275390625, "learning_rate": 1.9965050191585388e-05, "loss": 1.1555, "step": 1190 }, { "epoch": 0.38, "grad_norm": 0.166015625, "learning_rate": 1.9961946980917457e-05, "loss": 1.1395, "step": 1200 }, { "epoch": 0.38, "grad_norm": 0.28515625, "learning_rate": 1.995871206108104e-05, "loss": 1.2342, "step": 1210 }, { "epoch": 0.38, "grad_norm": 0.31640625, "learning_rate": 1.995534547484574e-05, "loss": 1.1636, "step": 1220 }, { "epoch": 0.38, "grad_norm": 0.26171875, "learning_rate": 1.995184726672197e-05, "loss": 1.0984, "step": 1230 }, { "epoch": 0.39, "grad_norm": 0.65234375, "learning_rate": 1.9948217482960334e-05, "loss": 1.186, "step": 1240 }, { "epoch": 0.39, "grad_norm": 0.6328125, "learning_rate": 1.9944456171551024e-05, "loss": 1.2151, "step": 1250 }, { "epoch": 0.39, "grad_norm": 0.25390625, "learning_rate": 1.9940563382223196e-05, "loss": 1.1904, "step": 1260 }, { "epoch": 0.4, "grad_norm": 0.462890625, "learning_rate": 1.993653916644431e-05, "loss": 1.1485, "step": 1270 }, { "epoch": 0.4, "grad_norm": 0.173828125, "learning_rate": 1.9932383577419432e-05, "loss": 1.1406, "step": 1280 }, { "epoch": 0.4, "grad_norm": 1.40625, "learning_rate": 1.9928096670090552e-05, "loss": 1.1583, "step": 1290 }, { "epoch": 0.41, "grad_norm": 0.2255859375, "learning_rate": 1.992367850113585e-05, "loss": 1.1775, "step": 1300 }, { "epoch": 0.41, "grad_norm": 0.203125, "learning_rate": 1.9919129128968938e-05, "loss": 1.173, "step": 1310 }, { "epoch": 0.41, "grad_norm": 0.1845703125, "learning_rate": 1.9914448613738107e-05, "loss": 1.1648, "step": 1320 }, { "epoch": 0.42, "grad_norm": 0.58203125, "learning_rate": 1.9909637017325508e-05, "loss": 1.0632, "step": 1330 }, { "epoch": 0.42, "grad_norm": 0.2470703125, "learning_rate": 1.9904694403346362e-05, "loss": 1.1756, "step": 1340 }, { "epoch": 0.42, "grad_norm": 0.16015625, "learning_rate": 1.989962083714808e-05, "loss": 1.154, "step": 1350 }, { "epoch": 0.42, "grad_norm": 0.291015625, "learning_rate": 1.9894416385809444e-05, "loss": 1.1659, "step": 1360 }, { "epoch": 0.43, "grad_norm": 0.1884765625, "learning_rate": 1.9889081118139694e-05, "loss": 1.1852, "step": 1370 }, { "epoch": 0.43, "grad_norm": 0.2373046875, "learning_rate": 1.988361510467761e-05, "loss": 1.1686, "step": 1380 }, { "epoch": 0.43, "grad_norm": 0.267578125, "learning_rate": 1.9878018417690602e-05, "loss": 1.2461, "step": 1390 }, { "epoch": 0.44, "grad_norm": 3.09375, "learning_rate": 1.9872291131173743e-05, "loss": 1.1913, "step": 1400 }, { "epoch": 0.44, "grad_norm": 0.271484375, "learning_rate": 1.9866433320848793e-05, "loss": 1.2264, "step": 1410 }, { "epoch": 0.44, "grad_norm": 0.953125, "learning_rate": 1.9860445064163193e-05, "loss": 1.151, "step": 1420 }, { "epoch": 0.45, "grad_norm": 0.1875, "learning_rate": 1.985432644028905e-05, "loss": 1.1306, "step": 1430 }, { "epoch": 0.45, "grad_norm": 1.265625, "learning_rate": 1.9848077530122083e-05, "loss": 1.19, "step": 1440 }, { "epoch": 0.45, "grad_norm": 0.255859375, "learning_rate": 1.9841698416280554e-05, "loss": 1.0554, "step": 1450 }, { "epoch": 0.46, "grad_norm": 0.291015625, "learning_rate": 1.983518918310418e-05, "loss": 1.1399, "step": 1460 }, { "epoch": 0.46, "grad_norm": 1.6015625, "learning_rate": 1.9828549916653013e-05, "loss": 1.1098, "step": 1470 }, { "epoch": 0.46, "grad_norm": 0.193359375, "learning_rate": 1.982178070470631e-05, "loss": 1.1319, "step": 1480 }, { "epoch": 0.47, "grad_norm": 0.52734375, "learning_rate": 1.9814881636761355e-05, "loss": 1.2344, "step": 1490 }, { "epoch": 0.47, "grad_norm": 0.578125, "learning_rate": 1.9807852804032306e-05, "loss": 1.1336, "step": 1500 }, { "epoch": 0.47, "grad_norm": 0.16796875, "learning_rate": 1.980069429944895e-05, "loss": 1.1052, "step": 1510 }, { "epoch": 0.47, "grad_norm": 0.19921875, "learning_rate": 1.9793406217655516e-05, "loss": 1.1314, "step": 1520 }, { "epoch": 0.48, "grad_norm": 0.169921875, "learning_rate": 1.9785988655009386e-05, "loss": 1.1783, "step": 1530 }, { "epoch": 0.48, "grad_norm": 0.490234375, "learning_rate": 1.977844170957984e-05, "loss": 1.1526, "step": 1540 }, { "epoch": 0.48, "grad_norm": 0.33984375, "learning_rate": 1.977076548114677e-05, "loss": 1.1069, "step": 1550 }, { "epoch": 0.49, "grad_norm": 0.3828125, "learning_rate": 1.9762960071199334e-05, "loss": 1.2254, "step": 1560 }, { "epoch": 0.49, "grad_norm": 0.640625, "learning_rate": 1.975502558293464e-05, "loss": 1.1648, "step": 1570 }, { "epoch": 0.49, "grad_norm": 0.279296875, "learning_rate": 1.974696212125635e-05, "loss": 1.1854, "step": 1580 }, { "epoch": 0.5, "grad_norm": 0.3046875, "learning_rate": 1.9738769792773338e-05, "loss": 1.2473, "step": 1590 }, { "epoch": 0.5, "grad_norm": 0.326171875, "learning_rate": 1.973044870579824e-05, "loss": 1.1984, "step": 1600 }, { "epoch": 0.5, "grad_norm": 0.224609375, "learning_rate": 1.972199897034604e-05, "loss": 1.1372, "step": 1610 }, { "epoch": 0.51, "grad_norm": 0.5703125, "learning_rate": 1.9713420698132614e-05, "loss": 1.123, "step": 1620 }, { "epoch": 0.51, "grad_norm": 0.427734375, "learning_rate": 1.9704714002573255e-05, "loss": 1.1167, "step": 1630 }, { "epoch": 0.51, "grad_norm": 0.2177734375, "learning_rate": 1.9695878998781162e-05, "loss": 1.1196, "step": 1640 }, { "epoch": 0.52, "grad_norm": 0.2451171875, "learning_rate": 1.9686915803565934e-05, "loss": 1.1106, "step": 1650 }, { "epoch": 0.52, "grad_norm": 0.193359375, "learning_rate": 1.9677824535432012e-05, "loss": 1.1101, "step": 1660 }, { "epoch": 0.52, "grad_norm": 0.1982421875, "learning_rate": 1.9668605314577124e-05, "loss": 1.1876, "step": 1670 }, { "epoch": 0.53, "grad_norm": 0.2080078125, "learning_rate": 1.9659258262890683e-05, "loss": 1.1827, "step": 1680 }, { "epoch": 0.53, "grad_norm": 0.291015625, "learning_rate": 1.9649783503952196e-05, "loss": 1.1399, "step": 1690 }, { "epoch": 0.53, "grad_norm": 0.5546875, "learning_rate": 1.9640181163029597e-05, "loss": 1.13, "step": 1700 }, { "epoch": 0.53, "grad_norm": 0.2294921875, "learning_rate": 1.963045136707763e-05, "loss": 1.1582, "step": 1710 }, { "epoch": 0.54, "grad_norm": 0.2294921875, "learning_rate": 1.9620594244736133e-05, "loss": 1.2255, "step": 1720 }, { "epoch": 0.54, "grad_norm": 2.8125, "learning_rate": 1.9610609926328373e-05, "loss": 1.1538, "step": 1730 }, { "epoch": 0.54, "grad_norm": 0.2197265625, "learning_rate": 1.960049854385929e-05, "loss": 1.1813, "step": 1740 }, { "epoch": 0.55, "grad_norm": 0.38671875, "learning_rate": 1.9590260231013774e-05, "loss": 1.202, "step": 1750 }, { "epoch": 0.55, "grad_norm": 1.0625, "learning_rate": 1.957989512315489e-05, "loss": 1.1095, "step": 1760 }, { "epoch": 0.55, "grad_norm": 0.318359375, "learning_rate": 1.956940335732209e-05, "loss": 1.1931, "step": 1770 }, { "epoch": 0.56, "grad_norm": 0.287109375, "learning_rate": 1.9558785072229395e-05, "loss": 1.1645, "step": 1780 }, { "epoch": 0.56, "grad_norm": 0.181640625, "learning_rate": 1.9548040408263575e-05, "loss": 1.1852, "step": 1790 }, { "epoch": 0.56, "grad_norm": 0.369140625, "learning_rate": 1.953716950748227e-05, "loss": 1.0633, "step": 1800 }, { "epoch": 0.57, "grad_norm": 0.2041015625, "learning_rate": 1.952617251361214e-05, "loss": 1.1748, "step": 1810 }, { "epoch": 0.57, "grad_norm": 0.2158203125, "learning_rate": 1.951504957204694e-05, "loss": 1.075, "step": 1820 }, { "epoch": 0.57, "grad_norm": 0.16796875, "learning_rate": 1.9503800829845613e-05, "loss": 1.1498, "step": 1830 }, { "epoch": 0.57, "grad_norm": 0.68359375, "learning_rate": 1.949242643573034e-05, "loss": 1.1333, "step": 1840 }, { "epoch": 0.58, "grad_norm": 0.55078125, "learning_rate": 1.9480926540084578e-05, "loss": 1.1932, "step": 1850 }, { "epoch": 0.58, "grad_norm": 0.9453125, "learning_rate": 1.946930129495106e-05, "loss": 1.2105, "step": 1860 }, { "epoch": 0.58, "grad_norm": 0.1669921875, "learning_rate": 1.9457550854029797e-05, "loss": 1.0915, "step": 1870 }, { "epoch": 0.59, "grad_norm": 0.2041015625, "learning_rate": 1.944567537267605e-05, "loss": 1.105, "step": 1880 }, { "epoch": 0.59, "grad_norm": 0.27734375, "learning_rate": 1.9433675007898255e-05, "loss": 1.1275, "step": 1890 }, { "epoch": 0.59, "grad_norm": 0.1884765625, "learning_rate": 1.9421549918355965e-05, "loss": 1.1369, "step": 1900 }, { "epoch": 0.6, "grad_norm": 0.3046875, "learning_rate": 1.9409300264357757e-05, "loss": 1.134, "step": 1910 }, { "epoch": 0.6, "grad_norm": 0.2001953125, "learning_rate": 1.9396926207859085e-05, "loss": 1.0956, "step": 1920 }, { "epoch": 0.6, "grad_norm": 0.2060546875, "learning_rate": 1.9384427912460172e-05, "loss": 1.1498, "step": 1930 }, { "epoch": 0.61, "grad_norm": 0.2216796875, "learning_rate": 1.9371805543403826e-05, "loss": 1.1962, "step": 1940 }, { "epoch": 0.61, "grad_norm": 0.29296875, "learning_rate": 1.935905926757326e-05, "loss": 1.1103, "step": 1950 }, { "epoch": 0.61, "grad_norm": 0.1953125, "learning_rate": 1.9346189253489888e-05, "loss": 1.0858, "step": 1960 }, { "epoch": 0.62, "grad_norm": 0.2734375, "learning_rate": 1.9333195671311093e-05, "loss": 1.1537, "step": 1970 }, { "epoch": 0.62, "grad_norm": 0.2119140625, "learning_rate": 1.932007869282799e-05, "loss": 1.1339, "step": 1980 }, { "epoch": 0.62, "grad_norm": 0.3046875, "learning_rate": 1.9306838491463126e-05, "loss": 1.0732, "step": 1990 }, { "epoch": 0.62, "grad_norm": 0.203125, "learning_rate": 1.9293475242268224e-05, "loss": 1.1001, "step": 2000 }, { "epoch": 0.63, "grad_norm": 0.193359375, "learning_rate": 1.9279989121921846e-05, "loss": 1.1295, "step": 2010 }, { "epoch": 0.63, "grad_norm": 0.61328125, "learning_rate": 1.9266380308727054e-05, "loss": 1.1069, "step": 2020 }, { "epoch": 0.63, "grad_norm": 0.34765625, "learning_rate": 1.9252648982609065e-05, "loss": 1.1003, "step": 2030 }, { "epoch": 0.64, "grad_norm": 0.255859375, "learning_rate": 1.9238795325112867e-05, "loss": 1.098, "step": 2040 }, { "epoch": 0.64, "grad_norm": 0.66796875, "learning_rate": 1.9224819519400825e-05, "loss": 1.1447, "step": 2050 }, { "epoch": 0.64, "grad_norm": 0.91796875, "learning_rate": 1.9210721750250237e-05, "loss": 1.0511, "step": 2060 }, { "epoch": 0.65, "grad_norm": 0.2119140625, "learning_rate": 1.9196502204050925e-05, "loss": 1.1048, "step": 2070 }, { "epoch": 0.65, "grad_norm": 0.30859375, "learning_rate": 1.9182161068802742e-05, "loss": 1.2156, "step": 2080 }, { "epoch": 0.65, "grad_norm": 0.16015625, "learning_rate": 1.9167698534113105e-05, "loss": 1.0455, "step": 2090 }, { "epoch": 0.66, "grad_norm": 0.20703125, "learning_rate": 1.9153114791194475e-05, "loss": 1.2662, "step": 2100 }, { "epoch": 0.66, "grad_norm": 0.212890625, "learning_rate": 1.9138410032861833e-05, "loss": 1.1073, "step": 2110 }, { "epoch": 0.66, "grad_norm": 0.20703125, "learning_rate": 1.9123584453530145e-05, "loss": 1.1808, "step": 2120 }, { "epoch": 0.67, "grad_norm": 0.2041015625, "learning_rate": 1.910863824921176e-05, "loss": 1.0918, "step": 2130 }, { "epoch": 0.67, "grad_norm": 0.2158203125, "learning_rate": 1.9093571617513853e-05, "loss": 1.1115, "step": 2140 }, { "epoch": 0.67, "grad_norm": 0.31640625, "learning_rate": 1.907838475763579e-05, "loss": 1.1847, "step": 2150 }, { "epoch": 0.68, "grad_norm": 0.251953125, "learning_rate": 1.9063077870366504e-05, "loss": 1.1355, "step": 2160 }, { "epoch": 0.68, "grad_norm": 0.18359375, "learning_rate": 1.9047651158081827e-05, "loss": 1.1227, "step": 2170 }, { "epoch": 0.68, "grad_norm": 1.0625, "learning_rate": 1.9032104824741843e-05, "loss": 1.156, "step": 2180 }, { "epoch": 0.68, "grad_norm": 0.205078125, "learning_rate": 1.901643907588816e-05, "loss": 1.1995, "step": 2190 }, { "epoch": 0.69, "grad_norm": 0.2060546875, "learning_rate": 1.900065411864121e-05, "loss": 1.1294, "step": 2200 }, { "epoch": 0.69, "grad_norm": 0.1591796875, "learning_rate": 1.898475016169751e-05, "loss": 1.1487, "step": 2210 }, { "epoch": 0.69, "grad_norm": 0.248046875, "learning_rate": 1.8968727415326885e-05, "loss": 1.1438, "step": 2220 }, { "epoch": 0.7, "grad_norm": 0.2294921875, "learning_rate": 1.895258609136972e-05, "loss": 1.1341, "step": 2230 }, { "epoch": 0.7, "grad_norm": 0.9765625, "learning_rate": 1.8936326403234125e-05, "loss": 1.1479, "step": 2240 }, { "epoch": 0.7, "grad_norm": 0.169921875, "learning_rate": 1.8919948565893144e-05, "loss": 1.2063, "step": 2250 }, { "epoch": 0.71, "grad_norm": 0.1923828125, "learning_rate": 1.8903452795881893e-05, "loss": 1.1608, "step": 2260 }, { "epoch": 0.71, "grad_norm": 0.2099609375, "learning_rate": 1.8886839311294695e-05, "loss": 1.1145, "step": 2270 }, { "epoch": 0.71, "grad_norm": 0.201171875, "learning_rate": 1.887010833178222e-05, "loss": 1.0683, "step": 2280 }, { "epoch": 0.72, "grad_norm": 0.19140625, "learning_rate": 1.885326007854855e-05, "loss": 1.1338, "step": 2290 }, { "epoch": 0.72, "grad_norm": 0.19921875, "learning_rate": 1.883629477434828e-05, "loss": 1.1307, "step": 2300 }, { "epoch": 0.72, "grad_norm": 0.173828125, "learning_rate": 1.881921264348355e-05, "loss": 1.1751, "step": 2310 }, { "epoch": 0.72, "grad_norm": 0.1953125, "learning_rate": 1.880201391180111e-05, "loss": 1.1072, "step": 2320 }, { "epoch": 0.73, "grad_norm": 0.201171875, "learning_rate": 1.8784698806689308e-05, "loss": 1.1771, "step": 2330 }, { "epoch": 0.73, "grad_norm": 0.1552734375, "learning_rate": 1.876726755707508e-05, "loss": 1.1182, "step": 2340 }, { "epoch": 0.73, "grad_norm": 0.38671875, "learning_rate": 1.8749720393420948e-05, "loss": 1.2228, "step": 2350 }, { "epoch": 0.74, "grad_norm": 0.20703125, "learning_rate": 1.8732057547721962e-05, "loss": 1.1807, "step": 2360 }, { "epoch": 0.74, "grad_norm": 0.205078125, "learning_rate": 1.8714279253502616e-05, "loss": 1.0115, "step": 2370 }, { "epoch": 0.74, "grad_norm": 0.328125, "learning_rate": 1.8696385745813793e-05, "loss": 1.0663, "step": 2380 }, { "epoch": 0.75, "grad_norm": 0.251953125, "learning_rate": 1.8678377261229624e-05, "loss": 1.1894, "step": 2390 }, { "epoch": 0.75, "grad_norm": 0.26171875, "learning_rate": 1.866025403784439e-05, "loss": 1.2207, "step": 2400 }, { "epoch": 0.75, "grad_norm": 0.287109375, "learning_rate": 1.8642016315269345e-05, "loss": 1.092, "step": 2410 }, { "epoch": 0.76, "grad_norm": 0.1982421875, "learning_rate": 1.8623664334629576e-05, "loss": 1.1397, "step": 2420 }, { "epoch": 0.76, "grad_norm": 0.291015625, "learning_rate": 1.860519833856079e-05, "loss": 1.1003, "step": 2430 }, { "epoch": 0.76, "grad_norm": 0.267578125, "learning_rate": 1.8586618571206133e-05, "loss": 1.1145, "step": 2440 }, { "epoch": 0.77, "grad_norm": 0.5703125, "learning_rate": 1.856792527821293e-05, "loss": 1.1072, "step": 2450 }, { "epoch": 0.77, "grad_norm": 0.271484375, "learning_rate": 1.854911870672947e-05, "loss": 1.2103, "step": 2460 }, { "epoch": 0.77, "grad_norm": 0.173828125, "learning_rate": 1.8530199105401705e-05, "loss": 1.2184, "step": 2470 }, { "epoch": 0.78, "grad_norm": 0.1669921875, "learning_rate": 1.8511166724369997e-05, "loss": 1.1892, "step": 2480 }, { "epoch": 0.78, "grad_norm": 0.28125, "learning_rate": 1.849202181526579e-05, "loss": 1.1147, "step": 2490 }, { "epoch": 0.78, "grad_norm": 0.177734375, "learning_rate": 1.847276463120828e-05, "loss": 1.1493, "step": 2500 }, { "epoch": 0.78, "grad_norm": 0.1630859375, "learning_rate": 1.8453395426801083e-05, "loss": 1.1848, "step": 2510 }, { "epoch": 0.79, "grad_norm": 0.216796875, "learning_rate": 1.843391445812886e-05, "loss": 1.1074, "step": 2520 }, { "epoch": 0.79, "grad_norm": 0.24609375, "learning_rate": 1.841432198275393e-05, "loss": 1.1532, "step": 2530 }, { "epoch": 0.79, "grad_norm": 0.197265625, "learning_rate": 1.8394618259712866e-05, "loss": 1.1678, "step": 2540 }, { "epoch": 0.8, "grad_norm": 0.2138671875, "learning_rate": 1.837480354951308e-05, "loss": 1.1779, "step": 2550 }, { "epoch": 0.8, "grad_norm": 0.1943359375, "learning_rate": 1.8354878114129368e-05, "loss": 1.2235, "step": 2560 }, { "epoch": 0.8, "grad_norm": 0.18359375, "learning_rate": 1.833484221700044e-05, "loss": 1.128, "step": 2570 }, { "epoch": 0.81, "grad_norm": 0.216796875, "learning_rate": 1.8314696123025456e-05, "loss": 1.0884, "step": 2580 }, { "epoch": 0.81, "grad_norm": 0.1728515625, "learning_rate": 1.8294440098560508e-05, "loss": 1.152, "step": 2590 }, { "epoch": 0.81, "grad_norm": 0.1962890625, "learning_rate": 1.8274074411415104e-05, "loss": 1.1352, "step": 2600 }, { "epoch": 0.82, "grad_norm": 0.419921875, "learning_rate": 1.8253599330848638e-05, "loss": 1.129, "step": 2610 }, { "epoch": 0.82, "grad_norm": 0.271484375, "learning_rate": 1.8233015127566805e-05, "loss": 1.1701, "step": 2620 }, { "epoch": 0.82, "grad_norm": 0.302734375, "learning_rate": 1.8212322073718042e-05, "loss": 1.1856, "step": 2630 }, { "epoch": 0.82, "grad_norm": 0.255859375, "learning_rate": 1.819152044288992e-05, "loss": 1.1635, "step": 2640 }, { "epoch": 0.83, "grad_norm": 1.0625, "learning_rate": 1.817061051010554e-05, "loss": 1.1203, "step": 2650 }, { "epoch": 0.83, "grad_norm": 0.19140625, "learning_rate": 1.8149592551819883e-05, "loss": 1.1662, "step": 2660 }, { "epoch": 0.83, "grad_norm": 0.2353515625, "learning_rate": 1.8128466845916156e-05, "loss": 1.1825, "step": 2670 }, { "epoch": 0.84, "grad_norm": 0.201171875, "learning_rate": 1.8107233671702123e-05, "loss": 1.1868, "step": 2680 }, { "epoch": 0.84, "grad_norm": 0.212890625, "learning_rate": 1.808589330990642e-05, "loss": 1.1071, "step": 2690 }, { "epoch": 0.84, "grad_norm": 0.1943359375, "learning_rate": 1.806444604267483e-05, "loss": 1.1734, "step": 2700 }, { "epoch": 0.85, "grad_norm": 0.193359375, "learning_rate": 1.8042892153566543e-05, "loss": 1.213, "step": 2710 }, { "epoch": 0.85, "grad_norm": 0.61328125, "learning_rate": 1.802123192755044e-05, "loss": 1.1722, "step": 2720 }, { "epoch": 0.85, "grad_norm": 0.220703125, "learning_rate": 1.7999465651001297e-05, "loss": 1.0755, "step": 2730 }, { "epoch": 0.86, "grad_norm": 0.6875, "learning_rate": 1.7977593611696017e-05, "loss": 1.1852, "step": 2740 }, { "epoch": 0.86, "grad_norm": 0.19921875, "learning_rate": 1.79556160988098e-05, "loss": 1.1037, "step": 2750 }, { "epoch": 0.86, "grad_norm": 0.166015625, "learning_rate": 1.7933533402912354e-05, "loss": 1.2323, "step": 2760 }, { "epoch": 0.87, "grad_norm": 0.2099609375, "learning_rate": 1.791134581596402e-05, "loss": 1.1885, "step": 2770 }, { "epoch": 0.87, "grad_norm": 0.29296875, "learning_rate": 1.7889053631311947e-05, "loss": 1.1541, "step": 2780 }, { "epoch": 0.87, "grad_norm": 0.2373046875, "learning_rate": 1.786665714368617e-05, "loss": 1.1488, "step": 2790 }, { "epoch": 0.88, "grad_norm": 0.75, "learning_rate": 1.784415664919576e-05, "loss": 1.1424, "step": 2800 }, { "epoch": 0.88, "grad_norm": 1.03125, "learning_rate": 1.782155244532487e-05, "loss": 1.1235, "step": 2810 }, { "epoch": 0.88, "grad_norm": 0.388671875, "learning_rate": 1.7798844830928818e-05, "loss": 1.3127, "step": 2820 }, { "epoch": 0.88, "grad_norm": 0.4609375, "learning_rate": 1.7776034106230156e-05, "loss": 1.1475, "step": 2830 }, { "epoch": 0.89, "grad_norm": 0.1650390625, "learning_rate": 1.775312057281466e-05, "loss": 1.1558, "step": 2840 }, { "epoch": 0.89, "grad_norm": 0.171875, "learning_rate": 1.773010453362737e-05, "loss": 1.0776, "step": 2850 }, { "epoch": 0.89, "grad_norm": 0.236328125, "learning_rate": 1.770698629296858e-05, "loss": 1.1442, "step": 2860 }, { "epoch": 0.9, "grad_norm": 0.498046875, "learning_rate": 1.7683766156489817e-05, "loss": 1.1917, "step": 2870 }, { "epoch": 0.9, "grad_norm": 0.1826171875, "learning_rate": 1.766044443118978e-05, "loss": 1.2439, "step": 2880 }, { "epoch": 0.9, "grad_norm": 0.2177734375, "learning_rate": 1.763702142541032e-05, "loss": 1.129, "step": 2890 }, { "epoch": 0.91, "grad_norm": 0.2314453125, "learning_rate": 1.761349744883231e-05, "loss": 1.1861, "step": 2900 }, { "epoch": 0.91, "grad_norm": 0.291015625, "learning_rate": 1.758987281247162e-05, "loss": 1.1001, "step": 2910 }, { "epoch": 0.91, "grad_norm": 0.17578125, "learning_rate": 1.756614782867493e-05, "loss": 1.2019, "step": 2920 }, { "epoch": 0.92, "grad_norm": 0.1552734375, "learning_rate": 1.7542322811115657e-05, "loss": 1.1267, "step": 2930 }, { "epoch": 0.92, "grad_norm": 0.158203125, "learning_rate": 1.7518398074789776e-05, "loss": 1.1625, "step": 2940 }, { "epoch": 0.92, "grad_norm": 0.1728515625, "learning_rate": 1.7494373936011674e-05, "loss": 1.1846, "step": 2950 }, { "epoch": 0.93, "grad_norm": 0.26171875, "learning_rate": 1.7470250712409963e-05, "loss": 1.0803, "step": 2960 }, { "epoch": 0.93, "grad_norm": 0.2021484375, "learning_rate": 1.7446028722923266e-05, "loss": 1.1674, "step": 2970 }, { "epoch": 0.93, "grad_norm": 0.240234375, "learning_rate": 1.7421708287796017e-05, "loss": 1.1976, "step": 2980 }, { "epoch": 0.93, "grad_norm": 0.23828125, "learning_rate": 1.7397289728574234e-05, "loss": 1.1307, "step": 2990 }, { "epoch": 0.94, "grad_norm": 0.61328125, "learning_rate": 1.737277336810124e-05, "loss": 1.1122, "step": 3000 }, { "epoch": 0.94, "grad_norm": 0.1845703125, "learning_rate": 1.7348159530513424e-05, "loss": 1.1558, "step": 3010 }, { "epoch": 0.94, "grad_norm": 0.1826171875, "learning_rate": 1.7323448541235922e-05, "loss": 1.1335, "step": 3020 }, { "epoch": 0.95, "grad_norm": 0.2109375, "learning_rate": 1.7298640726978357e-05, "loss": 1.1388, "step": 3030 }, { "epoch": 0.95, "grad_norm": 0.2578125, "learning_rate": 1.7273736415730488e-05, "loss": 1.207, "step": 3040 }, { "epoch": 0.95, "grad_norm": 0.25, "learning_rate": 1.7248735936757882e-05, "loss": 1.113, "step": 3050 }, { "epoch": 0.96, "grad_norm": 0.201171875, "learning_rate": 1.7223639620597556e-05, "loss": 1.1843, "step": 3060 }, { "epoch": 0.96, "grad_norm": 0.189453125, "learning_rate": 1.719844779905363e-05, "loss": 1.1279, "step": 3070 }, { "epoch": 0.96, "grad_norm": 0.18359375, "learning_rate": 1.7173160805192895e-05, "loss": 1.1639, "step": 3080 }, { "epoch": 0.97, "grad_norm": 0.2041015625, "learning_rate": 1.7147778973340466e-05, "loss": 1.1434, "step": 3090 }, { "epoch": 0.97, "grad_norm": 0.2158203125, "learning_rate": 1.7122302639075313e-05, "loss": 1.1951, "step": 3100 }, { "epoch": 0.97, "grad_norm": 2.046875, "learning_rate": 1.7096732139225853e-05, "loss": 1.1432, "step": 3110 }, { "epoch": 0.97, "grad_norm": 0.2060546875, "learning_rate": 1.7071067811865477e-05, "loss": 1.1327, "step": 3120 }, { "epoch": 0.98, "grad_norm": 0.314453125, "learning_rate": 1.7045309996308107e-05, "loss": 1.1622, "step": 3130 }, { "epoch": 0.98, "grad_norm": 0.1904296875, "learning_rate": 1.7019459033103684e-05, "loss": 1.1804, "step": 3140 }, { "epoch": 0.98, "grad_norm": 0.1708984375, "learning_rate": 1.699351526403367e-05, "loss": 1.0703, "step": 3150 }, { "epoch": 0.99, "grad_norm": 0.30859375, "learning_rate": 1.6967479032106552e-05, "loss": 1.1724, "step": 3160 }, { "epoch": 0.99, "grad_norm": 0.1953125, "learning_rate": 1.694135068155327e-05, "loss": 1.1547, "step": 3170 }, { "epoch": 0.99, "grad_norm": 0.2314453125, "learning_rate": 1.6915130557822698e-05, "loss": 1.1062, "step": 3180 }, { "epoch": 1.0, "grad_norm": 0.392578125, "learning_rate": 1.6888819007577054e-05, "loss": 1.1308, "step": 3190 }, { "epoch": 1.0, "grad_norm": 0.546875, "learning_rate": 1.686241637868734e-05, "loss": 1.0832, "step": 3200 }, { "epoch": 1.0, "grad_norm": 0.3359375, "learning_rate": 1.6835923020228714e-05, "loss": 1.1029, "step": 3210 }, { "epoch": 1.01, "grad_norm": 0.412109375, "learning_rate": 1.6809339282475905e-05, "loss": 1.1039, "step": 3220 }, { "epoch": 1.01, "grad_norm": 0.2021484375, "learning_rate": 1.678266551689856e-05, "loss": 1.2061, "step": 3230 }, { "epoch": 1.01, "grad_norm": 0.2451171875, "learning_rate": 1.6755902076156606e-05, "loss": 1.2829, "step": 3240 }, { "epoch": 1.02, "grad_norm": 0.171875, "learning_rate": 1.6729049314095578e-05, "loss": 1.1065, "step": 3250 }, { "epoch": 1.02, "grad_norm": 0.2578125, "learning_rate": 1.670210758574196e-05, "loss": 1.1086, "step": 3260 }, { "epoch": 1.02, "grad_norm": 0.333984375, "learning_rate": 1.6675077247298475e-05, "loss": 1.0745, "step": 3270 }, { "epoch": 1.02, "grad_norm": 0.72265625, "learning_rate": 1.6647958656139377e-05, "loss": 1.1755, "step": 3280 }, { "epoch": 1.03, "grad_norm": 0.2431640625, "learning_rate": 1.662075217080574e-05, "loss": 1.2102, "step": 3290 }, { "epoch": 1.03, "grad_norm": 0.2197265625, "learning_rate": 1.659345815100069e-05, "loss": 1.1135, "step": 3300 }, { "epoch": 1.03, "grad_norm": 0.37890625, "learning_rate": 1.656607695758468e-05, "loss": 1.1546, "step": 3310 }, { "epoch": 1.04, "grad_norm": 0.287109375, "learning_rate": 1.6538608952570698e-05, "loss": 1.0797, "step": 3320 }, { "epoch": 1.04, "grad_norm": 2.125, "learning_rate": 1.6511054499119493e-05, "loss": 1.142, "step": 3330 }, { "epoch": 1.04, "grad_norm": 0.203125, "learning_rate": 1.6483413961534764e-05, "loss": 1.1214, "step": 3340 }, { "epoch": 1.05, "grad_norm": 0.21875, "learning_rate": 1.6455687705258348e-05, "loss": 1.1406, "step": 3350 }, { "epoch": 1.05, "grad_norm": 0.7734375, "learning_rate": 1.6427876096865394e-05, "loss": 1.1028, "step": 3360 }, { "epoch": 1.05, "grad_norm": 0.1728515625, "learning_rate": 1.6399979504059506e-05, "loss": 1.1224, "step": 3370 }, { "epoch": 1.06, "grad_norm": 0.609375, "learning_rate": 1.6371998295667885e-05, "loss": 1.1554, "step": 3380 }, { "epoch": 1.06, "grad_norm": 0.2275390625, "learning_rate": 1.6343932841636455e-05, "loss": 1.1098, "step": 3390 }, { "epoch": 1.06, "grad_norm": 0.1572265625, "learning_rate": 1.6315783513024977e-05, "loss": 1.1195, "step": 3400 }, { "epoch": 1.07, "grad_norm": 0.173828125, "learning_rate": 1.6287550682002126e-05, "loss": 1.069, "step": 3410 }, { "epoch": 1.07, "grad_norm": 0.306640625, "learning_rate": 1.6259234721840595e-05, "loss": 1.241, "step": 3420 }, { "epoch": 1.07, "grad_norm": 0.2275390625, "learning_rate": 1.6230836006912127e-05, "loss": 1.0996, "step": 3430 }, { "epoch": 1.07, "grad_norm": 0.1689453125, "learning_rate": 1.6202354912682602e-05, "loss": 1.2133, "step": 3440 }, { "epoch": 1.08, "grad_norm": 0.169921875, "learning_rate": 1.6173791815707053e-05, "loss": 1.0838, "step": 3450 }, { "epoch": 1.08, "grad_norm": 0.298828125, "learning_rate": 1.6145147093624677e-05, "loss": 1.1078, "step": 3460 }, { "epoch": 1.08, "grad_norm": 0.2001953125, "learning_rate": 1.6116421125153876e-05, "loss": 1.172, "step": 3470 }, { "epoch": 1.09, "grad_norm": 0.625, "learning_rate": 1.608761429008721e-05, "loss": 1.0926, "step": 3480 }, { "epoch": 1.09, "grad_norm": 0.330078125, "learning_rate": 1.60587269692864e-05, "loss": 1.0849, "step": 3490 }, { "epoch": 1.09, "grad_norm": 0.21484375, "learning_rate": 1.6029759544677298e-05, "loss": 1.2657, "step": 3500 }, { "epoch": 1.1, "grad_norm": 0.2275390625, "learning_rate": 1.6000712399244813e-05, "loss": 1.1805, "step": 3510 }, { "epoch": 1.1, "grad_norm": 0.26953125, "learning_rate": 1.5971585917027864e-05, "loss": 1.0808, "step": 3520 }, { "epoch": 1.1, "grad_norm": 0.173828125, "learning_rate": 1.5942380483114305e-05, "loss": 1.1493, "step": 3530 }, { "epoch": 1.11, "grad_norm": 0.1669921875, "learning_rate": 1.5913096483635827e-05, "loss": 1.1078, "step": 3540 }, { "epoch": 1.11, "grad_norm": 0.20703125, "learning_rate": 1.5883734305762846e-05, "loss": 1.0484, "step": 3550 }, { "epoch": 1.11, "grad_norm": 0.1796875, "learning_rate": 1.5854294337699407e-05, "loss": 1.1525, "step": 3560 }, { "epoch": 1.12, "grad_norm": 0.189453125, "learning_rate": 1.5824776968678024e-05, "loss": 1.0877, "step": 3570 }, { "epoch": 1.12, "grad_norm": 0.1845703125, "learning_rate": 1.5795182588954553e-05, "loss": 1.1422, "step": 3580 }, { "epoch": 1.12, "grad_norm": 0.2060546875, "learning_rate": 1.576551158980302e-05, "loss": 1.2228, "step": 3590 }, { "epoch": 1.12, "grad_norm": 0.201171875, "learning_rate": 1.573576436351046e-05, "loss": 1.0375, "step": 3600 }, { "epoch": 1.13, "grad_norm": 1.59375, "learning_rate": 1.570594130337173e-05, "loss": 1.1789, "step": 3610 }, { "epoch": 1.13, "grad_norm": 0.333984375, "learning_rate": 1.567604280368429e-05, "loss": 1.1084, "step": 3620 }, { "epoch": 1.13, "grad_norm": 0.1572265625, "learning_rate": 1.5646069259743007e-05, "loss": 1.1219, "step": 3630 }, { "epoch": 1.14, "grad_norm": 0.19921875, "learning_rate": 1.561602106783493e-05, "loss": 1.2003, "step": 3640 }, { "epoch": 1.14, "grad_norm": 0.2412109375, "learning_rate": 1.5585898625234047e-05, "loss": 1.191, "step": 3650 }, { "epoch": 1.14, "grad_norm": 0.1748046875, "learning_rate": 1.5555702330196024e-05, "loss": 1.2076, "step": 3660 }, { "epoch": 1.15, "grad_norm": 0.267578125, "learning_rate": 1.552543258195295e-05, "loss": 1.0907, "step": 3670 }, { "epoch": 1.15, "grad_norm": 0.1875, "learning_rate": 1.5495089780708062e-05, "loss": 1.1739, "step": 3680 }, { "epoch": 1.15, "grad_norm": 1.2109375, "learning_rate": 1.5464674327630437e-05, "loss": 1.086, "step": 3690 }, { "epoch": 1.16, "grad_norm": 0.181640625, "learning_rate": 1.54341866248497e-05, "loss": 1.182, "step": 3700 }, { "epoch": 1.16, "grad_norm": 0.291015625, "learning_rate": 1.5403627075450717e-05, "loss": 1.2353, "step": 3710 }, { "epoch": 1.16, "grad_norm": 0.2197265625, "learning_rate": 1.5372996083468242e-05, "loss": 1.0746, "step": 3720 }, { "epoch": 1.17, "grad_norm": 0.216796875, "learning_rate": 1.534229405388159e-05, "loss": 1.1302, "step": 3730 }, { "epoch": 1.17, "grad_norm": 0.275390625, "learning_rate": 1.5311521392609283e-05, "loss": 1.1688, "step": 3740 }, { "epoch": 1.17, "grad_norm": 0.2060546875, "learning_rate": 1.528067850650368e-05, "loss": 1.1074, "step": 3750 }, { "epoch": 1.18, "grad_norm": 0.1953125, "learning_rate": 1.5249765803345602e-05, "loss": 1.1006, "step": 3760 }, { "epoch": 1.18, "grad_norm": 0.232421875, "learning_rate": 1.5218783691838935e-05, "loss": 1.1077, "step": 3770 }, { "epoch": 1.18, "grad_norm": 0.189453125, "learning_rate": 1.5187732581605217e-05, "loss": 1.0822, "step": 3780 }, { "epoch": 1.18, "grad_norm": 0.1953125, "learning_rate": 1.515661288317825e-05, "loss": 1.1424, "step": 3790 }, { "epoch": 1.19, "grad_norm": 0.1806640625, "learning_rate": 1.5125425007998653e-05, "loss": 1.1602, "step": 3800 }, { "epoch": 1.19, "grad_norm": 0.251953125, "learning_rate": 1.509416936840842e-05, "loss": 1.0829, "step": 3810 }, { "epoch": 1.19, "grad_norm": 0.46484375, "learning_rate": 1.5062846377645476e-05, "loss": 1.1062, "step": 3820 }, { "epoch": 1.2, "grad_norm": 0.66796875, "learning_rate": 1.5031456449838207e-05, "loss": 1.1321, "step": 3830 }, { "epoch": 1.2, "grad_norm": 0.21484375, "learning_rate": 1.5000000000000002e-05, "loss": 1.0985, "step": 3840 }, { "epoch": 1.2, "grad_norm": 0.185546875, "learning_rate": 1.4968477444023739e-05, "loss": 1.183, "step": 3850 }, { "epoch": 1.21, "grad_norm": 0.1796875, "learning_rate": 1.4936889198676303e-05, "loss": 1.1727, "step": 3860 }, { "epoch": 1.21, "grad_norm": 0.1708984375, "learning_rate": 1.4905235681593079e-05, "loss": 1.0394, "step": 3870 }, { "epoch": 1.21, "grad_norm": 0.328125, "learning_rate": 1.4873517311272425e-05, "loss": 1.1763, "step": 3880 }, { "epoch": 1.22, "grad_norm": 0.21875, "learning_rate": 1.484173450707013e-05, "loss": 1.143, "step": 3890 }, { "epoch": 1.22, "grad_norm": 0.1689453125, "learning_rate": 1.4809887689193878e-05, "loss": 1.1462, "step": 3900 }, { "epoch": 1.22, "grad_norm": 0.279296875, "learning_rate": 1.4777977278697704e-05, "loss": 1.2094, "step": 3910 }, { "epoch": 1.23, "grad_norm": 0.251953125, "learning_rate": 1.4746003697476406e-05, "loss": 1.1694, "step": 3920 }, { "epoch": 1.23, "grad_norm": 0.1962890625, "learning_rate": 1.4713967368259981e-05, "loss": 1.1119, "step": 3930 }, { "epoch": 1.23, "grad_norm": 0.17578125, "learning_rate": 1.4681868714608021e-05, "loss": 1.1699, "step": 3940 }, { "epoch": 1.23, "grad_norm": 0.271484375, "learning_rate": 1.4649708160904142e-05, "loss": 1.1966, "step": 3950 }, { "epoch": 1.24, "grad_norm": 0.158203125, "learning_rate": 1.4617486132350343e-05, "loss": 1.1101, "step": 3960 }, { "epoch": 1.24, "grad_norm": 0.203125, "learning_rate": 1.458520305496139e-05, "loss": 1.0989, "step": 3970 }, { "epoch": 1.24, "grad_norm": 0.1552734375, "learning_rate": 1.4552859355559205e-05, "loss": 1.1079, "step": 3980 }, { "epoch": 1.25, "grad_norm": 0.55859375, "learning_rate": 1.45204554617672e-05, "loss": 1.1722, "step": 3990 }, { "epoch": 1.25, "grad_norm": 0.1796875, "learning_rate": 1.4487991802004625e-05, "loss": 1.1155, "step": 4000 }, { "epoch": 1.25, "grad_norm": 0.177734375, "learning_rate": 1.4455468805480918e-05, "loss": 1.2127, "step": 4010 }, { "epoch": 1.26, "grad_norm": 0.23828125, "learning_rate": 1.4422886902190014e-05, "loss": 1.0877, "step": 4020 }, { "epoch": 1.26, "grad_norm": 0.2109375, "learning_rate": 1.4390246522904681e-05, "loss": 1.1662, "step": 4030 }, { "epoch": 1.26, "grad_norm": 0.328125, "learning_rate": 1.4357548099170794e-05, "loss": 1.0952, "step": 4040 }, { "epoch": 1.27, "grad_norm": 0.375, "learning_rate": 1.4324792063301662e-05, "loss": 1.18, "step": 4050 }, { "epoch": 1.27, "grad_norm": 0.392578125, "learning_rate": 1.4291978848372292e-05, "loss": 1.1226, "step": 4060 }, { "epoch": 1.27, "grad_norm": 0.2021484375, "learning_rate": 1.425910888821366e-05, "loss": 1.1017, "step": 4070 }, { "epoch": 1.27, "grad_norm": 0.443359375, "learning_rate": 1.4226182617406996e-05, "loss": 1.1678, "step": 4080 }, { "epoch": 1.28, "grad_norm": 0.1923828125, "learning_rate": 1.4193200471278019e-05, "loss": 1.16, "step": 4090 }, { "epoch": 1.28, "grad_norm": 0.63671875, "learning_rate": 1.4160162885891193e-05, "loss": 1.1208, "step": 4100 }, { "epoch": 1.28, "grad_norm": 0.1708984375, "learning_rate": 1.4127070298043949e-05, "loss": 1.1794, "step": 4110 }, { "epoch": 1.29, "grad_norm": 0.287109375, "learning_rate": 1.4093923145260926e-05, "loss": 1.0882, "step": 4120 }, { "epoch": 1.29, "grad_norm": 0.1806640625, "learning_rate": 1.4060721865788178e-05, "loss": 1.1112, "step": 4130 }, { "epoch": 1.29, "grad_norm": 0.24609375, "learning_rate": 1.4027466898587375e-05, "loss": 1.1513, "step": 4140 }, { "epoch": 1.3, "grad_norm": 0.1748046875, "learning_rate": 1.3994158683330006e-05, "loss": 1.1307, "step": 4150 }, { "epoch": 1.3, "grad_norm": 0.1748046875, "learning_rate": 1.396079766039157e-05, "loss": 1.1097, "step": 4160 }, { "epoch": 1.3, "grad_norm": 0.189453125, "learning_rate": 1.3927384270845744e-05, "loss": 1.1136, "step": 4170 }, { "epoch": 1.31, "grad_norm": 0.267578125, "learning_rate": 1.3893918956458554e-05, "loss": 1.1208, "step": 4180 }, { "epoch": 1.31, "grad_norm": 0.224609375, "learning_rate": 1.3860402159682535e-05, "loss": 1.1909, "step": 4190 }, { "epoch": 1.31, "grad_norm": 0.1904296875, "learning_rate": 1.3826834323650899e-05, "loss": 1.1901, "step": 4200 }, { "epoch": 1.32, "grad_norm": 0.22265625, "learning_rate": 1.3793215892171636e-05, "loss": 1.227, "step": 4210 }, { "epoch": 1.32, "grad_norm": 0.15234375, "learning_rate": 1.3759547309721681e-05, "loss": 1.141, "step": 4220 }, { "epoch": 1.32, "grad_norm": 0.1943359375, "learning_rate": 1.372582902144103e-05, "loss": 1.2137, "step": 4230 }, { "epoch": 1.32, "grad_norm": 0.169921875, "learning_rate": 1.3692061473126845e-05, "loss": 1.0471, "step": 4240 }, { "epoch": 1.33, "grad_norm": 0.1728515625, "learning_rate": 1.3658245111227571e-05, "loss": 1.0859, "step": 4250 }, { "epoch": 1.33, "grad_norm": 0.28515625, "learning_rate": 1.3624380382837017e-05, "loss": 1.135, "step": 4260 }, { "epoch": 1.33, "grad_norm": 0.375, "learning_rate": 1.3590467735688475e-05, "loss": 1.2119, "step": 4270 }, { "epoch": 1.34, "grad_norm": 0.2080078125, "learning_rate": 1.3556507618148769e-05, "loss": 1.0579, "step": 4280 }, { "epoch": 1.34, "grad_norm": 0.1630859375, "learning_rate": 1.3522500479212337e-05, "loss": 1.1343, "step": 4290 }, { "epoch": 1.34, "grad_norm": 0.185546875, "learning_rate": 1.3488446768495309e-05, "loss": 1.1671, "step": 4300 }, { "epoch": 1.35, "grad_norm": 0.2158203125, "learning_rate": 1.3454346936229547e-05, "loss": 1.1931, "step": 4310 }, { "epoch": 1.35, "grad_norm": 0.2265625, "learning_rate": 1.342020143325669e-05, "loss": 1.131, "step": 4320 }, { "epoch": 1.35, "grad_norm": 0.1611328125, "learning_rate": 1.3386010711022206e-05, "loss": 1.1752, "step": 4330 }, { "epoch": 1.36, "grad_norm": 0.197265625, "learning_rate": 1.3351775221569416e-05, "loss": 1.1737, "step": 4340 }, { "epoch": 1.36, "grad_norm": 0.2119140625, "learning_rate": 1.3317495417533523e-05, "loss": 1.0803, "step": 4350 }, { "epoch": 1.36, "grad_norm": 0.56640625, "learning_rate": 1.3283171752135614e-05, "loss": 1.1451, "step": 4360 }, { "epoch": 1.37, "grad_norm": 0.49609375, "learning_rate": 1.3248804679176679e-05, "loss": 1.0968, "step": 4370 }, { "epoch": 1.37, "grad_norm": 0.345703125, "learning_rate": 1.3214394653031616e-05, "loss": 1.0748, "step": 4380 }, { "epoch": 1.37, "grad_norm": 0.205078125, "learning_rate": 1.3179942128643216e-05, "loss": 1.1289, "step": 4390 }, { "epoch": 1.38, "grad_norm": 0.16015625, "learning_rate": 1.3145447561516138e-05, "loss": 1.1095, "step": 4400 }, { "epoch": 1.38, "grad_norm": 0.5703125, "learning_rate": 1.3110911407710909e-05, "loss": 1.136, "step": 4410 }, { "epoch": 1.38, "grad_norm": 0.267578125, "learning_rate": 1.3076334123837884e-05, "loss": 1.1481, "step": 4420 }, { "epoch": 1.38, "grad_norm": 0.263671875, "learning_rate": 1.3041716167051197e-05, "loss": 1.1945, "step": 4430 }, { "epoch": 1.39, "grad_norm": 0.1552734375, "learning_rate": 1.300705799504273e-05, "loss": 1.1104, "step": 4440 }, { "epoch": 1.39, "grad_norm": 0.21484375, "learning_rate": 1.2972360066036078e-05, "loss": 1.1484, "step": 4450 }, { "epoch": 1.39, "grad_norm": 0.2099609375, "learning_rate": 1.2937622838780444e-05, "loss": 1.0896, "step": 4460 }, { "epoch": 1.4, "grad_norm": 0.453125, "learning_rate": 1.2902846772544625e-05, "loss": 1.0531, "step": 4470 }, { "epoch": 1.4, "grad_norm": 0.1591796875, "learning_rate": 1.2868032327110904e-05, "loss": 1.1475, "step": 4480 }, { "epoch": 1.4, "grad_norm": 0.193359375, "learning_rate": 1.2833179962768988e-05, "loss": 1.1864, "step": 4490 }, { "epoch": 1.41, "grad_norm": 0.2451171875, "learning_rate": 1.2798290140309924e-05, "loss": 1.1529, "step": 4500 }, { "epoch": 1.41, "grad_norm": 1.4765625, "learning_rate": 1.2763363321019986e-05, "loss": 1.1534, "step": 4510 }, { "epoch": 1.41, "grad_norm": 0.150390625, "learning_rate": 1.2728399966674612e-05, "loss": 1.089, "step": 4520 }, { "epoch": 1.42, "grad_norm": 0.1708984375, "learning_rate": 1.2693400539532263e-05, "loss": 1.128, "step": 4530 }, { "epoch": 1.42, "grad_norm": 0.1826171875, "learning_rate": 1.2658365502328329e-05, "loss": 1.0726, "step": 4540 }, { "epoch": 1.42, "grad_norm": 0.27734375, "learning_rate": 1.2623295318269018e-05, "loss": 1.2424, "step": 4550 }, { "epoch": 1.43, "grad_norm": 0.2373046875, "learning_rate": 1.2588190451025209e-05, "loss": 1.2114, "step": 4560 }, { "epoch": 1.43, "grad_norm": 0.2431640625, "learning_rate": 1.2553051364726347e-05, "loss": 1.0968, "step": 4570 }, { "epoch": 1.43, "grad_norm": 2.921875, "learning_rate": 1.2517878523954287e-05, "loss": 1.1075, "step": 4580 }, { "epoch": 1.43, "grad_norm": 0.4140625, "learning_rate": 1.2482672393737164e-05, "loss": 1.1654, "step": 4590 }, { "epoch": 1.44, "grad_norm": 0.15625, "learning_rate": 1.2447433439543239e-05, "loss": 1.0986, "step": 4600 }, { "epoch": 1.44, "grad_norm": 0.40234375, "learning_rate": 1.2412162127274748e-05, "loss": 1.1458, "step": 4610 }, { "epoch": 1.44, "grad_norm": 0.31640625, "learning_rate": 1.2376858923261732e-05, "loss": 1.1728, "step": 4620 }, { "epoch": 1.45, "grad_norm": 0.1845703125, "learning_rate": 1.2341524294255893e-05, "loss": 1.1034, "step": 4630 }, { "epoch": 1.45, "grad_norm": 0.185546875, "learning_rate": 1.2306158707424402e-05, "loss": 1.1559, "step": 4640 }, { "epoch": 1.45, "grad_norm": 0.1865234375, "learning_rate": 1.2270762630343734e-05, "loss": 1.1225, "step": 4650 }, { "epoch": 1.46, "grad_norm": 0.1865234375, "learning_rate": 1.2235336530993475e-05, "loss": 1.1869, "step": 4660 }, { "epoch": 1.46, "grad_norm": 0.2099609375, "learning_rate": 1.2199880877750157e-05, "loss": 1.2204, "step": 4670 }, { "epoch": 1.46, "grad_norm": 0.193359375, "learning_rate": 1.2164396139381029e-05, "loss": 1.0915, "step": 4680 }, { "epoch": 1.47, "grad_norm": 0.234375, "learning_rate": 1.2128882785037905e-05, "loss": 1.1554, "step": 4690 }, { "epoch": 1.47, "grad_norm": 1.1796875, "learning_rate": 1.2093341284250922e-05, "loss": 1.0801, "step": 4700 }, { "epoch": 1.47, "grad_norm": 0.6484375, "learning_rate": 1.205777210692235e-05, "loss": 1.1295, "step": 4710 }, { "epoch": 1.48, "grad_norm": 0.259765625, "learning_rate": 1.2022175723320382e-05, "loss": 1.0752, "step": 4720 }, { "epoch": 1.48, "grad_norm": 0.240234375, "learning_rate": 1.19865526040729e-05, "loss": 1.0692, "step": 4730 }, { "epoch": 1.48, "grad_norm": 1.3046875, "learning_rate": 1.1950903220161286e-05, "loss": 1.0992, "step": 4740 }, { "epoch": 1.48, "grad_norm": 0.296875, "learning_rate": 1.1915228042914144e-05, "loss": 1.1283, "step": 4750 }, { "epoch": 1.49, "grad_norm": 0.185546875, "learning_rate": 1.187952754400112e-05, "loss": 1.1302, "step": 4760 }, { "epoch": 1.49, "grad_norm": 0.1552734375, "learning_rate": 1.1843802195426634e-05, "loss": 1.1006, "step": 4770 }, { "epoch": 1.49, "grad_norm": 0.50390625, "learning_rate": 1.1808052469523654e-05, "loss": 1.088, "step": 4780 }, { "epoch": 1.5, "grad_norm": 0.240234375, "learning_rate": 1.1772278838947442e-05, "loss": 1.155, "step": 4790 }, { "epoch": 1.5, "grad_norm": 0.625, "learning_rate": 1.1736481776669307e-05, "loss": 1.1309, "step": 4800 }, { "epoch": 1.5, "grad_norm": 0.1787109375, "learning_rate": 1.1700661755970357e-05, "loss": 1.1405, "step": 4810 }, { "epoch": 1.51, "grad_norm": 0.25, "learning_rate": 1.1664819250435246e-05, "loss": 1.1056, "step": 4820 }, { "epoch": 1.51, "grad_norm": 0.216796875, "learning_rate": 1.162895473394589e-05, "loss": 1.1191, "step": 4830 }, { "epoch": 1.51, "grad_norm": 0.3125, "learning_rate": 1.1593068680675227e-05, "loss": 1.1283, "step": 4840 }, { "epoch": 1.52, "grad_norm": 0.2080078125, "learning_rate": 1.155716156508094e-05, "loss": 1.1046, "step": 4850 }, { "epoch": 1.52, "grad_norm": 0.234375, "learning_rate": 1.1521233861899168e-05, "loss": 1.1485, "step": 4860 }, { "epoch": 1.52, "grad_norm": 0.2216796875, "learning_rate": 1.1485286046138259e-05, "loss": 1.0905, "step": 4870 }, { "epoch": 1.52, "grad_norm": 0.33203125, "learning_rate": 1.1449318593072468e-05, "loss": 1.1524, "step": 4880 }, { "epoch": 1.53, "grad_norm": 1.953125, "learning_rate": 1.1413331978235677e-05, "loss": 1.206, "step": 4890 }, { "epoch": 1.53, "grad_norm": 0.3046875, "learning_rate": 1.1377326677415108e-05, "loss": 1.1059, "step": 4900 }, { "epoch": 1.53, "grad_norm": 0.49609375, "learning_rate": 1.1341303166645043e-05, "loss": 1.151, "step": 4910 }, { "epoch": 1.54, "grad_norm": 0.2080078125, "learning_rate": 1.130526192220052e-05, "loss": 1.131, "step": 4920 }, { "epoch": 1.54, "grad_norm": 0.2001953125, "learning_rate": 1.1269203420591024e-05, "loss": 1.279, "step": 4930 }, { "epoch": 1.54, "grad_norm": 0.1533203125, "learning_rate": 1.1233128138554222e-05, "loss": 1.1143, "step": 4940 }, { "epoch": 1.55, "grad_norm": 0.1923828125, "learning_rate": 1.1197036553049626e-05, "loss": 1.1559, "step": 4950 }, { "epoch": 1.55, "grad_norm": 0.48046875, "learning_rate": 1.1160929141252303e-05, "loss": 1.1276, "step": 4960 }, { "epoch": 1.55, "grad_norm": 0.162109375, "learning_rate": 1.1124806380546564e-05, "loss": 1.1622, "step": 4970 }, { "epoch": 1.56, "grad_norm": 0.83984375, "learning_rate": 1.1088668748519646e-05, "loss": 1.1724, "step": 4980 }, { "epoch": 1.56, "grad_norm": 0.341796875, "learning_rate": 1.1052516722955412e-05, "loss": 1.156, "step": 4990 }, { "epoch": 1.56, "grad_norm": 0.404296875, "learning_rate": 1.101635078182802e-05, "loss": 1.1908, "step": 5000 }, { "epoch": 1.57, "grad_norm": 0.15625, "learning_rate": 1.098017140329561e-05, "loss": 1.1212, "step": 5010 }, { "epoch": 1.57, "grad_norm": 0.2431640625, "learning_rate": 1.0943979065693974e-05, "loss": 1.1628, "step": 5020 }, { "epoch": 1.57, "grad_norm": 0.1953125, "learning_rate": 1.0907774247530252e-05, "loss": 1.1384, "step": 5030 }, { "epoch": 1.57, "grad_norm": 0.25, "learning_rate": 1.0871557427476585e-05, "loss": 1.1181, "step": 5040 }, { "epoch": 1.58, "grad_norm": 0.19921875, "learning_rate": 1.0835329084363787e-05, "loss": 1.1326, "step": 5050 }, { "epoch": 1.58, "grad_norm": 0.478515625, "learning_rate": 1.0799089697175041e-05, "loss": 1.1383, "step": 5060 }, { "epoch": 1.58, "grad_norm": 0.427734375, "learning_rate": 1.0762839745039526e-05, "loss": 1.2123, "step": 5070 }, { "epoch": 1.59, "grad_norm": 0.291015625, "learning_rate": 1.0726579707226108e-05, "loss": 1.0768, "step": 5080 }, { "epoch": 1.59, "grad_norm": 0.384765625, "learning_rate": 1.0690310063137003e-05, "loss": 1.1651, "step": 5090 }, { "epoch": 1.59, "grad_norm": 0.1845703125, "learning_rate": 1.0654031292301432e-05, "loss": 1.0896, "step": 5100 }, { "epoch": 1.6, "grad_norm": 0.1943359375, "learning_rate": 1.0617743874369282e-05, "loss": 1.1552, "step": 5110 }, { "epoch": 1.6, "grad_norm": 0.169921875, "learning_rate": 1.0581448289104759e-05, "loss": 1.1278, "step": 5120 }, { "epoch": 1.6, "grad_norm": 0.205078125, "learning_rate": 1.0545145016380065e-05, "loss": 1.0809, "step": 5130 }, { "epoch": 1.61, "grad_norm": 0.1767578125, "learning_rate": 1.0508834536169028e-05, "loss": 1.1482, "step": 5140 }, { "epoch": 1.61, "grad_norm": 0.72265625, "learning_rate": 1.047251732854077e-05, "loss": 1.1308, "step": 5150 }, { "epoch": 1.61, "grad_norm": 0.296875, "learning_rate": 1.0436193873653362e-05, "loss": 1.156, "step": 5160 }, { "epoch": 1.62, "grad_norm": 0.2421875, "learning_rate": 1.0399864651747467e-05, "loss": 1.0906, "step": 5170 }, { "epoch": 1.62, "grad_norm": 0.21484375, "learning_rate": 1.036353014314e-05, "loss": 1.0996, "step": 5180 }, { "epoch": 1.62, "grad_norm": 0.15234375, "learning_rate": 1.0327190828217763e-05, "loss": 1.1608, "step": 5190 }, { "epoch": 1.62, "grad_norm": 0.232421875, "learning_rate": 1.0290847187431115e-05, "loss": 1.1294, "step": 5200 }, { "epoch": 1.63, "grad_norm": 0.326171875, "learning_rate": 1.0254499701287604e-05, "loss": 1.157, "step": 5210 }, { "epoch": 1.63, "grad_norm": 0.1787109375, "learning_rate": 1.0218148850345613e-05, "loss": 1.0994, "step": 5220 }, { "epoch": 1.63, "grad_norm": 8.0, "learning_rate": 1.0181795115208017e-05, "loss": 1.1308, "step": 5230 }, { "epoch": 1.64, "grad_norm": 0.443359375, "learning_rate": 1.014543897651583e-05, "loss": 1.1917, "step": 5240 }, { "epoch": 1.64, "grad_norm": 0.48046875, "learning_rate": 1.0109080914941825e-05, "loss": 1.1694, "step": 5250 }, { "epoch": 1.64, "grad_norm": 0.2734375, "learning_rate": 1.007272141118422e-05, "loss": 1.2024, "step": 5260 }, { "epoch": 1.65, "grad_norm": 0.328125, "learning_rate": 1.003636094596028e-05, "loss": 1.0688, "step": 5270 }, { "epoch": 1.65, "grad_norm": 0.1572265625, "learning_rate": 1e-05, "loss": 1.1478, "step": 5280 }, { "epoch": 1.65, "grad_norm": 0.330078125, "learning_rate": 9.963639054039722e-06, "loss": 1.1068, "step": 5290 }, { "epoch": 1.66, "grad_norm": 0.65234375, "learning_rate": 9.927278588815786e-06, "loss": 1.1371, "step": 5300 }, { "epoch": 1.66, "grad_norm": 0.2109375, "learning_rate": 9.890919085058179e-06, "loss": 1.2268, "step": 5310 }, { "epoch": 1.66, "grad_norm": 0.2294921875, "learning_rate": 9.854561023484174e-06, "loss": 1.2387, "step": 5320 }, { "epoch": 1.67, "grad_norm": 0.2001953125, "learning_rate": 9.818204884791983e-06, "loss": 1.2898, "step": 5330 }, { "epoch": 1.67, "grad_norm": 0.38671875, "learning_rate": 9.78185114965439e-06, "loss": 1.1426, "step": 5340 }, { "epoch": 1.67, "grad_norm": 0.1845703125, "learning_rate": 9.7455002987124e-06, "loss": 1.1677, "step": 5350 }, { "epoch": 1.68, "grad_norm": 0.16015625, "learning_rate": 9.709152812568886e-06, "loss": 1.091, "step": 5360 }, { "epoch": 1.68, "grad_norm": 0.1689453125, "learning_rate": 9.67280917178224e-06, "loss": 1.1368, "step": 5370 }, { "epoch": 1.68, "grad_norm": 0.2373046875, "learning_rate": 9.636469856860005e-06, "loss": 1.1298, "step": 5380 }, { "epoch": 1.68, "grad_norm": 0.25390625, "learning_rate": 9.600135348252535e-06, "loss": 1.1522, "step": 5390 }, { "epoch": 1.69, "grad_norm": 0.177734375, "learning_rate": 9.563806126346643e-06, "loss": 1.1137, "step": 5400 }, { "epoch": 1.69, "grad_norm": 0.193359375, "learning_rate": 9.527482671459233e-06, "loss": 1.178, "step": 5410 }, { "epoch": 1.69, "grad_norm": 0.255859375, "learning_rate": 9.491165463830975e-06, "loss": 1.1467, "step": 5420 }, { "epoch": 1.7, "grad_norm": 0.240234375, "learning_rate": 9.454854983619936e-06, "loss": 1.1423, "step": 5430 }, { "epoch": 1.7, "grad_norm": 0.25, "learning_rate": 9.418551710895243e-06, "loss": 1.2355, "step": 5440 }, { "epoch": 1.7, "grad_norm": 0.1630859375, "learning_rate": 9.382256125630722e-06, "loss": 1.1157, "step": 5450 }, { "epoch": 1.71, "grad_norm": 0.298828125, "learning_rate": 9.34596870769857e-06, "loss": 1.1808, "step": 5460 }, { "epoch": 1.71, "grad_norm": 0.53125, "learning_rate": 9.309689936863002e-06, "loss": 1.1627, "step": 5470 }, { "epoch": 1.71, "grad_norm": 0.2734375, "learning_rate": 9.273420292773895e-06, "loss": 1.2524, "step": 5480 }, { "epoch": 1.72, "grad_norm": 0.16796875, "learning_rate": 9.237160254960477e-06, "loss": 1.247, "step": 5490 }, { "epoch": 1.72, "grad_norm": 0.21484375, "learning_rate": 9.200910302824964e-06, "loss": 1.1416, "step": 5500 }, { "epoch": 1.72, "grad_norm": 0.66796875, "learning_rate": 9.164670915636214e-06, "loss": 1.1514, "step": 5510 }, { "epoch": 1.73, "grad_norm": 0.1494140625, "learning_rate": 9.128442572523418e-06, "loss": 1.0619, "step": 5520 }, { "epoch": 1.73, "grad_norm": 0.19140625, "learning_rate": 9.09222575246975e-06, "loss": 1.1393, "step": 5530 }, { "epoch": 1.73, "grad_norm": 0.3984375, "learning_rate": 9.05602093430603e-06, "loss": 1.0532, "step": 5540 }, { "epoch": 1.73, "grad_norm": 0.1591796875, "learning_rate": 9.019828596704394e-06, "loss": 1.1563, "step": 5550 }, { "epoch": 1.74, "grad_norm": 0.431640625, "learning_rate": 8.983649218171981e-06, "loss": 1.1124, "step": 5560 }, { "epoch": 1.74, "grad_norm": 0.431640625, "learning_rate": 8.947483277044593e-06, "loss": 1.1028, "step": 5570 }, { "epoch": 1.74, "grad_norm": 0.37109375, "learning_rate": 8.911331251480357e-06, "loss": 1.2788, "step": 5580 }, { "epoch": 1.75, "grad_norm": 0.7734375, "learning_rate": 8.875193619453438e-06, "loss": 1.1199, "step": 5590 }, { "epoch": 1.75, "grad_norm": 0.185546875, "learning_rate": 8.839070858747697e-06, "loss": 1.1208, "step": 5600 }, { "epoch": 1.75, "grad_norm": 0.375, "learning_rate": 8.802963446950378e-06, "loss": 1.1418, "step": 5610 }, { "epoch": 1.76, "grad_norm": 0.470703125, "learning_rate": 8.76687186144578e-06, "loss": 1.0694, "step": 5620 }, { "epoch": 1.76, "grad_norm": 0.255859375, "learning_rate": 8.730796579408976e-06, "loss": 1.1452, "step": 5630 }, { "epoch": 1.76, "grad_norm": 0.439453125, "learning_rate": 8.694738077799487e-06, "loss": 1.1319, "step": 5640 }, { "epoch": 1.77, "grad_norm": 0.2470703125, "learning_rate": 8.658696833354959e-06, "loss": 1.1615, "step": 5650 }, { "epoch": 1.77, "grad_norm": 0.2392578125, "learning_rate": 8.622673322584894e-06, "loss": 1.1194, "step": 5660 }, { "epoch": 1.77, "grad_norm": 0.28125, "learning_rate": 8.586668021764328e-06, "loss": 1.1372, "step": 5670 }, { "epoch": 1.77, "grad_norm": 0.185546875, "learning_rate": 8.550681406927534e-06, "loss": 1.1236, "step": 5680 }, { "epoch": 1.78, "grad_norm": 0.62109375, "learning_rate": 8.514713953861743e-06, "loss": 1.1411, "step": 5690 }, { "epoch": 1.78, "grad_norm": 0.1728515625, "learning_rate": 8.478766138100834e-06, "loss": 1.1524, "step": 5700 }, { "epoch": 1.78, "grad_norm": 0.27734375, "learning_rate": 8.442838434919066e-06, "loss": 1.0989, "step": 5710 }, { "epoch": 1.79, "grad_norm": 0.2099609375, "learning_rate": 8.406931319324776e-06, "loss": 1.105, "step": 5720 }, { "epoch": 1.79, "grad_norm": 0.1982421875, "learning_rate": 8.371045266054114e-06, "loss": 1.1416, "step": 5730 }, { "epoch": 1.79, "grad_norm": 0.484375, "learning_rate": 8.335180749564759e-06, "loss": 0.9857, "step": 5740 }, { "epoch": 1.8, "grad_norm": 0.21875, "learning_rate": 8.299338244029646e-06, "loss": 1.1576, "step": 5750 }, { "epoch": 1.8, "grad_norm": 0.91015625, "learning_rate": 8.263518223330698e-06, "loss": 1.2153, "step": 5760 }, { "epoch": 1.8, "grad_norm": 0.169921875, "learning_rate": 8.227721161052564e-06, "loss": 1.1014, "step": 5770 }, { "epoch": 1.81, "grad_norm": 0.2421875, "learning_rate": 8.191947530476349e-06, "loss": 1.1624, "step": 5780 }, { "epoch": 1.81, "grad_norm": 0.1884765625, "learning_rate": 8.156197804573368e-06, "loss": 1.0864, "step": 5790 }, { "epoch": 1.81, "grad_norm": 0.2197265625, "learning_rate": 8.120472455998882e-06, "loss": 1.219, "step": 5800 }, { "epoch": 1.82, "grad_norm": 0.2265625, "learning_rate": 8.08477195708586e-06, "loss": 1.135, "step": 5810 }, { "epoch": 1.82, "grad_norm": 0.154296875, "learning_rate": 8.04909677983872e-06, "loss": 1.113, "step": 5820 }, { "epoch": 1.82, "grad_norm": 0.4375, "learning_rate": 8.0134473959271e-06, "loss": 1.1021, "step": 5830 }, { "epoch": 1.82, "grad_norm": 0.328125, "learning_rate": 7.977824276679623e-06, "loss": 1.1617, "step": 5840 }, { "epoch": 1.83, "grad_norm": 0.1591796875, "learning_rate": 7.942227893077652e-06, "loss": 1.1427, "step": 5850 }, { "epoch": 1.83, "grad_norm": 0.2001953125, "learning_rate": 7.90665871574908e-06, "loss": 1.1607, "step": 5860 }, { "epoch": 1.83, "grad_norm": 0.220703125, "learning_rate": 7.871117214962096e-06, "loss": 1.1157, "step": 5870 }, { "epoch": 1.84, "grad_norm": 0.2255859375, "learning_rate": 7.835603860618973e-06, "loss": 1.1335, "step": 5880 }, { "epoch": 1.84, "grad_norm": 0.298828125, "learning_rate": 7.800119122249847e-06, "loss": 1.0962, "step": 5890 }, { "epoch": 1.84, "grad_norm": 0.208984375, "learning_rate": 7.764663469006526e-06, "loss": 1.1688, "step": 5900 }, { "epoch": 1.85, "grad_norm": 0.3984375, "learning_rate": 7.72923736965627e-06, "loss": 1.2101, "step": 5910 }, { "epoch": 1.85, "grad_norm": 0.232421875, "learning_rate": 7.6938412925756e-06, "loss": 1.1859, "step": 5920 }, { "epoch": 1.85, "grad_norm": 0.2412109375, "learning_rate": 7.658475705744109e-06, "loss": 1.2033, "step": 5930 }, { "epoch": 1.86, "grad_norm": 0.490234375, "learning_rate": 7.623141076738271e-06, "loss": 1.1897, "step": 5940 }, { "epoch": 1.86, "grad_norm": 0.32421875, "learning_rate": 7.5878378727252565e-06, "loss": 1.1947, "step": 5950 }, { "epoch": 1.86, "grad_norm": 0.2275390625, "learning_rate": 7.552566560456762e-06, "loss": 1.1671, "step": 5960 }, { "epoch": 1.87, "grad_norm": 0.361328125, "learning_rate": 7.5173276062628364e-06, "loss": 1.1535, "step": 5970 }, { "epoch": 1.87, "grad_norm": 0.443359375, "learning_rate": 7.482121476045716e-06, "loss": 1.1131, "step": 5980 }, { "epoch": 1.87, "grad_norm": 0.255859375, "learning_rate": 7.446948635273655e-06, "loss": 1.1528, "step": 5990 }, { "epoch": 1.88, "grad_norm": 0.21484375, "learning_rate": 7.411809548974792e-06, "loss": 1.162, "step": 6000 }, { "epoch": 1.88, "grad_norm": 0.193359375, "learning_rate": 7.376704681730988e-06, "loss": 1.074, "step": 6010 }, { "epoch": 1.88, "grad_norm": 0.2275390625, "learning_rate": 7.341634497671673e-06, "loss": 1.1559, "step": 6020 }, { "epoch": 1.88, "grad_norm": 0.326171875, "learning_rate": 7.306599460467741e-06, "loss": 1.1815, "step": 6030 }, { "epoch": 1.89, "grad_norm": 0.169921875, "learning_rate": 7.271600033325393e-06, "loss": 1.1228, "step": 6040 }, { "epoch": 1.89, "grad_norm": 0.88671875, "learning_rate": 7.236636678980018e-06, "loss": 1.1813, "step": 6050 }, { "epoch": 1.89, "grad_norm": 0.83984375, "learning_rate": 7.201709859690081e-06, "loss": 1.1252, "step": 6060 }, { "epoch": 1.9, "grad_norm": 0.17578125, "learning_rate": 7.1668200372310124e-06, "loss": 1.0943, "step": 6070 }, { "epoch": 1.9, "grad_norm": 0.1728515625, "learning_rate": 7.131967672889101e-06, "loss": 1.119, "step": 6080 }, { "epoch": 1.9, "grad_norm": 0.27734375, "learning_rate": 7.097153227455379e-06, "loss": 1.1186, "step": 6090 }, { "epoch": 1.91, "grad_norm": 0.53125, "learning_rate": 7.062377161219556e-06, "loss": 1.1206, "step": 6100 }, { "epoch": 1.91, "grad_norm": 0.2265625, "learning_rate": 7.027639933963928e-06, "loss": 1.2089, "step": 6110 }, { "epoch": 1.91, "grad_norm": 0.400390625, "learning_rate": 6.992942004957271e-06, "loss": 1.1223, "step": 6120 }, { "epoch": 1.92, "grad_norm": 0.154296875, "learning_rate": 6.958283832948807e-06, "loss": 1.123, "step": 6130 }, { "epoch": 1.92, "grad_norm": 0.1572265625, "learning_rate": 6.923665876162118e-06, "loss": 1.2252, "step": 6140 }, { "epoch": 1.92, "grad_norm": 0.28515625, "learning_rate": 6.889088592289092e-06, "loss": 1.1673, "step": 6150 }, { "epoch": 1.93, "grad_norm": 0.162109375, "learning_rate": 6.854552438483866e-06, "loss": 1.1382, "step": 6160 }, { "epoch": 1.93, "grad_norm": 0.50390625, "learning_rate": 6.820057871356786e-06, "loss": 1.2062, "step": 6170 }, { "epoch": 1.93, "grad_norm": 0.2197265625, "learning_rate": 6.785605346968387e-06, "loss": 1.1328, "step": 6180 }, { "epoch": 1.93, "grad_norm": 0.228515625, "learning_rate": 6.751195320823325e-06, "loss": 1.0434, "step": 6190 }, { "epoch": 1.94, "grad_norm": 0.294921875, "learning_rate": 6.716828247864391e-06, "loss": 1.1365, "step": 6200 }, { "epoch": 1.94, "grad_norm": 0.2158203125, "learning_rate": 6.682504582466482e-06, "loss": 1.1761, "step": 6210 }, { "epoch": 1.94, "grad_norm": 0.1923828125, "learning_rate": 6.648224778430585e-06, "loss": 1.2372, "step": 6220 }, { "epoch": 1.95, "grad_norm": 0.171875, "learning_rate": 6.613989288977798e-06, "loss": 1.1305, "step": 6230 }, { "epoch": 1.95, "grad_norm": 0.23828125, "learning_rate": 6.579798566743314e-06, "loss": 1.0702, "step": 6240 }, { "epoch": 1.95, "grad_norm": 0.35546875, "learning_rate": 6.545653063770458e-06, "loss": 1.0872, "step": 6250 }, { "epoch": 1.96, "grad_norm": 0.216796875, "learning_rate": 6.5115532315046935e-06, "loss": 1.178, "step": 6260 }, { "epoch": 1.96, "grad_norm": 0.21875, "learning_rate": 6.4774995207876654e-06, "loss": 1.131, "step": 6270 }, { "epoch": 1.96, "grad_norm": 0.1552734375, "learning_rate": 6.443492381851237e-06, "loss": 1.1188, "step": 6280 }, { "epoch": 1.97, "grad_norm": 0.21484375, "learning_rate": 6.409532264311529e-06, "loss": 1.1051, "step": 6290 }, { "epoch": 1.97, "grad_norm": 0.2392578125, "learning_rate": 6.375619617162985e-06, "loss": 1.137, "step": 6300 }, { "epoch": 1.97, "grad_norm": 0.359375, "learning_rate": 6.3417548887724354e-06, "loss": 1.1259, "step": 6310 }, { "epoch": 1.98, "grad_norm": 0.38671875, "learning_rate": 6.3079385268731575e-06, "loss": 1.1637, "step": 6320 }, { "epoch": 1.98, "grad_norm": 0.205078125, "learning_rate": 6.274170978558971e-06, "loss": 1.1434, "step": 6330 }, { "epoch": 1.98, "grad_norm": 0.2197265625, "learning_rate": 6.2404526902783205e-06, "loss": 1.1927, "step": 6340 }, { "epoch": 1.98, "grad_norm": 0.2890625, "learning_rate": 6.206784107828367e-06, "loss": 1.1617, "step": 6350 }, { "epoch": 1.99, "grad_norm": 0.69921875, "learning_rate": 6.173165676349103e-06, "loss": 1.06, "step": 6360 }, { "epoch": 1.99, "grad_norm": 0.169921875, "learning_rate": 6.139597840317464e-06, "loss": 1.1713, "step": 6370 }, { "epoch": 1.99, "grad_norm": 0.1787109375, "learning_rate": 6.106081043541452e-06, "loss": 1.1969, "step": 6380 }, { "epoch": 2.0, "grad_norm": 0.22265625, "learning_rate": 6.072615729154261e-06, "loss": 1.0577, "step": 6390 }, { "epoch": 2.0, "grad_norm": 0.28515625, "learning_rate": 6.039202339608432e-06, "loss": 1.2017, "step": 6400 }, { "epoch": 2.0, "grad_norm": 0.197265625, "learning_rate": 6.005841316669996e-06, "loss": 1.2031, "step": 6410 }, { "epoch": 2.01, "grad_norm": 0.234375, "learning_rate": 5.97253310141263e-06, "loss": 1.2074, "step": 6420 }, { "epoch": 2.01, "grad_norm": 0.1904296875, "learning_rate": 5.939278134211824e-06, "loss": 1.2082, "step": 6430 }, { "epoch": 2.01, "grad_norm": 0.1953125, "learning_rate": 5.9060768547390746e-06, "loss": 1.1496, "step": 6440 }, { "epoch": 2.02, "grad_norm": 1.453125, "learning_rate": 5.872929701956054e-06, "loss": 1.2719, "step": 6450 }, { "epoch": 2.02, "grad_norm": 0.244140625, "learning_rate": 5.839837114108811e-06, "loss": 1.13, "step": 6460 }, { "epoch": 2.02, "grad_norm": 0.2021484375, "learning_rate": 5.806799528721985e-06, "loss": 1.0278, "step": 6470 }, { "epoch": 2.02, "grad_norm": 0.20703125, "learning_rate": 5.773817382593008e-06, "loss": 1.1581, "step": 6480 }, { "epoch": 2.03, "grad_norm": 0.232421875, "learning_rate": 5.740891111786342e-06, "loss": 1.1239, "step": 6490 }, { "epoch": 2.03, "grad_norm": 0.1572265625, "learning_rate": 5.708021151627712e-06, "loss": 1.093, "step": 6500 }, { "epoch": 2.03, "grad_norm": 0.177734375, "learning_rate": 5.675207936698337e-06, "loss": 1.1945, "step": 6510 }, { "epoch": 2.04, "grad_norm": 0.37109375, "learning_rate": 5.642451900829209e-06, "loss": 1.1466, "step": 6520 }, { "epoch": 2.04, "grad_norm": 0.1494140625, "learning_rate": 5.609753477095324e-06, "loss": 1.12, "step": 6530 }, { "epoch": 2.04, "grad_norm": 0.27734375, "learning_rate": 5.5771130978099896e-06, "loss": 1.2319, "step": 6540 }, { "epoch": 2.05, "grad_norm": 0.1884765625, "learning_rate": 5.5445311945190875e-06, "loss": 1.1012, "step": 6550 }, { "epoch": 2.05, "grad_norm": 0.140625, "learning_rate": 5.512008197995379e-06, "loss": 1.1189, "step": 6560 }, { "epoch": 2.05, "grad_norm": 0.1728515625, "learning_rate": 5.479544538232804e-06, "loss": 1.1231, "step": 6570 }, { "epoch": 2.06, "grad_norm": 0.423828125, "learning_rate": 5.447140644440798e-06, "loss": 1.1196, "step": 6580 }, { "epoch": 2.06, "grad_norm": 0.216796875, "learning_rate": 5.414796945038614e-06, "loss": 1.0255, "step": 6590 }, { "epoch": 2.06, "grad_norm": 0.173828125, "learning_rate": 5.382513867649663e-06, "loss": 1.1331, "step": 6600 }, { "epoch": 2.07, "grad_norm": 0.2255859375, "learning_rate": 5.35029183909586e-06, "loss": 1.1612, "step": 6610 }, { "epoch": 2.07, "grad_norm": 0.177734375, "learning_rate": 5.318131285391981e-06, "loss": 1.1348, "step": 6620 }, { "epoch": 2.07, "grad_norm": 1.1015625, "learning_rate": 5.286032631740023e-06, "loss": 1.1347, "step": 6630 }, { "epoch": 2.08, "grad_norm": 0.2080078125, "learning_rate": 5.253996302523596e-06, "loss": 1.1574, "step": 6640 }, { "epoch": 2.08, "grad_norm": 0.263671875, "learning_rate": 5.2220227213023e-06, "loss": 1.1343, "step": 6650 }, { "epoch": 2.08, "grad_norm": 0.1416015625, "learning_rate": 5.190112310806126e-06, "loss": 1.1487, "step": 6660 }, { "epoch": 2.08, "grad_norm": 0.2333984375, "learning_rate": 5.1582654929298745e-06, "loss": 1.0826, "step": 6670 }, { "epoch": 2.09, "grad_norm": 0.310546875, "learning_rate": 5.1264826887275774e-06, "loss": 1.0898, "step": 6680 }, { "epoch": 2.09, "grad_norm": 0.1767578125, "learning_rate": 5.094764318406921e-06, "loss": 1.1969, "step": 6690 }, { "epoch": 2.09, "grad_norm": 0.77734375, "learning_rate": 5.063110801323697e-06, "loss": 1.1896, "step": 6700 }, { "epoch": 2.1, "grad_norm": 0.189453125, "learning_rate": 5.031522555976263e-06, "loss": 1.1934, "step": 6710 }, { "epoch": 2.1, "grad_norm": 0.2353515625, "learning_rate": 5.000000000000003e-06, "loss": 1.1681, "step": 6720 }, { "epoch": 2.1, "grad_norm": 0.2197265625, "learning_rate": 4.968543550161795e-06, "loss": 1.1093, "step": 6730 }, { "epoch": 2.11, "grad_norm": 0.2060546875, "learning_rate": 4.9371536223545295e-06, "loss": 1.1083, "step": 6740 }, { "epoch": 2.11, "grad_norm": 0.1630859375, "learning_rate": 4.9058306315915826e-06, "loss": 1.1548, "step": 6750 }, { "epoch": 2.11, "grad_norm": 0.251953125, "learning_rate": 4.874574992001348e-06, "loss": 1.0813, "step": 6760 }, { "epoch": 2.12, "grad_norm": 0.1787109375, "learning_rate": 4.843387116821749e-06, "loss": 1.1811, "step": 6770 }, { "epoch": 2.12, "grad_norm": 0.271484375, "learning_rate": 4.812267418394784e-06, "loss": 1.1001, "step": 6780 }, { "epoch": 2.12, "grad_norm": 0.1572265625, "learning_rate": 4.781216308161072e-06, "loss": 1.1128, "step": 6790 }, { "epoch": 2.12, "grad_norm": 0.1904296875, "learning_rate": 4.7502341966544e-06, "loss": 1.1298, "step": 6800 }, { "epoch": 2.13, "grad_norm": 0.1953125, "learning_rate": 4.7193214934963204e-06, "loss": 1.1128, "step": 6810 }, { "epoch": 2.13, "grad_norm": 0.1640625, "learning_rate": 4.688478607390723e-06, "loss": 1.1457, "step": 6820 }, { "epoch": 2.13, "grad_norm": 0.158203125, "learning_rate": 4.657705946118414e-06, "loss": 1.1214, "step": 6830 }, { "epoch": 2.14, "grad_norm": 0.65625, "learning_rate": 4.627003916531761e-06, "loss": 1.1325, "step": 6840 }, { "epoch": 2.14, "grad_norm": 0.22265625, "learning_rate": 4.5963729245492875e-06, "loss": 1.1377, "step": 6850 }, { "epoch": 2.14, "grad_norm": 0.1689453125, "learning_rate": 4.565813375150302e-06, "loss": 1.1611, "step": 6860 }, { "epoch": 2.15, "grad_norm": 0.1552734375, "learning_rate": 4.535325672369567e-06, "loss": 1.1096, "step": 6870 }, { "epoch": 2.15, "grad_norm": 0.5703125, "learning_rate": 4.504910219291941e-06, "loss": 1.0815, "step": 6880 }, { "epoch": 2.15, "grad_norm": 0.169921875, "learning_rate": 4.474567418047053e-06, "loss": 1.0877, "step": 6890 }, { "epoch": 2.16, "grad_norm": 0.890625, "learning_rate": 4.444297669803981e-06, "loss": 1.1295, "step": 6900 }, { "epoch": 2.16, "grad_norm": 0.2021484375, "learning_rate": 4.414101374765953e-06, "loss": 1.1272, "step": 6910 }, { "epoch": 2.16, "grad_norm": 0.232421875, "learning_rate": 4.3839789321650724e-06, "loss": 1.1389, "step": 6920 }, { "epoch": 2.17, "grad_norm": 0.1767578125, "learning_rate": 4.353930740256997e-06, "loss": 1.0919, "step": 6930 }, { "epoch": 2.17, "grad_norm": 0.26171875, "learning_rate": 4.323957196315714e-06, "loss": 1.1784, "step": 6940 }, { "epoch": 2.17, "grad_norm": 0.240234375, "learning_rate": 4.294058696628272e-06, "loss": 1.1252, "step": 6950 }, { "epoch": 2.17, "grad_norm": 0.296875, "learning_rate": 4.264235636489542e-06, "loss": 1.156, "step": 6960 }, { "epoch": 2.18, "grad_norm": 0.2333984375, "learning_rate": 4.234488410196985e-06, "loss": 1.1763, "step": 6970 }, { "epoch": 2.18, "grad_norm": 0.1640625, "learning_rate": 4.20481741104545e-06, "loss": 1.084, "step": 6980 }, { "epoch": 2.18, "grad_norm": 0.1904296875, "learning_rate": 4.17522303132198e-06, "loss": 1.1256, "step": 6990 }, { "epoch": 2.19, "grad_norm": 0.291015625, "learning_rate": 4.1457056623005954e-06, "loss": 1.1413, "step": 7000 }, { "epoch": 2.19, "grad_norm": 0.1689453125, "learning_rate": 4.116265694237155e-06, "loss": 1.1526, "step": 7010 }, { "epoch": 2.19, "grad_norm": 0.32421875, "learning_rate": 4.086903516364179e-06, "loss": 1.1419, "step": 7020 }, { "epoch": 2.2, "grad_norm": 0.83203125, "learning_rate": 4.057619516885699e-06, "loss": 1.1273, "step": 7030 }, { "epoch": 2.2, "grad_norm": 0.271484375, "learning_rate": 4.028414082972141e-06, "loss": 1.1381, "step": 7040 }, { "epoch": 2.2, "grad_norm": 0.185546875, "learning_rate": 3.999287600755192e-06, "loss": 1.1496, "step": 7050 }, { "epoch": 2.21, "grad_norm": 0.2392578125, "learning_rate": 3.970240455322705e-06, "loss": 1.1078, "step": 7060 }, { "epoch": 2.21, "grad_norm": 0.1845703125, "learning_rate": 3.9412730307136e-06, "loss": 1.114, "step": 7070 }, { "epoch": 2.21, "grad_norm": 0.1650390625, "learning_rate": 3.912385709912794e-06, "loss": 1.0728, "step": 7080 }, { "epoch": 2.22, "grad_norm": 0.1943359375, "learning_rate": 3.88357887484613e-06, "loss": 1.1587, "step": 7090 }, { "epoch": 2.22, "grad_norm": 0.25390625, "learning_rate": 3.854852906375326e-06, "loss": 1.1683, "step": 7100 }, { "epoch": 2.22, "grad_norm": 0.19921875, "learning_rate": 3.826208184292952e-06, "loss": 1.1083, "step": 7110 }, { "epoch": 2.23, "grad_norm": 0.177734375, "learning_rate": 3.797645087317401e-06, "loss": 1.2234, "step": 7120 }, { "epoch": 2.23, "grad_norm": 0.1982421875, "learning_rate": 3.7691639930878767e-06, "loss": 1.1747, "step": 7130 }, { "epoch": 2.23, "grad_norm": 0.3359375, "learning_rate": 3.7407652781594094e-06, "loss": 1.1763, "step": 7140 }, { "epoch": 2.23, "grad_norm": 0.169921875, "learning_rate": 3.7124493179978737e-06, "loss": 1.1894, "step": 7150 }, { "epoch": 2.24, "grad_norm": 0.189453125, "learning_rate": 3.6842164869750265e-06, "loss": 1.1198, "step": 7160 }, { "epoch": 2.24, "grad_norm": 0.2236328125, "learning_rate": 3.6560671583635467e-06, "loss": 1.1708, "step": 7170 }, { "epoch": 2.24, "grad_norm": 0.177734375, "learning_rate": 3.628001704332118e-06, "loss": 1.1266, "step": 7180 }, { "epoch": 2.25, "grad_norm": 0.275390625, "learning_rate": 3.600020495940496e-06, "loss": 1.1959, "step": 7190 }, { "epoch": 2.25, "grad_norm": 0.1708984375, "learning_rate": 3.5721239031346067e-06, "loss": 1.0973, "step": 7200 }, { "epoch": 2.25, "grad_norm": 0.2734375, "learning_rate": 3.544312294741652e-06, "loss": 1.0695, "step": 7210 }, { "epoch": 2.26, "grad_norm": 0.2138671875, "learning_rate": 3.5165860384652374e-06, "loss": 1.1086, "step": 7220 }, { "epoch": 2.26, "grad_norm": 0.201171875, "learning_rate": 3.4889455008805107e-06, "loss": 1.1062, "step": 7230 }, { "epoch": 2.26, "grad_norm": 0.2578125, "learning_rate": 3.4613910474293045e-06, "loss": 1.2236, "step": 7240 }, { "epoch": 2.27, "grad_norm": 0.251953125, "learning_rate": 3.4339230424153225e-06, "loss": 1.1529, "step": 7250 }, { "epoch": 2.27, "grad_norm": 0.1787109375, "learning_rate": 3.4065418489993118e-06, "loss": 1.1224, "step": 7260 }, { "epoch": 2.27, "grad_norm": 0.5546875, "learning_rate": 3.3792478291942623e-06, "loss": 1.168, "step": 7270 }, { "epoch": 2.27, "grad_norm": 0.2109375, "learning_rate": 3.3520413438606215e-06, "loss": 1.1042, "step": 7280 }, { "epoch": 2.28, "grad_norm": 0.400390625, "learning_rate": 3.324922752701528e-06, "loss": 1.177, "step": 7290 }, { "epoch": 2.28, "grad_norm": 0.369140625, "learning_rate": 3.2978924142580427e-06, "loss": 1.1132, "step": 7300 }, { "epoch": 2.28, "grad_norm": 0.333984375, "learning_rate": 3.2709506859044248e-06, "loss": 1.1695, "step": 7310 }, { "epoch": 2.29, "grad_norm": 0.1552734375, "learning_rate": 3.2440979238433977e-06, "loss": 1.1658, "step": 7320 }, { "epoch": 2.29, "grad_norm": 0.6875, "learning_rate": 3.217334483101441e-06, "loss": 1.1292, "step": 7330 }, { "epoch": 2.29, "grad_norm": 0.2890625, "learning_rate": 3.1906607175240943e-06, "loss": 1.2035, "step": 7340 }, { "epoch": 2.3, "grad_norm": 0.3046875, "learning_rate": 3.1640769797712865e-06, "loss": 1.2309, "step": 7350 }, { "epoch": 2.3, "grad_norm": 0.19921875, "learning_rate": 3.1375836213126653e-06, "loss": 1.0319, "step": 7360 }, { "epoch": 2.3, "grad_norm": 0.1689453125, "learning_rate": 3.11118099242295e-06, "loss": 1.0977, "step": 7370 }, { "epoch": 2.31, "grad_norm": 0.181640625, "learning_rate": 3.0848694421773075e-06, "loss": 1.1817, "step": 7380 }, { "epoch": 2.31, "grad_norm": 0.3203125, "learning_rate": 3.058649318446736e-06, "loss": 1.1536, "step": 7390 }, { "epoch": 2.31, "grad_norm": 0.259765625, "learning_rate": 3.032520967893453e-06, "loss": 1.0876, "step": 7400 }, { "epoch": 2.32, "grad_norm": 0.2099609375, "learning_rate": 3.0064847359663284e-06, "loss": 1.1101, "step": 7410 }, { "epoch": 2.32, "grad_norm": 0.18359375, "learning_rate": 2.980540966896317e-06, "loss": 1.1642, "step": 7420 }, { "epoch": 2.32, "grad_norm": 0.1591796875, "learning_rate": 2.9546900036918956e-06, "loss": 1.0749, "step": 7430 }, { "epoch": 2.33, "grad_norm": 0.4765625, "learning_rate": 2.9289321881345257e-06, "loss": 1.1889, "step": 7440 }, { "epoch": 2.33, "grad_norm": 0.224609375, "learning_rate": 2.9032678607741526e-06, "loss": 1.1401, "step": 7450 }, { "epoch": 2.33, "grad_norm": 0.21875, "learning_rate": 2.877697360924693e-06, "loss": 1.1381, "step": 7460 }, { "epoch": 2.33, "grad_norm": 0.197265625, "learning_rate": 2.8522210266595386e-06, "loss": 1.1324, "step": 7470 }, { "epoch": 2.34, "grad_norm": 0.337890625, "learning_rate": 2.826839194807105e-06, "loss": 1.1212, "step": 7480 }, { "epoch": 2.34, "grad_norm": 0.1943359375, "learning_rate": 2.8015522009463736e-06, "loss": 1.0972, "step": 7490 }, { "epoch": 2.34, "grad_norm": 0.255859375, "learning_rate": 2.776360379402445e-06, "loss": 1.103, "step": 7500 }, { "epoch": 2.35, "grad_norm": 0.1572265625, "learning_rate": 2.751264063242122e-06, "loss": 1.1275, "step": 7510 }, { "epoch": 2.35, "grad_norm": 0.26171875, "learning_rate": 2.726263584269513e-06, "loss": 1.056, "step": 7520 }, { "epoch": 2.35, "grad_norm": 0.1650390625, "learning_rate": 2.7013592730216464e-06, "loss": 1.0944, "step": 7530 }, { "epoch": 2.36, "grad_norm": 0.19140625, "learning_rate": 2.6765514587640815e-06, "loss": 1.1272, "step": 7540 }, { "epoch": 2.36, "grad_norm": 0.1953125, "learning_rate": 2.651840469486582e-06, "loss": 1.1535, "step": 7550 }, { "epoch": 2.36, "grad_norm": 0.6953125, "learning_rate": 2.6272266318987606e-06, "loss": 1.2144, "step": 7560 }, { "epoch": 2.37, "grad_norm": 0.2109375, "learning_rate": 2.602710271425767e-06, "loss": 1.2021, "step": 7570 }, { "epoch": 2.37, "grad_norm": 1.1328125, "learning_rate": 2.578291712203983e-06, "loss": 1.2018, "step": 7580 }, { "epoch": 2.37, "grad_norm": 0.2734375, "learning_rate": 2.5539712770767377e-06, "loss": 1.198, "step": 7590 }, { "epoch": 2.38, "grad_norm": 0.63671875, "learning_rate": 2.529749287590042e-06, "loss": 1.2152, "step": 7600 }, { "epoch": 2.38, "grad_norm": 0.1796875, "learning_rate": 2.5056260639883278e-06, "loss": 1.1113, "step": 7610 }, { "epoch": 2.38, "grad_norm": 0.189453125, "learning_rate": 2.4816019252102274e-06, "loss": 1.1076, "step": 7620 }, { "epoch": 2.38, "grad_norm": 0.2294921875, "learning_rate": 2.4576771888843478e-06, "loss": 1.0783, "step": 7630 }, { "epoch": 2.39, "grad_norm": 0.177734375, "learning_rate": 2.4338521713250717e-06, "loss": 1.145, "step": 7640 }, { "epoch": 2.39, "grad_norm": 0.5234375, "learning_rate": 2.4101271875283818e-06, "loss": 1.1538, "step": 7650 }, { "epoch": 2.39, "grad_norm": 0.1748046875, "learning_rate": 2.3865025511676896e-06, "loss": 1.063, "step": 7660 }, { "epoch": 2.4, "grad_norm": 0.392578125, "learning_rate": 2.362978574589686e-06, "loss": 1.1177, "step": 7670 }, { "epoch": 2.4, "grad_norm": 0.2236328125, "learning_rate": 2.339555568810221e-06, "loss": 1.1144, "step": 7680 }, { "epoch": 2.4, "grad_norm": 0.2109375, "learning_rate": 2.316233843510186e-06, "loss": 1.0996, "step": 7690 }, { "epoch": 2.41, "grad_norm": 0.193359375, "learning_rate": 2.2930137070314196e-06, "loss": 1.1714, "step": 7700 }, { "epoch": 2.41, "grad_norm": 0.234375, "learning_rate": 2.26989546637263e-06, "loss": 1.1246, "step": 7710 }, { "epoch": 2.41, "grad_norm": 0.1826171875, "learning_rate": 2.246879427185341e-06, "loss": 1.1508, "step": 7720 }, { "epoch": 2.42, "grad_norm": 0.1826171875, "learning_rate": 2.223965893769847e-06, "loss": 1.1693, "step": 7730 }, { "epoch": 2.42, "grad_norm": 0.375, "learning_rate": 2.201155169071184e-06, "loss": 1.0889, "step": 7740 }, { "epoch": 2.42, "grad_norm": 0.25390625, "learning_rate": 2.178447554675136e-06, "loss": 1.149, "step": 7750 }, { "epoch": 2.42, "grad_norm": 0.1904296875, "learning_rate": 2.155843350804243e-06, "loss": 1.1733, "step": 7760 }, { "epoch": 2.43, "grad_norm": 0.3359375, "learning_rate": 2.1333428563138304e-06, "loss": 1.2165, "step": 7770 }, { "epoch": 2.43, "grad_norm": 0.7421875, "learning_rate": 2.110946368688055e-06, "loss": 1.0973, "step": 7780 }, { "epoch": 2.43, "grad_norm": 0.1845703125, "learning_rate": 2.0886541840359776e-06, "loss": 1.1503, "step": 7790 }, { "epoch": 2.44, "grad_norm": 0.185546875, "learning_rate": 2.0664665970876496e-06, "loss": 1.1956, "step": 7800 }, { "epoch": 2.44, "grad_norm": 0.341796875, "learning_rate": 2.0443839011902023e-06, "loss": 1.156, "step": 7810 }, { "epoch": 2.44, "grad_norm": 0.21484375, "learning_rate": 2.0224063883039868e-06, "loss": 1.2088, "step": 7820 }, { "epoch": 2.45, "grad_norm": 0.2734375, "learning_rate": 2.0005343489987038e-06, "loss": 1.1721, "step": 7830 }, { "epoch": 2.45, "grad_norm": 0.2060546875, "learning_rate": 1.9787680724495617e-06, "loss": 1.1056, "step": 7840 }, { "epoch": 2.45, "grad_norm": 0.24609375, "learning_rate": 1.957107846433459e-06, "loss": 1.1742, "step": 7850 }, { "epoch": 2.46, "grad_norm": 0.173828125, "learning_rate": 1.9355539573251737e-06, "loss": 1.0646, "step": 7860 }, { "epoch": 2.46, "grad_norm": 0.271484375, "learning_rate": 1.914106690093581e-06, "loss": 1.186, "step": 7870 }, { "epoch": 2.46, "grad_norm": 1.3671875, "learning_rate": 1.8927663282978781e-06, "loss": 1.1526, "step": 7880 }, { "epoch": 2.47, "grad_norm": 0.18359375, "learning_rate": 1.8715331540838488e-06, "loss": 1.1172, "step": 7890 }, { "epoch": 2.47, "grad_norm": 0.62109375, "learning_rate": 1.8504074481801237e-06, "loss": 1.1363, "step": 7900 }, { "epoch": 2.47, "grad_norm": 0.19921875, "learning_rate": 1.829389489894462e-06, "loss": 1.1328, "step": 7910 }, { "epoch": 2.48, "grad_norm": 0.60546875, "learning_rate": 1.808479557110081e-06, "loss": 1.1105, "step": 7920 }, { "epoch": 2.48, "grad_norm": 0.2109375, "learning_rate": 1.7876779262819633e-06, "loss": 1.0745, "step": 7930 }, { "epoch": 2.48, "grad_norm": 0.5078125, "learning_rate": 1.7669848724331984e-06, "loss": 1.1751, "step": 7940 }, { "epoch": 2.48, "grad_norm": 0.22265625, "learning_rate": 1.7464006691513624e-06, "loss": 1.0984, "step": 7950 }, { "epoch": 2.49, "grad_norm": 0.2158203125, "learning_rate": 1.7259255885848946e-06, "loss": 1.1735, "step": 7960 }, { "epoch": 2.49, "grad_norm": 0.2109375, "learning_rate": 1.7055599014394974e-06, "loss": 1.1037, "step": 7970 }, { "epoch": 2.49, "grad_norm": 0.6015625, "learning_rate": 1.6853038769745466e-06, "loss": 1.1227, "step": 7980 }, { "epoch": 2.5, "grad_norm": 0.1884765625, "learning_rate": 1.6651577829995625e-06, "loss": 1.1774, "step": 7990 }, { "epoch": 2.5, "grad_norm": 0.2138671875, "learning_rate": 1.6451218858706374e-06, "loss": 1.1882, "step": 8000 }, { "epoch": 2.5, "grad_norm": 0.37890625, "learning_rate": 1.6251964504869221e-06, "loss": 1.1326, "step": 8010 }, { "epoch": 2.51, "grad_norm": 0.5859375, "learning_rate": 1.6053817402871363e-06, "loss": 1.1951, "step": 8020 }, { "epoch": 2.51, "grad_norm": 0.1962890625, "learning_rate": 1.585678017246075e-06, "loss": 1.1242, "step": 8030 }, { "epoch": 2.51, "grad_norm": 0.2001953125, "learning_rate": 1.566085541871145e-06, "loss": 1.1974, "step": 8040 }, { "epoch": 2.52, "grad_norm": 0.1943359375, "learning_rate": 1.5466045731989199e-06, "loss": 1.1416, "step": 8050 }, { "epoch": 2.52, "grad_norm": 3.25, "learning_rate": 1.5272353687917197e-06, "loss": 1.1547, "step": 8060 }, { "epoch": 2.52, "grad_norm": 0.2060546875, "learning_rate": 1.5079781847342122e-06, "loss": 1.1367, "step": 8070 }, { "epoch": 2.52, "grad_norm": 0.2275390625, "learning_rate": 1.4888332756300027e-06, "loss": 1.1798, "step": 8080 }, { "epoch": 2.53, "grad_norm": 0.181640625, "learning_rate": 1.4698008945982966e-06, "loss": 1.1475, "step": 8090 }, { "epoch": 2.53, "grad_norm": 0.2109375, "learning_rate": 1.4508812932705364e-06, "loss": 1.1843, "step": 8100 }, { "epoch": 2.53, "grad_norm": 0.248046875, "learning_rate": 1.4320747217870722e-06, "loss": 1.1455, "step": 8110 }, { "epoch": 2.54, "grad_norm": 0.1708984375, "learning_rate": 1.4133814287938707e-06, "loss": 1.1856, "step": 8120 }, { "epoch": 2.54, "grad_norm": 0.2431640625, "learning_rate": 1.3948016614392113e-06, "loss": 1.1715, "step": 8130 }, { "epoch": 2.54, "grad_norm": 0.1845703125, "learning_rate": 1.3763356653704274e-06, "loss": 1.1171, "step": 8140 }, { "epoch": 2.55, "grad_norm": 0.279296875, "learning_rate": 1.357983684730657e-06, "loss": 1.1482, "step": 8150 }, { "epoch": 2.55, "grad_norm": 0.30078125, "learning_rate": 1.339745962155613e-06, "loss": 1.1939, "step": 8160 }, { "epoch": 2.55, "grad_norm": 0.2021484375, "learning_rate": 1.3216227387703795e-06, "loss": 1.1733, "step": 8170 }, { "epoch": 2.56, "grad_norm": 0.197265625, "learning_rate": 1.3036142541862119e-06, "loss": 1.0869, "step": 8180 }, { "epoch": 2.56, "grad_norm": 0.2099609375, "learning_rate": 1.2857207464973876e-06, "loss": 1.0876, "step": 8190 }, { "epoch": 2.56, "grad_norm": 0.259765625, "learning_rate": 1.2679424522780426e-06, "loss": 1.1027, "step": 8200 }, { "epoch": 2.57, "grad_norm": 0.3515625, "learning_rate": 1.2502796065790534e-06, "loss": 1.2209, "step": 8210 }, { "epoch": 2.57, "grad_norm": 0.2314453125, "learning_rate": 1.2327324429249232e-06, "loss": 1.0929, "step": 8220 }, { "epoch": 2.57, "grad_norm": 0.1884765625, "learning_rate": 1.215301193310695e-06, "loss": 1.0845, "step": 8230 }, { "epoch": 2.58, "grad_norm": 1.546875, "learning_rate": 1.1979860881988903e-06, "loss": 1.0954, "step": 8240 }, { "epoch": 2.58, "grad_norm": 0.26171875, "learning_rate": 1.1807873565164507e-06, "loss": 1.1575, "step": 8250 }, { "epoch": 2.58, "grad_norm": 0.3203125, "learning_rate": 1.1637052256517245e-06, "loss": 1.0876, "step": 8260 }, { "epoch": 2.58, "grad_norm": 0.2265625, "learning_rate": 1.146739921451453e-06, "loss": 1.1826, "step": 8270 }, { "epoch": 2.59, "grad_norm": 0.201171875, "learning_rate": 1.129891668217783e-06, "loss": 1.1179, "step": 8280 }, { "epoch": 2.59, "grad_norm": 0.1640625, "learning_rate": 1.1131606887053058e-06, "loss": 1.1667, "step": 8290 }, { "epoch": 2.59, "grad_norm": 0.2041015625, "learning_rate": 1.0965472041181102e-06, "loss": 1.0908, "step": 8300 }, { "epoch": 2.6, "grad_norm": 0.474609375, "learning_rate": 1.0800514341068592e-06, "loss": 1.1571, "step": 8310 }, { "epoch": 2.6, "grad_norm": 0.1787109375, "learning_rate": 1.0636735967658785e-06, "loss": 1.1791, "step": 8320 }, { "epoch": 2.6, "grad_norm": 0.2392578125, "learning_rate": 1.0474139086302848e-06, "loss": 1.2092, "step": 8330 }, { "epoch": 2.61, "grad_norm": 0.265625, "learning_rate": 1.0312725846731174e-06, "loss": 1.125, "step": 8340 }, { "epoch": 2.61, "grad_norm": 0.1904296875, "learning_rate": 1.0152498383024922e-06, "loss": 1.1918, "step": 8350 }, { "epoch": 2.61, "grad_norm": 0.208984375, "learning_rate": 9.993458813587885e-07, "loss": 1.1757, "step": 8360 }, { "epoch": 2.62, "grad_norm": 0.1767578125, "learning_rate": 9.835609241118404e-07, "loss": 1.1526, "step": 8370 }, { "epoch": 2.62, "grad_norm": 0.396484375, "learning_rate": 9.678951752581577e-07, "loss": 1.0743, "step": 8380 }, { "epoch": 2.62, "grad_norm": 0.173828125, "learning_rate": 9.523488419181737e-07, "loss": 1.1601, "step": 8390 }, { "epoch": 2.62, "grad_norm": 0.1591796875, "learning_rate": 9.369221296335007e-07, "loss": 1.1483, "step": 8400 }, { "epoch": 2.63, "grad_norm": 0.1787109375, "learning_rate": 9.216152423642122e-07, "loss": 1.1179, "step": 8410 }, { "epoch": 2.63, "grad_norm": 0.1904296875, "learning_rate": 9.064283824861486e-07, "loss": 1.1955, "step": 8420 }, { "epoch": 2.63, "grad_norm": 0.1845703125, "learning_rate": 8.91361750788241e-07, "loss": 1.1328, "step": 8430 }, { "epoch": 2.64, "grad_norm": 0.15625, "learning_rate": 8.764155464698598e-07, "loss": 1.1189, "step": 8440 }, { "epoch": 2.64, "grad_norm": 0.15625, "learning_rate": 8.615899671381689e-07, "loss": 1.0908, "step": 8450 }, { "epoch": 2.64, "grad_norm": 1.3359375, "learning_rate": 8.468852088055291e-07, "loss": 1.2761, "step": 8460 }, { "epoch": 2.65, "grad_norm": 0.203125, "learning_rate": 8.323014658869e-07, "loss": 1.2368, "step": 8470 }, { "epoch": 2.65, "grad_norm": 0.271484375, "learning_rate": 8.178389311972612e-07, "loss": 1.1072, "step": 8480 }, { "epoch": 2.65, "grad_norm": 0.1923828125, "learning_rate": 8.034977959490775e-07, "loss": 1.1443, "step": 8490 }, { "epoch": 2.66, "grad_norm": 0.390625, "learning_rate": 7.892782497497642e-07, "loss": 1.1778, "step": 8500 }, { "epoch": 2.66, "grad_norm": 0.232421875, "learning_rate": 7.751804805991792e-07, "loss": 1.133, "step": 8510 }, { "epoch": 2.66, "grad_norm": 0.228515625, "learning_rate": 7.612046748871327e-07, "loss": 1.0993, "step": 8520 }, { "epoch": 2.67, "grad_norm": 0.2333984375, "learning_rate": 7.47351017390936e-07, "loss": 1.0747, "step": 8530 }, { "epoch": 2.67, "grad_norm": 0.2431640625, "learning_rate": 7.336196912729488e-07, "loss": 1.1714, "step": 8540 }, { "epoch": 2.67, "grad_norm": 0.2216796875, "learning_rate": 7.200108780781556e-07, "loss": 1.1422, "step": 8550 }, { "epoch": 2.67, "grad_norm": 0.1845703125, "learning_rate": 7.065247577317747e-07, "loss": 1.0799, "step": 8560 }, { "epoch": 2.68, "grad_norm": 0.640625, "learning_rate": 6.931615085368748e-07, "loss": 1.135, "step": 8570 }, { "epoch": 2.68, "grad_norm": 0.193359375, "learning_rate": 6.799213071720156e-07, "loss": 1.121, "step": 8580 }, { "epoch": 2.68, "grad_norm": 0.2890625, "learning_rate": 6.668043286889092e-07, "loss": 1.1067, "step": 8590 }, { "epoch": 2.69, "grad_norm": 0.2333984375, "learning_rate": 6.538107465101162e-07, "loss": 1.1267, "step": 8600 }, { "epoch": 2.69, "grad_norm": 0.1669921875, "learning_rate": 6.409407324267448e-07, "loss": 1.0681, "step": 8610 }, { "epoch": 2.69, "grad_norm": 0.181640625, "learning_rate": 6.281944565961772e-07, "loss": 1.1079, "step": 8620 }, { "epoch": 2.7, "grad_norm": 0.1591796875, "learning_rate": 6.155720875398297e-07, "loss": 1.1112, "step": 8630 }, { "epoch": 2.7, "grad_norm": 0.1982421875, "learning_rate": 6.030737921409169e-07, "loss": 1.1731, "step": 8640 }, { "epoch": 2.7, "grad_norm": 0.81640625, "learning_rate": 5.906997356422461e-07, "loss": 1.1339, "step": 8650 }, { "epoch": 2.71, "grad_norm": 0.486328125, "learning_rate": 5.784500816440353e-07, "loss": 1.1263, "step": 8660 }, { "epoch": 2.71, "grad_norm": 0.267578125, "learning_rate": 5.663249921017477e-07, "loss": 1.1395, "step": 8670 }, { "epoch": 2.71, "grad_norm": 0.228515625, "learning_rate": 5.543246273239533e-07, "loss": 1.1303, "step": 8680 }, { "epoch": 2.72, "grad_norm": 0.154296875, "learning_rate": 5.424491459702053e-07, "loss": 1.1496, "step": 8690 }, { "epoch": 2.72, "grad_norm": 0.294921875, "learning_rate": 5.306987050489442e-07, "loss": 1.1948, "step": 8700 }, { "epoch": 2.72, "grad_norm": 0.171875, "learning_rate": 5.190734599154257e-07, "loss": 1.1804, "step": 8710 }, { "epoch": 2.73, "grad_norm": 0.2060546875, "learning_rate": 5.075735642696611e-07, "loss": 1.1113, "step": 8720 }, { "epoch": 2.73, "grad_norm": 0.1640625, "learning_rate": 4.961991701543889e-07, "loss": 1.102, "step": 8730 }, { "epoch": 2.73, "grad_norm": 0.2001953125, "learning_rate": 4.849504279530637e-07, "loss": 1.085, "step": 8740 }, { "epoch": 2.73, "grad_norm": 0.255859375, "learning_rate": 4.7382748638786336e-07, "loss": 1.2166, "step": 8750 }, { "epoch": 2.74, "grad_norm": 0.271484375, "learning_rate": 4.628304925177318e-07, "loss": 1.1139, "step": 8760 }, { "epoch": 2.74, "grad_norm": 0.1982421875, "learning_rate": 4.519595917364272e-07, "loss": 1.1451, "step": 8770 }, { "epoch": 2.74, "grad_norm": 0.353515625, "learning_rate": 4.412149277706046e-07, "loss": 1.0985, "step": 8780 }, { "epoch": 2.75, "grad_norm": 0.1669921875, "learning_rate": 4.305966426779118e-07, "loss": 1.1081, "step": 8790 }, { "epoch": 2.75, "grad_norm": 0.1787109375, "learning_rate": 4.2010487684511105e-07, "loss": 1.0521, "step": 8800 }, { "epoch": 2.75, "grad_norm": 0.25, "learning_rate": 4.0973976898622923e-07, "loss": 1.1641, "step": 8810 }, { "epoch": 2.76, "grad_norm": 0.28515625, "learning_rate": 3.99501456140714e-07, "loss": 1.162, "step": 8820 }, { "epoch": 2.76, "grad_norm": 0.380859375, "learning_rate": 3.893900736716305e-07, "loss": 1.1032, "step": 8830 }, { "epoch": 2.76, "grad_norm": 0.1796875, "learning_rate": 3.794057552638686e-07, "loss": 1.1375, "step": 8840 }, { "epoch": 2.77, "grad_norm": 0.3125, "learning_rate": 3.6954863292237297e-07, "loss": 1.148, "step": 8850 }, { "epoch": 2.77, "grad_norm": 0.1767578125, "learning_rate": 3.5981883697040363e-07, "loss": 1.1263, "step": 8860 }, { "epoch": 2.77, "grad_norm": 0.2080078125, "learning_rate": 3.5021649604780714e-07, "loss": 1.0826, "step": 8870 }, { "epoch": 2.77, "grad_norm": 0.2490234375, "learning_rate": 3.4074173710931804e-07, "loss": 1.1399, "step": 8880 }, { "epoch": 2.78, "grad_norm": 0.193359375, "learning_rate": 3.3139468542288e-07, "loss": 1.1834, "step": 8890 }, { "epoch": 2.78, "grad_norm": 0.34375, "learning_rate": 3.2217546456799086e-07, "loss": 1.0919, "step": 8900 }, { "epoch": 2.78, "grad_norm": 0.478515625, "learning_rate": 3.1308419643406915e-07, "loss": 1.1491, "step": 8910 }, { "epoch": 2.79, "grad_norm": 0.23828125, "learning_rate": 3.0412100121884005e-07, "loss": 1.1358, "step": 8920 }, { "epoch": 2.79, "grad_norm": 0.1572265625, "learning_rate": 2.9528599742674635e-07, "loss": 1.1614, "step": 8930 }, { "epoch": 2.79, "grad_norm": 0.30078125, "learning_rate": 2.865793018673857e-07, "loss": 1.1033, "step": 8940 }, { "epoch": 2.8, "grad_norm": 0.2333984375, "learning_rate": 2.780010296539615e-07, "loss": 1.1511, "step": 8950 }, { "epoch": 2.8, "grad_norm": 0.287109375, "learning_rate": 2.6955129420176193e-07, "loss": 1.1184, "step": 8960 }, { "epoch": 2.8, "grad_norm": 0.271484375, "learning_rate": 2.612302072266637e-07, "loss": 1.1031, "step": 8970 }, { "epoch": 2.81, "grad_norm": 0.1787109375, "learning_rate": 2.530378787436527e-07, "loss": 1.1486, "step": 8980 }, { "epoch": 2.81, "grad_norm": 0.2578125, "learning_rate": 2.449744170653645e-07, "loss": 1.1128, "step": 8990 }, { "epoch": 2.81, "grad_norm": 0.2890625, "learning_rate": 2.370399288006664e-07, "loss": 1.1861, "step": 9000 }, { "epoch": 2.82, "grad_norm": 0.65234375, "learning_rate": 2.292345188532308e-07, "loss": 1.1236, "step": 9010 }, { "epoch": 2.82, "grad_norm": 0.2490234375, "learning_rate": 2.2155829042015963e-07, "loss": 1.1608, "step": 9020 }, { "epoch": 2.82, "grad_norm": 0.150390625, "learning_rate": 2.140113449906167e-07, "loss": 1.0921, "step": 9030 }, { "epoch": 2.83, "grad_norm": 0.248046875, "learning_rate": 2.0659378234448524e-07, "loss": 1.1844, "step": 9040 }, { "epoch": 2.83, "grad_norm": 0.1630859375, "learning_rate": 1.9930570055104903e-07, "loss": 1.0844, "step": 9050 }, { "epoch": 2.83, "grad_norm": 0.2021484375, "learning_rate": 1.921471959676957e-07, "loss": 1.0501, "step": 9060 }, { "epoch": 2.83, "grad_norm": 0.19140625, "learning_rate": 1.851183632386444e-07, "loss": 1.0818, "step": 9070 }, { "epoch": 2.84, "grad_norm": 0.193359375, "learning_rate": 1.7821929529369343e-07, "loss": 1.1264, "step": 9080 }, { "epoch": 2.84, "grad_norm": 0.298828125, "learning_rate": 1.7145008334698898e-07, "loss": 1.1732, "step": 9090 }, { "epoch": 2.84, "grad_norm": 0.259765625, "learning_rate": 1.648108168958229e-07, "loss": 1.1637, "step": 9100 }, { "epoch": 2.85, "grad_norm": 0.2109375, "learning_rate": 1.5830158371944793e-07, "loss": 1.2041, "step": 9110 }, { "epoch": 2.85, "grad_norm": 0.205078125, "learning_rate": 1.519224698779198e-07, "loss": 1.1201, "step": 9120 }, { "epoch": 2.85, "grad_norm": 0.16796875, "learning_rate": 1.4567355971095266e-07, "loss": 1.1199, "step": 9130 }, { "epoch": 2.86, "grad_norm": 0.1787109375, "learning_rate": 1.395549358368087e-07, "loss": 1.1063, "step": 9140 }, { "epoch": 2.86, "grad_norm": 0.310546875, "learning_rate": 1.3356667915121025e-07, "loss": 1.1265, "step": 9150 }, { "epoch": 2.86, "grad_norm": 0.1708984375, "learning_rate": 1.2770886882625954e-07, "loss": 1.1396, "step": 9160 }, { "epoch": 2.87, "grad_norm": 0.205078125, "learning_rate": 1.219815823094006e-07, "loss": 1.0873, "step": 9170 }, { "epoch": 2.87, "grad_norm": 0.296875, "learning_rate": 1.1638489532239339e-07, "loss": 1.1613, "step": 9180 }, { "epoch": 2.87, "grad_norm": 0.365234375, "learning_rate": 1.1091888186030908e-07, "loss": 1.117, "step": 9190 }, { "epoch": 2.88, "grad_norm": 0.26171875, "learning_rate": 1.055836141905553e-07, "loss": 1.1733, "step": 9200 }, { "epoch": 2.88, "grad_norm": 0.380859375, "learning_rate": 1.0037916285192129e-07, "loss": 1.1751, "step": 9210 }, { "epoch": 2.88, "grad_norm": 0.15234375, "learning_rate": 9.530559665364203e-08, "loss": 1.1405, "step": 9220 }, { "epoch": 2.88, "grad_norm": 0.212890625, "learning_rate": 9.036298267449228e-08, "loss": 1.0727, "step": 9230 }, { "epoch": 2.89, "grad_norm": 0.578125, "learning_rate": 8.555138626189619e-08, "loss": 1.052, "step": 9240 }, { "epoch": 2.89, "grad_norm": 0.1728515625, "learning_rate": 8.087087103106461e-08, "loss": 1.1398, "step": 9250 }, { "epoch": 2.89, "grad_norm": 0.1904296875, "learning_rate": 7.632149886415363e-08, "loss": 1.0789, "step": 9260 }, { "epoch": 2.9, "grad_norm": 0.189453125, "learning_rate": 7.19033299094496e-08, "loss": 1.1432, "step": 9270 }, { "epoch": 2.9, "grad_norm": 0.60546875, "learning_rate": 6.761642258056977e-08, "loss": 1.0849, "step": 9280 }, { "epoch": 2.9, "grad_norm": 0.20703125, "learning_rate": 6.346083355569188e-08, "loss": 1.1329, "step": 9290 }, { "epoch": 2.91, "grad_norm": 0.2158203125, "learning_rate": 5.943661777680354e-08, "loss": 1.1733, "step": 9300 }, { "epoch": 2.91, "grad_norm": 0.2041015625, "learning_rate": 5.554382844897843e-08, "loss": 1.1316, "step": 9310 }, { "epoch": 2.91, "grad_norm": 0.240234375, "learning_rate": 5.178251703967019e-08, "loss": 1.238, "step": 9320 }, { "epoch": 2.92, "grad_norm": 0.1767578125, "learning_rate": 4.815273327803183e-08, "loss": 1.2028, "step": 9330 }, { "epoch": 2.92, "grad_norm": 0.2119140625, "learning_rate": 4.4654525154260717e-08, "loss": 1.1178, "step": 9340 }, { "epoch": 2.92, "grad_norm": 0.25, "learning_rate": 4.128793891896465e-08, "loss": 1.1779, "step": 9350 }, { "epoch": 2.92, "grad_norm": 0.1923828125, "learning_rate": 3.805301908254455e-08, "loss": 1.1201, "step": 9360 }, { "epoch": 2.93, "grad_norm": 0.1875, "learning_rate": 3.4949808414612705e-08, "loss": 1.1082, "step": 9370 }, { "epoch": 2.93, "grad_norm": 0.3828125, "learning_rate": 3.197834794342436e-08, "loss": 1.0979, "step": 9380 }, { "epoch": 2.93, "grad_norm": 0.19140625, "learning_rate": 2.9138676955333676e-08, "loss": 1.1627, "step": 9390 }, { "epoch": 2.94, "grad_norm": 0.2353515625, "learning_rate": 2.643083299427751e-08, "loss": 1.1055, "step": 9400 }, { "epoch": 2.94, "grad_norm": 0.2080078125, "learning_rate": 2.3854851861276895e-08, "loss": 1.1528, "step": 9410 }, { "epoch": 2.94, "grad_norm": 0.296875, "learning_rate": 2.1410767613965212e-08, "loss": 1.1785, "step": 9420 }, { "epoch": 2.95, "grad_norm": 0.38671875, "learning_rate": 1.909861256613632e-08, "loss": 1.1647, "step": 9430 }, { "epoch": 2.95, "grad_norm": 2.25, "learning_rate": 1.6918417287318245e-08, "loss": 1.1859, "step": 9440 }, { "epoch": 2.95, "grad_norm": 0.34375, "learning_rate": 1.487021060236904e-08, "loss": 1.147, "step": 9450 }, { "epoch": 2.96, "grad_norm": 0.1630859375, "learning_rate": 1.2954019591095989e-08, "loss": 1.1402, "step": 9460 }, { "epoch": 2.96, "grad_norm": 0.39453125, "learning_rate": 1.1169869587895899e-08, "loss": 1.1166, "step": 9470 }, { "epoch": 2.96, "grad_norm": 0.3203125, "learning_rate": 9.517784181422018e-09, "loss": 1.1793, "step": 9480 }, { "epoch": 2.97, "grad_norm": 0.162109375, "learning_rate": 7.997785214273191e-09, "loss": 1.1659, "step": 9490 }, { "epoch": 2.97, "grad_norm": 0.2392578125, "learning_rate": 6.609892782699634e-09, "loss": 1.0663, "step": 9500 }, { "epoch": 2.97, "grad_norm": 1.2421875, "learning_rate": 5.354125236343155e-09, "loss": 1.1448, "step": 9510 }, { "epoch": 2.98, "grad_norm": 0.166015625, "learning_rate": 4.230499177994007e-09, "loss": 1.1582, "step": 9520 }, { "epoch": 2.98, "grad_norm": 0.49609375, "learning_rate": 3.239029463367738e-09, "loss": 1.1834, "step": 9530 }, { "epoch": 2.98, "grad_norm": 0.248046875, "learning_rate": 2.379729200908676e-09, "loss": 1.1191, "step": 9540 }, { "epoch": 2.98, "grad_norm": 0.333984375, "learning_rate": 1.652609751624512e-09, "loss": 1.1587, "step": 9550 }, { "epoch": 2.99, "grad_norm": 0.2275390625, "learning_rate": 1.0576807289253143e-09, "loss": 1.1762, "step": 9560 }, { "epoch": 2.99, "grad_norm": 0.162109375, "learning_rate": 5.949499985025142e-10, "loss": 1.1972, "step": 9570 }, { "epoch": 2.99, "grad_norm": 0.2490234375, "learning_rate": 2.64423678225656e-10, "loss": 1.1115, "step": 9580 }, { "epoch": 3.0, "grad_norm": 0.1767578125, "learning_rate": 6.610613805690947e-11, "loss": 1.1751, "step": 9590 }, { "epoch": 3.0, "grad_norm": 0.2373046875, "learning_rate": 0.0, "loss": 1.1969, "step": 9600 } ], "logging_steps": 10, "max_steps": 9600, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 2.8158118667338383e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }