{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 500, "global_step": 7500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008, "grad_norm": 1.6428182125091553, "learning_rate": 0.00029983999999999995, "loss": 4.0499, "step": 10 }, { "epoch": 0.016, "grad_norm": 1.1695531606674194, "learning_rate": 0.00029968, "loss": 2.6823, "step": 20 }, { "epoch": 0.024, "grad_norm": 0.982557475566864, "learning_rate": 0.00029951999999999995, "loss": 2.3495, "step": 30 }, { "epoch": 0.032, "grad_norm": 1.119385004043579, "learning_rate": 0.00029936, "loss": 2.187, "step": 40 }, { "epoch": 0.04, "grad_norm": 1.1943817138671875, "learning_rate": 0.00029919999999999995, "loss": 2.1278, "step": 50 }, { "epoch": 0.048, "grad_norm": 1.0324301719665527, "learning_rate": 0.00029904, "loss": 2.1567, "step": 60 }, { "epoch": 0.056, "grad_norm": 1.0339545011520386, "learning_rate": 0.00029887999999999996, "loss": 2.1682, "step": 70 }, { "epoch": 0.064, "grad_norm": 1.1292812824249268, "learning_rate": 0.00029872, "loss": 2.0833, "step": 80 }, { "epoch": 0.072, "grad_norm": 1.112321376800537, "learning_rate": 0.00029855999999999996, "loss": 2.0453, "step": 90 }, { "epoch": 0.08, "grad_norm": 1.2117633819580078, "learning_rate": 0.0002984, "loss": 2.1188, "step": 100 }, { "epoch": 0.088, "grad_norm": 1.0593370199203491, "learning_rate": 0.00029823999999999996, "loss": 2.1201, "step": 110 }, { "epoch": 0.096, "grad_norm": 1.1461642980575562, "learning_rate": 0.00029808, "loss": 2.035, "step": 120 }, { "epoch": 0.104, "grad_norm": 1.2336146831512451, "learning_rate": 0.00029791999999999997, "loss": 2.0329, "step": 130 }, { "epoch": 0.112, "grad_norm": 1.0999081134796143, "learning_rate": 0.00029776, "loss": 2.029, "step": 140 }, { "epoch": 0.12, "grad_norm": 1.109130620956421, "learning_rate": 0.00029759999999999997, "loss": 2.0032, "step": 150 }, { "epoch": 0.128, "grad_norm": 1.150937557220459, "learning_rate": 0.00029744, "loss": 2.039, "step": 160 }, { "epoch": 0.136, "grad_norm": 1.1265838146209717, "learning_rate": 0.00029727999999999997, "loss": 2.0358, "step": 170 }, { "epoch": 0.144, "grad_norm": 1.1429523229599, "learning_rate": 0.00029711999999999995, "loss": 2.0357, "step": 180 }, { "epoch": 0.152, "grad_norm": 1.0551432371139526, "learning_rate": 0.00029696, "loss": 2.0233, "step": 190 }, { "epoch": 0.16, "grad_norm": 1.1221256256103516, "learning_rate": 0.00029679999999999995, "loss": 2.0512, "step": 200 }, { "epoch": 0.168, "grad_norm": 1.0235646963119507, "learning_rate": 0.00029664, "loss": 2.0874, "step": 210 }, { "epoch": 0.176, "grad_norm": 1.0271421670913696, "learning_rate": 0.00029647999999999995, "loss": 2.007, "step": 220 }, { "epoch": 0.184, "grad_norm": 1.1792947053909302, "learning_rate": 0.00029632, "loss": 1.9954, "step": 230 }, { "epoch": 0.192, "grad_norm": 1.1998450756072998, "learning_rate": 0.00029615999999999996, "loss": 1.9629, "step": 240 }, { "epoch": 0.2, "grad_norm": 1.0941493511199951, "learning_rate": 0.000296, "loss": 1.9895, "step": 250 }, { "epoch": 0.208, "grad_norm": 1.1195231676101685, "learning_rate": 0.00029583999999999996, "loss": 1.9704, "step": 260 }, { "epoch": 0.216, "grad_norm": 1.0294626951217651, "learning_rate": 0.00029568, "loss": 1.9912, "step": 270 }, { "epoch": 0.224, "grad_norm": 1.0843749046325684, "learning_rate": 0.00029551999999999996, "loss": 1.9365, "step": 280 }, { "epoch": 0.232, "grad_norm": 0.8985214233398438, "learning_rate": 0.00029536, "loss": 2.0002, "step": 290 }, { "epoch": 0.24, "grad_norm": 1.0384533405303955, "learning_rate": 0.00029519999999999997, "loss": 1.94, "step": 300 }, { "epoch": 0.248, "grad_norm": 1.1195266246795654, "learning_rate": 0.00029504, "loss": 2.0072, "step": 310 }, { "epoch": 0.256, "grad_norm": 1.0751473903656006, "learning_rate": 0.00029487999999999997, "loss": 1.9446, "step": 320 }, { "epoch": 0.264, "grad_norm": 1.0846151113510132, "learning_rate": 0.00029472, "loss": 1.9619, "step": 330 }, { "epoch": 0.272, "grad_norm": 1.0839966535568237, "learning_rate": 0.00029455999999999997, "loss": 1.9454, "step": 340 }, { "epoch": 0.28, "grad_norm": 1.0731072425842285, "learning_rate": 0.00029439999999999995, "loss": 1.9626, "step": 350 }, { "epoch": 0.288, "grad_norm": 1.0523524284362793, "learning_rate": 0.00029424, "loss": 1.913, "step": 360 }, { "epoch": 0.296, "grad_norm": 1.0012118816375732, "learning_rate": 0.00029407999999999995, "loss": 1.9395, "step": 370 }, { "epoch": 0.304, "grad_norm": 0.9734252691268921, "learning_rate": 0.00029392, "loss": 2.0065, "step": 380 }, { "epoch": 0.312, "grad_norm": 1.127196192741394, "learning_rate": 0.00029375999999999995, "loss": 1.8512, "step": 390 }, { "epoch": 0.32, "grad_norm": 1.2507715225219727, "learning_rate": 0.0002936, "loss": 1.9136, "step": 400 }, { "epoch": 0.328, "grad_norm": 1.0916541814804077, "learning_rate": 0.00029343999999999996, "loss": 1.8957, "step": 410 }, { "epoch": 0.336, "grad_norm": 1.1081781387329102, "learning_rate": 0.00029328, "loss": 1.9262, "step": 420 }, { "epoch": 0.344, "grad_norm": 1.1098934412002563, "learning_rate": 0.00029311999999999996, "loss": 1.9213, "step": 430 }, { "epoch": 0.352, "grad_norm": 1.0184811353683472, "learning_rate": 0.00029296, "loss": 1.9374, "step": 440 }, { "epoch": 0.36, "grad_norm": 1.1124446392059326, "learning_rate": 0.00029279999999999996, "loss": 1.9237, "step": 450 }, { "epoch": 0.368, "grad_norm": 1.1229047775268555, "learning_rate": 0.00029264, "loss": 1.8897, "step": 460 }, { "epoch": 0.376, "grad_norm": 1.0087217092514038, "learning_rate": 0.00029247999999999996, "loss": 1.9317, "step": 470 }, { "epoch": 0.384, "grad_norm": 1.0527478456497192, "learning_rate": 0.00029232, "loss": 1.9571, "step": 480 }, { "epoch": 0.392, "grad_norm": 0.9762263894081116, "learning_rate": 0.00029215999999999997, "loss": 1.911, "step": 490 }, { "epoch": 0.4, "grad_norm": 1.0288947820663452, "learning_rate": 0.000292, "loss": 1.8763, "step": 500 }, { "epoch": 0.408, "grad_norm": 1.0375839471817017, "learning_rate": 0.00029183999999999997, "loss": 1.9924, "step": 510 }, { "epoch": 0.416, "grad_norm": 1.005863904953003, "learning_rate": 0.00029167999999999994, "loss": 1.8497, "step": 520 }, { "epoch": 0.424, "grad_norm": 0.9753358960151672, "learning_rate": 0.00029152, "loss": 1.9155, "step": 530 }, { "epoch": 0.432, "grad_norm": 1.0157995223999023, "learning_rate": 0.00029135999999999995, "loss": 1.9108, "step": 540 }, { "epoch": 0.44, "grad_norm": 1.1655962467193604, "learning_rate": 0.0002912, "loss": 1.8594, "step": 550 }, { "epoch": 0.448, "grad_norm": 1.0194449424743652, "learning_rate": 0.00029103999999999995, "loss": 1.8832, "step": 560 }, { "epoch": 0.456, "grad_norm": 1.0156056880950928, "learning_rate": 0.00029088, "loss": 1.9253, "step": 570 }, { "epoch": 0.464, "grad_norm": 1.031867265701294, "learning_rate": 0.00029071999999999995, "loss": 1.8896, "step": 580 }, { "epoch": 0.472, "grad_norm": 0.9771973490715027, "learning_rate": 0.00029056, "loss": 1.8817, "step": 590 }, { "epoch": 0.48, "grad_norm": 1.0212839841842651, "learning_rate": 0.00029039999999999996, "loss": 1.9077, "step": 600 }, { "epoch": 0.488, "grad_norm": 1.09153413772583, "learning_rate": 0.00029024, "loss": 1.8725, "step": 610 }, { "epoch": 0.496, "grad_norm": 1.043017029762268, "learning_rate": 0.00029007999999999996, "loss": 1.8432, "step": 620 }, { "epoch": 0.504, "grad_norm": 0.9705913066864014, "learning_rate": 0.00028992, "loss": 1.8996, "step": 630 }, { "epoch": 0.512, "grad_norm": 0.9535217881202698, "learning_rate": 0.00028975999999999996, "loss": 1.9339, "step": 640 }, { "epoch": 0.52, "grad_norm": 1.1274858713150024, "learning_rate": 0.0002896, "loss": 1.8275, "step": 650 }, { "epoch": 0.528, "grad_norm": 1.1044244766235352, "learning_rate": 0.00028943999999999997, "loss": 1.894, "step": 660 }, { "epoch": 0.536, "grad_norm": 1.0410267114639282, "learning_rate": 0.00028928, "loss": 1.9064, "step": 670 }, { "epoch": 0.544, "grad_norm": 1.118211269378662, "learning_rate": 0.00028911999999999997, "loss": 1.8881, "step": 680 }, { "epoch": 0.552, "grad_norm": 1.0527877807617188, "learning_rate": 0.00028895999999999994, "loss": 1.8371, "step": 690 }, { "epoch": 0.56, "grad_norm": 1.0014268159866333, "learning_rate": 0.00028879999999999997, "loss": 1.9004, "step": 700 }, { "epoch": 0.568, "grad_norm": 1.0764245986938477, "learning_rate": 0.00028863999999999995, "loss": 1.9347, "step": 710 }, { "epoch": 0.576, "grad_norm": 1.0075087547302246, "learning_rate": 0.00028848, "loss": 1.8226, "step": 720 }, { "epoch": 0.584, "grad_norm": 1.0563082695007324, "learning_rate": 0.00028831999999999995, "loss": 1.8147, "step": 730 }, { "epoch": 0.592, "grad_norm": 1.1010650396347046, "learning_rate": 0.00028816, "loss": 1.9306, "step": 740 }, { "epoch": 0.6, "grad_norm": 0.9899283647537231, "learning_rate": 0.00028799999999999995, "loss": 1.885, "step": 750 }, { "epoch": 0.608, "grad_norm": 1.0245839357376099, "learning_rate": 0.00028784, "loss": 1.8532, "step": 760 }, { "epoch": 0.616, "grad_norm": 1.056541085243225, "learning_rate": 0.00028767999999999996, "loss": 1.8861, "step": 770 }, { "epoch": 0.624, "grad_norm": 0.9766470193862915, "learning_rate": 0.00028752, "loss": 1.8241, "step": 780 }, { "epoch": 0.632, "grad_norm": 1.10284423828125, "learning_rate": 0.00028735999999999996, "loss": 1.7675, "step": 790 }, { "epoch": 0.64, "grad_norm": 1.080234408378601, "learning_rate": 0.0002872, "loss": 1.8204, "step": 800 }, { "epoch": 0.648, "grad_norm": 1.0814071893692017, "learning_rate": 0.00028703999999999996, "loss": 1.8619, "step": 810 }, { "epoch": 0.656, "grad_norm": 0.9824687838554382, "learning_rate": 0.00028688, "loss": 1.9017, "step": 820 }, { "epoch": 0.664, "grad_norm": 1.0177820920944214, "learning_rate": 0.00028671999999999997, "loss": 1.8842, "step": 830 }, { "epoch": 0.672, "grad_norm": 0.9703278541564941, "learning_rate": 0.00028656, "loss": 1.916, "step": 840 }, { "epoch": 0.68, "grad_norm": 1.0800108909606934, "learning_rate": 0.00028639999999999997, "loss": 1.8274, "step": 850 }, { "epoch": 0.688, "grad_norm": 1.0110689401626587, "learning_rate": 0.00028624, "loss": 1.8077, "step": 860 }, { "epoch": 0.696, "grad_norm": 1.091354250907898, "learning_rate": 0.00028607999999999997, "loss": 1.8971, "step": 870 }, { "epoch": 0.704, "grad_norm": 1.0147050619125366, "learning_rate": 0.00028591999999999995, "loss": 1.8365, "step": 880 }, { "epoch": 0.712, "grad_norm": 1.0930813550949097, "learning_rate": 0.00028576, "loss": 1.7962, "step": 890 }, { "epoch": 0.72, "grad_norm": 1.0309563875198364, "learning_rate": 0.00028559999999999995, "loss": 1.808, "step": 900 }, { "epoch": 0.728, "grad_norm": 1.0878843069076538, "learning_rate": 0.00028544, "loss": 1.8481, "step": 910 }, { "epoch": 0.736, "grad_norm": 1.039565086364746, "learning_rate": 0.00028527999999999995, "loss": 1.8475, "step": 920 }, { "epoch": 0.744, "grad_norm": 0.9955683350563049, "learning_rate": 0.00028512, "loss": 1.8577, "step": 930 }, { "epoch": 0.752, "grad_norm": 0.9792163372039795, "learning_rate": 0.00028495999999999996, "loss": 1.8577, "step": 940 }, { "epoch": 0.76, "grad_norm": 1.0933603048324585, "learning_rate": 0.0002848, "loss": 1.8941, "step": 950 }, { "epoch": 0.768, "grad_norm": 1.0719082355499268, "learning_rate": 0.00028463999999999996, "loss": 1.8739, "step": 960 }, { "epoch": 0.776, "grad_norm": 1.039011836051941, "learning_rate": 0.00028448, "loss": 1.8526, "step": 970 }, { "epoch": 0.784, "grad_norm": 1.1158881187438965, "learning_rate": 0.00028431999999999996, "loss": 1.8001, "step": 980 }, { "epoch": 0.792, "grad_norm": 0.9756163954734802, "learning_rate": 0.00028416, "loss": 1.8211, "step": 990 }, { "epoch": 0.8, "grad_norm": 1.0662978887557983, "learning_rate": 0.00028399999999999996, "loss": 1.8549, "step": 1000 }, { "epoch": 0.808, "grad_norm": 1.060304880142212, "learning_rate": 0.00028384, "loss": 1.8516, "step": 1010 }, { "epoch": 0.816, "grad_norm": 1.0433423519134521, "learning_rate": 0.00028367999999999997, "loss": 1.8146, "step": 1020 }, { "epoch": 0.824, "grad_norm": 1.0191080570220947, "learning_rate": 0.00028352, "loss": 1.8172, "step": 1030 }, { "epoch": 0.832, "grad_norm": 1.0157259702682495, "learning_rate": 0.00028335999999999997, "loss": 1.8096, "step": 1040 }, { "epoch": 0.84, "grad_norm": 1.0125967264175415, "learning_rate": 0.00028319999999999994, "loss": 1.7954, "step": 1050 }, { "epoch": 0.848, "grad_norm": 1.0847101211547852, "learning_rate": 0.00028304, "loss": 1.7961, "step": 1060 }, { "epoch": 0.856, "grad_norm": 0.9798891544342041, "learning_rate": 0.00028287999999999995, "loss": 1.8246, "step": 1070 }, { "epoch": 0.864, "grad_norm": 0.9857827425003052, "learning_rate": 0.00028272, "loss": 1.8989, "step": 1080 }, { "epoch": 0.872, "grad_norm": 0.9614414572715759, "learning_rate": 0.00028255999999999995, "loss": 1.9081, "step": 1090 }, { "epoch": 0.88, "grad_norm": 0.9770805835723877, "learning_rate": 0.0002824, "loss": 1.8396, "step": 1100 }, { "epoch": 0.888, "grad_norm": 1.0100719928741455, "learning_rate": 0.00028223999999999995, "loss": 1.8233, "step": 1110 }, { "epoch": 0.896, "grad_norm": 0.9945518970489502, "learning_rate": 0.00028208, "loss": 1.8163, "step": 1120 }, { "epoch": 0.904, "grad_norm": 1.0281423330307007, "learning_rate": 0.00028191999999999996, "loss": 1.8317, "step": 1130 }, { "epoch": 0.912, "grad_norm": 1.0575731992721558, "learning_rate": 0.00028176, "loss": 1.8673, "step": 1140 }, { "epoch": 0.92, "grad_norm": 1.1658177375793457, "learning_rate": 0.00028159999999999996, "loss": 1.827, "step": 1150 }, { "epoch": 0.928, "grad_norm": 1.0432631969451904, "learning_rate": 0.00028144, "loss": 1.8164, "step": 1160 }, { "epoch": 0.936, "grad_norm": 1.0257468223571777, "learning_rate": 0.00028127999999999996, "loss": 1.8025, "step": 1170 }, { "epoch": 0.944, "grad_norm": 1.1194055080413818, "learning_rate": 0.00028112, "loss": 1.8514, "step": 1180 }, { "epoch": 0.952, "grad_norm": 1.0339341163635254, "learning_rate": 0.00028095999999999997, "loss": 1.7735, "step": 1190 }, { "epoch": 0.96, "grad_norm": 0.9593726396560669, "learning_rate": 0.0002808, "loss": 1.8003, "step": 1200 }, { "epoch": 0.968, "grad_norm": 0.9705820083618164, "learning_rate": 0.00028063999999999997, "loss": 1.7788, "step": 1210 }, { "epoch": 0.976, "grad_norm": 1.0924532413482666, "learning_rate": 0.00028047999999999994, "loss": 1.7911, "step": 1220 }, { "epoch": 0.984, "grad_norm": 1.0870336294174194, "learning_rate": 0.00028031999999999997, "loss": 1.777, "step": 1230 }, { "epoch": 0.992, "grad_norm": 1.0212570428848267, "learning_rate": 0.00028015999999999995, "loss": 1.8329, "step": 1240 }, { "epoch": 1.0, "grad_norm": 0.9898034334182739, "learning_rate": 0.00028, "loss": 1.803, "step": 1250 }, { "epoch": 1.008, "grad_norm": 1.1636098623275757, "learning_rate": 0.00027983999999999995, "loss": 1.7512, "step": 1260 }, { "epoch": 1.016, "grad_norm": 1.122517704963684, "learning_rate": 0.00027968, "loss": 1.7579, "step": 1270 }, { "epoch": 1.024, "grad_norm": 1.1521267890930176, "learning_rate": 0.00027951999999999995, "loss": 1.6773, "step": 1280 }, { "epoch": 1.032, "grad_norm": 1.164711833000183, "learning_rate": 0.00027936, "loss": 1.6539, "step": 1290 }, { "epoch": 1.04, "grad_norm": 1.0965043306350708, "learning_rate": 0.00027919999999999996, "loss": 1.7126, "step": 1300 }, { "epoch": 1.048, "grad_norm": 1.2235987186431885, "learning_rate": 0.00027904, "loss": 1.6967, "step": 1310 }, { "epoch": 1.056, "grad_norm": 1.1083018779754639, "learning_rate": 0.00027887999999999996, "loss": 1.7282, "step": 1320 }, { "epoch": 1.064, "grad_norm": 1.1210997104644775, "learning_rate": 0.00027872, "loss": 1.7371, "step": 1330 }, { "epoch": 1.072, "grad_norm": 1.1816761493682861, "learning_rate": 0.00027855999999999996, "loss": 1.7099, "step": 1340 }, { "epoch": 1.08, "grad_norm": 1.1083471775054932, "learning_rate": 0.0002784, "loss": 1.7029, "step": 1350 }, { "epoch": 1.088, "grad_norm": 1.1974619626998901, "learning_rate": 0.00027823999999999997, "loss": 1.676, "step": 1360 }, { "epoch": 1.096, "grad_norm": 1.1856564283370972, "learning_rate": 0.00027808, "loss": 1.7675, "step": 1370 }, { "epoch": 1.104, "grad_norm": 1.1293108463287354, "learning_rate": 0.00027791999999999997, "loss": 1.6936, "step": 1380 }, { "epoch": 1.112, "grad_norm": 1.1792447566986084, "learning_rate": 0.00027775999999999994, "loss": 1.7194, "step": 1390 }, { "epoch": 1.12, "grad_norm": 1.1036359071731567, "learning_rate": 0.00027759999999999997, "loss": 1.6564, "step": 1400 }, { "epoch": 1.1280000000000001, "grad_norm": 1.2215582132339478, "learning_rate": 0.00027743999999999995, "loss": 1.7064, "step": 1410 }, { "epoch": 1.1360000000000001, "grad_norm": 1.1735379695892334, "learning_rate": 0.00027728, "loss": 1.7184, "step": 1420 }, { "epoch": 1.144, "grad_norm": 1.1964507102966309, "learning_rate": 0.00027711999999999995, "loss": 1.7329, "step": 1430 }, { "epoch": 1.152, "grad_norm": 1.138510823249817, "learning_rate": 0.00027696, "loss": 1.7096, "step": 1440 }, { "epoch": 1.16, "grad_norm": 1.1308197975158691, "learning_rate": 0.00027679999999999995, "loss": 1.7457, "step": 1450 }, { "epoch": 1.168, "grad_norm": 1.1567286252975464, "learning_rate": 0.00027664, "loss": 1.6813, "step": 1460 }, { "epoch": 1.176, "grad_norm": 1.1560039520263672, "learning_rate": 0.00027647999999999995, "loss": 1.699, "step": 1470 }, { "epoch": 1.184, "grad_norm": 1.230444073677063, "learning_rate": 0.00027632, "loss": 1.7782, "step": 1480 }, { "epoch": 1.192, "grad_norm": 1.2430510520935059, "learning_rate": 0.00027615999999999996, "loss": 1.716, "step": 1490 }, { "epoch": 1.2, "grad_norm": 1.1405155658721924, "learning_rate": 0.000276, "loss": 1.7072, "step": 1500 }, { "epoch": 1.208, "grad_norm": 1.1308519840240479, "learning_rate": 0.00027583999999999996, "loss": 1.6952, "step": 1510 }, { "epoch": 1.216, "grad_norm": 1.2301914691925049, "learning_rate": 0.00027568, "loss": 1.721, "step": 1520 }, { "epoch": 1.224, "grad_norm": 1.2387229204177856, "learning_rate": 0.00027551999999999996, "loss": 1.6866, "step": 1530 }, { "epoch": 1.232, "grad_norm": 1.070438027381897, "learning_rate": 0.00027536, "loss": 1.6882, "step": 1540 }, { "epoch": 1.24, "grad_norm": 1.1818335056304932, "learning_rate": 0.00027519999999999997, "loss": 1.7479, "step": 1550 }, { "epoch": 1.248, "grad_norm": 1.1129992008209229, "learning_rate": 0.00027503999999999994, "loss": 1.7563, "step": 1560 }, { "epoch": 1.256, "grad_norm": 1.2298282384872437, "learning_rate": 0.00027487999999999997, "loss": 1.684, "step": 1570 }, { "epoch": 1.264, "grad_norm": 1.2662142515182495, "learning_rate": 0.00027471999999999994, "loss": 1.7052, "step": 1580 }, { "epoch": 1.272, "grad_norm": 1.2866853475570679, "learning_rate": 0.00027456, "loss": 1.7403, "step": 1590 }, { "epoch": 1.28, "grad_norm": 1.225150227546692, "learning_rate": 0.00027439999999999995, "loss": 1.7141, "step": 1600 }, { "epoch": 1.288, "grad_norm": 1.2254585027694702, "learning_rate": 0.00027424, "loss": 1.7558, "step": 1610 }, { "epoch": 1.296, "grad_norm": 1.228905439376831, "learning_rate": 0.00027407999999999995, "loss": 1.7081, "step": 1620 }, { "epoch": 1.304, "grad_norm": 1.190305471420288, "learning_rate": 0.00027392, "loss": 1.7173, "step": 1630 }, { "epoch": 1.312, "grad_norm": 1.1080456972122192, "learning_rate": 0.00027375999999999995, "loss": 1.6771, "step": 1640 }, { "epoch": 1.32, "grad_norm": 1.1341513395309448, "learning_rate": 0.0002736, "loss": 1.6865, "step": 1650 }, { "epoch": 1.328, "grad_norm": 1.1582372188568115, "learning_rate": 0.00027343999999999996, "loss": 1.7083, "step": 1660 }, { "epoch": 1.336, "grad_norm": 1.2780879735946655, "learning_rate": 0.00027328, "loss": 1.7348, "step": 1670 }, { "epoch": 1.3439999999999999, "grad_norm": 1.1118934154510498, "learning_rate": 0.00027311999999999996, "loss": 1.763, "step": 1680 }, { "epoch": 1.3519999999999999, "grad_norm": 1.2540453672409058, "learning_rate": 0.00027296, "loss": 1.7273, "step": 1690 }, { "epoch": 1.3599999999999999, "grad_norm": 1.1426582336425781, "learning_rate": 0.00027279999999999996, "loss": 1.7098, "step": 1700 }, { "epoch": 1.3679999999999999, "grad_norm": 1.1964046955108643, "learning_rate": 0.00027264, "loss": 1.709, "step": 1710 }, { "epoch": 1.376, "grad_norm": 1.14896559715271, "learning_rate": 0.00027247999999999997, "loss": 1.7313, "step": 1720 }, { "epoch": 1.384, "grad_norm": 1.202528953552246, "learning_rate": 0.00027231999999999994, "loss": 1.744, "step": 1730 }, { "epoch": 1.392, "grad_norm": 1.1821858882904053, "learning_rate": 0.00027215999999999997, "loss": 1.6926, "step": 1740 }, { "epoch": 1.4, "grad_norm": 1.2063863277435303, "learning_rate": 0.00027199999999999994, "loss": 1.6386, "step": 1750 }, { "epoch": 1.408, "grad_norm": 1.2269303798675537, "learning_rate": 0.00027183999999999997, "loss": 1.7395, "step": 1760 }, { "epoch": 1.416, "grad_norm": 1.1873764991760254, "learning_rate": 0.00027167999999999995, "loss": 1.7408, "step": 1770 }, { "epoch": 1.424, "grad_norm": 1.1701534986495972, "learning_rate": 0.00027152, "loss": 1.6902, "step": 1780 }, { "epoch": 1.432, "grad_norm": 1.2059394121170044, "learning_rate": 0.00027135999999999995, "loss": 1.7189, "step": 1790 }, { "epoch": 1.44, "grad_norm": 1.2177969217300415, "learning_rate": 0.0002712, "loss": 1.6688, "step": 1800 }, { "epoch": 1.448, "grad_norm": 1.1420925855636597, "learning_rate": 0.00027103999999999995, "loss": 1.7027, "step": 1810 }, { "epoch": 1.456, "grad_norm": 1.1630126237869263, "learning_rate": 0.00027088, "loss": 1.7268, "step": 1820 }, { "epoch": 1.464, "grad_norm": 1.1708976030349731, "learning_rate": 0.00027071999999999996, "loss": 1.6667, "step": 1830 }, { "epoch": 1.472, "grad_norm": 1.1763298511505127, "learning_rate": 0.00027056, "loss": 1.7667, "step": 1840 }, { "epoch": 1.48, "grad_norm": 1.1959589719772339, "learning_rate": 0.00027039999999999996, "loss": 1.682, "step": 1850 }, { "epoch": 1.488, "grad_norm": 1.187795639038086, "learning_rate": 0.00027024, "loss": 1.7078, "step": 1860 }, { "epoch": 1.496, "grad_norm": 1.1146178245544434, "learning_rate": 0.00027007999999999996, "loss": 1.6864, "step": 1870 }, { "epoch": 1.504, "grad_norm": 1.1661298274993896, "learning_rate": 0.00026992, "loss": 1.7206, "step": 1880 }, { "epoch": 1.512, "grad_norm": 1.1348265409469604, "learning_rate": 0.00026975999999999997, "loss": 1.6971, "step": 1890 }, { "epoch": 1.52, "grad_norm": 1.2029168605804443, "learning_rate": 0.00026959999999999994, "loss": 1.7435, "step": 1900 }, { "epoch": 1.528, "grad_norm": 1.2038522958755493, "learning_rate": 0.00026943999999999997, "loss": 1.6938, "step": 1910 }, { "epoch": 1.536, "grad_norm": 1.1772645711898804, "learning_rate": 0.00026927999999999994, "loss": 1.6974, "step": 1920 }, { "epoch": 1.544, "grad_norm": 1.2052574157714844, "learning_rate": 0.00026911999999999997, "loss": 1.6798, "step": 1930 }, { "epoch": 1.552, "grad_norm": 1.22791588306427, "learning_rate": 0.00026895999999999995, "loss": 1.7074, "step": 1940 }, { "epoch": 1.56, "grad_norm": 1.0809330940246582, "learning_rate": 0.0002688, "loss": 1.7332, "step": 1950 }, { "epoch": 1.568, "grad_norm": 1.2375030517578125, "learning_rate": 0.00026863999999999995, "loss": 1.7188, "step": 1960 }, { "epoch": 1.576, "grad_norm": 1.1218806505203247, "learning_rate": 0.00026848, "loss": 1.7222, "step": 1970 }, { "epoch": 1.584, "grad_norm": 1.1987130641937256, "learning_rate": 0.00026831999999999995, "loss": 1.6705, "step": 1980 }, { "epoch": 1.592, "grad_norm": 1.1293755769729614, "learning_rate": 0.00026816, "loss": 1.7911, "step": 1990 }, { "epoch": 1.6, "grad_norm": 1.1469671726226807, "learning_rate": 0.00026799999999999995, "loss": 1.7355, "step": 2000 }, { "epoch": 1.608, "grad_norm": 1.2343659400939941, "learning_rate": 0.00026784, "loss": 1.6655, "step": 2010 }, { "epoch": 1.616, "grad_norm": 1.1669197082519531, "learning_rate": 0.00026767999999999996, "loss": 1.7655, "step": 2020 }, { "epoch": 1.624, "grad_norm": 1.1948648691177368, "learning_rate": 0.00026752, "loss": 1.5903, "step": 2030 }, { "epoch": 1.6320000000000001, "grad_norm": 1.210276484489441, "learning_rate": 0.00026735999999999996, "loss": 1.6919, "step": 2040 }, { "epoch": 1.6400000000000001, "grad_norm": 1.1474298238754272, "learning_rate": 0.0002672, "loss": 1.7315, "step": 2050 }, { "epoch": 1.6480000000000001, "grad_norm": 1.1558197736740112, "learning_rate": 0.00026703999999999996, "loss": 1.7004, "step": 2060 }, { "epoch": 1.6560000000000001, "grad_norm": 1.2014431953430176, "learning_rate": 0.00026687999999999994, "loss": 1.7262, "step": 2070 }, { "epoch": 1.6640000000000001, "grad_norm": 1.1946237087249756, "learning_rate": 0.00026671999999999997, "loss": 1.7575, "step": 2080 }, { "epoch": 1.6720000000000002, "grad_norm": 1.096993088722229, "learning_rate": 0.00026655999999999994, "loss": 1.7049, "step": 2090 }, { "epoch": 1.6800000000000002, "grad_norm": 1.136132001876831, "learning_rate": 0.00026639999999999997, "loss": 1.7116, "step": 2100 }, { "epoch": 1.688, "grad_norm": 1.1487154960632324, "learning_rate": 0.00026623999999999994, "loss": 1.6711, "step": 2110 }, { "epoch": 1.696, "grad_norm": 1.2251691818237305, "learning_rate": 0.00026608, "loss": 1.7647, "step": 2120 }, { "epoch": 1.704, "grad_norm": 1.1736303567886353, "learning_rate": 0.00026591999999999995, "loss": 1.7074, "step": 2130 }, { "epoch": 1.712, "grad_norm": 1.1187472343444824, "learning_rate": 0.00026576, "loss": 1.6904, "step": 2140 }, { "epoch": 1.72, "grad_norm": 1.2309964895248413, "learning_rate": 0.00026559999999999995, "loss": 1.6816, "step": 2150 }, { "epoch": 1.728, "grad_norm": 1.182122826576233, "learning_rate": 0.00026544, "loss": 1.7505, "step": 2160 }, { "epoch": 1.736, "grad_norm": 1.1426887512207031, "learning_rate": 0.00026527999999999995, "loss": 1.7415, "step": 2170 }, { "epoch": 1.744, "grad_norm": 1.1243617534637451, "learning_rate": 0.00026512, "loss": 1.7427, "step": 2180 }, { "epoch": 1.752, "grad_norm": 1.1814117431640625, "learning_rate": 0.00026495999999999996, "loss": 1.6717, "step": 2190 }, { "epoch": 1.76, "grad_norm": 1.1558399200439453, "learning_rate": 0.0002648, "loss": 1.7521, "step": 2200 }, { "epoch": 1.768, "grad_norm": 1.1759192943572998, "learning_rate": 0.00026463999999999996, "loss": 1.6922, "step": 2210 }, { "epoch": 1.776, "grad_norm": 1.213027834892273, "learning_rate": 0.00026448, "loss": 1.6918, "step": 2220 }, { "epoch": 1.784, "grad_norm": 1.1476492881774902, "learning_rate": 0.00026431999999999996, "loss": 1.7319, "step": 2230 }, { "epoch": 1.792, "grad_norm": 1.171706199645996, "learning_rate": 0.00026415999999999994, "loss": 1.7126, "step": 2240 }, { "epoch": 1.8, "grad_norm": 1.2222481966018677, "learning_rate": 0.00026399999999999997, "loss": 1.666, "step": 2250 }, { "epoch": 1.808, "grad_norm": 1.1074283123016357, "learning_rate": 0.00026383999999999994, "loss": 1.7136, "step": 2260 }, { "epoch": 1.8159999999999998, "grad_norm": 1.0644099712371826, "learning_rate": 0.00026367999999999997, "loss": 1.7343, "step": 2270 }, { "epoch": 1.8239999999999998, "grad_norm": 1.1833316087722778, "learning_rate": 0.00026351999999999994, "loss": 1.6886, "step": 2280 }, { "epoch": 1.8319999999999999, "grad_norm": 1.2397806644439697, "learning_rate": 0.00026335999999999997, "loss": 1.7136, "step": 2290 }, { "epoch": 1.8399999999999999, "grad_norm": 0.6187876462936401, "learning_rate": 0.00026319999999999995, "loss": 1.5873, "step": 2300 }, { "epoch": 1.8479999999999999, "grad_norm": 1.141440749168396, "learning_rate": 0.00026304, "loss": 1.6937, "step": 2310 }, { "epoch": 1.8559999999999999, "grad_norm": 1.1803792715072632, "learning_rate": 0.00026287999999999995, "loss": 1.8136, "step": 2320 }, { "epoch": 1.8639999999999999, "grad_norm": 1.1979511976242065, "learning_rate": 0.00026272, "loss": 1.7426, "step": 2330 }, { "epoch": 1.8719999999999999, "grad_norm": 1.1708755493164062, "learning_rate": 0.00026255999999999995, "loss": 1.7532, "step": 2340 }, { "epoch": 1.88, "grad_norm": 1.0788543224334717, "learning_rate": 0.0002624, "loss": 1.7435, "step": 2350 }, { "epoch": 1.888, "grad_norm": 1.1670109033584595, "learning_rate": 0.00026223999999999996, "loss": 1.7338, "step": 2360 }, { "epoch": 1.896, "grad_norm": 1.1337978839874268, "learning_rate": 0.00026208, "loss": 1.7508, "step": 2370 }, { "epoch": 1.904, "grad_norm": 1.131404995918274, "learning_rate": 0.00026191999999999996, "loss": 1.7321, "step": 2380 }, { "epoch": 1.912, "grad_norm": 1.1655117273330688, "learning_rate": 0.00026176, "loss": 1.7629, "step": 2390 }, { "epoch": 1.92, "grad_norm": 1.1582902669906616, "learning_rate": 0.00026159999999999996, "loss": 1.7083, "step": 2400 }, { "epoch": 1.928, "grad_norm": 1.181884765625, "learning_rate": 0.00026143999999999994, "loss": 1.6752, "step": 2410 }, { "epoch": 1.936, "grad_norm": 1.1487571001052856, "learning_rate": 0.00026127999999999996, "loss": 1.6785, "step": 2420 }, { "epoch": 1.944, "grad_norm": 1.2264763116836548, "learning_rate": 0.00026111999999999994, "loss": 1.7237, "step": 2430 }, { "epoch": 1.952, "grad_norm": 1.285232424736023, "learning_rate": 0.00026095999999999997, "loss": 1.7148, "step": 2440 }, { "epoch": 1.96, "grad_norm": 1.2243884801864624, "learning_rate": 0.00026079999999999994, "loss": 1.7305, "step": 2450 }, { "epoch": 1.968, "grad_norm": 1.2969841957092285, "learning_rate": 0.00026063999999999997, "loss": 1.7271, "step": 2460 }, { "epoch": 1.976, "grad_norm": 1.1378511190414429, "learning_rate": 0.00026047999999999995, "loss": 1.7425, "step": 2470 }, { "epoch": 1.984, "grad_norm": 1.1501156091690063, "learning_rate": 0.00026032, "loss": 1.712, "step": 2480 }, { "epoch": 1.992, "grad_norm": 1.2089420557022095, "learning_rate": 0.00026015999999999995, "loss": 1.7164, "step": 2490 }, { "epoch": 2.0, "grad_norm": 1.1784271001815796, "learning_rate": 0.00026, "loss": 1.736, "step": 2500 }, { "epoch": 2.008, "grad_norm": 1.1402006149291992, "learning_rate": 0.00025983999999999995, "loss": 1.5834, "step": 2510 }, { "epoch": 2.016, "grad_norm": 1.1883610486984253, "learning_rate": 0.00025968, "loss": 1.5888, "step": 2520 }, { "epoch": 2.024, "grad_norm": 1.3511167764663696, "learning_rate": 0.00025951999999999995, "loss": 1.5551, "step": 2530 }, { "epoch": 2.032, "grad_norm": 1.2824231386184692, "learning_rate": 0.00025936, "loss": 1.5119, "step": 2540 }, { "epoch": 2.04, "grad_norm": 1.3076261281967163, "learning_rate": 0.00025919999999999996, "loss": 1.5044, "step": 2550 }, { "epoch": 2.048, "grad_norm": 1.3731348514556885, "learning_rate": 0.00025904, "loss": 1.5585, "step": 2560 }, { "epoch": 2.056, "grad_norm": 1.3174951076507568, "learning_rate": 0.00025887999999999996, "loss": 1.5591, "step": 2570 }, { "epoch": 2.064, "grad_norm": 1.3365912437438965, "learning_rate": 0.00025872, "loss": 1.5357, "step": 2580 }, { "epoch": 2.072, "grad_norm": 1.3091166019439697, "learning_rate": 0.00025855999999999996, "loss": 1.5232, "step": 2590 }, { "epoch": 2.08, "grad_norm": 1.372147560119629, "learning_rate": 0.00025839999999999994, "loss": 1.5427, "step": 2600 }, { "epoch": 2.088, "grad_norm": 1.412030577659607, "learning_rate": 0.00025823999999999997, "loss": 1.5851, "step": 2610 }, { "epoch": 2.096, "grad_norm": 1.3613168001174927, "learning_rate": 0.00025807999999999994, "loss": 1.5283, "step": 2620 }, { "epoch": 2.104, "grad_norm": 1.3381197452545166, "learning_rate": 0.00025791999999999997, "loss": 1.5812, "step": 2630 }, { "epoch": 2.112, "grad_norm": 1.3254282474517822, "learning_rate": 0.00025775999999999994, "loss": 1.5016, "step": 2640 }, { "epoch": 2.12, "grad_norm": 1.410579800605774, "learning_rate": 0.0002576, "loss": 1.5876, "step": 2650 }, { "epoch": 2.128, "grad_norm": 1.3652007579803467, "learning_rate": 0.00025743999999999995, "loss": 1.6116, "step": 2660 }, { "epoch": 2.136, "grad_norm": 1.4137206077575684, "learning_rate": 0.00025728, "loss": 1.5736, "step": 2670 }, { "epoch": 2.144, "grad_norm": 1.3824632167816162, "learning_rate": 0.00025711999999999995, "loss": 1.5523, "step": 2680 }, { "epoch": 2.152, "grad_norm": 1.4239449501037598, "learning_rate": 0.00025696, "loss": 1.5221, "step": 2690 }, { "epoch": 2.16, "grad_norm": 1.4395557641983032, "learning_rate": 0.00025679999999999995, "loss": 1.5161, "step": 2700 }, { "epoch": 2.168, "grad_norm": 1.5318158864974976, "learning_rate": 0.00025664, "loss": 1.5784, "step": 2710 }, { "epoch": 2.176, "grad_norm": 1.3760408163070679, "learning_rate": 0.00025647999999999996, "loss": 1.567, "step": 2720 }, { "epoch": 2.184, "grad_norm": 1.4429463148117065, "learning_rate": 0.00025632, "loss": 1.5653, "step": 2730 }, { "epoch": 2.192, "grad_norm": 1.424181342124939, "learning_rate": 0.00025615999999999996, "loss": 1.5378, "step": 2740 }, { "epoch": 2.2, "grad_norm": 1.4517723321914673, "learning_rate": 0.000256, "loss": 1.5738, "step": 2750 }, { "epoch": 2.208, "grad_norm": 1.455818772315979, "learning_rate": 0.00025583999999999996, "loss": 1.5117, "step": 2760 }, { "epoch": 2.216, "grad_norm": 1.3915988206863403, "learning_rate": 0.00025567999999999994, "loss": 1.5682, "step": 2770 }, { "epoch": 2.224, "grad_norm": 1.3807563781738281, "learning_rate": 0.00025551999999999997, "loss": 1.5618, "step": 2780 }, { "epoch": 2.232, "grad_norm": 1.3413660526275635, "learning_rate": 0.00025535999999999994, "loss": 1.5724, "step": 2790 }, { "epoch": 2.24, "grad_norm": 1.3377097845077515, "learning_rate": 0.00025519999999999997, "loss": 1.5662, "step": 2800 }, { "epoch": 2.248, "grad_norm": 1.3897491693496704, "learning_rate": 0.00025503999999999994, "loss": 1.5431, "step": 2810 }, { "epoch": 2.2560000000000002, "grad_norm": 1.4780755043029785, "learning_rate": 0.00025487999999999997, "loss": 1.5657, "step": 2820 }, { "epoch": 2.2640000000000002, "grad_norm": 1.3818844556808472, "learning_rate": 0.00025471999999999995, "loss": 1.5399, "step": 2830 }, { "epoch": 2.2720000000000002, "grad_norm": 1.4447002410888672, "learning_rate": 0.00025456, "loss": 1.5729, "step": 2840 }, { "epoch": 2.2800000000000002, "grad_norm": 1.381330132484436, "learning_rate": 0.00025439999999999995, "loss": 1.52, "step": 2850 }, { "epoch": 2.288, "grad_norm": 1.418094277381897, "learning_rate": 0.00025424, "loss": 1.5954, "step": 2860 }, { "epoch": 2.296, "grad_norm": 1.329988718032837, "learning_rate": 0.00025407999999999995, "loss": 1.5852, "step": 2870 }, { "epoch": 2.304, "grad_norm": 1.3431826829910278, "learning_rate": 0.00025392, "loss": 1.5482, "step": 2880 }, { "epoch": 2.312, "grad_norm": 1.4532684087753296, "learning_rate": 0.00025375999999999996, "loss": 1.4967, "step": 2890 }, { "epoch": 2.32, "grad_norm": 1.3491160869598389, "learning_rate": 0.0002536, "loss": 1.5145, "step": 2900 }, { "epoch": 2.328, "grad_norm": 1.3651959896087646, "learning_rate": 0.00025343999999999996, "loss": 1.5726, "step": 2910 }, { "epoch": 2.336, "grad_norm": 1.4137355089187622, "learning_rate": 0.00025328, "loss": 1.5646, "step": 2920 }, { "epoch": 2.344, "grad_norm": 1.4950937032699585, "learning_rate": 0.00025311999999999996, "loss": 1.5653, "step": 2930 }, { "epoch": 2.352, "grad_norm": 1.3360849618911743, "learning_rate": 0.00025295999999999994, "loss": 1.5669, "step": 2940 }, { "epoch": 2.36, "grad_norm": 1.4283881187438965, "learning_rate": 0.00025279999999999996, "loss": 1.5729, "step": 2950 }, { "epoch": 2.368, "grad_norm": 1.40790855884552, "learning_rate": 0.00025263999999999994, "loss": 1.54, "step": 2960 }, { "epoch": 2.376, "grad_norm": 1.5222750902175903, "learning_rate": 0.00025247999999999997, "loss": 1.6079, "step": 2970 }, { "epoch": 2.384, "grad_norm": 1.424391746520996, "learning_rate": 0.00025231999999999994, "loss": 1.5835, "step": 2980 }, { "epoch": 2.392, "grad_norm": 1.4419969320297241, "learning_rate": 0.00025215999999999997, "loss": 1.6546, "step": 2990 }, { "epoch": 2.4, "grad_norm": 1.3506951332092285, "learning_rate": 0.00025199999999999995, "loss": 1.5856, "step": 3000 }, { "epoch": 2.408, "grad_norm": 1.3341702222824097, "learning_rate": 0.00025184, "loss": 1.5933, "step": 3010 }, { "epoch": 2.416, "grad_norm": 1.3723673820495605, "learning_rate": 0.00025167999999999995, "loss": 1.602, "step": 3020 }, { "epoch": 2.424, "grad_norm": 1.4064242839813232, "learning_rate": 0.00025152, "loss": 1.5438, "step": 3030 }, { "epoch": 2.432, "grad_norm": 1.3700838088989258, "learning_rate": 0.00025135999999999995, "loss": 1.6322, "step": 3040 }, { "epoch": 2.44, "grad_norm": 1.4045076370239258, "learning_rate": 0.0002512, "loss": 1.5728, "step": 3050 }, { "epoch": 2.448, "grad_norm": 1.4885849952697754, "learning_rate": 0.00025103999999999995, "loss": 1.6177, "step": 3060 }, { "epoch": 2.456, "grad_norm": 1.4054323434829712, "learning_rate": 0.00025088, "loss": 1.5579, "step": 3070 }, { "epoch": 2.464, "grad_norm": 1.4171288013458252, "learning_rate": 0.00025071999999999996, "loss": 1.6058, "step": 3080 }, { "epoch": 2.472, "grad_norm": 1.3950269222259521, "learning_rate": 0.00025056, "loss": 1.5906, "step": 3090 }, { "epoch": 2.48, "grad_norm": 1.3375904560089111, "learning_rate": 0.00025039999999999996, "loss": 1.5984, "step": 3100 }, { "epoch": 2.488, "grad_norm": 1.3980008363723755, "learning_rate": 0.00025024, "loss": 1.5724, "step": 3110 }, { "epoch": 2.496, "grad_norm": 1.3917794227600098, "learning_rate": 0.00025007999999999996, "loss": 1.6085, "step": 3120 }, { "epoch": 2.504, "grad_norm": 1.3524688482284546, "learning_rate": 0.00024991999999999994, "loss": 1.5734, "step": 3130 }, { "epoch": 2.512, "grad_norm": 1.4597851037979126, "learning_rate": 0.00024975999999999997, "loss": 1.6001, "step": 3140 }, { "epoch": 2.52, "grad_norm": 1.4018633365631104, "learning_rate": 0.00024959999999999994, "loss": 1.5761, "step": 3150 }, { "epoch": 2.528, "grad_norm": 1.414162278175354, "learning_rate": 0.00024943999999999997, "loss": 1.5799, "step": 3160 }, { "epoch": 2.536, "grad_norm": 1.3470393419265747, "learning_rate": 0.00024927999999999994, "loss": 1.5905, "step": 3170 }, { "epoch": 2.544, "grad_norm": 1.350521445274353, "learning_rate": 0.00024912, "loss": 1.605, "step": 3180 }, { "epoch": 2.552, "grad_norm": 1.4013463258743286, "learning_rate": 0.00024895999999999995, "loss": 1.5653, "step": 3190 }, { "epoch": 2.56, "grad_norm": 1.3292449712753296, "learning_rate": 0.0002488, "loss": 1.5761, "step": 3200 }, { "epoch": 2.568, "grad_norm": 1.2734830379486084, "learning_rate": 0.00024863999999999995, "loss": 1.6476, "step": 3210 }, { "epoch": 2.576, "grad_norm": 1.4279605150222778, "learning_rate": 0.00024848, "loss": 1.5671, "step": 3220 }, { "epoch": 2.584, "grad_norm": 1.4233906269073486, "learning_rate": 0.00024831999999999995, "loss": 1.5653, "step": 3230 }, { "epoch": 2.592, "grad_norm": 1.393425703048706, "learning_rate": 0.00024816, "loss": 1.5605, "step": 3240 }, { "epoch": 2.6, "grad_norm": 1.4003586769104004, "learning_rate": 0.00024799999999999996, "loss": 1.5775, "step": 3250 }, { "epoch": 2.608, "grad_norm": 1.3909311294555664, "learning_rate": 0.00024784, "loss": 1.599, "step": 3260 }, { "epoch": 2.616, "grad_norm": 1.3618372678756714, "learning_rate": 0.00024767999999999996, "loss": 1.5981, "step": 3270 }, { "epoch": 2.624, "grad_norm": 1.3769896030426025, "learning_rate": 0.00024752, "loss": 1.6391, "step": 3280 }, { "epoch": 2.632, "grad_norm": 1.2977598905563354, "learning_rate": 0.00024735999999999996, "loss": 1.6043, "step": 3290 }, { "epoch": 2.64, "grad_norm": 1.3508882522583008, "learning_rate": 0.0002472, "loss": 1.5432, "step": 3300 }, { "epoch": 2.648, "grad_norm": 1.411916732788086, "learning_rate": 0.00024703999999999997, "loss": 1.5889, "step": 3310 }, { "epoch": 2.656, "grad_norm": 1.3107311725616455, "learning_rate": 0.00024687999999999994, "loss": 1.5618, "step": 3320 }, { "epoch": 2.664, "grad_norm": 1.383974552154541, "learning_rate": 0.00024671999999999997, "loss": 1.5893, "step": 3330 }, { "epoch": 2.672, "grad_norm": 1.3793646097183228, "learning_rate": 0.00024655999999999994, "loss": 1.5901, "step": 3340 }, { "epoch": 2.68, "grad_norm": 1.405423879623413, "learning_rate": 0.00024639999999999997, "loss": 1.638, "step": 3350 }, { "epoch": 2.6879999999999997, "grad_norm": 1.3815405368804932, "learning_rate": 0.00024623999999999995, "loss": 1.6037, "step": 3360 }, { "epoch": 2.6959999999999997, "grad_norm": 1.297813057899475, "learning_rate": 0.00024608, "loss": 1.5632, "step": 3370 }, { "epoch": 2.7039999999999997, "grad_norm": 1.3591309785842896, "learning_rate": 0.00024591999999999995, "loss": 1.5892, "step": 3380 }, { "epoch": 2.7119999999999997, "grad_norm": 1.4379678964614868, "learning_rate": 0.00024576, "loss": 1.6029, "step": 3390 }, { "epoch": 2.7199999999999998, "grad_norm": 1.4956458806991577, "learning_rate": 0.00024559999999999995, "loss": 1.5974, "step": 3400 }, { "epoch": 2.7279999999999998, "grad_norm": 1.4072085618972778, "learning_rate": 0.00024544, "loss": 1.6431, "step": 3410 }, { "epoch": 2.7359999999999998, "grad_norm": 1.28607177734375, "learning_rate": 0.00024527999999999996, "loss": 1.5868, "step": 3420 }, { "epoch": 2.7439999999999998, "grad_norm": 1.5061297416687012, "learning_rate": 0.00024512, "loss": 1.613, "step": 3430 }, { "epoch": 2.752, "grad_norm": 1.4274139404296875, "learning_rate": 0.00024495999999999996, "loss": 1.5507, "step": 3440 }, { "epoch": 2.76, "grad_norm": 1.4335947036743164, "learning_rate": 0.0002448, "loss": 1.5557, "step": 3450 }, { "epoch": 2.768, "grad_norm": 1.3052548170089722, "learning_rate": 0.00024463999999999996, "loss": 1.5779, "step": 3460 }, { "epoch": 2.776, "grad_norm": 1.2695350646972656, "learning_rate": 0.00024448, "loss": 1.5587, "step": 3470 }, { "epoch": 2.784, "grad_norm": 1.4060364961624146, "learning_rate": 0.00024431999999999996, "loss": 1.6129, "step": 3480 }, { "epoch": 2.792, "grad_norm": 1.4803110361099243, "learning_rate": 0.00024416, "loss": 1.6236, "step": 3490 }, { "epoch": 2.8, "grad_norm": 1.353215217590332, "learning_rate": 0.000244, "loss": 1.6323, "step": 3500 }, { "epoch": 2.808, "grad_norm": 1.3456429243087769, "learning_rate": 0.00024383999999999997, "loss": 1.6635, "step": 3510 }, { "epoch": 2.816, "grad_norm": 1.4098529815673828, "learning_rate": 0.00024368, "loss": 1.5523, "step": 3520 }, { "epoch": 2.824, "grad_norm": 1.5074928998947144, "learning_rate": 0.00024351999999999997, "loss": 1.5736, "step": 3530 }, { "epoch": 2.832, "grad_norm": 1.4895234107971191, "learning_rate": 0.00024336, "loss": 1.6004, "step": 3540 }, { "epoch": 2.84, "grad_norm": 1.370694875717163, "learning_rate": 0.00024319999999999998, "loss": 1.5812, "step": 3550 }, { "epoch": 2.848, "grad_norm": 1.3541662693023682, "learning_rate": 0.00024303999999999998, "loss": 1.6059, "step": 3560 }, { "epoch": 2.856, "grad_norm": 1.3392258882522583, "learning_rate": 0.00024287999999999998, "loss": 1.5564, "step": 3570 }, { "epoch": 2.864, "grad_norm": 1.4230504035949707, "learning_rate": 0.00024271999999999998, "loss": 1.6174, "step": 3580 }, { "epoch": 2.872, "grad_norm": 1.3360211849212646, "learning_rate": 0.00024255999999999998, "loss": 1.5281, "step": 3590 }, { "epoch": 2.88, "grad_norm": 1.4017730951309204, "learning_rate": 0.00024239999999999998, "loss": 1.5762, "step": 3600 }, { "epoch": 2.888, "grad_norm": 1.4613922834396362, "learning_rate": 0.00024223999999999998, "loss": 1.6045, "step": 3610 }, { "epoch": 2.896, "grad_norm": 1.458549976348877, "learning_rate": 0.00024207999999999996, "loss": 1.5711, "step": 3620 }, { "epoch": 2.904, "grad_norm": 1.4020884037017822, "learning_rate": 0.00024192, "loss": 1.5384, "step": 3630 }, { "epoch": 2.912, "grad_norm": 1.306881308555603, "learning_rate": 0.00024175999999999996, "loss": 1.6462, "step": 3640 }, { "epoch": 2.92, "grad_norm": 1.417031168937683, "learning_rate": 0.0002416, "loss": 1.6108, "step": 3650 }, { "epoch": 2.928, "grad_norm": 1.3262524604797363, "learning_rate": 0.00024143999999999997, "loss": 1.6286, "step": 3660 }, { "epoch": 2.936, "grad_norm": 1.4688752889633179, "learning_rate": 0.00024128, "loss": 1.6187, "step": 3670 }, { "epoch": 2.944, "grad_norm": 1.413116216659546, "learning_rate": 0.00024111999999999997, "loss": 1.5804, "step": 3680 }, { "epoch": 2.952, "grad_norm": 1.3572710752487183, "learning_rate": 0.00024096, "loss": 1.5947, "step": 3690 }, { "epoch": 2.96, "grad_norm": 1.4315195083618164, "learning_rate": 0.00024079999999999997, "loss": 1.5673, "step": 3700 }, { "epoch": 2.968, "grad_norm": 1.455693244934082, "learning_rate": 0.00024064, "loss": 1.6311, "step": 3710 }, { "epoch": 2.976, "grad_norm": 1.4192560911178589, "learning_rate": 0.00024047999999999997, "loss": 1.604, "step": 3720 }, { "epoch": 2.984, "grad_norm": 1.4426895380020142, "learning_rate": 0.00024032, "loss": 1.6163, "step": 3730 }, { "epoch": 2.992, "grad_norm": 1.3536198139190674, "learning_rate": 0.00024015999999999998, "loss": 1.6009, "step": 3740 }, { "epoch": 3.0, "grad_norm": 1.4080064296722412, "learning_rate": 0.00023999999999999998, "loss": 1.5628, "step": 3750 }, { "epoch": 3.008, "grad_norm": 1.457725167274475, "learning_rate": 0.00023983999999999998, "loss": 1.3987, "step": 3760 }, { "epoch": 3.016, "grad_norm": 1.5731124877929688, "learning_rate": 0.00023967999999999998, "loss": 1.3801, "step": 3770 }, { "epoch": 3.024, "grad_norm": 1.6412163972854614, "learning_rate": 0.00023951999999999998, "loss": 1.3839, "step": 3780 }, { "epoch": 3.032, "grad_norm": 1.596416711807251, "learning_rate": 0.00023935999999999996, "loss": 1.3565, "step": 3790 }, { "epoch": 3.04, "grad_norm": 1.645661473274231, "learning_rate": 0.0002392, "loss": 1.3738, "step": 3800 }, { "epoch": 3.048, "grad_norm": 1.6392892599105835, "learning_rate": 0.00023903999999999996, "loss": 1.4528, "step": 3810 }, { "epoch": 3.056, "grad_norm": 1.6278529167175293, "learning_rate": 0.00023888, "loss": 1.3921, "step": 3820 }, { "epoch": 3.064, "grad_norm": 1.5314973592758179, "learning_rate": 0.00023871999999999996, "loss": 1.3978, "step": 3830 }, { "epoch": 3.072, "grad_norm": 1.648292899131775, "learning_rate": 0.00023856, "loss": 1.3825, "step": 3840 }, { "epoch": 3.08, "grad_norm": 1.5989805459976196, "learning_rate": 0.00023839999999999997, "loss": 1.4229, "step": 3850 }, { "epoch": 3.088, "grad_norm": 1.581239104270935, "learning_rate": 0.00023824, "loss": 1.4038, "step": 3860 }, { "epoch": 3.096, "grad_norm": 1.6047204732894897, "learning_rate": 0.00023807999999999997, "loss": 1.3652, "step": 3870 }, { "epoch": 3.104, "grad_norm": 1.6853421926498413, "learning_rate": 0.00023792, "loss": 1.4256, "step": 3880 }, { "epoch": 3.112, "grad_norm": 1.531514048576355, "learning_rate": 0.00023775999999999997, "loss": 1.4061, "step": 3890 }, { "epoch": 3.12, "grad_norm": 1.6761980056762695, "learning_rate": 0.0002376, "loss": 1.4205, "step": 3900 }, { "epoch": 3.128, "grad_norm": 1.6190063953399658, "learning_rate": 0.00023743999999999998, "loss": 1.434, "step": 3910 }, { "epoch": 3.136, "grad_norm": 1.6470814943313599, "learning_rate": 0.00023728, "loss": 1.42, "step": 3920 }, { "epoch": 3.144, "grad_norm": 1.5572431087493896, "learning_rate": 0.00023711999999999998, "loss": 1.418, "step": 3930 }, { "epoch": 3.152, "grad_norm": 1.6244536638259888, "learning_rate": 0.00023695999999999998, "loss": 1.4415, "step": 3940 }, { "epoch": 3.16, "grad_norm": 1.7610841989517212, "learning_rate": 0.00023679999999999998, "loss": 1.4249, "step": 3950 }, { "epoch": 3.168, "grad_norm": 1.6935431957244873, "learning_rate": 0.00023663999999999996, "loss": 1.4373, "step": 3960 }, { "epoch": 3.176, "grad_norm": 1.6628581285476685, "learning_rate": 0.00023647999999999999, "loss": 1.4168, "step": 3970 }, { "epoch": 3.184, "grad_norm": 1.6654530763626099, "learning_rate": 0.00023631999999999996, "loss": 1.4028, "step": 3980 }, { "epoch": 3.192, "grad_norm": 1.6582281589508057, "learning_rate": 0.00023616, "loss": 1.4062, "step": 3990 }, { "epoch": 3.2, "grad_norm": 1.7206676006317139, "learning_rate": 0.00023599999999999996, "loss": 1.3972, "step": 4000 }, { "epoch": 3.208, "grad_norm": 1.6492377519607544, "learning_rate": 0.000235856, "loss": 1.4392, "step": 4010 }, { "epoch": 3.216, "grad_norm": 1.8048959970474243, "learning_rate": 0.00023569599999999997, "loss": 1.4523, "step": 4020 }, { "epoch": 3.224, "grad_norm": 1.644860029220581, "learning_rate": 0.000235536, "loss": 1.4168, "step": 4030 }, { "epoch": 3.232, "grad_norm": 1.666577696800232, "learning_rate": 0.00023537599999999998, "loss": 1.5143, "step": 4040 }, { "epoch": 3.24, "grad_norm": 1.6702250242233276, "learning_rate": 0.000235216, "loss": 1.4573, "step": 4050 }, { "epoch": 3.248, "grad_norm": 1.6784839630126953, "learning_rate": 0.00023505599999999998, "loss": 1.4388, "step": 4060 }, { "epoch": 3.2560000000000002, "grad_norm": 1.6922202110290527, "learning_rate": 0.000234896, "loss": 1.4481, "step": 4070 }, { "epoch": 3.2640000000000002, "grad_norm": 1.5728070735931396, "learning_rate": 0.00023473599999999998, "loss": 1.4358, "step": 4080 }, { "epoch": 3.2720000000000002, "grad_norm": 1.6510705947875977, "learning_rate": 0.00023457599999999996, "loss": 1.4615, "step": 4090 }, { "epoch": 3.2800000000000002, "grad_norm": 1.655216097831726, "learning_rate": 0.00023441599999999999, "loss": 1.4087, "step": 4100 }, { "epoch": 3.288, "grad_norm": 1.6214429140090942, "learning_rate": 0.00023425599999999996, "loss": 1.3582, "step": 4110 }, { "epoch": 3.296, "grad_norm": 1.5415211915969849, "learning_rate": 0.000234096, "loss": 1.438, "step": 4120 }, { "epoch": 3.304, "grad_norm": 1.6406790018081665, "learning_rate": 0.00023393599999999996, "loss": 1.4781, "step": 4130 }, { "epoch": 3.312, "grad_norm": 1.7424193620681763, "learning_rate": 0.000233776, "loss": 1.447, "step": 4140 }, { "epoch": 3.32, "grad_norm": 1.5324851274490356, "learning_rate": 0.00023361599999999997, "loss": 1.4415, "step": 4150 }, { "epoch": 3.328, "grad_norm": 1.6945812702178955, "learning_rate": 0.000233456, "loss": 1.4761, "step": 4160 }, { "epoch": 3.336, "grad_norm": 1.7372822761535645, "learning_rate": 0.00023329599999999997, "loss": 1.4504, "step": 4170 }, { "epoch": 3.344, "grad_norm": 1.6869308948516846, "learning_rate": 0.000233136, "loss": 1.4623, "step": 4180 }, { "epoch": 3.352, "grad_norm": 1.7480005025863647, "learning_rate": 0.00023299199999999998, "loss": 1.4184, "step": 4190 }, { "epoch": 3.36, "grad_norm": 1.570494532585144, "learning_rate": 0.00023283199999999996, "loss": 1.3832, "step": 4200 }, { "epoch": 3.368, "grad_norm": 1.8143585920333862, "learning_rate": 0.00023267199999999998, "loss": 1.4252, "step": 4210 }, { "epoch": 3.376, "grad_norm": 1.571781039237976, "learning_rate": 0.00023251199999999996, "loss": 1.4336, "step": 4220 }, { "epoch": 3.384, "grad_norm": 1.6962851285934448, "learning_rate": 0.000232352, "loss": 1.4297, "step": 4230 }, { "epoch": 3.392, "grad_norm": 1.6035798788070679, "learning_rate": 0.00023219199999999996, "loss": 1.451, "step": 4240 }, { "epoch": 3.4, "grad_norm": 1.6665290594100952, "learning_rate": 0.000232032, "loss": 1.4281, "step": 4250 }, { "epoch": 3.408, "grad_norm": 1.8126115798950195, "learning_rate": 0.00023187199999999996, "loss": 1.4711, "step": 4260 }, { "epoch": 3.416, "grad_norm": 1.6531234979629517, "learning_rate": 0.000231712, "loss": 1.4257, "step": 4270 }, { "epoch": 3.424, "grad_norm": 1.7809317111968994, "learning_rate": 0.00023155199999999997, "loss": 1.4205, "step": 4280 }, { "epoch": 3.432, "grad_norm": 1.8113545179367065, "learning_rate": 0.000231392, "loss": 1.4269, "step": 4290 }, { "epoch": 3.44, "grad_norm": 1.6733276844024658, "learning_rate": 0.00023123199999999997, "loss": 1.4178, "step": 4300 }, { "epoch": 3.448, "grad_norm": 1.6381465196609497, "learning_rate": 0.000231072, "loss": 1.4926, "step": 4310 }, { "epoch": 3.456, "grad_norm": 1.6401630640029907, "learning_rate": 0.00023091199999999997, "loss": 1.4817, "step": 4320 }, { "epoch": 3.464, "grad_norm": 1.628445029258728, "learning_rate": 0.000230752, "loss": 1.4935, "step": 4330 }, { "epoch": 3.472, "grad_norm": 1.632957935333252, "learning_rate": 0.00023059199999999998, "loss": 1.4787, "step": 4340 }, { "epoch": 3.48, "grad_norm": 1.7037655115127563, "learning_rate": 0.000230432, "loss": 1.4547, "step": 4350 }, { "epoch": 3.488, "grad_norm": 1.7122141122817993, "learning_rate": 0.00023027199999999998, "loss": 1.4314, "step": 4360 }, { "epoch": 3.496, "grad_norm": 1.620144248008728, "learning_rate": 0.000230112, "loss": 1.4198, "step": 4370 }, { "epoch": 3.504, "grad_norm": 1.604781150817871, "learning_rate": 0.00022995199999999998, "loss": 1.4648, "step": 4380 }, { "epoch": 3.512, "grad_norm": 1.6798850297927856, "learning_rate": 0.00022979199999999996, "loss": 1.4556, "step": 4390 }, { "epoch": 3.52, "grad_norm": 1.7711225748062134, "learning_rate": 0.00022963199999999999, "loss": 1.4741, "step": 4400 }, { "epoch": 3.528, "grad_norm": 1.7546377182006836, "learning_rate": 0.00022947199999999996, "loss": 1.4186, "step": 4410 }, { "epoch": 3.536, "grad_norm": 1.5374271869659424, "learning_rate": 0.000229312, "loss": 1.4726, "step": 4420 }, { "epoch": 3.544, "grad_norm": 1.691049575805664, "learning_rate": 0.00022915199999999996, "loss": 1.4302, "step": 4430 }, { "epoch": 3.552, "grad_norm": 1.8386030197143555, "learning_rate": 0.000228992, "loss": 1.4855, "step": 4440 }, { "epoch": 3.56, "grad_norm": 1.659847617149353, "learning_rate": 0.00022883199999999997, "loss": 1.4818, "step": 4450 }, { "epoch": 3.568, "grad_norm": 1.693474292755127, "learning_rate": 0.000228672, "loss": 1.4226, "step": 4460 }, { "epoch": 3.576, "grad_norm": 1.6777136325836182, "learning_rate": 0.00022851199999999997, "loss": 1.4819, "step": 4470 }, { "epoch": 3.584, "grad_norm": 1.6764864921569824, "learning_rate": 0.000228352, "loss": 1.4802, "step": 4480 }, { "epoch": 3.592, "grad_norm": 1.790226697921753, "learning_rate": 0.00022819199999999997, "loss": 1.5031, "step": 4490 }, { "epoch": 3.6, "grad_norm": 1.5997536182403564, "learning_rate": 0.000228032, "loss": 1.4584, "step": 4500 }, { "epoch": 3.608, "grad_norm": 1.6292929649353027, "learning_rate": 0.00022787199999999998, "loss": 1.4389, "step": 4510 }, { "epoch": 3.616, "grad_norm": 1.6309911012649536, "learning_rate": 0.000227712, "loss": 1.4545, "step": 4520 }, { "epoch": 3.624, "grad_norm": 1.6208481788635254, "learning_rate": 0.00022755199999999998, "loss": 1.4653, "step": 4530 }, { "epoch": 3.632, "grad_norm": 1.75088369846344, "learning_rate": 0.000227392, "loss": 1.4934, "step": 4540 }, { "epoch": 3.64, "grad_norm": 1.8166102170944214, "learning_rate": 0.00022723199999999998, "loss": 1.4889, "step": 4550 }, { "epoch": 3.648, "grad_norm": 1.5575486421585083, "learning_rate": 0.00022707199999999996, "loss": 1.4948, "step": 4560 }, { "epoch": 3.656, "grad_norm": 1.6632091999053955, "learning_rate": 0.00022691199999999999, "loss": 1.4832, "step": 4570 }, { "epoch": 3.664, "grad_norm": 1.583553433418274, "learning_rate": 0.00022675199999999996, "loss": 1.492, "step": 4580 }, { "epoch": 3.672, "grad_norm": 1.668994426727295, "learning_rate": 0.000226592, "loss": 1.4978, "step": 4590 }, { "epoch": 3.68, "grad_norm": 1.515479326248169, "learning_rate": 0.00022643199999999996, "loss": 1.4916, "step": 4600 }, { "epoch": 3.6879999999999997, "grad_norm": 1.652949333190918, "learning_rate": 0.000226272, "loss": 1.4584, "step": 4610 }, { "epoch": 3.6959999999999997, "grad_norm": 1.6760021448135376, "learning_rate": 0.00022611199999999997, "loss": 1.5198, "step": 4620 }, { "epoch": 3.7039999999999997, "grad_norm": 1.5702306032180786, "learning_rate": 0.000225952, "loss": 1.4703, "step": 4630 }, { "epoch": 3.7119999999999997, "grad_norm": 1.4395867586135864, "learning_rate": 0.00022579199999999997, "loss": 1.5072, "step": 4640 }, { "epoch": 3.7199999999999998, "grad_norm": 1.5167125463485718, "learning_rate": 0.000225632, "loss": 1.473, "step": 4650 }, { "epoch": 3.7279999999999998, "grad_norm": 1.6731159687042236, "learning_rate": 0.00022547199999999997, "loss": 1.4886, "step": 4660 }, { "epoch": 3.7359999999999998, "grad_norm": 1.6189254522323608, "learning_rate": 0.000225312, "loss": 1.4699, "step": 4670 }, { "epoch": 3.7439999999999998, "grad_norm": 1.5746572017669678, "learning_rate": 0.00022515199999999997, "loss": 1.5175, "step": 4680 }, { "epoch": 3.752, "grad_norm": 1.7097058296203613, "learning_rate": 0.000224992, "loss": 1.4978, "step": 4690 }, { "epoch": 3.76, "grad_norm": 1.6119595766067505, "learning_rate": 0.00022483199999999998, "loss": 1.4492, "step": 4700 }, { "epoch": 3.768, "grad_norm": 1.736672043800354, "learning_rate": 0.000224672, "loss": 1.4344, "step": 4710 }, { "epoch": 3.776, "grad_norm": 1.7588441371917725, "learning_rate": 0.00022451199999999998, "loss": 1.4563, "step": 4720 }, { "epoch": 3.784, "grad_norm": 1.6483169794082642, "learning_rate": 0.00022435199999999996, "loss": 1.525, "step": 4730 }, { "epoch": 3.792, "grad_norm": 1.5439528226852417, "learning_rate": 0.00022419199999999998, "loss": 1.4263, "step": 4740 }, { "epoch": 3.8, "grad_norm": 1.5422582626342773, "learning_rate": 0.00022403199999999996, "loss": 1.4792, "step": 4750 }, { "epoch": 3.808, "grad_norm": 1.580538272857666, "learning_rate": 0.000223872, "loss": 1.5005, "step": 4760 }, { "epoch": 3.816, "grad_norm": 1.5790603160858154, "learning_rate": 0.00022371199999999996, "loss": 1.4742, "step": 4770 }, { "epoch": 3.824, "grad_norm": 1.597711443901062, "learning_rate": 0.000223552, "loss": 1.4576, "step": 4780 }, { "epoch": 3.832, "grad_norm": 1.7034629583358765, "learning_rate": 0.00022339199999999996, "loss": 1.4753, "step": 4790 }, { "epoch": 3.84, "grad_norm": 1.6988534927368164, "learning_rate": 0.000223232, "loss": 1.4657, "step": 4800 }, { "epoch": 3.848, "grad_norm": 1.5512536764144897, "learning_rate": 0.00022307199999999997, "loss": 1.5351, "step": 4810 }, { "epoch": 3.856, "grad_norm": 1.6742076873779297, "learning_rate": 0.000222912, "loss": 1.5108, "step": 4820 }, { "epoch": 3.864, "grad_norm": 1.6922365427017212, "learning_rate": 0.00022275199999999997, "loss": 1.4979, "step": 4830 }, { "epoch": 3.872, "grad_norm": 1.6407732963562012, "learning_rate": 0.000222592, "loss": 1.535, "step": 4840 }, { "epoch": 3.88, "grad_norm": 1.7077395915985107, "learning_rate": 0.00022243199999999997, "loss": 1.4981, "step": 4850 }, { "epoch": 3.888, "grad_norm": 1.7150638103485107, "learning_rate": 0.000222272, "loss": 1.5042, "step": 4860 }, { "epoch": 3.896, "grad_norm": 1.5963282585144043, "learning_rate": 0.00022211199999999998, "loss": 1.4823, "step": 4870 }, { "epoch": 3.904, "grad_norm": 1.5717283487319946, "learning_rate": 0.000221952, "loss": 1.4598, "step": 4880 }, { "epoch": 3.912, "grad_norm": 1.6642472743988037, "learning_rate": 0.00022179199999999998, "loss": 1.5055, "step": 4890 }, { "epoch": 3.92, "grad_norm": 1.6903674602508545, "learning_rate": 0.00022163199999999995, "loss": 1.4801, "step": 4900 }, { "epoch": 3.928, "grad_norm": 1.5324851274490356, "learning_rate": 0.00022147199999999998, "loss": 1.4673, "step": 4910 }, { "epoch": 3.936, "grad_norm": 1.6329302787780762, "learning_rate": 0.00022131199999999996, "loss": 1.5142, "step": 4920 }, { "epoch": 3.944, "grad_norm": 1.74014413356781, "learning_rate": 0.00022115199999999999, "loss": 1.5035, "step": 4930 }, { "epoch": 3.952, "grad_norm": 1.7019540071487427, "learning_rate": 0.00022099199999999996, "loss": 1.4721, "step": 4940 }, { "epoch": 3.96, "grad_norm": 1.8085479736328125, "learning_rate": 0.000220832, "loss": 1.5223, "step": 4950 }, { "epoch": 3.968, "grad_norm": 1.5533138513565063, "learning_rate": 0.00022067199999999996, "loss": 1.5005, "step": 4960 }, { "epoch": 3.976, "grad_norm": 1.5848819017410278, "learning_rate": 0.000220512, "loss": 1.4882, "step": 4970 }, { "epoch": 3.984, "grad_norm": 1.7058250904083252, "learning_rate": 0.00022035199999999997, "loss": 1.5222, "step": 4980 }, { "epoch": 3.992, "grad_norm": 1.6337260007858276, "learning_rate": 0.000220192, "loss": 1.5153, "step": 4990 }, { "epoch": 4.0, "grad_norm": 1.6282905340194702, "learning_rate": 0.00022003199999999997, "loss": 1.5281, "step": 5000 }, { "epoch": 4.008, "grad_norm": 1.752145528793335, "learning_rate": 0.000219872, "loss": 1.2938, "step": 5010 }, { "epoch": 4.016, "grad_norm": 1.7552149295806885, "learning_rate": 0.00021971199999999997, "loss": 1.2279, "step": 5020 }, { "epoch": 4.024, "grad_norm": 1.7727724313735962, "learning_rate": 0.000219552, "loss": 1.2764, "step": 5030 }, { "epoch": 4.032, "grad_norm": 1.9101881980895996, "learning_rate": 0.00021939199999999998, "loss": 1.2379, "step": 5040 }, { "epoch": 4.04, "grad_norm": 1.8460677862167358, "learning_rate": 0.000219232, "loss": 1.2328, "step": 5050 }, { "epoch": 4.048, "grad_norm": 1.869718313217163, "learning_rate": 0.00021907199999999998, "loss": 1.3119, "step": 5060 }, { "epoch": 4.056, "grad_norm": 1.789228916168213, "learning_rate": 0.00021891199999999995, "loss": 1.3397, "step": 5070 }, { "epoch": 4.064, "grad_norm": 1.9012354612350464, "learning_rate": 0.00021875199999999998, "loss": 1.2709, "step": 5080 }, { "epoch": 4.072, "grad_norm": 1.9376635551452637, "learning_rate": 0.00021859199999999996, "loss": 1.3102, "step": 5090 }, { "epoch": 4.08, "grad_norm": 2.017179489135742, "learning_rate": 0.00021843199999999998, "loss": 1.2612, "step": 5100 }, { "epoch": 4.088, "grad_norm": 1.9067007303237915, "learning_rate": 0.00021827199999999996, "loss": 1.2686, "step": 5110 }, { "epoch": 4.096, "grad_norm": 1.8910828828811646, "learning_rate": 0.000218112, "loss": 1.3517, "step": 5120 }, { "epoch": 4.104, "grad_norm": 2.0437941551208496, "learning_rate": 0.00021795199999999996, "loss": 1.3021, "step": 5130 }, { "epoch": 4.112, "grad_norm": 1.896239995956421, "learning_rate": 0.000217792, "loss": 1.2559, "step": 5140 }, { "epoch": 4.12, "grad_norm": 1.965014100074768, "learning_rate": 0.00021763199999999997, "loss": 1.2925, "step": 5150 }, { "epoch": 4.128, "grad_norm": 1.9773045778274536, "learning_rate": 0.000217472, "loss": 1.2994, "step": 5160 }, { "epoch": 4.136, "grad_norm": 1.7610217332839966, "learning_rate": 0.00021731199999999997, "loss": 1.2904, "step": 5170 }, { "epoch": 4.144, "grad_norm": 2.0166215896606445, "learning_rate": 0.000217152, "loss": 1.3171, "step": 5180 }, { "epoch": 4.152, "grad_norm": 1.8862032890319824, "learning_rate": 0.00021699199999999997, "loss": 1.3267, "step": 5190 }, { "epoch": 4.16, "grad_norm": 1.7716232538223267, "learning_rate": 0.000216832, "loss": 1.3342, "step": 5200 }, { "epoch": 4.168, "grad_norm": 1.8332161903381348, "learning_rate": 0.00021667199999999997, "loss": 1.2916, "step": 5210 }, { "epoch": 4.176, "grad_norm": 1.9238322973251343, "learning_rate": 0.000216512, "loss": 1.3271, "step": 5220 }, { "epoch": 4.184, "grad_norm": 1.7780416011810303, "learning_rate": 0.00021635199999999998, "loss": 1.321, "step": 5230 }, { "epoch": 4.192, "grad_norm": 1.985548973083496, "learning_rate": 0.00021619199999999995, "loss": 1.3342, "step": 5240 }, { "epoch": 4.2, "grad_norm": 1.9339617490768433, "learning_rate": 0.00021603199999999998, "loss": 1.3496, "step": 5250 }, { "epoch": 4.208, "grad_norm": 1.7527296543121338, "learning_rate": 0.00021587199999999996, "loss": 1.2671, "step": 5260 }, { "epoch": 4.216, "grad_norm": 1.8272658586502075, "learning_rate": 0.00021571199999999998, "loss": 1.2471, "step": 5270 }, { "epoch": 4.224, "grad_norm": 1.87795090675354, "learning_rate": 0.00021555199999999996, "loss": 1.2631, "step": 5280 }, { "epoch": 4.232, "grad_norm": 1.9426238536834717, "learning_rate": 0.000215392, "loss": 1.2649, "step": 5290 }, { "epoch": 4.24, "grad_norm": 1.819056510925293, "learning_rate": 0.00021523199999999996, "loss": 1.3185, "step": 5300 }, { "epoch": 4.248, "grad_norm": 1.9573816061019897, "learning_rate": 0.000215072, "loss": 1.3122, "step": 5310 }, { "epoch": 4.256, "grad_norm": 1.919756531715393, "learning_rate": 0.00021491199999999996, "loss": 1.329, "step": 5320 }, { "epoch": 4.264, "grad_norm": 1.9141929149627686, "learning_rate": 0.000214752, "loss": 1.2475, "step": 5330 }, { "epoch": 4.272, "grad_norm": 2.0552964210510254, "learning_rate": 0.00021459199999999997, "loss": 1.3471, "step": 5340 }, { "epoch": 4.28, "grad_norm": 1.90670645236969, "learning_rate": 0.000214432, "loss": 1.3266, "step": 5350 }, { "epoch": 4.288, "grad_norm": 2.149916410446167, "learning_rate": 0.00021427199999999997, "loss": 1.3484, "step": 5360 }, { "epoch": 4.296, "grad_norm": 1.805012822151184, "learning_rate": 0.000214112, "loss": 1.3079, "step": 5370 }, { "epoch": 4.304, "grad_norm": 1.9589773416519165, "learning_rate": 0.00021395199999999997, "loss": 1.36, "step": 5380 }, { "epoch": 4.312, "grad_norm": 2.037567138671875, "learning_rate": 0.000213792, "loss": 1.2982, "step": 5390 }, { "epoch": 4.32, "grad_norm": 1.8047535419464111, "learning_rate": 0.00021363199999999998, "loss": 1.3386, "step": 5400 }, { "epoch": 4.328, "grad_norm": 1.9072496891021729, "learning_rate": 0.00021347199999999998, "loss": 1.2602, "step": 5410 }, { "epoch": 4.336, "grad_norm": 1.9491392374038696, "learning_rate": 0.00021331199999999998, "loss": 1.3797, "step": 5420 }, { "epoch": 4.344, "grad_norm": 2.073835611343384, "learning_rate": 0.00021315199999999995, "loss": 1.3107, "step": 5430 }, { "epoch": 4.352, "grad_norm": 1.936270833015442, "learning_rate": 0.00021299199999999998, "loss": 1.3318, "step": 5440 }, { "epoch": 4.36, "grad_norm": 1.9866790771484375, "learning_rate": 0.00021283199999999996, "loss": 1.273, "step": 5450 }, { "epoch": 4.368, "grad_norm": 2.0993947982788086, "learning_rate": 0.00021267199999999999, "loss": 1.3157, "step": 5460 }, { "epoch": 4.376, "grad_norm": 1.937992811203003, "learning_rate": 0.00021251199999999996, "loss": 1.2858, "step": 5470 }, { "epoch": 4.384, "grad_norm": 1.7649872303009033, "learning_rate": 0.000212352, "loss": 1.3118, "step": 5480 }, { "epoch": 4.392, "grad_norm": 1.8896372318267822, "learning_rate": 0.00021219199999999996, "loss": 1.3051, "step": 5490 }, { "epoch": 4.4, "grad_norm": 1.9377533197402954, "learning_rate": 0.000212032, "loss": 1.3561, "step": 5500 }, { "epoch": 4.408, "grad_norm": 2.079291820526123, "learning_rate": 0.00021187199999999997, "loss": 1.3083, "step": 5510 }, { "epoch": 4.416, "grad_norm": 2.0022287368774414, "learning_rate": 0.000211712, "loss": 1.3287, "step": 5520 }, { "epoch": 4.424, "grad_norm": 1.80183744430542, "learning_rate": 0.00021155199999999997, "loss": 1.3599, "step": 5530 }, { "epoch": 4.432, "grad_norm": 1.9421368837356567, "learning_rate": 0.000211392, "loss": 1.3661, "step": 5540 }, { "epoch": 4.44, "grad_norm": 1.9392564296722412, "learning_rate": 0.00021123199999999997, "loss": 1.3463, "step": 5550 }, { "epoch": 4.448, "grad_norm": 2.102717638015747, "learning_rate": 0.000211072, "loss": 1.3544, "step": 5560 }, { "epoch": 4.456, "grad_norm": 1.9294030666351318, "learning_rate": 0.00021091199999999998, "loss": 1.3765, "step": 5570 }, { "epoch": 4.464, "grad_norm": 1.8542896509170532, "learning_rate": 0.00021075199999999998, "loss": 1.3624, "step": 5580 }, { "epoch": 4.4719999999999995, "grad_norm": 2.159574031829834, "learning_rate": 0.00021059199999999998, "loss": 1.3452, "step": 5590 }, { "epoch": 4.48, "grad_norm": 2.136308193206787, "learning_rate": 0.00021043199999999998, "loss": 1.3514, "step": 5600 }, { "epoch": 4.4879999999999995, "grad_norm": 1.959116816520691, "learning_rate": 0.00021027199999999998, "loss": 1.3332, "step": 5610 }, { "epoch": 4.496, "grad_norm": 1.9541338682174683, "learning_rate": 0.00021011199999999996, "loss": 1.3396, "step": 5620 }, { "epoch": 4.504, "grad_norm": 1.9139293432235718, "learning_rate": 0.00020995199999999998, "loss": 1.3306, "step": 5630 }, { "epoch": 4.5120000000000005, "grad_norm": 2.0729434490203857, "learning_rate": 0.00020979199999999996, "loss": 1.3385, "step": 5640 }, { "epoch": 4.52, "grad_norm": 1.9547297954559326, "learning_rate": 0.000209632, "loss": 1.3378, "step": 5650 }, { "epoch": 4.5280000000000005, "grad_norm": 2.0007593631744385, "learning_rate": 0.00020947199999999996, "loss": 1.3744, "step": 5660 }, { "epoch": 4.536, "grad_norm": 1.841583251953125, "learning_rate": 0.000209312, "loss": 1.3461, "step": 5670 }, { "epoch": 4.5440000000000005, "grad_norm": 1.950011968612671, "learning_rate": 0.00020915199999999997, "loss": 1.3898, "step": 5680 }, { "epoch": 4.552, "grad_norm": 1.9242889881134033, "learning_rate": 0.000208992, "loss": 1.375, "step": 5690 }, { "epoch": 4.5600000000000005, "grad_norm": 2.023679733276367, "learning_rate": 0.00020883199999999997, "loss": 1.3547, "step": 5700 }, { "epoch": 4.568, "grad_norm": 1.96961510181427, "learning_rate": 0.000208672, "loss": 1.33, "step": 5710 }, { "epoch": 4.576, "grad_norm": 1.9337737560272217, "learning_rate": 0.00020851199999999997, "loss": 1.338, "step": 5720 }, { "epoch": 4.584, "grad_norm": 1.9906611442565918, "learning_rate": 0.000208352, "loss": 1.3979, "step": 5730 }, { "epoch": 4.592, "grad_norm": 1.819471001625061, "learning_rate": 0.00020819199999999997, "loss": 1.375, "step": 5740 }, { "epoch": 4.6, "grad_norm": 1.9368617534637451, "learning_rate": 0.00020803199999999998, "loss": 1.3637, "step": 5750 }, { "epoch": 4.608, "grad_norm": 1.9653687477111816, "learning_rate": 0.00020787199999999998, "loss": 1.3554, "step": 5760 }, { "epoch": 4.616, "grad_norm": 1.9763808250427246, "learning_rate": 0.00020771199999999998, "loss": 1.3437, "step": 5770 }, { "epoch": 4.624, "grad_norm": 1.8649840354919434, "learning_rate": 0.00020755199999999998, "loss": 1.3624, "step": 5780 }, { "epoch": 4.632, "grad_norm": 1.828291893005371, "learning_rate": 0.00020739199999999998, "loss": 1.378, "step": 5790 }, { "epoch": 4.64, "grad_norm": 1.8722482919692993, "learning_rate": 0.00020723199999999998, "loss": 1.3548, "step": 5800 }, { "epoch": 4.648, "grad_norm": 2.2012381553649902, "learning_rate": 0.00020707199999999996, "loss": 1.3698, "step": 5810 }, { "epoch": 4.656, "grad_norm": 1.9233702421188354, "learning_rate": 0.000206912, "loss": 1.3798, "step": 5820 }, { "epoch": 4.664, "grad_norm": 1.9627357721328735, "learning_rate": 0.00020675199999999996, "loss": 1.424, "step": 5830 }, { "epoch": 4.672, "grad_norm": 1.8615745306015015, "learning_rate": 0.000206592, "loss": 1.3909, "step": 5840 }, { "epoch": 4.68, "grad_norm": 1.9583446979522705, "learning_rate": 0.00020643199999999996, "loss": 1.3946, "step": 5850 }, { "epoch": 4.688, "grad_norm": 1.9457666873931885, "learning_rate": 0.000206272, "loss": 1.372, "step": 5860 }, { "epoch": 4.696, "grad_norm": 1.8619425296783447, "learning_rate": 0.00020611199999999997, "loss": 1.3787, "step": 5870 }, { "epoch": 4.704, "grad_norm": 1.9508628845214844, "learning_rate": 0.000205952, "loss": 1.371, "step": 5880 }, { "epoch": 4.712, "grad_norm": 1.7727349996566772, "learning_rate": 0.00020579199999999997, "loss": 1.3631, "step": 5890 }, { "epoch": 4.72, "grad_norm": 1.7935019731521606, "learning_rate": 0.000205632, "loss": 1.3514, "step": 5900 }, { "epoch": 4.728, "grad_norm": 1.9109313488006592, "learning_rate": 0.00020547199999999997, "loss": 1.327, "step": 5910 }, { "epoch": 4.736, "grad_norm": 1.9290359020233154, "learning_rate": 0.00020531199999999997, "loss": 1.3574, "step": 5920 }, { "epoch": 4.744, "grad_norm": 2.15079665184021, "learning_rate": 0.00020515199999999998, "loss": 1.3939, "step": 5930 }, { "epoch": 4.752, "grad_norm": 2.0457019805908203, "learning_rate": 0.00020499199999999998, "loss": 1.3558, "step": 5940 }, { "epoch": 4.76, "grad_norm": 1.9548970460891724, "learning_rate": 0.00020483199999999998, "loss": 1.362, "step": 5950 }, { "epoch": 4.768, "grad_norm": 1.791396141052246, "learning_rate": 0.00020467199999999998, "loss": 1.33, "step": 5960 }, { "epoch": 4.776, "grad_norm": 1.8451635837554932, "learning_rate": 0.00020451199999999998, "loss": 1.3507, "step": 5970 }, { "epoch": 4.784, "grad_norm": 1.7999178171157837, "learning_rate": 0.00020435199999999998, "loss": 1.4174, "step": 5980 }, { "epoch": 4.792, "grad_norm": 1.8192893266677856, "learning_rate": 0.00020419199999999999, "loss": 1.3481, "step": 5990 }, { "epoch": 4.8, "grad_norm": 1.9753166437149048, "learning_rate": 0.00020403199999999996, "loss": 1.4223, "step": 6000 }, { "epoch": 4.808, "grad_norm": 1.8800415992736816, "learning_rate": 0.000203872, "loss": 1.3661, "step": 6010 }, { "epoch": 4.816, "grad_norm": 1.8040504455566406, "learning_rate": 0.00020371199999999996, "loss": 1.3519, "step": 6020 }, { "epoch": 4.824, "grad_norm": 1.9058725833892822, "learning_rate": 0.000203552, "loss": 1.3973, "step": 6030 }, { "epoch": 4.832, "grad_norm": 1.7217756509780884, "learning_rate": 0.00020339199999999997, "loss": 1.403, "step": 6040 }, { "epoch": 4.84, "grad_norm": 1.8864495754241943, "learning_rate": 0.000203232, "loss": 1.3468, "step": 6050 }, { "epoch": 4.848, "grad_norm": 2.006610870361328, "learning_rate": 0.00020307199999999997, "loss": 1.3972, "step": 6060 }, { "epoch": 4.856, "grad_norm": 1.9524073600769043, "learning_rate": 0.000202912, "loss": 1.4012, "step": 6070 }, { "epoch": 4.864, "grad_norm": 1.9322147369384766, "learning_rate": 0.00020275199999999997, "loss": 1.3928, "step": 6080 }, { "epoch": 4.872, "grad_norm": 1.929335594177246, "learning_rate": 0.000202592, "loss": 1.3799, "step": 6090 }, { "epoch": 4.88, "grad_norm": 1.8158811330795288, "learning_rate": 0.00020243199999999998, "loss": 1.3967, "step": 6100 }, { "epoch": 4.888, "grad_norm": 1.9702143669128418, "learning_rate": 0.00020227199999999998, "loss": 1.3714, "step": 6110 }, { "epoch": 4.896, "grad_norm": 1.6967090368270874, "learning_rate": 0.00020211199999999998, "loss": 1.3553, "step": 6120 }, { "epoch": 4.904, "grad_norm": 1.7388558387756348, "learning_rate": 0.00020195199999999998, "loss": 1.4053, "step": 6130 }, { "epoch": 4.912, "grad_norm": 1.9453833103179932, "learning_rate": 0.00020179199999999998, "loss": 1.3512, "step": 6140 }, { "epoch": 4.92, "grad_norm": 1.8605188131332397, "learning_rate": 0.00020163199999999998, "loss": 1.3376, "step": 6150 }, { "epoch": 4.928, "grad_norm": 1.9881434440612793, "learning_rate": 0.00020147199999999998, "loss": 1.3877, "step": 6160 }, { "epoch": 4.936, "grad_norm": 1.8932327032089233, "learning_rate": 0.00020131199999999999, "loss": 1.331, "step": 6170 }, { "epoch": 4.944, "grad_norm": 1.8074854612350464, "learning_rate": 0.000201152, "loss": 1.4211, "step": 6180 }, { "epoch": 4.952, "grad_norm": 1.9307423830032349, "learning_rate": 0.00020099199999999996, "loss": 1.3498, "step": 6190 }, { "epoch": 4.96, "grad_norm": 1.949623942375183, "learning_rate": 0.000200832, "loss": 1.3897, "step": 6200 }, { "epoch": 4.968, "grad_norm": 1.7373038530349731, "learning_rate": 0.00020067199999999997, "loss": 1.3696, "step": 6210 }, { "epoch": 4.976, "grad_norm": 1.9628345966339111, "learning_rate": 0.000200512, "loss": 1.3667, "step": 6220 }, { "epoch": 4.984, "grad_norm": 1.9516173601150513, "learning_rate": 0.00020035199999999997, "loss": 1.4143, "step": 6230 }, { "epoch": 4.992, "grad_norm": 1.7527846097946167, "learning_rate": 0.000200192, "loss": 1.3599, "step": 6240 }, { "epoch": 5.0, "grad_norm": 1.9414066076278687, "learning_rate": 0.00020003199999999997, "loss": 1.3942, "step": 6250 }, { "epoch": 5.008, "grad_norm": 1.802935242652893, "learning_rate": 0.000199872, "loss": 1.2225, "step": 6260 }, { "epoch": 5.016, "grad_norm": 2.1949446201324463, "learning_rate": 0.00019971199999999997, "loss": 1.168, "step": 6270 }, { "epoch": 5.024, "grad_norm": 2.1667227745056152, "learning_rate": 0.00019955199999999998, "loss": 1.2283, "step": 6280 }, { "epoch": 5.032, "grad_norm": 2.0180180072784424, "learning_rate": 0.00019939199999999998, "loss": 1.1925, "step": 6290 }, { "epoch": 5.04, "grad_norm": 2.257992744445801, "learning_rate": 0.00019923199999999998, "loss": 1.1695, "step": 6300 }, { "epoch": 5.048, "grad_norm": 2.023444890975952, "learning_rate": 0.00019907199999999998, "loss": 1.1454, "step": 6310 }, { "epoch": 5.056, "grad_norm": 2.1307425498962402, "learning_rate": 0.00019891199999999998, "loss": 1.1645, "step": 6320 }, { "epoch": 5.064, "grad_norm": 2.018718957901001, "learning_rate": 0.00019875199999999998, "loss": 1.1451, "step": 6330 }, { "epoch": 5.072, "grad_norm": 2.158968448638916, "learning_rate": 0.00019859199999999999, "loss": 1.1819, "step": 6340 }, { "epoch": 5.08, "grad_norm": 2.125598907470703, "learning_rate": 0.000198432, "loss": 1.1819, "step": 6350 }, { "epoch": 5.088, "grad_norm": 2.2982337474823, "learning_rate": 0.000198272, "loss": 1.1838, "step": 6360 }, { "epoch": 5.096, "grad_norm": 2.3263471126556396, "learning_rate": 0.000198112, "loss": 1.176, "step": 6370 }, { "epoch": 5.104, "grad_norm": 2.0729761123657227, "learning_rate": 0.00019795199999999996, "loss": 1.1237, "step": 6380 }, { "epoch": 5.112, "grad_norm": 2.302323579788208, "learning_rate": 0.000197792, "loss": 1.1658, "step": 6390 }, { "epoch": 5.12, "grad_norm": 2.1555356979370117, "learning_rate": 0.00019763199999999997, "loss": 1.1943, "step": 6400 }, { "epoch": 5.128, "grad_norm": 2.104564666748047, "learning_rate": 0.000197472, "loss": 1.1776, "step": 6410 }, { "epoch": 5.136, "grad_norm": 2.101271390914917, "learning_rate": 0.00019731199999999997, "loss": 1.2107, "step": 6420 }, { "epoch": 5.144, "grad_norm": 2.1387553215026855, "learning_rate": 0.000197152, "loss": 1.1662, "step": 6430 }, { "epoch": 5.152, "grad_norm": 1.9566245079040527, "learning_rate": 0.00019699199999999997, "loss": 1.1862, "step": 6440 }, { "epoch": 5.16, "grad_norm": 2.1503751277923584, "learning_rate": 0.00019683199999999997, "loss": 1.1739, "step": 6450 }, { "epoch": 5.168, "grad_norm": 2.0225651264190674, "learning_rate": 0.00019668799999999998, "loss": 1.1878, "step": 6460 }, { "epoch": 5.176, "grad_norm": 2.179147481918335, "learning_rate": 0.00019652799999999999, "loss": 1.1973, "step": 6470 }, { "epoch": 5.184, "grad_norm": 2.376354932785034, "learning_rate": 0.000196368, "loss": 1.1756, "step": 6480 }, { "epoch": 5.192, "grad_norm": 2.143554449081421, "learning_rate": 0.000196208, "loss": 1.2341, "step": 6490 }, { "epoch": 5.2, "grad_norm": 2.128620147705078, "learning_rate": 0.000196048, "loss": 1.1808, "step": 6500 }, { "epoch": 5.208, "grad_norm": 2.025129556655884, "learning_rate": 0.00019588799999999996, "loss": 1.1889, "step": 6510 }, { "epoch": 5.216, "grad_norm": 2.1475353240966797, "learning_rate": 0.000195728, "loss": 1.2154, "step": 6520 }, { "epoch": 5.224, "grad_norm": 2.032588005065918, "learning_rate": 0.00019556799999999997, "loss": 1.2046, "step": 6530 }, { "epoch": 5.232, "grad_norm": 2.2672226428985596, "learning_rate": 0.000195408, "loss": 1.1553, "step": 6540 }, { "epoch": 5.24, "grad_norm": 2.2911875247955322, "learning_rate": 0.00019524799999999997, "loss": 1.2179, "step": 6550 }, { "epoch": 5.248, "grad_norm": 2.0162782669067383, "learning_rate": 0.00019508799999999997, "loss": 1.2399, "step": 6560 }, { "epoch": 5.256, "grad_norm": 2.193554639816284, "learning_rate": 0.00019492799999999997, "loss": 1.1745, "step": 6570 }, { "epoch": 5.264, "grad_norm": 2.104660749435425, "learning_rate": 0.00019476799999999998, "loss": 1.2107, "step": 6580 }, { "epoch": 5.272, "grad_norm": 2.141188621520996, "learning_rate": 0.00019460799999999998, "loss": 1.2222, "step": 6590 }, { "epoch": 5.28, "grad_norm": 2.184913158416748, "learning_rate": 0.00019444799999999998, "loss": 1.2103, "step": 6600 }, { "epoch": 5.288, "grad_norm": 2.3275797367095947, "learning_rate": 0.00019428799999999998, "loss": 1.2263, "step": 6610 }, { "epoch": 5.296, "grad_norm": 2.2514960765838623, "learning_rate": 0.00019412799999999998, "loss": 1.2263, "step": 6620 }, { "epoch": 5.304, "grad_norm": 2.335054874420166, "learning_rate": 0.00019396799999999998, "loss": 1.2105, "step": 6630 }, { "epoch": 5.312, "grad_norm": 2.0840258598327637, "learning_rate": 0.00019380799999999998, "loss": 1.2322, "step": 6640 }, { "epoch": 5.32, "grad_norm": 2.2909815311431885, "learning_rate": 0.00019364799999999999, "loss": 1.2027, "step": 6650 }, { "epoch": 5.328, "grad_norm": 2.076932668685913, "learning_rate": 0.000193488, "loss": 1.2039, "step": 6660 }, { "epoch": 5.336, "grad_norm": 2.017833948135376, "learning_rate": 0.000193328, "loss": 1.1752, "step": 6670 }, { "epoch": 5.344, "grad_norm": 2.242431879043579, "learning_rate": 0.000193168, "loss": 1.2351, "step": 6680 }, { "epoch": 5.352, "grad_norm": 2.0976057052612305, "learning_rate": 0.000193008, "loss": 1.2151, "step": 6690 }, { "epoch": 5.36, "grad_norm": 2.2112200260162354, "learning_rate": 0.00019284799999999997, "loss": 1.2196, "step": 6700 }, { "epoch": 5.368, "grad_norm": 2.1883575916290283, "learning_rate": 0.000192688, "loss": 1.2368, "step": 6710 }, { "epoch": 5.376, "grad_norm": 2.3068554401397705, "learning_rate": 0.00019252799999999997, "loss": 1.2621, "step": 6720 }, { "epoch": 5.384, "grad_norm": 2.039863109588623, "learning_rate": 0.00019236799999999997, "loss": 1.2687, "step": 6730 }, { "epoch": 5.392, "grad_norm": 2.26802396774292, "learning_rate": 0.00019220799999999997, "loss": 1.1788, "step": 6740 }, { "epoch": 5.4, "grad_norm": 2.010828733444214, "learning_rate": 0.00019204799999999997, "loss": 1.2232, "step": 6750 }, { "epoch": 5.408, "grad_norm": 2.1727616786956787, "learning_rate": 0.00019188799999999998, "loss": 1.2625, "step": 6760 }, { "epoch": 5.416, "grad_norm": 2.030134439468384, "learning_rate": 0.00019172799999999998, "loss": 1.2391, "step": 6770 }, { "epoch": 5.424, "grad_norm": 2.2361104488372803, "learning_rate": 0.00019156799999999998, "loss": 1.2329, "step": 6780 }, { "epoch": 5.432, "grad_norm": 2.1066739559173584, "learning_rate": 0.00019140799999999998, "loss": 1.2456, "step": 6790 }, { "epoch": 5.44, "grad_norm": 2.1428840160369873, "learning_rate": 0.00019124799999999998, "loss": 1.1813, "step": 6800 }, { "epoch": 5.448, "grad_norm": 2.3433635234832764, "learning_rate": 0.00019108799999999998, "loss": 1.2672, "step": 6810 }, { "epoch": 5.456, "grad_norm": 2.185671091079712, "learning_rate": 0.00019092799999999999, "loss": 1.2162, "step": 6820 }, { "epoch": 5.464, "grad_norm": 2.205509662628174, "learning_rate": 0.000190768, "loss": 1.2383, "step": 6830 }, { "epoch": 5.4719999999999995, "grad_norm": 2.428114891052246, "learning_rate": 0.000190608, "loss": 1.3059, "step": 6840 }, { "epoch": 5.48, "grad_norm": 2.135251998901367, "learning_rate": 0.000190448, "loss": 1.2053, "step": 6850 }, { "epoch": 5.4879999999999995, "grad_norm": 2.074209213256836, "learning_rate": 0.000190288, "loss": 1.194, "step": 6860 }, { "epoch": 5.496, "grad_norm": 2.0454697608947754, "learning_rate": 0.000190128, "loss": 1.232, "step": 6870 }, { "epoch": 5.504, "grad_norm": 1.9665228128433228, "learning_rate": 0.000189968, "loss": 1.2077, "step": 6880 }, { "epoch": 5.5120000000000005, "grad_norm": 2.0836398601531982, "learning_rate": 0.00018980799999999997, "loss": 1.186, "step": 6890 }, { "epoch": 5.52, "grad_norm": 2.0634419918060303, "learning_rate": 0.000189648, "loss": 1.2539, "step": 6900 }, { "epoch": 5.5280000000000005, "grad_norm": 2.2017769813537598, "learning_rate": 0.00018948799999999997, "loss": 1.2484, "step": 6910 }, { "epoch": 5.536, "grad_norm": 2.193028450012207, "learning_rate": 0.00018932799999999997, "loss": 1.2916, "step": 6920 }, { "epoch": 5.5440000000000005, "grad_norm": 2.163944721221924, "learning_rate": 0.00018916799999999998, "loss": 1.2706, "step": 6930 }, { "epoch": 5.552, "grad_norm": 2.214864730834961, "learning_rate": 0.00018900799999999998, "loss": 1.2626, "step": 6940 }, { "epoch": 5.5600000000000005, "grad_norm": 2.167754888534546, "learning_rate": 0.00018884799999999998, "loss": 1.2654, "step": 6950 }, { "epoch": 5.568, "grad_norm": 2.114359140396118, "learning_rate": 0.00018868799999999998, "loss": 1.2345, "step": 6960 }, { "epoch": 5.576, "grad_norm": 2.2773566246032715, "learning_rate": 0.00018852799999999998, "loss": 1.2244, "step": 6970 }, { "epoch": 5.584, "grad_norm": 2.1949045658111572, "learning_rate": 0.00018836799999999998, "loss": 1.2508, "step": 6980 }, { "epoch": 5.592, "grad_norm": 2.0954575538635254, "learning_rate": 0.00018820799999999998, "loss": 1.2387, "step": 6990 }, { "epoch": 5.6, "grad_norm": 2.1742050647735596, "learning_rate": 0.00018804799999999999, "loss": 1.2407, "step": 7000 }, { "epoch": 5.608, "grad_norm": 2.1627070903778076, "learning_rate": 0.000187888, "loss": 1.1706, "step": 7010 }, { "epoch": 5.616, "grad_norm": 2.1110544204711914, "learning_rate": 0.000187728, "loss": 1.2538, "step": 7020 }, { "epoch": 5.624, "grad_norm": 2.255958318710327, "learning_rate": 0.000187568, "loss": 1.209, "step": 7030 }, { "epoch": 5.632, "grad_norm": 2.2075769901275635, "learning_rate": 0.000187408, "loss": 1.2942, "step": 7040 }, { "epoch": 5.64, "grad_norm": 1.964128851890564, "learning_rate": 0.000187248, "loss": 1.2519, "step": 7050 }, { "epoch": 5.648, "grad_norm": 2.2681636810302734, "learning_rate": 0.000187088, "loss": 1.2993, "step": 7060 }, { "epoch": 5.656, "grad_norm": 2.313188076019287, "learning_rate": 0.000186928, "loss": 1.2438, "step": 7070 }, { "epoch": 5.664, "grad_norm": 2.369359254837036, "learning_rate": 0.00018676799999999997, "loss": 1.2157, "step": 7080 }, { "epoch": 5.672, "grad_norm": 2.2245047092437744, "learning_rate": 0.00018660799999999997, "loss": 1.2857, "step": 7090 }, { "epoch": 5.68, "grad_norm": 2.058401107788086, "learning_rate": 0.00018644799999999997, "loss": 1.2045, "step": 7100 }, { "epoch": 5.688, "grad_norm": 2.2531964778900146, "learning_rate": 0.00018628799999999998, "loss": 1.2493, "step": 7110 }, { "epoch": 5.696, "grad_norm": 2.2315497398376465, "learning_rate": 0.00018612799999999998, "loss": 1.235, "step": 7120 }, { "epoch": 5.704, "grad_norm": 2.018808603286743, "learning_rate": 0.00018596799999999998, "loss": 1.2016, "step": 7130 }, { "epoch": 5.712, "grad_norm": 2.0911753177642822, "learning_rate": 0.00018580799999999998, "loss": 1.2624, "step": 7140 }, { "epoch": 5.72, "grad_norm": 2.147120475769043, "learning_rate": 0.00018564799999999998, "loss": 1.2376, "step": 7150 }, { "epoch": 5.728, "grad_norm": 2.1546943187713623, "learning_rate": 0.00018548799999999998, "loss": 1.2286, "step": 7160 }, { "epoch": 5.736, "grad_norm": 2.0924603939056396, "learning_rate": 0.00018532799999999998, "loss": 1.2964, "step": 7170 }, { "epoch": 5.744, "grad_norm": 2.337070941925049, "learning_rate": 0.00018516799999999999, "loss": 1.2699, "step": 7180 }, { "epoch": 5.752, "grad_norm": 2.4989166259765625, "learning_rate": 0.000185008, "loss": 1.2631, "step": 7190 }, { "epoch": 5.76, "grad_norm": 2.3049070835113525, "learning_rate": 0.000184848, "loss": 1.2879, "step": 7200 }, { "epoch": 5.768, "grad_norm": 2.328397274017334, "learning_rate": 0.000184688, "loss": 1.2624, "step": 7210 }, { "epoch": 5.776, "grad_norm": 2.147589921951294, "learning_rate": 0.000184528, "loss": 1.2825, "step": 7220 }, { "epoch": 5.784, "grad_norm": 2.348174571990967, "learning_rate": 0.000184368, "loss": 1.2751, "step": 7230 }, { "epoch": 5.792, "grad_norm": 2.270873785018921, "learning_rate": 0.000184208, "loss": 1.2714, "step": 7240 }, { "epoch": 5.8, "grad_norm": 2.289658308029175, "learning_rate": 0.000184048, "loss": 1.2412, "step": 7250 }, { "epoch": 5.808, "grad_norm": 2.3569588661193848, "learning_rate": 0.00018388799999999997, "loss": 1.2714, "step": 7260 }, { "epoch": 5.816, "grad_norm": 2.372729539871216, "learning_rate": 0.00018372799999999997, "loss": 1.2824, "step": 7270 }, { "epoch": 5.824, "grad_norm": 2.3369643688201904, "learning_rate": 0.00018356799999999997, "loss": 1.2897, "step": 7280 }, { "epoch": 5.832, "grad_norm": 2.149664878845215, "learning_rate": 0.00018340799999999998, "loss": 1.2411, "step": 7290 }, { "epoch": 5.84, "grad_norm": 2.1661763191223145, "learning_rate": 0.00018324799999999998, "loss": 1.2484, "step": 7300 }, { "epoch": 5.848, "grad_norm": 2.2296934127807617, "learning_rate": 0.00018308799999999998, "loss": 1.2713, "step": 7310 }, { "epoch": 5.856, "grad_norm": 2.0819859504699707, "learning_rate": 0.00018292799999999998, "loss": 1.2636, "step": 7320 }, { "epoch": 5.864, "grad_norm": 2.158386468887329, "learning_rate": 0.00018276799999999998, "loss": 1.2886, "step": 7330 }, { "epoch": 5.872, "grad_norm": 2.1622161865234375, "learning_rate": 0.00018260799999999998, "loss": 1.2663, "step": 7340 }, { "epoch": 5.88, "grad_norm": 2.1625213623046875, "learning_rate": 0.00018244799999999999, "loss": 1.3031, "step": 7350 }, { "epoch": 5.888, "grad_norm": 2.1951282024383545, "learning_rate": 0.000182288, "loss": 1.2569, "step": 7360 }, { "epoch": 5.896, "grad_norm": 2.2481329441070557, "learning_rate": 0.000182128, "loss": 1.2376, "step": 7370 }, { "epoch": 5.904, "grad_norm": 2.11740779876709, "learning_rate": 0.000181968, "loss": 1.2642, "step": 7380 }, { "epoch": 5.912, "grad_norm": 2.3954527378082275, "learning_rate": 0.000181808, "loss": 1.3029, "step": 7390 }, { "epoch": 5.92, "grad_norm": 2.222752571105957, "learning_rate": 0.000181648, "loss": 1.3028, "step": 7400 }, { "epoch": 5.928, "grad_norm": 2.15301513671875, "learning_rate": 0.000181488, "loss": 1.3011, "step": 7410 }, { "epoch": 5.936, "grad_norm": 2.27708101272583, "learning_rate": 0.000181328, "loss": 1.2727, "step": 7420 }, { "epoch": 5.944, "grad_norm": 2.1490461826324463, "learning_rate": 0.00018116799999999997, "loss": 1.2968, "step": 7430 }, { "epoch": 5.952, "grad_norm": 2.247800588607788, "learning_rate": 0.000181008, "loss": 1.3003, "step": 7440 }, { "epoch": 5.96, "grad_norm": 2.2584476470947266, "learning_rate": 0.00018084799999999997, "loss": 1.2659, "step": 7450 }, { "epoch": 5.968, "grad_norm": 2.1247005462646484, "learning_rate": 0.00018068799999999997, "loss": 1.26, "step": 7460 }, { "epoch": 5.976, "grad_norm": 2.2989518642425537, "learning_rate": 0.00018052799999999998, "loss": 1.28, "step": 7470 }, { "epoch": 5.984, "grad_norm": 2.3190391063690186, "learning_rate": 0.00018036799999999998, "loss": 1.2763, "step": 7480 }, { "epoch": 5.992, "grad_norm": 2.170459032058716, "learning_rate": 0.00018020799999999998, "loss": 1.2263, "step": 7490 }, { "epoch": 6.0, "grad_norm": 2.1551315784454346, "learning_rate": 0.00018004799999999998, "loss": 1.3013, "step": 7500 } ], "logging_steps": 10, "max_steps": 18750, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 2500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.2603608444698624e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }