{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.6051364365971108, "eval_steps": 500, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 7.3019086777975994, "learning_rate": 5e-06, "loss": 0.6939, "step": 10 }, { "epoch": 0.01, "grad_norm": 5.150989333266983, "learning_rate": 1e-05, "loss": 0.7167, "step": 20 }, { "epoch": 0.02, "grad_norm": 3.640558809610037, "learning_rate": 1.5e-05, "loss": 0.5683, "step": 30 }, { "epoch": 0.02, "grad_norm": 7.517999397731128, "learning_rate": 2e-05, "loss": 0.5472, "step": 40 }, { "epoch": 0.03, "grad_norm": 1.9687061679463425, "learning_rate": 2.5e-05, "loss": 0.4439, "step": 50 }, { "epoch": 0.03, "grad_norm": 3.643479206523606, "learning_rate": 3e-05, "loss": 0.2486, "step": 60 }, { "epoch": 0.04, "grad_norm": 2.2754773308695095, "learning_rate": 3.5e-05, "loss": 0.2217, "step": 70 }, { "epoch": 0.04, "grad_norm": 1.7144730049127388, "learning_rate": 4e-05, "loss": 0.169, "step": 80 }, { "epoch": 0.05, "grad_norm": 3.4702829704135114, "learning_rate": 4.5e-05, "loss": 0.1994, "step": 90 }, { "epoch": 0.05, "grad_norm": 1.3985127340985621, "learning_rate": 5e-05, "loss": 0.1612, "step": 100 }, { "epoch": 0.06, "grad_norm": 1.375992184386137, "learning_rate": 4.982758620689655e-05, "loss": 0.1576, "step": 110 }, { "epoch": 0.06, "grad_norm": 1.9528635753013313, "learning_rate": 4.9655172413793107e-05, "loss": 0.1393, "step": 120 }, { "epoch": 0.07, "grad_norm": 4.075169010198401, "learning_rate": 4.9482758620689655e-05, "loss": 0.1969, "step": 130 }, { "epoch": 0.07, "grad_norm": 2.0953991165751207, "learning_rate": 4.931034482758621e-05, "loss": 0.1294, "step": 140 }, { "epoch": 0.08, "grad_norm": 1.942660591044849, "learning_rate": 4.913793103448276e-05, "loss": 0.1306, "step": 150 }, { "epoch": 0.09, "grad_norm": 3.1508904015728345, "learning_rate": 4.896551724137931e-05, "loss": 0.1526, "step": 160 }, { "epoch": 0.09, "grad_norm": 1.9862795165358471, "learning_rate": 4.8793103448275864e-05, "loss": 0.1186, "step": 170 }, { "epoch": 0.1, "grad_norm": 2.633061833817991, "learning_rate": 4.862068965517241e-05, "loss": 0.1457, "step": 180 }, { "epoch": 0.1, "grad_norm": 1.8017052368446178, "learning_rate": 4.844827586206897e-05, "loss": 0.1234, "step": 190 }, { "epoch": 0.11, "grad_norm": 2.1560694100709803, "learning_rate": 4.827586206896552e-05, "loss": 0.1346, "step": 200 }, { "epoch": 0.11, "grad_norm": 1.5737689267430703, "learning_rate": 4.810344827586207e-05, "loss": 0.116, "step": 210 }, { "epoch": 0.12, "grad_norm": 1.957864677854788, "learning_rate": 4.793103448275863e-05, "loss": 0.1692, "step": 220 }, { "epoch": 0.12, "grad_norm": 2.215039223521855, "learning_rate": 4.7758620689655176e-05, "loss": 0.1245, "step": 230 }, { "epoch": 0.13, "grad_norm": 1.370517239734168, "learning_rate": 4.7586206896551725e-05, "loss": 0.1476, "step": 240 }, { "epoch": 0.13, "grad_norm": 1.7341334563022532, "learning_rate": 4.741379310344828e-05, "loss": 0.1236, "step": 250 }, { "epoch": 0.14, "grad_norm": 1.5994298113068974, "learning_rate": 4.724137931034483e-05, "loss": 0.1161, "step": 260 }, { "epoch": 0.14, "grad_norm": 1.5317433190951963, "learning_rate": 4.7068965517241385e-05, "loss": 0.1035, "step": 270 }, { "epoch": 0.15, "grad_norm": 2.191977732539556, "learning_rate": 4.689655172413793e-05, "loss": 0.1427, "step": 280 }, { "epoch": 0.16, "grad_norm": 1.6038667570691656, "learning_rate": 4.672413793103448e-05, "loss": 0.1225, "step": 290 }, { "epoch": 0.16, "grad_norm": 2.577572731831179, "learning_rate": 4.655172413793104e-05, "loss": 0.1399, "step": 300 }, { "epoch": 0.17, "grad_norm": 1.6199241001441385, "learning_rate": 4.6379310344827586e-05, "loss": 0.1242, "step": 310 }, { "epoch": 0.17, "grad_norm": 2.236577821186196, "learning_rate": 4.6206896551724135e-05, "loss": 0.1656, "step": 320 }, { "epoch": 0.18, "grad_norm": 1.7294690605254757, "learning_rate": 4.603448275862069e-05, "loss": 0.1382, "step": 330 }, { "epoch": 0.18, "grad_norm": 2.196527516378511, "learning_rate": 4.586206896551724e-05, "loss": 0.1257, "step": 340 }, { "epoch": 0.19, "grad_norm": 2.1057444340221463, "learning_rate": 4.5689655172413794e-05, "loss": 0.1238, "step": 350 }, { "epoch": 0.19, "grad_norm": 1.5409556870328274, "learning_rate": 4.551724137931035e-05, "loss": 0.1383, "step": 360 }, { "epoch": 0.2, "grad_norm": 1.5204083616874053, "learning_rate": 4.53448275862069e-05, "loss": 0.1068, "step": 370 }, { "epoch": 0.2, "grad_norm": 2.3557725298931746, "learning_rate": 4.5172413793103454e-05, "loss": 0.1071, "step": 380 }, { "epoch": 0.21, "grad_norm": 3.2601538460418644, "learning_rate": 4.5e-05, "loss": 0.125, "step": 390 }, { "epoch": 0.21, "grad_norm": 1.9031725385762286, "learning_rate": 4.482758620689655e-05, "loss": 0.0991, "step": 400 }, { "epoch": 0.22, "grad_norm": 1.3946050262183123, "learning_rate": 4.465517241379311e-05, "loss": 0.1156, "step": 410 }, { "epoch": 0.22, "grad_norm": 1.097644875106397, "learning_rate": 4.4482758620689656e-05, "loss": 0.1366, "step": 420 }, { "epoch": 0.23, "grad_norm": 1.37846299019108, "learning_rate": 4.431034482758621e-05, "loss": 0.126, "step": 430 }, { "epoch": 0.24, "grad_norm": 1.8340152889320331, "learning_rate": 4.413793103448276e-05, "loss": 0.1066, "step": 440 }, { "epoch": 0.24, "grad_norm": 1.8304505611337867, "learning_rate": 4.396551724137931e-05, "loss": 0.0868, "step": 450 }, { "epoch": 0.25, "grad_norm": 1.550196490898523, "learning_rate": 4.3793103448275864e-05, "loss": 0.1286, "step": 460 }, { "epoch": 0.25, "grad_norm": 2.176112247796248, "learning_rate": 4.362068965517241e-05, "loss": 0.1206, "step": 470 }, { "epoch": 0.26, "grad_norm": 1.6589263894091213, "learning_rate": 4.344827586206897e-05, "loss": 0.1008, "step": 480 }, { "epoch": 0.26, "grad_norm": 1.8349611508902046, "learning_rate": 4.327586206896552e-05, "loss": 0.1198, "step": 490 }, { "epoch": 0.27, "grad_norm": 2.1218964920724126, "learning_rate": 4.3103448275862066e-05, "loss": 0.1166, "step": 500 }, { "epoch": 0.27, "eval_loss": 0.6078919172286987, "eval_runtime": 116.8471, "eval_samples_per_second": 11.288, "eval_steps_per_second": 2.824, "step": 500 }, { "epoch": 0.27, "grad_norm": 2.5775141311007856, "learning_rate": 4.293103448275863e-05, "loss": 0.1124, "step": 510 }, { "epoch": 0.28, "grad_norm": 1.6019517017800202, "learning_rate": 4.275862068965518e-05, "loss": 0.1068, "step": 520 }, { "epoch": 0.28, "grad_norm": 2.6901962755310205, "learning_rate": 4.2586206896551725e-05, "loss": 0.1286, "step": 530 }, { "epoch": 0.29, "grad_norm": 3.9517995356721767, "learning_rate": 4.241379310344828e-05, "loss": 0.1149, "step": 540 }, { "epoch": 0.29, "grad_norm": 2.0428896228074076, "learning_rate": 4.224137931034483e-05, "loss": 0.141, "step": 550 }, { "epoch": 0.3, "grad_norm": 2.263258592133553, "learning_rate": 4.2068965517241385e-05, "loss": 0.0949, "step": 560 }, { "epoch": 0.3, "grad_norm": 1.4823165953974604, "learning_rate": 4.1896551724137934e-05, "loss": 0.1365, "step": 570 }, { "epoch": 0.31, "grad_norm": 1.4441377020989878, "learning_rate": 4.172413793103448e-05, "loss": 0.1015, "step": 580 }, { "epoch": 0.32, "grad_norm": 1.4779059254436886, "learning_rate": 4.155172413793104e-05, "loss": 0.0988, "step": 590 }, { "epoch": 0.32, "grad_norm": 1.7777823671018818, "learning_rate": 4.1379310344827587e-05, "loss": 0.1124, "step": 600 }, { "epoch": 0.33, "grad_norm": 1.737579831138191, "learning_rate": 4.120689655172414e-05, "loss": 0.086, "step": 610 }, { "epoch": 0.33, "grad_norm": 2.708453961232997, "learning_rate": 4.103448275862069e-05, "loss": 0.0933, "step": 620 }, { "epoch": 0.34, "grad_norm": 1.8871805824236731, "learning_rate": 4.086206896551724e-05, "loss": 0.1407, "step": 630 }, { "epoch": 0.34, "grad_norm": 1.7300112722427339, "learning_rate": 4.0689655172413795e-05, "loss": 0.1224, "step": 640 }, { "epoch": 0.35, "grad_norm": 1.4631236252240614, "learning_rate": 4.0517241379310344e-05, "loss": 0.1014, "step": 650 }, { "epoch": 0.35, "grad_norm": 1.2602431597419264, "learning_rate": 4.03448275862069e-05, "loss": 0.1583, "step": 660 }, { "epoch": 0.36, "grad_norm": 1.2077937041919453, "learning_rate": 4.0172413793103455e-05, "loss": 0.1209, "step": 670 }, { "epoch": 0.36, "grad_norm": 1.4386184566429954, "learning_rate": 4e-05, "loss": 0.1016, "step": 680 }, { "epoch": 0.37, "grad_norm": 2.6160358835758584, "learning_rate": 3.982758620689656e-05, "loss": 0.1062, "step": 690 }, { "epoch": 0.37, "grad_norm": 1.9278794640498955, "learning_rate": 3.965517241379311e-05, "loss": 0.1037, "step": 700 }, { "epoch": 0.38, "grad_norm": 1.2872571900237024, "learning_rate": 3.9482758620689656e-05, "loss": 0.096, "step": 710 }, { "epoch": 0.39, "grad_norm": 1.243554309347296, "learning_rate": 3.931034482758621e-05, "loss": 0.1084, "step": 720 }, { "epoch": 0.39, "grad_norm": 1.5090589714253309, "learning_rate": 3.913793103448276e-05, "loss": 0.0877, "step": 730 }, { "epoch": 0.4, "grad_norm": 2.1419550623025168, "learning_rate": 3.896551724137931e-05, "loss": 0.0994, "step": 740 }, { "epoch": 0.4, "grad_norm": 1.7807417973632438, "learning_rate": 3.8793103448275865e-05, "loss": 0.1029, "step": 750 }, { "epoch": 0.41, "grad_norm": 1.3342960263057682, "learning_rate": 3.862068965517241e-05, "loss": 0.1072, "step": 760 }, { "epoch": 0.41, "grad_norm": 2.3865282340158136, "learning_rate": 3.844827586206897e-05, "loss": 0.1193, "step": 770 }, { "epoch": 0.42, "grad_norm": 1.5428742248459941, "learning_rate": 3.827586206896552e-05, "loss": 0.1156, "step": 780 }, { "epoch": 0.42, "grad_norm": 1.7660532115509044, "learning_rate": 3.8103448275862066e-05, "loss": 0.122, "step": 790 }, { "epoch": 0.43, "grad_norm": 1.8149742752994733, "learning_rate": 3.793103448275862e-05, "loss": 0.1346, "step": 800 }, { "epoch": 0.43, "grad_norm": 1.7456408876472995, "learning_rate": 3.775862068965517e-05, "loss": 0.1223, "step": 810 }, { "epoch": 0.44, "grad_norm": 1.10163248244056, "learning_rate": 3.7586206896551726e-05, "loss": 0.1031, "step": 820 }, { "epoch": 0.44, "grad_norm": 1.6441057737088702, "learning_rate": 3.741379310344828e-05, "loss": 0.1059, "step": 830 }, { "epoch": 0.45, "grad_norm": 2.3999279790163484, "learning_rate": 3.724137931034483e-05, "loss": 0.1125, "step": 840 }, { "epoch": 0.45, "grad_norm": 2.2081477934156903, "learning_rate": 3.7068965517241385e-05, "loss": 0.1266, "step": 850 }, { "epoch": 0.46, "grad_norm": 2.366783771480017, "learning_rate": 3.6896551724137934e-05, "loss": 0.1127, "step": 860 }, { "epoch": 0.47, "grad_norm": 1.3077873674136173, "learning_rate": 3.672413793103448e-05, "loss": 0.1095, "step": 870 }, { "epoch": 0.47, "grad_norm": 1.8197812508114701, "learning_rate": 3.655172413793104e-05, "loss": 0.0932, "step": 880 }, { "epoch": 0.48, "grad_norm": 1.0806192057981219, "learning_rate": 3.637931034482759e-05, "loss": 0.1166, "step": 890 }, { "epoch": 0.48, "grad_norm": 1.520666439337001, "learning_rate": 3.620689655172414e-05, "loss": 0.0883, "step": 900 }, { "epoch": 0.49, "grad_norm": 1.690002270629302, "learning_rate": 3.603448275862069e-05, "loss": 0.1199, "step": 910 }, { "epoch": 0.49, "grad_norm": 1.4319374130118003, "learning_rate": 3.586206896551724e-05, "loss": 0.0991, "step": 920 }, { "epoch": 0.5, "grad_norm": 1.0626084369653164, "learning_rate": 3.5689655172413795e-05, "loss": 0.0923, "step": 930 }, { "epoch": 0.5, "grad_norm": 2.0848060597460902, "learning_rate": 3.5517241379310344e-05, "loss": 0.0979, "step": 940 }, { "epoch": 0.51, "grad_norm": 1.4997189461483256, "learning_rate": 3.53448275862069e-05, "loss": 0.0949, "step": 950 }, { "epoch": 0.51, "grad_norm": 1.7887817042743388, "learning_rate": 3.517241379310345e-05, "loss": 0.1135, "step": 960 }, { "epoch": 0.52, "grad_norm": 3.242965692388458, "learning_rate": 3.5e-05, "loss": 0.1315, "step": 970 }, { "epoch": 0.52, "grad_norm": 1.5034762176322083, "learning_rate": 3.482758620689655e-05, "loss": 0.1177, "step": 980 }, { "epoch": 0.53, "grad_norm": 1.6679474444200848, "learning_rate": 3.465517241379311e-05, "loss": 0.1182, "step": 990 }, { "epoch": 0.54, "grad_norm": 2.814574507251776, "learning_rate": 3.4482758620689657e-05, "loss": 0.0912, "step": 1000 }, { "epoch": 0.54, "eval_loss": 0.6101276874542236, "eval_runtime": 113.9995, "eval_samples_per_second": 11.57, "eval_steps_per_second": 2.895, "step": 1000 }, { "epoch": 0.54, "grad_norm": 2.1321319681580535, "learning_rate": 3.431034482758621e-05, "loss": 0.0983, "step": 1010 }, { "epoch": 0.55, "grad_norm": 1.8915101367452352, "learning_rate": 3.413793103448276e-05, "loss": 0.1113, "step": 1020 }, { "epoch": 0.55, "grad_norm": 1.4160985095468477, "learning_rate": 3.3965517241379316e-05, "loss": 0.1076, "step": 1030 }, { "epoch": 0.56, "grad_norm": 1.8562788974878586, "learning_rate": 3.3793103448275865e-05, "loss": 0.1011, "step": 1040 }, { "epoch": 0.56, "grad_norm": 1.3793192563691294, "learning_rate": 3.3620689655172414e-05, "loss": 0.0978, "step": 1050 }, { "epoch": 0.57, "grad_norm": 1.4606563129628805, "learning_rate": 3.344827586206897e-05, "loss": 0.1069, "step": 1060 }, { "epoch": 0.57, "grad_norm": 1.5680856211032999, "learning_rate": 3.327586206896552e-05, "loss": 0.0988, "step": 1070 }, { "epoch": 0.58, "grad_norm": 1.5829345931951275, "learning_rate": 3.310344827586207e-05, "loss": 0.1256, "step": 1080 }, { "epoch": 0.58, "grad_norm": 1.6200852939319585, "learning_rate": 3.293103448275862e-05, "loss": 0.097, "step": 1090 }, { "epoch": 0.59, "grad_norm": 2.259656836213122, "learning_rate": 3.275862068965517e-05, "loss": 0.1137, "step": 1100 }, { "epoch": 0.59, "grad_norm": 2.2483622341560645, "learning_rate": 3.2586206896551726e-05, "loss": 0.0999, "step": 1110 }, { "epoch": 0.6, "grad_norm": 1.168198861956421, "learning_rate": 3.2413793103448275e-05, "loss": 0.1018, "step": 1120 }, { "epoch": 0.6, "grad_norm": 2.4699824799031482, "learning_rate": 3.2241379310344824e-05, "loss": 0.1132, "step": 1130 }, { "epoch": 0.61, "grad_norm": 1.2571654549549751, "learning_rate": 3.206896551724138e-05, "loss": 0.1054, "step": 1140 }, { "epoch": 0.62, "grad_norm": 0.5559534032307631, "learning_rate": 3.1896551724137935e-05, "loss": 0.0789, "step": 1150 }, { "epoch": 0.62, "grad_norm": 1.636369759504475, "learning_rate": 3.172413793103448e-05, "loss": 0.0902, "step": 1160 }, { "epoch": 0.63, "grad_norm": 1.6137142935446496, "learning_rate": 3.155172413793104e-05, "loss": 0.1199, "step": 1170 }, { "epoch": 0.63, "grad_norm": 1.7448003760796802, "learning_rate": 3.137931034482759e-05, "loss": 0.1295, "step": 1180 }, { "epoch": 0.64, "grad_norm": 1.3261005358227276, "learning_rate": 3.120689655172414e-05, "loss": 0.1117, "step": 1190 }, { "epoch": 0.64, "grad_norm": 1.7353127177901462, "learning_rate": 3.103448275862069e-05, "loss": 0.0951, "step": 1200 }, { "epoch": 0.65, "grad_norm": 2.8569975913367074, "learning_rate": 3.086206896551724e-05, "loss": 0.112, "step": 1210 }, { "epoch": 0.65, "grad_norm": 1.3481947218871082, "learning_rate": 3.0689655172413796e-05, "loss": 0.0876, "step": 1220 }, { "epoch": 0.66, "grad_norm": 2.015933141613929, "learning_rate": 3.0517241379310348e-05, "loss": 0.0993, "step": 1230 }, { "epoch": 0.66, "grad_norm": 1.0588164394448019, "learning_rate": 3.0344827586206897e-05, "loss": 0.1034, "step": 1240 }, { "epoch": 0.67, "grad_norm": 1.3594986645993228, "learning_rate": 3.017241379310345e-05, "loss": 0.1004, "step": 1250 }, { "epoch": 0.67, "grad_norm": 1.333098402625009, "learning_rate": 3e-05, "loss": 0.1259, "step": 1260 }, { "epoch": 0.68, "grad_norm": 1.1324206075196583, "learning_rate": 2.9827586206896553e-05, "loss": 0.1155, "step": 1270 }, { "epoch": 0.68, "grad_norm": 1.2270687927795876, "learning_rate": 2.96551724137931e-05, "loss": 0.092, "step": 1280 }, { "epoch": 0.69, "grad_norm": 1.3085362507403875, "learning_rate": 2.9482758620689654e-05, "loss": 0.1064, "step": 1290 }, { "epoch": 0.7, "grad_norm": 1.7135250277750762, "learning_rate": 2.9310344827586206e-05, "loss": 0.1132, "step": 1300 }, { "epoch": 0.7, "grad_norm": 1.6121189707451158, "learning_rate": 2.913793103448276e-05, "loss": 0.1006, "step": 1310 }, { "epoch": 0.71, "grad_norm": 1.3958680925504208, "learning_rate": 2.8965517241379313e-05, "loss": 0.0956, "step": 1320 }, { "epoch": 0.71, "grad_norm": 1.546226150121884, "learning_rate": 2.8793103448275865e-05, "loss": 0.1142, "step": 1330 }, { "epoch": 0.72, "grad_norm": 2.1837739995965415, "learning_rate": 2.8620689655172417e-05, "loss": 0.1127, "step": 1340 }, { "epoch": 0.72, "grad_norm": 1.9402402206909504, "learning_rate": 2.844827586206897e-05, "loss": 0.0922, "step": 1350 }, { "epoch": 0.73, "grad_norm": 1.7914401953164802, "learning_rate": 2.8275862068965518e-05, "loss": 0.1038, "step": 1360 }, { "epoch": 0.73, "grad_norm": 1.505804090650568, "learning_rate": 2.810344827586207e-05, "loss": 0.1034, "step": 1370 }, { "epoch": 0.74, "grad_norm": 1.9907350713586716, "learning_rate": 2.7931034482758622e-05, "loss": 0.103, "step": 1380 }, { "epoch": 0.74, "grad_norm": 1.6948381773166858, "learning_rate": 2.7758620689655175e-05, "loss": 0.1091, "step": 1390 }, { "epoch": 0.75, "grad_norm": 1.3995985437024723, "learning_rate": 2.7586206896551727e-05, "loss": 0.0852, "step": 1400 }, { "epoch": 0.75, "grad_norm": 1.9347024029069393, "learning_rate": 2.7413793103448275e-05, "loss": 0.1393, "step": 1410 }, { "epoch": 0.76, "grad_norm": 1.608776792445342, "learning_rate": 2.7241379310344827e-05, "loss": 0.0951, "step": 1420 }, { "epoch": 0.77, "grad_norm": 1.6005483580619249, "learning_rate": 2.706896551724138e-05, "loss": 0.1037, "step": 1430 }, { "epoch": 0.77, "grad_norm": 2.374208686020403, "learning_rate": 2.689655172413793e-05, "loss": 0.0926, "step": 1440 }, { "epoch": 0.78, "grad_norm": 1.7554923995400171, "learning_rate": 2.672413793103448e-05, "loss": 0.1164, "step": 1450 }, { "epoch": 0.78, "grad_norm": 1.2965114220197742, "learning_rate": 2.6551724137931032e-05, "loss": 0.1102, "step": 1460 }, { "epoch": 0.79, "grad_norm": 1.8857303249108055, "learning_rate": 2.637931034482759e-05, "loss": 0.0868, "step": 1470 }, { "epoch": 0.79, "grad_norm": 1.406207551120988, "learning_rate": 2.620689655172414e-05, "loss": 0.1179, "step": 1480 }, { "epoch": 0.8, "grad_norm": 1.275728362064451, "learning_rate": 2.6034482758620692e-05, "loss": 0.1128, "step": 1490 }, { "epoch": 0.8, "grad_norm": 1.7122434387720797, "learning_rate": 2.5862068965517244e-05, "loss": 0.1045, "step": 1500 }, { "epoch": 0.8, "eval_loss": 0.6075221300125122, "eval_runtime": 114.4297, "eval_samples_per_second": 11.527, "eval_steps_per_second": 2.884, "step": 1500 }, { "epoch": 0.81, "grad_norm": 1.3880467648229133, "learning_rate": 2.5689655172413796e-05, "loss": 0.1031, "step": 1510 }, { "epoch": 0.81, "grad_norm": 2.3475935500456657, "learning_rate": 2.551724137931035e-05, "loss": 0.1064, "step": 1520 }, { "epoch": 0.82, "grad_norm": 1.9746447047097486, "learning_rate": 2.5344827586206897e-05, "loss": 0.0995, "step": 1530 }, { "epoch": 0.82, "grad_norm": 1.2244623226185645, "learning_rate": 2.517241379310345e-05, "loss": 0.1044, "step": 1540 }, { "epoch": 0.83, "grad_norm": 2.2605012043265216, "learning_rate": 2.5e-05, "loss": 0.1017, "step": 1550 }, { "epoch": 0.83, "grad_norm": 1.624612730097256, "learning_rate": 2.4827586206896553e-05, "loss": 0.0973, "step": 1560 }, { "epoch": 0.84, "grad_norm": 1.3648662151461801, "learning_rate": 2.4655172413793105e-05, "loss": 0.0836, "step": 1570 }, { "epoch": 0.85, "grad_norm": 1.4642386177378814, "learning_rate": 2.4482758620689654e-05, "loss": 0.1058, "step": 1580 }, { "epoch": 0.85, "grad_norm": 1.3057087388796036, "learning_rate": 2.4310344827586206e-05, "loss": 0.1092, "step": 1590 }, { "epoch": 0.86, "grad_norm": 2.092348689417081, "learning_rate": 2.413793103448276e-05, "loss": 0.0958, "step": 1600 }, { "epoch": 0.86, "grad_norm": 1.3731115281087607, "learning_rate": 2.3965517241379314e-05, "loss": 0.0883, "step": 1610 }, { "epoch": 0.87, "grad_norm": 1.3983722722394134, "learning_rate": 2.3793103448275862e-05, "loss": 0.0944, "step": 1620 }, { "epoch": 0.87, "grad_norm": 2.4163442262002937, "learning_rate": 2.3620689655172415e-05, "loss": 0.1062, "step": 1630 }, { "epoch": 0.88, "grad_norm": 1.5006079944233688, "learning_rate": 2.3448275862068967e-05, "loss": 0.1129, "step": 1640 }, { "epoch": 0.88, "grad_norm": 1.54283270803711, "learning_rate": 2.327586206896552e-05, "loss": 0.0861, "step": 1650 }, { "epoch": 0.89, "grad_norm": 2.269678144223195, "learning_rate": 2.3103448275862067e-05, "loss": 0.0978, "step": 1660 }, { "epoch": 0.89, "grad_norm": 1.8105959776831768, "learning_rate": 2.293103448275862e-05, "loss": 0.1141, "step": 1670 }, { "epoch": 0.9, "grad_norm": 1.9135579713863027, "learning_rate": 2.2758620689655175e-05, "loss": 0.0918, "step": 1680 }, { "epoch": 0.9, "grad_norm": 1.6604450253018581, "learning_rate": 2.2586206896551727e-05, "loss": 0.096, "step": 1690 }, { "epoch": 0.91, "grad_norm": 2.115987565053461, "learning_rate": 2.2413793103448276e-05, "loss": 0.1166, "step": 1700 }, { "epoch": 0.91, "grad_norm": 1.4279927513160544, "learning_rate": 2.2241379310344828e-05, "loss": 0.1052, "step": 1710 }, { "epoch": 0.92, "grad_norm": 1.185880441960968, "learning_rate": 2.206896551724138e-05, "loss": 0.1053, "step": 1720 }, { "epoch": 0.93, "grad_norm": 1.6969029997828415, "learning_rate": 2.1896551724137932e-05, "loss": 0.106, "step": 1730 }, { "epoch": 0.93, "grad_norm": 2.0330998697970286, "learning_rate": 2.1724137931034484e-05, "loss": 0.1073, "step": 1740 }, { "epoch": 0.94, "grad_norm": 1.19027851417408, "learning_rate": 2.1551724137931033e-05, "loss": 0.091, "step": 1750 }, { "epoch": 0.94, "grad_norm": 1.2470713090698218, "learning_rate": 2.137931034482759e-05, "loss": 0.0898, "step": 1760 }, { "epoch": 0.95, "grad_norm": 2.235740059996042, "learning_rate": 2.120689655172414e-05, "loss": 0.1327, "step": 1770 }, { "epoch": 0.95, "grad_norm": 1.5741742016710085, "learning_rate": 2.1034482758620692e-05, "loss": 0.1126, "step": 1780 }, { "epoch": 0.96, "grad_norm": 0.9343547126371113, "learning_rate": 2.086206896551724e-05, "loss": 0.0819, "step": 1790 }, { "epoch": 0.96, "grad_norm": 2.2764271447338937, "learning_rate": 2.0689655172413793e-05, "loss": 0.1204, "step": 1800 }, { "epoch": 0.97, "grad_norm": 1.981384842073209, "learning_rate": 2.0517241379310345e-05, "loss": 0.1182, "step": 1810 }, { "epoch": 0.97, "grad_norm": 1.044063198588911, "learning_rate": 2.0344827586206897e-05, "loss": 0.1005, "step": 1820 }, { "epoch": 0.98, "grad_norm": 2.370183172473789, "learning_rate": 2.017241379310345e-05, "loss": 0.1174, "step": 1830 }, { "epoch": 0.98, "grad_norm": 1.9052733799672823, "learning_rate": 2e-05, "loss": 0.1125, "step": 1840 }, { "epoch": 0.99, "grad_norm": 1.628277530114902, "learning_rate": 1.9827586206896554e-05, "loss": 0.1015, "step": 1850 }, { "epoch": 1.0, "grad_norm": 1.2522124245986863, "learning_rate": 1.9655172413793106e-05, "loss": 0.0924, "step": 1860 }, { "epoch": 1.0, "grad_norm": 0.6911426489002421, "learning_rate": 1.9482758620689655e-05, "loss": 0.0965, "step": 1870 }, { "epoch": 1.01, "grad_norm": 1.821890613342771, "learning_rate": 1.9310344827586207e-05, "loss": 0.081, "step": 1880 }, { "epoch": 1.01, "grad_norm": 0.7643588179781782, "learning_rate": 1.913793103448276e-05, "loss": 0.0761, "step": 1890 }, { "epoch": 1.02, "grad_norm": 1.1095002263403428, "learning_rate": 1.896551724137931e-05, "loss": 0.0871, "step": 1900 }, { "epoch": 1.02, "grad_norm": 1.4639820667455608, "learning_rate": 1.8793103448275863e-05, "loss": 0.0805, "step": 1910 }, { "epoch": 1.03, "grad_norm": 1.6214269161589794, "learning_rate": 1.8620689655172415e-05, "loss": 0.0902, "step": 1920 }, { "epoch": 1.03, "grad_norm": 1.5979085316952373, "learning_rate": 1.8448275862068967e-05, "loss": 0.0967, "step": 1930 }, { "epoch": 1.04, "grad_norm": 1.2001043976090235, "learning_rate": 1.827586206896552e-05, "loss": 0.069, "step": 1940 }, { "epoch": 1.04, "grad_norm": 2.100190633629739, "learning_rate": 1.810344827586207e-05, "loss": 0.1024, "step": 1950 }, { "epoch": 1.05, "grad_norm": 1.7393396532511867, "learning_rate": 1.793103448275862e-05, "loss": 0.0728, "step": 1960 }, { "epoch": 1.05, "grad_norm": 1.873599965711283, "learning_rate": 1.7758620689655172e-05, "loss": 0.0735, "step": 1970 }, { "epoch": 1.06, "grad_norm": 1.4460752726376342, "learning_rate": 1.7586206896551724e-05, "loss": 0.1, "step": 1980 }, { "epoch": 1.06, "grad_norm": 0.8772715867399261, "learning_rate": 1.7413793103448276e-05, "loss": 0.0794, "step": 1990 }, { "epoch": 1.07, "grad_norm": 1.398173054729605, "learning_rate": 1.7241379310344828e-05, "loss": 0.078, "step": 2000 }, { "epoch": 1.07, "eval_loss": 0.6523420810699463, "eval_runtime": 115.4048, "eval_samples_per_second": 11.429, "eval_steps_per_second": 2.859, "step": 2000 }, { "epoch": 1.08, "grad_norm": 1.8544231422617872, "learning_rate": 1.706896551724138e-05, "loss": 0.1015, "step": 2010 }, { "epoch": 1.08, "grad_norm": 1.0455645860511078, "learning_rate": 1.6896551724137932e-05, "loss": 0.0957, "step": 2020 }, { "epoch": 1.09, "grad_norm": 2.2782017103278265, "learning_rate": 1.6724137931034485e-05, "loss": 0.1054, "step": 2030 }, { "epoch": 1.09, "grad_norm": 1.7712576675859857, "learning_rate": 1.6551724137931037e-05, "loss": 0.1054, "step": 2040 }, { "epoch": 1.1, "grad_norm": 1.0902741267616887, "learning_rate": 1.6379310344827585e-05, "loss": 0.0807, "step": 2050 }, { "epoch": 1.1, "grad_norm": 1.3438284680174908, "learning_rate": 1.6206896551724137e-05, "loss": 0.0686, "step": 2060 }, { "epoch": 1.11, "grad_norm": 1.0087838105436577, "learning_rate": 1.603448275862069e-05, "loss": 0.0906, "step": 2070 }, { "epoch": 1.11, "grad_norm": 0.9291493020789636, "learning_rate": 1.586206896551724e-05, "loss": 0.0777, "step": 2080 }, { "epoch": 1.12, "grad_norm": 1.3059876158609884, "learning_rate": 1.5689655172413794e-05, "loss": 0.1005, "step": 2090 }, { "epoch": 1.12, "grad_norm": 0.7541194076462521, "learning_rate": 1.5517241379310346e-05, "loss": 0.0711, "step": 2100 }, { "epoch": 1.13, "grad_norm": 0.976743599620672, "learning_rate": 1.5344827586206898e-05, "loss": 0.0734, "step": 2110 }, { "epoch": 1.13, "grad_norm": 1.5352620896805385, "learning_rate": 1.5172413793103448e-05, "loss": 0.0966, "step": 2120 }, { "epoch": 1.14, "grad_norm": 1.165805963424826, "learning_rate": 1.5e-05, "loss": 0.0804, "step": 2130 }, { "epoch": 1.14, "grad_norm": 1.5398844769347386, "learning_rate": 1.482758620689655e-05, "loss": 0.076, "step": 2140 }, { "epoch": 1.15, "grad_norm": 0.9475538051861888, "learning_rate": 1.4655172413793103e-05, "loss": 0.0771, "step": 2150 }, { "epoch": 1.16, "grad_norm": 1.3122536092219965, "learning_rate": 1.4482758620689657e-05, "loss": 0.0652, "step": 2160 }, { "epoch": 1.16, "grad_norm": 0.8569841144318953, "learning_rate": 1.4310344827586209e-05, "loss": 0.0666, "step": 2170 }, { "epoch": 1.17, "grad_norm": 1.0744088797395874, "learning_rate": 1.4137931034482759e-05, "loss": 0.0563, "step": 2180 }, { "epoch": 1.17, "grad_norm": 0.7306709750154786, "learning_rate": 1.3965517241379311e-05, "loss": 0.0724, "step": 2190 }, { "epoch": 1.18, "grad_norm": 1.1617485610561908, "learning_rate": 1.3793103448275863e-05, "loss": 0.0592, "step": 2200 }, { "epoch": 1.18, "grad_norm": 1.6124055507643191, "learning_rate": 1.3620689655172414e-05, "loss": 0.0755, "step": 2210 }, { "epoch": 1.19, "grad_norm": 1.0636933402698119, "learning_rate": 1.3448275862068966e-05, "loss": 0.0807, "step": 2220 }, { "epoch": 1.19, "grad_norm": 2.5428572768451514, "learning_rate": 1.3275862068965516e-05, "loss": 0.0894, "step": 2230 }, { "epoch": 1.2, "grad_norm": 1.6519996193761897, "learning_rate": 1.310344827586207e-05, "loss": 0.0796, "step": 2240 }, { "epoch": 1.2, "grad_norm": 1.7011145463555521, "learning_rate": 1.2931034482758622e-05, "loss": 0.0671, "step": 2250 }, { "epoch": 1.21, "grad_norm": 1.441376178837708, "learning_rate": 1.2758620689655174e-05, "loss": 0.0799, "step": 2260 }, { "epoch": 1.21, "grad_norm": 0.9345990828016713, "learning_rate": 1.2586206896551725e-05, "loss": 0.0684, "step": 2270 }, { "epoch": 1.22, "grad_norm": 1.3573336012486517, "learning_rate": 1.2413793103448277e-05, "loss": 0.0776, "step": 2280 }, { "epoch": 1.23, "grad_norm": 2.1805616160045, "learning_rate": 1.2241379310344827e-05, "loss": 0.0819, "step": 2290 }, { "epoch": 1.23, "grad_norm": 0.8476103473594272, "learning_rate": 1.206896551724138e-05, "loss": 0.0761, "step": 2300 }, { "epoch": 1.24, "grad_norm": 2.132874306113927, "learning_rate": 1.1896551724137931e-05, "loss": 0.1027, "step": 2310 }, { "epoch": 1.24, "grad_norm": 1.1904353387368432, "learning_rate": 1.1724137931034483e-05, "loss": 0.0623, "step": 2320 }, { "epoch": 1.25, "grad_norm": 1.6470612203805093, "learning_rate": 1.1551724137931034e-05, "loss": 0.0872, "step": 2330 }, { "epoch": 1.25, "grad_norm": 1.8675118975708371, "learning_rate": 1.1379310344827587e-05, "loss": 0.0728, "step": 2340 }, { "epoch": 1.26, "grad_norm": 1.2991706254841981, "learning_rate": 1.1206896551724138e-05, "loss": 0.079, "step": 2350 }, { "epoch": 1.26, "grad_norm": 1.9364824626463861, "learning_rate": 1.103448275862069e-05, "loss": 0.0783, "step": 2360 }, { "epoch": 1.27, "grad_norm": 1.6597471057325688, "learning_rate": 1.0862068965517242e-05, "loss": 0.0902, "step": 2370 }, { "epoch": 1.27, "grad_norm": 1.454588875045873, "learning_rate": 1.0689655172413794e-05, "loss": 0.0774, "step": 2380 }, { "epoch": 1.28, "grad_norm": 1.1670103110646668, "learning_rate": 1.0517241379310346e-05, "loss": 0.0659, "step": 2390 }, { "epoch": 1.28, "grad_norm": 1.4612075298157572, "learning_rate": 1.0344827586206897e-05, "loss": 0.0775, "step": 2400 }, { "epoch": 1.29, "grad_norm": 3.932323942941103, "learning_rate": 1.0172413793103449e-05, "loss": 0.0671, "step": 2410 }, { "epoch": 1.29, "grad_norm": 0.9979331315685359, "learning_rate": 1e-05, "loss": 0.0695, "step": 2420 }, { "epoch": 1.3, "grad_norm": 1.9070655793934348, "learning_rate": 9.827586206896553e-06, "loss": 0.0692, "step": 2430 }, { "epoch": 1.31, "grad_norm": 1.5364730689079074, "learning_rate": 9.655172413793103e-06, "loss": 0.0738, "step": 2440 }, { "epoch": 1.31, "grad_norm": 1.1024432074823722, "learning_rate": 9.482758620689655e-06, "loss": 0.0855, "step": 2450 }, { "epoch": 1.32, "grad_norm": 1.8514817180884215, "learning_rate": 9.310344827586207e-06, "loss": 0.076, "step": 2460 }, { "epoch": 1.32, "grad_norm": 1.1275915444574418, "learning_rate": 9.13793103448276e-06, "loss": 0.0688, "step": 2470 }, { "epoch": 1.33, "grad_norm": 2.1236224333360454, "learning_rate": 8.96551724137931e-06, "loss": 0.0985, "step": 2480 }, { "epoch": 1.33, "grad_norm": 1.5716827436886402, "learning_rate": 8.793103448275862e-06, "loss": 0.0827, "step": 2490 }, { "epoch": 1.34, "grad_norm": 0.9799310640151211, "learning_rate": 8.620689655172414e-06, "loss": 0.0807, "step": 2500 }, { "epoch": 1.34, "eval_loss": 0.6613200306892395, "eval_runtime": 115.1632, "eval_samples_per_second": 11.453, "eval_steps_per_second": 2.865, "step": 2500 }, { "epoch": 1.34, "grad_norm": 1.429365154274089, "learning_rate": 8.448275862068966e-06, "loss": 0.0803, "step": 2510 }, { "epoch": 1.35, "grad_norm": 2.034554738355009, "learning_rate": 8.275862068965518e-06, "loss": 0.0771, "step": 2520 }, { "epoch": 1.35, "grad_norm": 1.3116032945356366, "learning_rate": 8.103448275862069e-06, "loss": 0.102, "step": 2530 }, { "epoch": 1.36, "grad_norm": 0.538270992356044, "learning_rate": 7.93103448275862e-06, "loss": 0.0781, "step": 2540 }, { "epoch": 1.36, "grad_norm": 1.359815780671966, "learning_rate": 7.758620689655173e-06, "loss": 0.082, "step": 2550 }, { "epoch": 1.37, "grad_norm": 1.3555113208274487, "learning_rate": 7.586206896551724e-06, "loss": 0.0662, "step": 2560 }, { "epoch": 1.38, "grad_norm": 0.9126598313656492, "learning_rate": 7.413793103448275e-06, "loss": 0.0809, "step": 2570 }, { "epoch": 1.38, "grad_norm": 1.0362500054468016, "learning_rate": 7.241379310344828e-06, "loss": 0.0763, "step": 2580 }, { "epoch": 1.39, "grad_norm": 1.4486705272957705, "learning_rate": 7.0689655172413796e-06, "loss": 0.0772, "step": 2590 }, { "epoch": 1.39, "grad_norm": 1.942597284762883, "learning_rate": 6.896551724137932e-06, "loss": 0.0627, "step": 2600 }, { "epoch": 1.4, "grad_norm": 1.3527152216583775, "learning_rate": 6.724137931034483e-06, "loss": 0.0742, "step": 2610 }, { "epoch": 1.4, "grad_norm": 0.7183775146684962, "learning_rate": 6.551724137931035e-06, "loss": 0.0478, "step": 2620 }, { "epoch": 1.41, "grad_norm": 1.081708916490155, "learning_rate": 6.379310344827587e-06, "loss": 0.0858, "step": 2630 }, { "epoch": 1.41, "grad_norm": 2.083585945496064, "learning_rate": 6.206896551724138e-06, "loss": 0.094, "step": 2640 }, { "epoch": 1.42, "grad_norm": 1.3628676873699075, "learning_rate": 6.03448275862069e-06, "loss": 0.0996, "step": 2650 }, { "epoch": 1.42, "grad_norm": 1.1584513648567698, "learning_rate": 5.862068965517242e-06, "loss": 0.0678, "step": 2660 }, { "epoch": 1.43, "grad_norm": 1.239472995186361, "learning_rate": 5.689655172413794e-06, "loss": 0.0745, "step": 2670 }, { "epoch": 1.43, "grad_norm": 1.0885278290149767, "learning_rate": 5.517241379310345e-06, "loss": 0.0755, "step": 2680 }, { "epoch": 1.44, "grad_norm": 1.4830428953890444, "learning_rate": 5.344827586206897e-06, "loss": 0.0799, "step": 2690 }, { "epoch": 1.44, "grad_norm": 1.6819930232389664, "learning_rate": 5.172413793103448e-06, "loss": 0.0849, "step": 2700 }, { "epoch": 1.45, "grad_norm": 1.5372127437839038, "learning_rate": 5e-06, "loss": 0.0758, "step": 2710 }, { "epoch": 1.46, "grad_norm": 1.367779239832988, "learning_rate": 4.827586206896552e-06, "loss": 0.073, "step": 2720 }, { "epoch": 1.46, "grad_norm": 1.4887137481978274, "learning_rate": 4.655172413793104e-06, "loss": 0.061, "step": 2730 }, { "epoch": 1.47, "grad_norm": 2.1404166354958325, "learning_rate": 4.482758620689655e-06, "loss": 0.0957, "step": 2740 }, { "epoch": 1.47, "grad_norm": 1.8642358306515179, "learning_rate": 4.310344827586207e-06, "loss": 0.0782, "step": 2750 }, { "epoch": 1.48, "grad_norm": 1.3698287268201832, "learning_rate": 4.137931034482759e-06, "loss": 0.0647, "step": 2760 }, { "epoch": 1.48, "grad_norm": 1.4319268282045232, "learning_rate": 3.96551724137931e-06, "loss": 0.0787, "step": 2770 }, { "epoch": 1.49, "grad_norm": 1.383030067235673, "learning_rate": 3.793103448275862e-06, "loss": 0.0681, "step": 2780 }, { "epoch": 1.49, "grad_norm": 2.161260735181774, "learning_rate": 3.620689655172414e-06, "loss": 0.1013, "step": 2790 }, { "epoch": 1.5, "grad_norm": 1.6657048923514055, "learning_rate": 3.448275862068966e-06, "loss": 0.062, "step": 2800 }, { "epoch": 1.5, "grad_norm": 0.9193769932816829, "learning_rate": 3.2758620689655175e-06, "loss": 0.0696, "step": 2810 }, { "epoch": 1.51, "grad_norm": 1.2415022440375736, "learning_rate": 3.103448275862069e-06, "loss": 0.103, "step": 2820 }, { "epoch": 1.51, "grad_norm": 1.8151340036133554, "learning_rate": 2.931034482758621e-06, "loss": 0.0821, "step": 2830 }, { "epoch": 1.52, "grad_norm": 0.928286704694044, "learning_rate": 2.7586206896551725e-06, "loss": 0.057, "step": 2840 }, { "epoch": 1.52, "grad_norm": 1.3442506927303919, "learning_rate": 2.586206896551724e-06, "loss": 0.0621, "step": 2850 }, { "epoch": 1.53, "grad_norm": 2.1790836959559523, "learning_rate": 2.413793103448276e-06, "loss": 0.074, "step": 2860 }, { "epoch": 1.54, "grad_norm": 0.6594054854326988, "learning_rate": 2.2413793103448275e-06, "loss": 0.0814, "step": 2870 }, { "epoch": 1.54, "grad_norm": 1.3726881004102018, "learning_rate": 2.0689655172413796e-06, "loss": 0.0712, "step": 2880 }, { "epoch": 1.55, "grad_norm": 1.679214299803237, "learning_rate": 1.896551724137931e-06, "loss": 0.0627, "step": 2890 }, { "epoch": 1.55, "grad_norm": 1.0321949433654767, "learning_rate": 1.724137931034483e-06, "loss": 0.1165, "step": 2900 }, { "epoch": 1.56, "grad_norm": 0.6973352883003652, "learning_rate": 1.5517241379310346e-06, "loss": 0.0779, "step": 2910 }, { "epoch": 1.56, "grad_norm": 1.2300329446905003, "learning_rate": 1.3793103448275862e-06, "loss": 0.0871, "step": 2920 }, { "epoch": 1.57, "grad_norm": 1.7134185607297843, "learning_rate": 1.206896551724138e-06, "loss": 0.0746, "step": 2930 }, { "epoch": 1.57, "grad_norm": 1.0698704299983892, "learning_rate": 1.0344827586206898e-06, "loss": 0.0801, "step": 2940 }, { "epoch": 1.58, "grad_norm": 2.2453660227340957, "learning_rate": 8.620689655172415e-07, "loss": 0.0867, "step": 2950 }, { "epoch": 1.58, "grad_norm": 1.1712923935242836, "learning_rate": 6.896551724137931e-07, "loss": 0.1029, "step": 2960 }, { "epoch": 1.59, "grad_norm": 1.1626149131019259, "learning_rate": 5.172413793103449e-07, "loss": 0.0875, "step": 2970 }, { "epoch": 1.59, "grad_norm": 1.5972056170209479, "learning_rate": 3.4482758620689656e-07, "loss": 0.0786, "step": 2980 }, { "epoch": 1.6, "grad_norm": 1.4660799065028676, "learning_rate": 1.7241379310344828e-07, "loss": 0.0916, "step": 2990 }, { "epoch": 1.61, "grad_norm": 1.1438897568043958, "learning_rate": 0.0, "loss": 0.0669, "step": 3000 }, { "epoch": 1.61, "eval_loss": 0.6566838622093201, "eval_runtime": 114.3635, "eval_samples_per_second": 11.533, "eval_steps_per_second": 2.886, "step": 3000 } ], "logging_steps": 10, "max_steps": 3000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "total_flos": 86675375849472.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }