diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,34883 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9998192662208565, + "eval_steps": 500, + "global_step": 24897, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.06317149847745895, + "learning_rate": 7.499999999999999e-07, + "loss": 1.0918, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 0.06490806490182877, + "learning_rate": 1.4999999999999998e-06, + "loss": 1.1945, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 0.06881864368915558, + "learning_rate": 2.2499999999999996e-06, + "loss": 1.1409, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 0.07844685763120651, + "learning_rate": 2.9999999999999997e-06, + "loss": 1.2207, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 0.08064830303192139, + "learning_rate": 3.7499999999999997e-06, + "loss": 1.1945, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 0.07337764650583267, + "learning_rate": 4.499999999999999e-06, + "loss": 1.1097, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 0.07508231699466705, + "learning_rate": 5.25e-06, + "loss": 1.1413, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 0.0660620704293251, + "learning_rate": 5.999999999999999e-06, + "loss": 1.0751, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 0.0696869045495987, + "learning_rate": 6.749999999999999e-06, + "loss": 1.1048, + "step": 45 + }, + { + "epoch": 0.01, + "grad_norm": 0.0708666518330574, + "learning_rate": 7.499999999999999e-06, + "loss": 1.1369, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 0.07739271968603134, + "learning_rate": 8.249999999999999e-06, + "loss": 1.1332, + "step": 55 + }, + { + "epoch": 0.01, + "grad_norm": 0.0700831338763237, + "learning_rate": 8.999999999999999e-06, + "loss": 1.2094, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 0.07708919048309326, + "learning_rate": 9.75e-06, + "loss": 1.1344, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 0.07628925144672394, + "learning_rate": 1.05e-05, + "loss": 1.0817, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 0.08013926446437836, + "learning_rate": 1.1249999999999999e-05, + "loss": 1.0898, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 0.09165573865175247, + "learning_rate": 1.1999999999999999e-05, + "loss": 1.1173, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 0.08685644716024399, + "learning_rate": 1.275e-05, + "loss": 1.1268, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 0.08818597346544266, + "learning_rate": 1.3499999999999998e-05, + "loss": 1.1928, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 0.0848746970295906, + "learning_rate": 1.4249999999999999e-05, + "loss": 1.1179, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 0.09676993638277054, + "learning_rate": 1.4999999999999999e-05, + "loss": 1.1478, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 0.09682285040616989, + "learning_rate": 1.5749999999999997e-05, + "loss": 1.0813, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 0.09817025810480118, + "learning_rate": 1.6499999999999998e-05, + "loss": 1.096, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 0.10309538245201111, + "learning_rate": 1.725e-05, + "loss": 1.1418, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 0.10393266379833221, + "learning_rate": 1.7999999999999997e-05, + "loss": 1.1351, + "step": 120 + }, + { + "epoch": 0.02, + "grad_norm": 0.10402549803256989, + "learning_rate": 1.875e-05, + "loss": 1.0959, + "step": 125 + }, + { + "epoch": 0.02, + "grad_norm": 0.10404133796691895, + "learning_rate": 1.95e-05, + "loss": 1.1869, + "step": 130 + }, + { + "epoch": 0.02, + "grad_norm": 0.12272530794143677, + "learning_rate": 2.025e-05, + "loss": 1.1436, + "step": 135 + }, + { + "epoch": 0.02, + "grad_norm": 0.1208878755569458, + "learning_rate": 2.1e-05, + "loss": 1.1444, + "step": 140 + }, + { + "epoch": 0.02, + "grad_norm": 0.12307845801115036, + "learning_rate": 2.1749999999999997e-05, + "loss": 1.0762, + "step": 145 + }, + { + "epoch": 0.02, + "grad_norm": 0.11315160989761353, + "learning_rate": 2.2499999999999998e-05, + "loss": 1.0269, + "step": 150 + }, + { + "epoch": 0.02, + "grad_norm": 0.10878825187683105, + "learning_rate": 2.325e-05, + "loss": 1.1535, + "step": 155 + }, + { + "epoch": 0.02, + "grad_norm": 0.11939235776662827, + "learning_rate": 2.3999999999999997e-05, + "loss": 1.1705, + "step": 160 + }, + { + "epoch": 0.02, + "grad_norm": 0.11615506559610367, + "learning_rate": 2.475e-05, + "loss": 1.1406, + "step": 165 + }, + { + "epoch": 0.02, + "grad_norm": 0.11318700015544891, + "learning_rate": 2.55e-05, + "loss": 1.0874, + "step": 170 + }, + { + "epoch": 0.02, + "grad_norm": 0.1265944391489029, + "learning_rate": 2.6249999999999998e-05, + "loss": 1.1317, + "step": 175 + }, + { + "epoch": 0.02, + "grad_norm": 0.153924360871315, + "learning_rate": 2.6999999999999996e-05, + "loss": 1.102, + "step": 180 + }, + { + "epoch": 0.02, + "grad_norm": 0.1322435736656189, + "learning_rate": 2.7749999999999997e-05, + "loss": 1.0387, + "step": 185 + }, + { + "epoch": 0.02, + "grad_norm": 0.1285255402326584, + "learning_rate": 2.8499999999999998e-05, + "loss": 1.1082, + "step": 190 + }, + { + "epoch": 0.02, + "grad_norm": 0.13724276423454285, + "learning_rate": 2.925e-05, + "loss": 1.082, + "step": 195 + }, + { + "epoch": 0.02, + "grad_norm": 0.13925635814666748, + "learning_rate": 2.9999999999999997e-05, + "loss": 1.2066, + "step": 200 + }, + { + "epoch": 0.02, + "grad_norm": 0.16304455697536469, + "learning_rate": 3.0749999999999995e-05, + "loss": 1.1404, + "step": 205 + }, + { + "epoch": 0.03, + "grad_norm": 0.1349136382341385, + "learning_rate": 3.149999999999999e-05, + "loss": 1.0513, + "step": 210 + }, + { + "epoch": 0.03, + "grad_norm": 0.150814950466156, + "learning_rate": 3.225e-05, + "loss": 1.0785, + "step": 215 + }, + { + "epoch": 0.03, + "grad_norm": 0.13001343607902527, + "learning_rate": 3.2999999999999996e-05, + "loss": 1.121, + "step": 220 + }, + { + "epoch": 0.03, + "grad_norm": 0.1322905719280243, + "learning_rate": 3.375e-05, + "loss": 1.1, + "step": 225 + }, + { + "epoch": 0.03, + "grad_norm": 0.1382080763578415, + "learning_rate": 3.45e-05, + "loss": 1.1415, + "step": 230 + }, + { + "epoch": 0.03, + "grad_norm": 0.14844882488250732, + "learning_rate": 3.5249999999999996e-05, + "loss": 1.1337, + "step": 235 + }, + { + "epoch": 0.03, + "grad_norm": 0.13436168432235718, + "learning_rate": 3.5999999999999994e-05, + "loss": 1.1736, + "step": 240 + }, + { + "epoch": 0.03, + "grad_norm": 0.1420661062002182, + "learning_rate": 3.675e-05, + "loss": 1.0822, + "step": 245 + }, + { + "epoch": 0.03, + "grad_norm": 0.14159414172172546, + "learning_rate": 3.75e-05, + "loss": 1.0864, + "step": 250 + }, + { + "epoch": 0.03, + "grad_norm": 0.15307262539863586, + "learning_rate": 3.8249999999999995e-05, + "loss": 1.114, + "step": 255 + }, + { + "epoch": 0.03, + "grad_norm": 0.1432732194662094, + "learning_rate": 3.9e-05, + "loss": 1.1041, + "step": 260 + }, + { + "epoch": 0.03, + "grad_norm": 0.14971235394477844, + "learning_rate": 3.975e-05, + "loss": 1.1015, + "step": 265 + }, + { + "epoch": 0.03, + "grad_norm": 0.13618604838848114, + "learning_rate": 4.05e-05, + "loss": 1.1041, + "step": 270 + }, + { + "epoch": 0.03, + "grad_norm": 0.15167336165905, + "learning_rate": 4.125e-05, + "loss": 1.1905, + "step": 275 + }, + { + "epoch": 0.03, + "grad_norm": 0.17375151813030243, + "learning_rate": 4.2e-05, + "loss": 1.0554, + "step": 280 + }, + { + "epoch": 0.03, + "grad_norm": 0.15129323303699493, + "learning_rate": 4.2749999999999996e-05, + "loss": 1.1595, + "step": 285 + }, + { + "epoch": 0.03, + "grad_norm": 0.13002634048461914, + "learning_rate": 4.3499999999999993e-05, + "loss": 1.0993, + "step": 290 + }, + { + "epoch": 0.04, + "grad_norm": 0.15022237598896027, + "learning_rate": 4.424999999999999e-05, + "loss": 1.1154, + "step": 295 + }, + { + "epoch": 0.04, + "grad_norm": 0.15329715609550476, + "learning_rate": 4.4999999999999996e-05, + "loss": 1.1181, + "step": 300 + }, + { + "epoch": 0.04, + "grad_norm": 0.14781033992767334, + "learning_rate": 4.5749999999999994e-05, + "loss": 1.1708, + "step": 305 + }, + { + "epoch": 0.04, + "grad_norm": 0.13594377040863037, + "learning_rate": 4.65e-05, + "loss": 1.0517, + "step": 310 + }, + { + "epoch": 0.04, + "grad_norm": 0.14029279351234436, + "learning_rate": 4.7249999999999997e-05, + "loss": 1.0883, + "step": 315 + }, + { + "epoch": 0.04, + "grad_norm": 0.14294695854187012, + "learning_rate": 4.7999999999999994e-05, + "loss": 1.166, + "step": 320 + }, + { + "epoch": 0.04, + "grad_norm": 0.1437993049621582, + "learning_rate": 4.875e-05, + "loss": 1.0838, + "step": 325 + }, + { + "epoch": 0.04, + "grad_norm": 0.13464024662971497, + "learning_rate": 4.95e-05, + "loss": 1.076, + "step": 330 + }, + { + "epoch": 0.04, + "grad_norm": 0.15120777487754822, + "learning_rate": 5.025e-05, + "loss": 1.1465, + "step": 335 + }, + { + "epoch": 0.04, + "grad_norm": 0.13077160716056824, + "learning_rate": 5.1e-05, + "loss": 0.9944, + "step": 340 + }, + { + "epoch": 0.04, + "grad_norm": 0.14656583964824677, + "learning_rate": 5.174999999999999e-05, + "loss": 1.142, + "step": 345 + }, + { + "epoch": 0.04, + "grad_norm": 0.14403922855854034, + "learning_rate": 5.2499999999999995e-05, + "loss": 1.1545, + "step": 350 + }, + { + "epoch": 0.04, + "grad_norm": 0.14558084309101105, + "learning_rate": 5.324999999999999e-05, + "loss": 1.1135, + "step": 355 + }, + { + "epoch": 0.04, + "grad_norm": 0.13940554857254028, + "learning_rate": 5.399999999999999e-05, + "loss": 1.1143, + "step": 360 + }, + { + "epoch": 0.04, + "grad_norm": 0.12889456748962402, + "learning_rate": 5.4749999999999996e-05, + "loss": 1.0995, + "step": 365 + }, + { + "epoch": 0.04, + "grad_norm": 0.13410042226314545, + "learning_rate": 5.5499999999999994e-05, + "loss": 1.0377, + "step": 370 + }, + { + "epoch": 0.05, + "grad_norm": 0.1537715345621109, + "learning_rate": 5.625e-05, + "loss": 1.1654, + "step": 375 + }, + { + "epoch": 0.05, + "grad_norm": 0.14372758567333221, + "learning_rate": 5.6999999999999996e-05, + "loss": 1.0393, + "step": 380 + }, + { + "epoch": 0.05, + "grad_norm": 0.14559227228164673, + "learning_rate": 5.7749999999999994e-05, + "loss": 1.1712, + "step": 385 + }, + { + "epoch": 0.05, + "grad_norm": 0.14143848419189453, + "learning_rate": 5.85e-05, + "loss": 1.0294, + "step": 390 + }, + { + "epoch": 0.05, + "grad_norm": 0.13969220221042633, + "learning_rate": 5.925e-05, + "loss": 1.1678, + "step": 395 + }, + { + "epoch": 0.05, + "grad_norm": 0.14875860512256622, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.0829, + "step": 400 + }, + { + "epoch": 0.05, + "grad_norm": 0.13244011998176575, + "learning_rate": 6.075e-05, + "loss": 1.173, + "step": 405 + }, + { + "epoch": 0.05, + "grad_norm": 0.14143739640712738, + "learning_rate": 6.149999999999999e-05, + "loss": 1.0446, + "step": 410 + }, + { + "epoch": 0.05, + "grad_norm": 0.13888464868068695, + "learning_rate": 6.225e-05, + "loss": 1.1222, + "step": 415 + }, + { + "epoch": 0.05, + "grad_norm": 0.1454809457063675, + "learning_rate": 6.299999999999999e-05, + "loss": 1.0266, + "step": 420 + }, + { + "epoch": 0.05, + "grad_norm": 0.1294557899236679, + "learning_rate": 6.374999999999999e-05, + "loss": 1.0773, + "step": 425 + }, + { + "epoch": 0.05, + "grad_norm": 0.13563519716262817, + "learning_rate": 6.45e-05, + "loss": 1.0808, + "step": 430 + }, + { + "epoch": 0.05, + "grad_norm": 0.1332767903804779, + "learning_rate": 6.525e-05, + "loss": 1.1356, + "step": 435 + }, + { + "epoch": 0.05, + "grad_norm": 0.14194917678833008, + "learning_rate": 6.599999999999999e-05, + "loss": 1.0592, + "step": 440 + }, + { + "epoch": 0.05, + "grad_norm": 0.1359052062034607, + "learning_rate": 6.675e-05, + "loss": 1.1103, + "step": 445 + }, + { + "epoch": 0.05, + "grad_norm": 0.12448085099458694, + "learning_rate": 6.75e-05, + "loss": 1.0912, + "step": 450 + }, + { + "epoch": 0.05, + "grad_norm": 0.14592844247817993, + "learning_rate": 6.824999999999999e-05, + "loss": 1.0549, + "step": 455 + }, + { + "epoch": 0.06, + "grad_norm": 0.14550206065177917, + "learning_rate": 6.9e-05, + "loss": 1.0997, + "step": 460 + }, + { + "epoch": 0.06, + "grad_norm": 0.13571631908416748, + "learning_rate": 6.975e-05, + "loss": 1.1559, + "step": 465 + }, + { + "epoch": 0.06, + "grad_norm": 0.12359726428985596, + "learning_rate": 7.049999999999999e-05, + "loss": 1.1352, + "step": 470 + }, + { + "epoch": 0.06, + "grad_norm": 0.1370527297258377, + "learning_rate": 7.125e-05, + "loss": 1.1114, + "step": 475 + }, + { + "epoch": 0.06, + "grad_norm": 0.1392049938440323, + "learning_rate": 7.199999999999999e-05, + "loss": 1.1121, + "step": 480 + }, + { + "epoch": 0.06, + "grad_norm": 0.13348931074142456, + "learning_rate": 7.274999999999999e-05, + "loss": 1.1471, + "step": 485 + }, + { + "epoch": 0.06, + "grad_norm": 0.1287151575088501, + "learning_rate": 7.35e-05, + "loss": 1.1812, + "step": 490 + }, + { + "epoch": 0.06, + "grad_norm": 0.13077159225940704, + "learning_rate": 7.424999999999999e-05, + "loss": 1.0183, + "step": 495 + }, + { + "epoch": 0.06, + "grad_norm": 0.13283278048038483, + "learning_rate": 7.5e-05, + "loss": 1.1351, + "step": 500 + }, + { + "epoch": 0.06, + "grad_norm": 0.14058330655097961, + "learning_rate": 7.575e-05, + "loss": 1.0175, + "step": 505 + }, + { + "epoch": 0.06, + "grad_norm": 0.1260114163160324, + "learning_rate": 7.649999999999999e-05, + "loss": 1.0918, + "step": 510 + }, + { + "epoch": 0.06, + "grad_norm": 0.12077819555997849, + "learning_rate": 7.725e-05, + "loss": 1.2125, + "step": 515 + }, + { + "epoch": 0.06, + "grad_norm": 0.13014502823352814, + "learning_rate": 7.8e-05, + "loss": 1.0959, + "step": 520 + }, + { + "epoch": 0.06, + "grad_norm": 0.13080598413944244, + "learning_rate": 7.874999999999999e-05, + "loss": 1.1083, + "step": 525 + }, + { + "epoch": 0.06, + "grad_norm": 0.12461178004741669, + "learning_rate": 7.95e-05, + "loss": 1.0968, + "step": 530 + }, + { + "epoch": 0.06, + "grad_norm": 0.13079579174518585, + "learning_rate": 8.025e-05, + "loss": 1.1039, + "step": 535 + }, + { + "epoch": 0.07, + "grad_norm": 0.12605786323547363, + "learning_rate": 8.1e-05, + "loss": 1.0558, + "step": 540 + }, + { + "epoch": 0.07, + "grad_norm": 0.13258296251296997, + "learning_rate": 8.175e-05, + "loss": 1.0635, + "step": 545 + }, + { + "epoch": 0.07, + "grad_norm": 0.1320507675409317, + "learning_rate": 8.25e-05, + "loss": 1.042, + "step": 550 + }, + { + "epoch": 0.07, + "grad_norm": 0.11775451898574829, + "learning_rate": 8.325e-05, + "loss": 1.1299, + "step": 555 + }, + { + "epoch": 0.07, + "grad_norm": 0.12879742681980133, + "learning_rate": 8.4e-05, + "loss": 1.0587, + "step": 560 + }, + { + "epoch": 0.07, + "grad_norm": 0.1299828737974167, + "learning_rate": 8.474999999999999e-05, + "loss": 1.0773, + "step": 565 + }, + { + "epoch": 0.07, + "grad_norm": 0.1389782577753067, + "learning_rate": 8.549999999999999e-05, + "loss": 1.066, + "step": 570 + }, + { + "epoch": 0.07, + "grad_norm": 0.12050167471170425, + "learning_rate": 8.624999999999998e-05, + "loss": 1.0786, + "step": 575 + }, + { + "epoch": 0.07, + "grad_norm": 0.14948152005672455, + "learning_rate": 8.699999999999999e-05, + "loss": 1.1208, + "step": 580 + }, + { + "epoch": 0.07, + "grad_norm": 0.13367661833763123, + "learning_rate": 8.774999999999999e-05, + "loss": 1.0754, + "step": 585 + }, + { + "epoch": 0.07, + "grad_norm": 0.12811419367790222, + "learning_rate": 8.849999999999998e-05, + "loss": 1.1275, + "step": 590 + }, + { + "epoch": 0.07, + "grad_norm": 0.12895521521568298, + "learning_rate": 8.924999999999999e-05, + "loss": 1.112, + "step": 595 + }, + { + "epoch": 0.07, + "grad_norm": 0.1275772899389267, + "learning_rate": 8.999999999999999e-05, + "loss": 1.1171, + "step": 600 + }, + { + "epoch": 0.07, + "grad_norm": 0.12136266380548477, + "learning_rate": 9.074999999999998e-05, + "loss": 1.119, + "step": 605 + }, + { + "epoch": 0.07, + "grad_norm": 0.12577873468399048, + "learning_rate": 9.149999999999999e-05, + "loss": 1.059, + "step": 610 + }, + { + "epoch": 0.07, + "grad_norm": 0.11611288040876389, + "learning_rate": 9.224999999999999e-05, + "loss": 1.093, + "step": 615 + }, + { + "epoch": 0.07, + "grad_norm": 0.1267736703157425, + "learning_rate": 9.3e-05, + "loss": 1.1074, + "step": 620 + }, + { + "epoch": 0.08, + "grad_norm": 0.12328934669494629, + "learning_rate": 9.374999999999999e-05, + "loss": 1.0373, + "step": 625 + }, + { + "epoch": 0.08, + "grad_norm": 0.13137248158454895, + "learning_rate": 9.449999999999999e-05, + "loss": 1.0127, + "step": 630 + }, + { + "epoch": 0.08, + "grad_norm": 0.13046279549598694, + "learning_rate": 9.525e-05, + "loss": 1.0549, + "step": 635 + }, + { + "epoch": 0.08, + "grad_norm": 0.11722920835018158, + "learning_rate": 9.599999999999999e-05, + "loss": 1.0051, + "step": 640 + }, + { + "epoch": 0.08, + "grad_norm": 0.14764057099819183, + "learning_rate": 9.675e-05, + "loss": 1.1092, + "step": 645 + }, + { + "epoch": 0.08, + "grad_norm": 0.12383400648832321, + "learning_rate": 9.75e-05, + "loss": 1.1019, + "step": 650 + }, + { + "epoch": 0.08, + "grad_norm": 0.13521508872509003, + "learning_rate": 9.824999999999999e-05, + "loss": 1.0963, + "step": 655 + }, + { + "epoch": 0.08, + "grad_norm": 0.12719859182834625, + "learning_rate": 9.9e-05, + "loss": 1.1604, + "step": 660 + }, + { + "epoch": 0.08, + "grad_norm": 0.14219126105308533, + "learning_rate": 9.975e-05, + "loss": 1.0636, + "step": 665 + }, + { + "epoch": 0.08, + "grad_norm": 0.1256975382566452, + "learning_rate": 0.0001005, + "loss": 1.1046, + "step": 670 + }, + { + "epoch": 0.08, + "grad_norm": 0.12861751019954681, + "learning_rate": 0.00010125, + "loss": 1.0016, + "step": 675 + }, + { + "epoch": 0.08, + "grad_norm": 0.1278897374868393, + "learning_rate": 0.000102, + "loss": 1.108, + "step": 680 + }, + { + "epoch": 0.08, + "grad_norm": 0.11729888617992401, + "learning_rate": 0.00010275, + "loss": 1.1411, + "step": 685 + }, + { + "epoch": 0.08, + "grad_norm": 0.12147238105535507, + "learning_rate": 0.00010349999999999998, + "loss": 1.094, + "step": 690 + }, + { + "epoch": 0.08, + "grad_norm": 0.12336426228284836, + "learning_rate": 0.00010424999999999999, + "loss": 1.0397, + "step": 695 + }, + { + "epoch": 0.08, + "grad_norm": 0.12023478001356125, + "learning_rate": 0.00010499999999999999, + "loss": 1.0665, + "step": 700 + }, + { + "epoch": 0.08, + "grad_norm": 0.11755697429180145, + "learning_rate": 0.00010574999999999998, + "loss": 1.061, + "step": 705 + }, + { + "epoch": 0.09, + "grad_norm": 0.11740587651729584, + "learning_rate": 0.00010649999999999999, + "loss": 1.0637, + "step": 710 + }, + { + "epoch": 0.09, + "grad_norm": 0.13212309777736664, + "learning_rate": 0.00010724999999999999, + "loss": 1.0128, + "step": 715 + }, + { + "epoch": 0.09, + "grad_norm": 0.13335004448890686, + "learning_rate": 0.00010799999999999998, + "loss": 1.064, + "step": 720 + }, + { + "epoch": 0.09, + "grad_norm": 0.12249240279197693, + "learning_rate": 0.00010874999999999999, + "loss": 1.0939, + "step": 725 + }, + { + "epoch": 0.09, + "grad_norm": 0.13641893863677979, + "learning_rate": 0.00010949999999999999, + "loss": 1.1127, + "step": 730 + }, + { + "epoch": 0.09, + "grad_norm": 0.127156600356102, + "learning_rate": 0.00011024999999999998, + "loss": 1.0527, + "step": 735 + }, + { + "epoch": 0.09, + "grad_norm": 0.1096034049987793, + "learning_rate": 0.00011099999999999999, + "loss": 1.1304, + "step": 740 + }, + { + "epoch": 0.09, + "grad_norm": 0.12487472593784332, + "learning_rate": 0.00011174999999999999, + "loss": 1.0862, + "step": 745 + }, + { + "epoch": 0.09, + "grad_norm": 0.12395402044057846, + "learning_rate": 0.0001125, + "loss": 1.0641, + "step": 750 + }, + { + "epoch": 0.09, + "grad_norm": 0.12146257609128952, + "learning_rate": 0.00011324999999999999, + "loss": 1.1303, + "step": 755 + }, + { + "epoch": 0.09, + "grad_norm": 0.12403228133916855, + "learning_rate": 0.00011399999999999999, + "loss": 1.0214, + "step": 760 + }, + { + "epoch": 0.09, + "grad_norm": 0.11013570427894592, + "learning_rate": 0.00011475, + "loss": 1.0696, + "step": 765 + }, + { + "epoch": 0.09, + "grad_norm": 0.11409767717123032, + "learning_rate": 0.00011549999999999999, + "loss": 1.2188, + "step": 770 + }, + { + "epoch": 0.09, + "grad_norm": 0.12632502615451813, + "learning_rate": 0.00011624999999999999, + "loss": 1.113, + "step": 775 + }, + { + "epoch": 0.09, + "grad_norm": 0.11955476552248001, + "learning_rate": 0.000117, + "loss": 1.1429, + "step": 780 + }, + { + "epoch": 0.09, + "grad_norm": 0.13300061225891113, + "learning_rate": 0.00011774999999999999, + "loss": 1.1171, + "step": 785 + }, + { + "epoch": 0.1, + "grad_norm": 0.12019680440425873, + "learning_rate": 0.0001185, + "loss": 1.0675, + "step": 790 + }, + { + "epoch": 0.1, + "grad_norm": 0.12077269703149796, + "learning_rate": 0.00011925, + "loss": 1.0694, + "step": 795 + }, + { + "epoch": 0.1, + "grad_norm": 0.12203273177146912, + "learning_rate": 0.00011999999999999999, + "loss": 1.0378, + "step": 800 + }, + { + "epoch": 0.1, + "grad_norm": 0.1268533170223236, + "learning_rate": 0.00012075, + "loss": 1.1294, + "step": 805 + }, + { + "epoch": 0.1, + "grad_norm": 0.1281730681657791, + "learning_rate": 0.0001215, + "loss": 1.024, + "step": 810 + }, + { + "epoch": 0.1, + "grad_norm": 0.11289986968040466, + "learning_rate": 0.00012225, + "loss": 1.0776, + "step": 815 + }, + { + "epoch": 0.1, + "grad_norm": 0.11376605927944183, + "learning_rate": 0.00012299999999999998, + "loss": 1.107, + "step": 820 + }, + { + "epoch": 0.1, + "grad_norm": 0.1271873265504837, + "learning_rate": 0.00012374999999999997, + "loss": 1.1817, + "step": 825 + }, + { + "epoch": 0.1, + "grad_norm": 0.10974998027086258, + "learning_rate": 0.0001245, + "loss": 1.0228, + "step": 830 + }, + { + "epoch": 0.1, + "grad_norm": 0.13189394772052765, + "learning_rate": 0.00012524999999999998, + "loss": 1.1298, + "step": 835 + }, + { + "epoch": 0.1, + "grad_norm": 0.10656459629535675, + "learning_rate": 0.00012599999999999997, + "loss": 1.0833, + "step": 840 + }, + { + "epoch": 0.1, + "grad_norm": 0.14083336293697357, + "learning_rate": 0.00012675, + "loss": 1.0027, + "step": 845 + }, + { + "epoch": 0.1, + "grad_norm": 0.1123281791806221, + "learning_rate": 0.00012749999999999998, + "loss": 1.0953, + "step": 850 + }, + { + "epoch": 0.1, + "grad_norm": 0.12783868610858917, + "learning_rate": 0.00012824999999999997, + "loss": 1.0793, + "step": 855 + }, + { + "epoch": 0.1, + "grad_norm": 0.12001547962427139, + "learning_rate": 0.000129, + "loss": 1.0685, + "step": 860 + }, + { + "epoch": 0.1, + "grad_norm": 0.11165610700845718, + "learning_rate": 0.00012974999999999998, + "loss": 1.111, + "step": 865 + }, + { + "epoch": 0.1, + "grad_norm": 0.12007506936788559, + "learning_rate": 0.0001305, + "loss": 1.0681, + "step": 870 + }, + { + "epoch": 0.11, + "grad_norm": 0.12255184352397919, + "learning_rate": 0.00013125, + "loss": 1.1185, + "step": 875 + }, + { + "epoch": 0.11, + "grad_norm": 0.12230581790208817, + "learning_rate": 0.00013199999999999998, + "loss": 1.0196, + "step": 880 + }, + { + "epoch": 0.11, + "grad_norm": 0.11624744534492493, + "learning_rate": 0.00013275, + "loss": 1.0478, + "step": 885 + }, + { + "epoch": 0.11, + "grad_norm": 0.11575216054916382, + "learning_rate": 0.0001335, + "loss": 1.0387, + "step": 890 + }, + { + "epoch": 0.11, + "grad_norm": 0.1169903427362442, + "learning_rate": 0.00013424999999999998, + "loss": 1.1127, + "step": 895 + }, + { + "epoch": 0.11, + "grad_norm": 0.11636984348297119, + "learning_rate": 0.000135, + "loss": 1.0555, + "step": 900 + }, + { + "epoch": 0.11, + "grad_norm": 0.12122248113155365, + "learning_rate": 0.00013575, + "loss": 1.1497, + "step": 905 + }, + { + "epoch": 0.11, + "grad_norm": 0.1243358924984932, + "learning_rate": 0.00013649999999999998, + "loss": 1.0675, + "step": 910 + }, + { + "epoch": 0.11, + "grad_norm": 0.11892006546258926, + "learning_rate": 0.00013725, + "loss": 1.1499, + "step": 915 + }, + { + "epoch": 0.11, + "grad_norm": 0.12357810884714127, + "learning_rate": 0.000138, + "loss": 1.0508, + "step": 920 + }, + { + "epoch": 0.11, + "grad_norm": 0.11074083298444748, + "learning_rate": 0.00013874999999999998, + "loss": 0.9855, + "step": 925 + }, + { + "epoch": 0.11, + "grad_norm": 0.1215604916214943, + "learning_rate": 0.0001395, + "loss": 1.0723, + "step": 930 + }, + { + "epoch": 0.11, + "grad_norm": 0.11568805575370789, + "learning_rate": 0.00014025, + "loss": 1.141, + "step": 935 + }, + { + "epoch": 0.11, + "grad_norm": 0.1251344233751297, + "learning_rate": 0.00014099999999999998, + "loss": 1.0817, + "step": 940 + }, + { + "epoch": 0.11, + "grad_norm": 0.12408124655485153, + "learning_rate": 0.00014174999999999998, + "loss": 0.9859, + "step": 945 + }, + { + "epoch": 0.11, + "grad_norm": 0.13382701575756073, + "learning_rate": 0.0001425, + "loss": 1.0902, + "step": 950 + }, + { + "epoch": 0.12, + "grad_norm": 0.12058982998132706, + "learning_rate": 0.00014324999999999999, + "loss": 1.0817, + "step": 955 + }, + { + "epoch": 0.12, + "grad_norm": 0.12119226902723312, + "learning_rate": 0.00014399999999999998, + "loss": 1.0502, + "step": 960 + }, + { + "epoch": 0.12, + "grad_norm": 0.14532142877578735, + "learning_rate": 0.00014475, + "loss": 1.0244, + "step": 965 + }, + { + "epoch": 0.12, + "grad_norm": 0.12690915167331696, + "learning_rate": 0.00014549999999999999, + "loss": 1.1313, + "step": 970 + }, + { + "epoch": 0.12, + "grad_norm": 0.11626715213060379, + "learning_rate": 0.00014624999999999998, + "loss": 1.088, + "step": 975 + }, + { + "epoch": 0.12, + "grad_norm": 0.11747872829437256, + "learning_rate": 0.000147, + "loss": 1.0855, + "step": 980 + }, + { + "epoch": 0.12, + "grad_norm": 0.12771141529083252, + "learning_rate": 0.00014774999999999999, + "loss": 1.1304, + "step": 985 + }, + { + "epoch": 0.12, + "grad_norm": 0.1297111064195633, + "learning_rate": 0.00014849999999999998, + "loss": 1.0307, + "step": 990 + }, + { + "epoch": 0.12, + "grad_norm": 0.12018551677465439, + "learning_rate": 0.00014925, + "loss": 1.0946, + "step": 995 + }, + { + "epoch": 0.12, + "grad_norm": 0.11594726890325546, + "learning_rate": 0.00015, + "loss": 1.0314, + "step": 1000 + }, + { + "epoch": 0.12, + "grad_norm": 0.133875772356987, + "learning_rate": 0.00015074999999999998, + "loss": 1.0018, + "step": 1005 + }, + { + "epoch": 0.12, + "grad_norm": 0.1387271285057068, + "learning_rate": 0.0001515, + "loss": 1.0417, + "step": 1010 + }, + { + "epoch": 0.12, + "grad_norm": 0.11644894629716873, + "learning_rate": 0.00015224999999999996, + "loss": 1.045, + "step": 1015 + }, + { + "epoch": 0.12, + "grad_norm": 0.13126513361930847, + "learning_rate": 0.00015299999999999998, + "loss": 1.0666, + "step": 1020 + }, + { + "epoch": 0.12, + "grad_norm": 0.12716297805309296, + "learning_rate": 0.00015374999999999997, + "loss": 1.0578, + "step": 1025 + }, + { + "epoch": 0.12, + "grad_norm": 0.11895552277565002, + "learning_rate": 0.0001545, + "loss": 1.1052, + "step": 1030 + }, + { + "epoch": 0.12, + "grad_norm": 0.12736555933952332, + "learning_rate": 0.00015524999999999998, + "loss": 0.9881, + "step": 1035 + }, + { + "epoch": 0.13, + "grad_norm": 0.11497914791107178, + "learning_rate": 0.000156, + "loss": 1.0462, + "step": 1040 + }, + { + "epoch": 0.13, + "grad_norm": 0.13537730276584625, + "learning_rate": 0.00015675, + "loss": 0.9891, + "step": 1045 + }, + { + "epoch": 0.13, + "grad_norm": 0.12179003655910492, + "learning_rate": 0.00015749999999999998, + "loss": 1.0565, + "step": 1050 + }, + { + "epoch": 0.13, + "grad_norm": 0.12696489691734314, + "learning_rate": 0.00015824999999999997, + "loss": 1.0551, + "step": 1055 + }, + { + "epoch": 0.13, + "grad_norm": 0.11586569994688034, + "learning_rate": 0.000159, + "loss": 1.1354, + "step": 1060 + }, + { + "epoch": 0.13, + "grad_norm": 0.1284131109714508, + "learning_rate": 0.00015974999999999998, + "loss": 1.1317, + "step": 1065 + }, + { + "epoch": 0.13, + "grad_norm": 0.13363447785377502, + "learning_rate": 0.0001605, + "loss": 1.0358, + "step": 1070 + }, + { + "epoch": 0.13, + "grad_norm": 0.13264328241348267, + "learning_rate": 0.00016125, + "loss": 1.0852, + "step": 1075 + }, + { + "epoch": 0.13, + "grad_norm": 0.12372339516878128, + "learning_rate": 0.000162, + "loss": 0.997, + "step": 1080 + }, + { + "epoch": 0.13, + "grad_norm": 0.12511901557445526, + "learning_rate": 0.00016274999999999997, + "loss": 1.0837, + "step": 1085 + }, + { + "epoch": 0.13, + "grad_norm": 0.12333539128303528, + "learning_rate": 0.0001635, + "loss": 1.0565, + "step": 1090 + }, + { + "epoch": 0.13, + "grad_norm": 0.1194729283452034, + "learning_rate": 0.00016424999999999998, + "loss": 1.0099, + "step": 1095 + }, + { + "epoch": 0.13, + "grad_norm": 0.12635278701782227, + "learning_rate": 0.000165, + "loss": 1.039, + "step": 1100 + }, + { + "epoch": 0.13, + "grad_norm": 0.124270498752594, + "learning_rate": 0.00016575, + "loss": 1.165, + "step": 1105 + }, + { + "epoch": 0.13, + "grad_norm": 0.12829731404781342, + "learning_rate": 0.0001665, + "loss": 1.0653, + "step": 1110 + }, + { + "epoch": 0.13, + "grad_norm": 0.13601495325565338, + "learning_rate": 0.00016724999999999997, + "loss": 1.0509, + "step": 1115 + }, + { + "epoch": 0.13, + "grad_norm": 0.1377023309469223, + "learning_rate": 0.000168, + "loss": 1.0376, + "step": 1120 + }, + { + "epoch": 0.14, + "grad_norm": 0.12421699613332748, + "learning_rate": 0.00016874999999999998, + "loss": 1.0082, + "step": 1125 + }, + { + "epoch": 0.14, + "grad_norm": 0.12038341909646988, + "learning_rate": 0.00016949999999999997, + "loss": 1.0865, + "step": 1130 + }, + { + "epoch": 0.14, + "grad_norm": 0.11900181323289871, + "learning_rate": 0.00017025, + "loss": 1.1051, + "step": 1135 + }, + { + "epoch": 0.14, + "grad_norm": 0.12548121809959412, + "learning_rate": 0.00017099999999999998, + "loss": 1.1436, + "step": 1140 + }, + { + "epoch": 0.14, + "grad_norm": 0.12389590591192245, + "learning_rate": 0.00017175, + "loss": 0.9946, + "step": 1145 + }, + { + "epoch": 0.14, + "grad_norm": 0.13046157360076904, + "learning_rate": 0.00017249999999999996, + "loss": 1.0448, + "step": 1150 + }, + { + "epoch": 0.14, + "grad_norm": 0.13118329644203186, + "learning_rate": 0.00017324999999999998, + "loss": 1.0698, + "step": 1155 + }, + { + "epoch": 0.14, + "grad_norm": 0.11619829386472702, + "learning_rate": 0.00017399999999999997, + "loss": 1.0619, + "step": 1160 + }, + { + "epoch": 0.14, + "grad_norm": 0.12934699654579163, + "learning_rate": 0.00017475, + "loss": 1.0755, + "step": 1165 + }, + { + "epoch": 0.14, + "grad_norm": 0.13581542670726776, + "learning_rate": 0.00017549999999999998, + "loss": 1.0456, + "step": 1170 + }, + { + "epoch": 0.14, + "grad_norm": 0.1390586644411087, + "learning_rate": 0.00017625, + "loss": 1.0822, + "step": 1175 + }, + { + "epoch": 0.14, + "grad_norm": 0.13362278044223785, + "learning_rate": 0.00017699999999999997, + "loss": 1.0588, + "step": 1180 + }, + { + "epoch": 0.14, + "grad_norm": 0.1188979223370552, + "learning_rate": 0.00017774999999999998, + "loss": 1.0941, + "step": 1185 + }, + { + "epoch": 0.14, + "grad_norm": 0.11467181891202927, + "learning_rate": 0.00017849999999999997, + "loss": 1.0485, + "step": 1190 + }, + { + "epoch": 0.14, + "grad_norm": 0.13196997344493866, + "learning_rate": 0.00017925, + "loss": 1.1128, + "step": 1195 + }, + { + "epoch": 0.14, + "grad_norm": 0.13005390763282776, + "learning_rate": 0.00017999999999999998, + "loss": 1.0536, + "step": 1200 + }, + { + "epoch": 0.15, + "grad_norm": 0.13372902572155, + "learning_rate": 0.00018075, + "loss": 1.1123, + "step": 1205 + }, + { + "epoch": 0.15, + "grad_norm": 0.13549335300922394, + "learning_rate": 0.00018149999999999997, + "loss": 1.0998, + "step": 1210 + }, + { + "epoch": 0.15, + "grad_norm": 0.13829483091831207, + "learning_rate": 0.00018224999999999998, + "loss": 1.0174, + "step": 1215 + }, + { + "epoch": 0.15, + "grad_norm": 0.1173296719789505, + "learning_rate": 0.00018299999999999998, + "loss": 1.0898, + "step": 1220 + }, + { + "epoch": 0.15, + "grad_norm": 0.12226750701665878, + "learning_rate": 0.00018375, + "loss": 1.0516, + "step": 1225 + }, + { + "epoch": 0.15, + "grad_norm": 0.12172769755125046, + "learning_rate": 0.00018449999999999999, + "loss": 1.0495, + "step": 1230 + }, + { + "epoch": 0.15, + "grad_norm": 0.12832506000995636, + "learning_rate": 0.00018525, + "loss": 1.0816, + "step": 1235 + }, + { + "epoch": 0.15, + "grad_norm": 0.12533822655677795, + "learning_rate": 0.000186, + "loss": 1.0122, + "step": 1240 + }, + { + "epoch": 0.15, + "grad_norm": 0.13508310914039612, + "learning_rate": 0.00018675, + "loss": 1.0122, + "step": 1245 + }, + { + "epoch": 0.15, + "grad_norm": 0.11473240703344345, + "learning_rate": 0.00018749999999999998, + "loss": 1.0813, + "step": 1250 + }, + { + "epoch": 0.15, + "grad_norm": 0.12293410301208496, + "learning_rate": 0.00018824999999999997, + "loss": 1.03, + "step": 1255 + }, + { + "epoch": 0.15, + "grad_norm": 0.14231853187084198, + "learning_rate": 0.00018899999999999999, + "loss": 1.0724, + "step": 1260 + }, + { + "epoch": 0.15, + "grad_norm": 0.1204795315861702, + "learning_rate": 0.00018974999999999998, + "loss": 1.0449, + "step": 1265 + }, + { + "epoch": 0.15, + "grad_norm": 0.12521016597747803, + "learning_rate": 0.0001905, + "loss": 1.1008, + "step": 1270 + }, + { + "epoch": 0.15, + "grad_norm": 0.12301227450370789, + "learning_rate": 0.00019124999999999996, + "loss": 1.1471, + "step": 1275 + }, + { + "epoch": 0.15, + "grad_norm": 0.1287146359682083, + "learning_rate": 0.00019199999999999998, + "loss": 1.0501, + "step": 1280 + }, + { + "epoch": 0.15, + "grad_norm": 0.13978512585163116, + "learning_rate": 0.00019274999999999997, + "loss": 1.03, + "step": 1285 + }, + { + "epoch": 0.16, + "grad_norm": 0.12224988639354706, + "learning_rate": 0.0001935, + "loss": 1.0996, + "step": 1290 + }, + { + "epoch": 0.16, + "grad_norm": 0.1454314887523651, + "learning_rate": 0.00019424999999999998, + "loss": 1.0529, + "step": 1295 + }, + { + "epoch": 0.16, + "grad_norm": 0.12719061970710754, + "learning_rate": 0.000195, + "loss": 1.0984, + "step": 1300 + }, + { + "epoch": 0.16, + "grad_norm": 0.13518300652503967, + "learning_rate": 0.00019574999999999996, + "loss": 1.0336, + "step": 1305 + }, + { + "epoch": 0.16, + "grad_norm": 0.1353674679994583, + "learning_rate": 0.00019649999999999998, + "loss": 1.1046, + "step": 1310 + }, + { + "epoch": 0.16, + "grad_norm": 0.13263052701950073, + "learning_rate": 0.00019724999999999997, + "loss": 0.9654, + "step": 1315 + }, + { + "epoch": 0.16, + "grad_norm": 0.13989952206611633, + "learning_rate": 0.000198, + "loss": 1.0994, + "step": 1320 + }, + { + "epoch": 0.16, + "grad_norm": 0.13030202686786652, + "learning_rate": 0.00019874999999999998, + "loss": 1.1445, + "step": 1325 + }, + { + "epoch": 0.16, + "grad_norm": 0.12201202660799026, + "learning_rate": 0.0001995, + "loss": 1.0605, + "step": 1330 + }, + { + "epoch": 0.16, + "grad_norm": 0.1385253220796585, + "learning_rate": 0.00020025, + "loss": 1.0405, + "step": 1335 + }, + { + "epoch": 0.16, + "grad_norm": 0.14180108904838562, + "learning_rate": 0.000201, + "loss": 1.0612, + "step": 1340 + }, + { + "epoch": 0.16, + "grad_norm": 0.1332985907793045, + "learning_rate": 0.00020174999999999997, + "loss": 1.0653, + "step": 1345 + }, + { + "epoch": 0.16, + "grad_norm": 0.14803795516490936, + "learning_rate": 0.0002025, + "loss": 1.1237, + "step": 1350 + }, + { + "epoch": 0.16, + "grad_norm": 0.13238713145256042, + "learning_rate": 0.00020324999999999998, + "loss": 1.1257, + "step": 1355 + }, + { + "epoch": 0.16, + "grad_norm": 0.12412701547145844, + "learning_rate": 0.000204, + "loss": 1.0947, + "step": 1360 + }, + { + "epoch": 0.16, + "grad_norm": 0.13152962923049927, + "learning_rate": 0.00020475, + "loss": 1.1676, + "step": 1365 + }, + { + "epoch": 0.17, + "grad_norm": 0.13383346796035767, + "learning_rate": 0.0002055, + "loss": 1.0314, + "step": 1370 + }, + { + "epoch": 0.17, + "grad_norm": 0.12599609792232513, + "learning_rate": 0.00020624999999999997, + "loss": 0.9767, + "step": 1375 + }, + { + "epoch": 0.17, + "grad_norm": 0.12485210597515106, + "learning_rate": 0.00020699999999999996, + "loss": 1.1578, + "step": 1380 + }, + { + "epoch": 0.17, + "grad_norm": 0.15362361073493958, + "learning_rate": 0.00020774999999999998, + "loss": 1.0263, + "step": 1385 + }, + { + "epoch": 0.17, + "grad_norm": 0.13580173254013062, + "learning_rate": 0.00020849999999999997, + "loss": 1.0553, + "step": 1390 + }, + { + "epoch": 0.17, + "grad_norm": 0.13229776918888092, + "learning_rate": 0.00020925, + "loss": 1.0242, + "step": 1395 + }, + { + "epoch": 0.17, + "grad_norm": 0.14048846065998077, + "learning_rate": 0.00020999999999999998, + "loss": 1.0541, + "step": 1400 + }, + { + "epoch": 0.17, + "grad_norm": 0.1278860718011856, + "learning_rate": 0.00021074999999999997, + "loss": 1.0295, + "step": 1405 + }, + { + "epoch": 0.17, + "grad_norm": 0.12558186054229736, + "learning_rate": 0.00021149999999999996, + "loss": 0.9833, + "step": 1410 + }, + { + "epoch": 0.17, + "grad_norm": 0.13044369220733643, + "learning_rate": 0.00021224999999999998, + "loss": 1.0244, + "step": 1415 + }, + { + "epoch": 0.17, + "grad_norm": 0.12153918296098709, + "learning_rate": 0.00021299999999999997, + "loss": 1.0513, + "step": 1420 + }, + { + "epoch": 0.17, + "grad_norm": 0.1474679559469223, + "learning_rate": 0.00021375, + "loss": 1.0771, + "step": 1425 + }, + { + "epoch": 0.17, + "grad_norm": 0.13985954225063324, + "learning_rate": 0.00021449999999999998, + "loss": 1.0419, + "step": 1430 + }, + { + "epoch": 0.17, + "grad_norm": 0.13685761392116547, + "learning_rate": 0.00021525, + "loss": 1.045, + "step": 1435 + }, + { + "epoch": 0.17, + "grad_norm": 0.13151884078979492, + "learning_rate": 0.00021599999999999996, + "loss": 1.0856, + "step": 1440 + }, + { + "epoch": 0.17, + "grad_norm": 0.12494616955518723, + "learning_rate": 0.00021674999999999998, + "loss": 1.093, + "step": 1445 + }, + { + "epoch": 0.17, + "grad_norm": 0.1363985389471054, + "learning_rate": 0.00021749999999999997, + "loss": 1.0921, + "step": 1450 + }, + { + "epoch": 0.18, + "grad_norm": 0.13812515139579773, + "learning_rate": 0.00021825, + "loss": 1.0832, + "step": 1455 + }, + { + "epoch": 0.18, + "grad_norm": 0.1363314837217331, + "learning_rate": 0.00021899999999999998, + "loss": 1.0995, + "step": 1460 + }, + { + "epoch": 0.18, + "grad_norm": 0.12492544949054718, + "learning_rate": 0.00021975, + "loss": 1.0806, + "step": 1465 + }, + { + "epoch": 0.18, + "grad_norm": 0.12725725769996643, + "learning_rate": 0.00022049999999999997, + "loss": 1.0276, + "step": 1470 + }, + { + "epoch": 0.18, + "grad_norm": 0.11777817457914352, + "learning_rate": 0.00022124999999999998, + "loss": 0.9419, + "step": 1475 + }, + { + "epoch": 0.18, + "grad_norm": 0.13982278108596802, + "learning_rate": 0.00022199999999999998, + "loss": 1.0673, + "step": 1480 + }, + { + "epoch": 0.18, + "grad_norm": 0.15291635692119598, + "learning_rate": 0.00022275, + "loss": 1.0753, + "step": 1485 + }, + { + "epoch": 0.18, + "grad_norm": 0.1316549926996231, + "learning_rate": 0.00022349999999999998, + "loss": 1.0793, + "step": 1490 + }, + { + "epoch": 0.18, + "grad_norm": 0.13658076524734497, + "learning_rate": 0.00022425, + "loss": 1.0742, + "step": 1495 + }, + { + "epoch": 0.18, + "grad_norm": 0.12889617681503296, + "learning_rate": 0.000225, + "loss": 1.0272, + "step": 1500 + }, + { + "epoch": 0.18, + "grad_norm": 0.1431025117635727, + "learning_rate": 0.00022574999999999996, + "loss": 1.056, + "step": 1505 + }, + { + "epoch": 0.18, + "grad_norm": 0.12882299721240997, + "learning_rate": 0.00022649999999999998, + "loss": 1.0965, + "step": 1510 + }, + { + "epoch": 0.18, + "grad_norm": 0.1326344758272171, + "learning_rate": 0.00022724999999999997, + "loss": 1.1601, + "step": 1515 + }, + { + "epoch": 0.18, + "grad_norm": 0.1312105506658554, + "learning_rate": 0.00022799999999999999, + "loss": 1.0286, + "step": 1520 + }, + { + "epoch": 0.18, + "grad_norm": 0.13419494032859802, + "learning_rate": 0.00022874999999999998, + "loss": 1.0998, + "step": 1525 + }, + { + "epoch": 0.18, + "grad_norm": 0.12598495185375214, + "learning_rate": 0.0002295, + "loss": 1.0099, + "step": 1530 + }, + { + "epoch": 0.18, + "grad_norm": 0.12234248965978622, + "learning_rate": 0.00023024999999999996, + "loss": 1.0512, + "step": 1535 + }, + { + "epoch": 0.19, + "grad_norm": 0.1486140936613083, + "learning_rate": 0.00023099999999999998, + "loss": 1.0563, + "step": 1540 + }, + { + "epoch": 0.19, + "grad_norm": 0.13529768586158752, + "learning_rate": 0.00023174999999999997, + "loss": 0.9952, + "step": 1545 + }, + { + "epoch": 0.19, + "grad_norm": 0.1367512345314026, + "learning_rate": 0.00023249999999999999, + "loss": 0.9924, + "step": 1550 + }, + { + "epoch": 0.19, + "grad_norm": 0.14160625636577606, + "learning_rate": 0.00023324999999999998, + "loss": 0.9568, + "step": 1555 + }, + { + "epoch": 0.19, + "grad_norm": 0.13413269817829132, + "learning_rate": 0.000234, + "loss": 1.0429, + "step": 1560 + }, + { + "epoch": 0.19, + "grad_norm": 0.13511444628238678, + "learning_rate": 0.00023474999999999996, + "loss": 1.0548, + "step": 1565 + }, + { + "epoch": 0.19, + "grad_norm": 0.13897758722305298, + "learning_rate": 0.00023549999999999998, + "loss": 1.0312, + "step": 1570 + }, + { + "epoch": 0.19, + "grad_norm": 0.13271476328372955, + "learning_rate": 0.00023624999999999997, + "loss": 1.1496, + "step": 1575 + }, + { + "epoch": 0.19, + "grad_norm": 0.12695930898189545, + "learning_rate": 0.000237, + "loss": 1.0083, + "step": 1580 + }, + { + "epoch": 0.19, + "grad_norm": 0.1538165807723999, + "learning_rate": 0.00023774999999999998, + "loss": 1.1104, + "step": 1585 + }, + { + "epoch": 0.19, + "grad_norm": 0.13440971076488495, + "learning_rate": 0.0002385, + "loss": 1.0746, + "step": 1590 + }, + { + "epoch": 0.19, + "grad_norm": 0.15520578622817993, + "learning_rate": 0.00023925, + "loss": 1.0702, + "step": 1595 + }, + { + "epoch": 0.19, + "grad_norm": 0.1417379379272461, + "learning_rate": 0.00023999999999999998, + "loss": 1.0313, + "step": 1600 + }, + { + "epoch": 0.19, + "grad_norm": 0.1348305493593216, + "learning_rate": 0.00024074999999999997, + "loss": 1.0489, + "step": 1605 + }, + { + "epoch": 0.19, + "grad_norm": 0.13864970207214355, + "learning_rate": 0.0002415, + "loss": 1.0563, + "step": 1610 + }, + { + "epoch": 0.19, + "grad_norm": 0.12686601281166077, + "learning_rate": 0.00024224999999999998, + "loss": 1.1186, + "step": 1615 + }, + { + "epoch": 0.2, + "grad_norm": 0.1524277776479721, + "learning_rate": 0.000243, + "loss": 1.0285, + "step": 1620 + }, + { + "epoch": 0.2, + "grad_norm": 0.13417251408100128, + "learning_rate": 0.00024375, + "loss": 1.0816, + "step": 1625 + }, + { + "epoch": 0.2, + "grad_norm": 0.13841432332992554, + "learning_rate": 0.0002445, + "loss": 0.9558, + "step": 1630 + }, + { + "epoch": 0.2, + "grad_norm": 0.14064748585224152, + "learning_rate": 0.00024524999999999997, + "loss": 1.0487, + "step": 1635 + }, + { + "epoch": 0.2, + "grad_norm": 0.15057429671287537, + "learning_rate": 0.00024599999999999996, + "loss": 1.0382, + "step": 1640 + }, + { + "epoch": 0.2, + "grad_norm": 0.140291228890419, + "learning_rate": 0.00024675, + "loss": 1.0297, + "step": 1645 + }, + { + "epoch": 0.2, + "grad_norm": 0.1320800930261612, + "learning_rate": 0.00024749999999999994, + "loss": 1.1047, + "step": 1650 + }, + { + "epoch": 0.2, + "grad_norm": 0.13335539400577545, + "learning_rate": 0.00024825, + "loss": 1.0456, + "step": 1655 + }, + { + "epoch": 0.2, + "grad_norm": 0.14252524077892303, + "learning_rate": 0.000249, + "loss": 1.1067, + "step": 1660 + }, + { + "epoch": 0.2, + "grad_norm": 0.14384162425994873, + "learning_rate": 0.00024974999999999997, + "loss": 1.0627, + "step": 1665 + }, + { + "epoch": 0.2, + "grad_norm": 0.1295383721590042, + "learning_rate": 0.00025049999999999996, + "loss": 0.9831, + "step": 1670 + }, + { + "epoch": 0.2, + "grad_norm": 0.14580777287483215, + "learning_rate": 0.00025125, + "loss": 1.1085, + "step": 1675 + }, + { + "epoch": 0.2, + "grad_norm": 0.1554543673992157, + "learning_rate": 0.00025199999999999995, + "loss": 1.0539, + "step": 1680 + }, + { + "epoch": 0.2, + "grad_norm": 0.1461145579814911, + "learning_rate": 0.00025275, + "loss": 1.0761, + "step": 1685 + }, + { + "epoch": 0.2, + "grad_norm": 0.12733665108680725, + "learning_rate": 0.0002535, + "loss": 1.0143, + "step": 1690 + }, + { + "epoch": 0.2, + "grad_norm": 0.16144566237926483, + "learning_rate": 0.00025425, + "loss": 0.9641, + "step": 1695 + }, + { + "epoch": 0.2, + "grad_norm": 0.14675092697143555, + "learning_rate": 0.00025499999999999996, + "loss": 1.029, + "step": 1700 + }, + { + "epoch": 0.21, + "grad_norm": 0.15714582800865173, + "learning_rate": 0.00025575, + "loss": 1.0251, + "step": 1705 + }, + { + "epoch": 0.21, + "grad_norm": 0.1321059912443161, + "learning_rate": 0.00025649999999999995, + "loss": 1.0629, + "step": 1710 + }, + { + "epoch": 0.21, + "grad_norm": 0.13915494084358215, + "learning_rate": 0.00025725, + "loss": 1.0581, + "step": 1715 + }, + { + "epoch": 0.21, + "grad_norm": 0.1294393241405487, + "learning_rate": 0.000258, + "loss": 0.939, + "step": 1720 + }, + { + "epoch": 0.21, + "grad_norm": 0.14472445845603943, + "learning_rate": 0.00025875, + "loss": 1.0293, + "step": 1725 + }, + { + "epoch": 0.21, + "grad_norm": 0.1563960462808609, + "learning_rate": 0.00025949999999999997, + "loss": 1.0984, + "step": 1730 + }, + { + "epoch": 0.21, + "grad_norm": 0.13787269592285156, + "learning_rate": 0.00026025, + "loss": 1.0645, + "step": 1735 + }, + { + "epoch": 0.21, + "grad_norm": 0.142653688788414, + "learning_rate": 0.000261, + "loss": 1.0271, + "step": 1740 + }, + { + "epoch": 0.21, + "grad_norm": 0.16362793743610382, + "learning_rate": 0.00026175, + "loss": 0.9973, + "step": 1745 + }, + { + "epoch": 0.21, + "grad_norm": 0.13521093130111694, + "learning_rate": 0.0002625, + "loss": 1.0968, + "step": 1750 + }, + { + "epoch": 0.21, + "grad_norm": 0.1348579227924347, + "learning_rate": 0.00026325, + "loss": 1.084, + "step": 1755 + }, + { + "epoch": 0.21, + "grad_norm": 0.15295952558517456, + "learning_rate": 0.00026399999999999997, + "loss": 1.0843, + "step": 1760 + }, + { + "epoch": 0.21, + "grad_norm": 0.12325224280357361, + "learning_rate": 0.00026474999999999996, + "loss": 1.0448, + "step": 1765 + }, + { + "epoch": 0.21, + "grad_norm": 0.13043299317359924, + "learning_rate": 0.0002655, + "loss": 1.0078, + "step": 1770 + }, + { + "epoch": 0.21, + "grad_norm": 0.13307566940784454, + "learning_rate": 0.00026624999999999994, + "loss": 1.0496, + "step": 1775 + }, + { + "epoch": 0.21, + "grad_norm": 0.1296854168176651, + "learning_rate": 0.000267, + "loss": 1.0945, + "step": 1780 + }, + { + "epoch": 0.22, + "grad_norm": 0.14310450851917267, + "learning_rate": 0.00026775, + "loss": 0.9254, + "step": 1785 + }, + { + "epoch": 0.22, + "grad_norm": 0.14738260209560394, + "learning_rate": 0.00026849999999999997, + "loss": 1.0589, + "step": 1790 + }, + { + "epoch": 0.22, + "grad_norm": 0.1367303431034088, + "learning_rate": 0.00026924999999999996, + "loss": 1.1256, + "step": 1795 + }, + { + "epoch": 0.22, + "grad_norm": 0.1518729031085968, + "learning_rate": 0.00027, + "loss": 1.1417, + "step": 1800 + }, + { + "epoch": 0.22, + "grad_norm": 0.14660024642944336, + "learning_rate": 0.00027074999999999994, + "loss": 1.0219, + "step": 1805 + }, + { + "epoch": 0.22, + "grad_norm": 0.14216068387031555, + "learning_rate": 0.0002715, + "loss": 1.0597, + "step": 1810 + }, + { + "epoch": 0.22, + "grad_norm": 0.1290411353111267, + "learning_rate": 0.00027225, + "loss": 1.0887, + "step": 1815 + }, + { + "epoch": 0.22, + "grad_norm": 0.14220093190670013, + "learning_rate": 0.00027299999999999997, + "loss": 1.0228, + "step": 1820 + }, + { + "epoch": 0.22, + "grad_norm": 0.15115365386009216, + "learning_rate": 0.00027374999999999996, + "loss": 1.0877, + "step": 1825 + }, + { + "epoch": 0.22, + "grad_norm": 0.12890294194221497, + "learning_rate": 0.0002745, + "loss": 1.1167, + "step": 1830 + }, + { + "epoch": 0.22, + "grad_norm": 0.14485861361026764, + "learning_rate": 0.00027525, + "loss": 1.0937, + "step": 1835 + }, + { + "epoch": 0.22, + "grad_norm": 0.1425914615392685, + "learning_rate": 0.000276, + "loss": 1.0565, + "step": 1840 + }, + { + "epoch": 0.22, + "grad_norm": 0.16691897809505463, + "learning_rate": 0.00027675, + "loss": 1.0881, + "step": 1845 + }, + { + "epoch": 0.22, + "grad_norm": 0.1356891691684723, + "learning_rate": 0.00027749999999999997, + "loss": 0.9696, + "step": 1850 + }, + { + "epoch": 0.22, + "grad_norm": 0.14420901238918304, + "learning_rate": 0.00027824999999999996, + "loss": 0.9847, + "step": 1855 + }, + { + "epoch": 0.22, + "grad_norm": 0.1378943920135498, + "learning_rate": 0.000279, + "loss": 0.9863, + "step": 1860 + }, + { + "epoch": 0.22, + "grad_norm": 0.13317394256591797, + "learning_rate": 0.00027975, + "loss": 1.0724, + "step": 1865 + }, + { + "epoch": 0.23, + "grad_norm": 0.14231087267398834, + "learning_rate": 0.0002805, + "loss": 1.0886, + "step": 1870 + }, + { + "epoch": 0.23, + "grad_norm": 0.13940946757793427, + "learning_rate": 0.00028125, + "loss": 1.047, + "step": 1875 + }, + { + "epoch": 0.23, + "grad_norm": 0.13190115988254547, + "learning_rate": 0.00028199999999999997, + "loss": 1.0192, + "step": 1880 + }, + { + "epoch": 0.23, + "grad_norm": 0.16745632886886597, + "learning_rate": 0.00028274999999999996, + "loss": 0.9855, + "step": 1885 + }, + { + "epoch": 0.23, + "grad_norm": 0.1373772770166397, + "learning_rate": 0.00028349999999999995, + "loss": 1.0242, + "step": 1890 + }, + { + "epoch": 0.23, + "grad_norm": 0.13555435836315155, + "learning_rate": 0.00028425, + "loss": 1.075, + "step": 1895 + }, + { + "epoch": 0.23, + "grad_norm": 0.145888552069664, + "learning_rate": 0.000285, + "loss": 1.0011, + "step": 1900 + }, + { + "epoch": 0.23, + "grad_norm": 0.14030475914478302, + "learning_rate": 0.00028575, + "loss": 1.0065, + "step": 1905 + }, + { + "epoch": 0.23, + "grad_norm": 0.13973671197891235, + "learning_rate": 0.00028649999999999997, + "loss": 0.9631, + "step": 1910 + }, + { + "epoch": 0.23, + "grad_norm": 0.15829509496688843, + "learning_rate": 0.00028724999999999996, + "loss": 1.0565, + "step": 1915 + }, + { + "epoch": 0.23, + "grad_norm": 0.14472809433937073, + "learning_rate": 0.00028799999999999995, + "loss": 0.9725, + "step": 1920 + }, + { + "epoch": 0.23, + "grad_norm": 0.16352474689483643, + "learning_rate": 0.00028875, + "loss": 0.9782, + "step": 1925 + }, + { + "epoch": 0.23, + "grad_norm": 0.14885759353637695, + "learning_rate": 0.0002895, + "loss": 1.0521, + "step": 1930 + }, + { + "epoch": 0.23, + "grad_norm": 0.14806737005710602, + "learning_rate": 0.00029025, + "loss": 1.0596, + "step": 1935 + }, + { + "epoch": 0.23, + "grad_norm": 0.15192674100399017, + "learning_rate": 0.00029099999999999997, + "loss": 1.0307, + "step": 1940 + }, + { + "epoch": 0.23, + "grad_norm": 0.15374353528022766, + "learning_rate": 0.00029174999999999996, + "loss": 1.0676, + "step": 1945 + }, + { + "epoch": 0.23, + "grad_norm": 0.1534864455461502, + "learning_rate": 0.00029249999999999995, + "loss": 0.9044, + "step": 1950 + }, + { + "epoch": 0.24, + "grad_norm": 0.14087004959583282, + "learning_rate": 0.00029325, + "loss": 1.0849, + "step": 1955 + }, + { + "epoch": 0.24, + "grad_norm": 0.13771598041057587, + "learning_rate": 0.000294, + "loss": 1.0698, + "step": 1960 + }, + { + "epoch": 0.24, + "grad_norm": 0.17788104712963104, + "learning_rate": 0.00029475, + "loss": 1.0747, + "step": 1965 + }, + { + "epoch": 0.24, + "grad_norm": 0.13944664597511292, + "learning_rate": 0.00029549999999999997, + "loss": 1.0289, + "step": 1970 + }, + { + "epoch": 0.24, + "grad_norm": 0.14483226835727692, + "learning_rate": 0.00029624999999999996, + "loss": 1.0552, + "step": 1975 + }, + { + "epoch": 0.24, + "grad_norm": 0.16384923458099365, + "learning_rate": 0.00029699999999999996, + "loss": 1.103, + "step": 1980 + }, + { + "epoch": 0.24, + "grad_norm": 0.17012694478034973, + "learning_rate": 0.00029775, + "loss": 1.0814, + "step": 1985 + }, + { + "epoch": 0.24, + "grad_norm": 0.1577819287776947, + "learning_rate": 0.0002985, + "loss": 0.9313, + "step": 1990 + }, + { + "epoch": 0.24, + "grad_norm": 0.13799041509628296, + "learning_rate": 0.00029925, + "loss": 0.9898, + "step": 1995 + }, + { + "epoch": 0.24, + "grad_norm": 0.15773248672485352, + "learning_rate": 0.0003, + "loss": 1.0522, + "step": 2000 + }, + { + "epoch": 0.24, + "grad_norm": 0.1652466058731079, + "learning_rate": 0.0002999999647025093, + "loss": 1.1033, + "step": 2005 + }, + { + "epoch": 0.24, + "grad_norm": 0.14737606048583984, + "learning_rate": 0.0002999998588100539, + "loss": 1.024, + "step": 2010 + }, + { + "epoch": 0.24, + "grad_norm": 0.15708868205547333, + "learning_rate": 0.0002999996823226836, + "loss": 1.055, + "step": 2015 + }, + { + "epoch": 0.24, + "grad_norm": 0.13661877810955048, + "learning_rate": 0.00029999943524048147, + "loss": 1.0728, + "step": 2020 + }, + { + "epoch": 0.24, + "grad_norm": 0.13963155448436737, + "learning_rate": 0.00029999911756356377, + "loss": 1.0962, + "step": 2025 + }, + { + "epoch": 0.24, + "grad_norm": 0.14523880183696747, + "learning_rate": 0.00029999872929208, + "loss": 0.9992, + "step": 2030 + }, + { + "epoch": 0.25, + "grad_norm": 0.1487034410238266, + "learning_rate": 0.000299998270426213, + "loss": 0.9902, + "step": 2035 + }, + { + "epoch": 0.25, + "grad_norm": 0.14978134632110596, + "learning_rate": 0.0002999977409661786, + "loss": 1.0294, + "step": 2040 + }, + { + "epoch": 0.25, + "grad_norm": 0.15391825139522552, + "learning_rate": 0.00029999714091222604, + "loss": 0.9604, + "step": 2045 + }, + { + "epoch": 0.25, + "grad_norm": 0.15368473529815674, + "learning_rate": 0.0002999964702646377, + "loss": 0.9744, + "step": 2050 + }, + { + "epoch": 0.25, + "grad_norm": 0.15019111335277557, + "learning_rate": 0.00029999572902372925, + "loss": 0.9179, + "step": 2055 + }, + { + "epoch": 0.25, + "grad_norm": 0.151919886469841, + "learning_rate": 0.00029999491718984945, + "loss": 1.0588, + "step": 2060 + }, + { + "epoch": 0.25, + "grad_norm": 0.14386044442653656, + "learning_rate": 0.00029999403476338053, + "loss": 0.9972, + "step": 2065 + }, + { + "epoch": 0.25, + "grad_norm": 0.1599544882774353, + "learning_rate": 0.0002999930817447377, + "loss": 1.01, + "step": 2070 + }, + { + "epoch": 0.25, + "grad_norm": 0.14225023984909058, + "learning_rate": 0.00029999205813436945, + "loss": 1.1015, + "step": 2075 + }, + { + "epoch": 0.25, + "grad_norm": 0.1291441023349762, + "learning_rate": 0.00029999096393275754, + "loss": 1.0002, + "step": 2080 + }, + { + "epoch": 0.25, + "grad_norm": 0.140071839094162, + "learning_rate": 0.000299989799140417, + "loss": 1.1162, + "step": 2085 + }, + { + "epoch": 0.25, + "grad_norm": 0.1789652705192566, + "learning_rate": 0.00029998856375789594, + "loss": 0.9408, + "step": 2090 + }, + { + "epoch": 0.25, + "grad_norm": 0.13465484976768494, + "learning_rate": 0.00029998725778577584, + "loss": 1.0336, + "step": 2095 + }, + { + "epoch": 0.25, + "grad_norm": 0.13943049311637878, + "learning_rate": 0.0002999858812246713, + "loss": 1.0623, + "step": 2100 + }, + { + "epoch": 0.25, + "grad_norm": 0.15316350758075714, + "learning_rate": 0.0002999844340752302, + "loss": 1.0793, + "step": 2105 + }, + { + "epoch": 0.25, + "grad_norm": 0.15929552912712097, + "learning_rate": 0.00029998291633813353, + "loss": 1.0003, + "step": 2110 + }, + { + "epoch": 0.25, + "grad_norm": 0.14730267226696014, + "learning_rate": 0.00029998132801409565, + "loss": 1.0309, + "step": 2115 + }, + { + "epoch": 0.26, + "grad_norm": 0.1534448117017746, + "learning_rate": 0.00029997966910386413, + "loss": 1.0758, + "step": 2120 + }, + { + "epoch": 0.26, + "grad_norm": 0.14620815217494965, + "learning_rate": 0.00029997793960821967, + "loss": 1.1534, + "step": 2125 + }, + { + "epoch": 0.26, + "grad_norm": 0.16130419075489044, + "learning_rate": 0.00029997613952797617, + "loss": 0.9605, + "step": 2130 + }, + { + "epoch": 0.26, + "grad_norm": 0.13983474671840668, + "learning_rate": 0.00029997426886398094, + "loss": 1.0272, + "step": 2135 + }, + { + "epoch": 0.26, + "grad_norm": 0.13644209504127502, + "learning_rate": 0.00029997232761711423, + "loss": 1.0924, + "step": 2140 + }, + { + "epoch": 0.26, + "grad_norm": 0.15886028110980988, + "learning_rate": 0.0002999703157882897, + "loss": 1.1073, + "step": 2145 + }, + { + "epoch": 0.26, + "grad_norm": 0.13936792314052582, + "learning_rate": 0.0002999682333784542, + "loss": 1.0108, + "step": 2150 + }, + { + "epoch": 0.26, + "grad_norm": 0.1483025848865509, + "learning_rate": 0.0002999660803885878, + "loss": 1.0614, + "step": 2155 + }, + { + "epoch": 0.26, + "grad_norm": 0.16978415846824646, + "learning_rate": 0.00029996385681970377, + "loss": 1.0051, + "step": 2160 + }, + { + "epoch": 0.26, + "grad_norm": 0.1511315107345581, + "learning_rate": 0.0002999615626728486, + "loss": 0.9862, + "step": 2165 + }, + { + "epoch": 0.26, + "grad_norm": 0.16898401081562042, + "learning_rate": 0.00029995919794910186, + "loss": 1.0046, + "step": 2170 + }, + { + "epoch": 0.26, + "grad_norm": 0.1466691642999649, + "learning_rate": 0.00029995676264957667, + "loss": 1.0005, + "step": 2175 + }, + { + "epoch": 0.26, + "grad_norm": 0.16119731962680817, + "learning_rate": 0.000299954256775419, + "loss": 0.9917, + "step": 2180 + }, + { + "epoch": 0.26, + "grad_norm": 0.17459401488304138, + "learning_rate": 0.00029995168032780826, + "loss": 1.0336, + "step": 2185 + }, + { + "epoch": 0.26, + "grad_norm": 0.14910408854484558, + "learning_rate": 0.0002999490333079571, + "loss": 0.9864, + "step": 2190 + }, + { + "epoch": 0.26, + "grad_norm": 0.16221760213375092, + "learning_rate": 0.00029994631571711114, + "loss": 1.0274, + "step": 2195 + }, + { + "epoch": 0.27, + "grad_norm": 0.1858038604259491, + "learning_rate": 0.0002999435275565495, + "loss": 1.0693, + "step": 2200 + }, + { + "epoch": 0.27, + "grad_norm": 0.1565389484167099, + "learning_rate": 0.00029994066882758425, + "loss": 0.958, + "step": 2205 + }, + { + "epoch": 0.27, + "grad_norm": 0.1515437662601471, + "learning_rate": 0.00029993773953156095, + "loss": 1.1011, + "step": 2210 + }, + { + "epoch": 0.27, + "grad_norm": 0.16269664466381073, + "learning_rate": 0.0002999347396698581, + "loss": 1.0161, + "step": 2215 + }, + { + "epoch": 0.27, + "grad_norm": 0.18648435175418854, + "learning_rate": 0.00029993166924388755, + "loss": 0.9996, + "step": 2220 + }, + { + "epoch": 0.27, + "grad_norm": 0.15165026485919952, + "learning_rate": 0.00029992852825509443, + "loss": 1.0186, + "step": 2225 + }, + { + "epoch": 0.27, + "grad_norm": 0.15864279866218567, + "learning_rate": 0.00029992531670495695, + "loss": 1.1264, + "step": 2230 + }, + { + "epoch": 0.27, + "grad_norm": 0.13871487975120544, + "learning_rate": 0.00029992203459498654, + "loss": 1.0645, + "step": 2235 + }, + { + "epoch": 0.27, + "grad_norm": 0.15242180228233337, + "learning_rate": 0.0002999186819267279, + "loss": 1.004, + "step": 2240 + }, + { + "epoch": 0.27, + "grad_norm": 0.16039399802684784, + "learning_rate": 0.0002999152587017589, + "loss": 1.0212, + "step": 2245 + }, + { + "epoch": 0.27, + "grad_norm": 0.1544654369354248, + "learning_rate": 0.0002999117649216906, + "loss": 1.0053, + "step": 2250 + }, + { + "epoch": 0.27, + "grad_norm": 0.15874332189559937, + "learning_rate": 0.0002999082005881673, + "loss": 0.991, + "step": 2255 + }, + { + "epoch": 0.27, + "grad_norm": 0.1567724049091339, + "learning_rate": 0.0002999045657028666, + "loss": 0.9868, + "step": 2260 + }, + { + "epoch": 0.27, + "grad_norm": 0.168124258518219, + "learning_rate": 0.000299900860267499, + "loss": 0.9966, + "step": 2265 + }, + { + "epoch": 0.27, + "grad_norm": 0.14472615718841553, + "learning_rate": 0.0002998970842838086, + "loss": 1.0268, + "step": 2270 + }, + { + "epoch": 0.27, + "grad_norm": 0.1609342396259308, + "learning_rate": 0.0002998932377535723, + "loss": 1.1297, + "step": 2275 + }, + { + "epoch": 0.27, + "grad_norm": 0.14865976572036743, + "learning_rate": 0.0002998893206786006, + "loss": 1.0403, + "step": 2280 + }, + { + "epoch": 0.28, + "grad_norm": 0.162654310464859, + "learning_rate": 0.0002998853330607369, + "loss": 1.0732, + "step": 2285 + }, + { + "epoch": 0.28, + "grad_norm": 0.15120600163936615, + "learning_rate": 0.0002998812749018579, + "loss": 1.0225, + "step": 2290 + }, + { + "epoch": 0.28, + "grad_norm": 0.14963847398757935, + "learning_rate": 0.0002998771462038735, + "loss": 0.9712, + "step": 2295 + }, + { + "epoch": 0.28, + "grad_norm": 0.14909087121486664, + "learning_rate": 0.00029987294696872687, + "loss": 1.0717, + "step": 2300 + }, + { + "epoch": 0.28, + "grad_norm": 0.15834349393844604, + "learning_rate": 0.00029986867719839427, + "loss": 1.1024, + "step": 2305 + }, + { + "epoch": 0.28, + "grad_norm": 0.14275187253952026, + "learning_rate": 0.00029986433689488515, + "loss": 1.0141, + "step": 2310 + }, + { + "epoch": 0.28, + "grad_norm": 0.148543119430542, + "learning_rate": 0.0002998599260602423, + "loss": 1.0205, + "step": 2315 + }, + { + "epoch": 0.28, + "grad_norm": 0.14403221011161804, + "learning_rate": 0.00029985544469654155, + "loss": 1.0232, + "step": 2320 + }, + { + "epoch": 0.28, + "grad_norm": 0.1535564363002777, + "learning_rate": 0.000299850892805892, + "loss": 0.9793, + "step": 2325 + }, + { + "epoch": 0.28, + "grad_norm": 0.143857941031456, + "learning_rate": 0.00029984627039043583, + "loss": 0.9961, + "step": 2330 + }, + { + "epoch": 0.28, + "grad_norm": 0.17255234718322754, + "learning_rate": 0.0002998415774523486, + "loss": 0.9796, + "step": 2335 + }, + { + "epoch": 0.28, + "grad_norm": 0.15468095242977142, + "learning_rate": 0.00029983681399383896, + "loss": 1.0223, + "step": 2340 + }, + { + "epoch": 0.28, + "grad_norm": 0.14770975708961487, + "learning_rate": 0.00029983198001714873, + "loss": 1.0694, + "step": 2345 + }, + { + "epoch": 0.28, + "grad_norm": 0.14611324667930603, + "learning_rate": 0.00029982707552455293, + "loss": 1.0747, + "step": 2350 + }, + { + "epoch": 0.28, + "grad_norm": 0.19976738095283508, + "learning_rate": 0.0002998221005183598, + "loss": 1.0519, + "step": 2355 + }, + { + "epoch": 0.28, + "grad_norm": 0.1997351050376892, + "learning_rate": 0.0002998170550009107, + "loss": 1.1093, + "step": 2360 + }, + { + "epoch": 0.28, + "grad_norm": 0.15991829335689545, + "learning_rate": 0.0002998119389745802, + "loss": 1.029, + "step": 2365 + }, + { + "epoch": 0.29, + "grad_norm": 0.1683609038591385, + "learning_rate": 0.0002998067524417762, + "loss": 0.8991, + "step": 2370 + }, + { + "epoch": 0.29, + "grad_norm": 0.1613713800907135, + "learning_rate": 0.00029980149540493955, + "loss": 1.0849, + "step": 2375 + }, + { + "epoch": 0.29, + "grad_norm": 0.15293876826763153, + "learning_rate": 0.0002997961678665444, + "loss": 1.0498, + "step": 2380 + }, + { + "epoch": 0.29, + "grad_norm": 0.15544572472572327, + "learning_rate": 0.0002997907698290981, + "loss": 0.9695, + "step": 2385 + }, + { + "epoch": 0.29, + "grad_norm": 0.16265787184238434, + "learning_rate": 0.0002997853012951411, + "loss": 0.9936, + "step": 2390 + }, + { + "epoch": 0.29, + "grad_norm": 0.144724041223526, + "learning_rate": 0.00029977976226724706, + "loss": 1.0893, + "step": 2395 + }, + { + "epoch": 0.29, + "grad_norm": 0.1512562781572342, + "learning_rate": 0.00029977415274802294, + "loss": 1.0602, + "step": 2400 + }, + { + "epoch": 0.29, + "grad_norm": 0.135329931974411, + "learning_rate": 0.0002997684727401086, + "loss": 1.0371, + "step": 2405 + }, + { + "epoch": 0.29, + "grad_norm": 0.15818721055984497, + "learning_rate": 0.00029976272224617744, + "loss": 1.0003, + "step": 2410 + }, + { + "epoch": 0.29, + "grad_norm": 0.16431140899658203, + "learning_rate": 0.00029975690126893566, + "loss": 1.017, + "step": 2415 + }, + { + "epoch": 0.29, + "grad_norm": 0.1531086415052414, + "learning_rate": 0.00029975100981112284, + "loss": 1.0058, + "step": 2420 + }, + { + "epoch": 0.29, + "grad_norm": 0.15581297874450684, + "learning_rate": 0.0002997450478755118, + "loss": 0.9911, + "step": 2425 + }, + { + "epoch": 0.29, + "grad_norm": 0.14527705311775208, + "learning_rate": 0.0002997390154649083, + "loss": 1.0665, + "step": 2430 + }, + { + "epoch": 0.29, + "grad_norm": 0.1660381704568863, + "learning_rate": 0.0002997329125821515, + "loss": 0.9779, + "step": 2435 + }, + { + "epoch": 0.29, + "grad_norm": 0.18006062507629395, + "learning_rate": 0.0002997267392301135, + "loss": 1.0496, + "step": 2440 + }, + { + "epoch": 0.29, + "grad_norm": 0.16907352209091187, + "learning_rate": 0.00029972049541169974, + "loss": 0.9899, + "step": 2445 + }, + { + "epoch": 0.3, + "grad_norm": 0.15815675258636475, + "learning_rate": 0.00029971418112984883, + "loss": 1.061, + "step": 2450 + }, + { + "epoch": 0.3, + "grad_norm": 0.15669722855091095, + "learning_rate": 0.0002997077963875324, + "loss": 1.0424, + "step": 2455 + }, + { + "epoch": 0.3, + "grad_norm": 0.14894799888134003, + "learning_rate": 0.00029970134118775533, + "loss": 0.9602, + "step": 2460 + }, + { + "epoch": 0.3, + "grad_norm": 0.16584117710590363, + "learning_rate": 0.00029969481553355565, + "loss": 0.9836, + "step": 2465 + }, + { + "epoch": 0.3, + "grad_norm": 0.15635208785533905, + "learning_rate": 0.0002996882194280046, + "loss": 0.9357, + "step": 2470 + }, + { + "epoch": 0.3, + "grad_norm": 0.16133363544940948, + "learning_rate": 0.0002996815528742065, + "loss": 0.9737, + "step": 2475 + }, + { + "epoch": 0.3, + "grad_norm": 0.1643681675195694, + "learning_rate": 0.00029967481587529884, + "loss": 1.0039, + "step": 2480 + }, + { + "epoch": 0.3, + "grad_norm": 0.16857954859733582, + "learning_rate": 0.0002996680084344523, + "loss": 1.0575, + "step": 2485 + }, + { + "epoch": 0.3, + "grad_norm": 0.16123433411121368, + "learning_rate": 0.0002996611305548707, + "loss": 0.9418, + "step": 2490 + }, + { + "epoch": 0.3, + "grad_norm": 0.1680585891008377, + "learning_rate": 0.0002996541822397909, + "loss": 0.9875, + "step": 2495 + }, + { + "epoch": 0.3, + "grad_norm": 0.1484622061252594, + "learning_rate": 0.00029964716349248306, + "loss": 0.824, + "step": 2500 + }, + { + "epoch": 0.3, + "grad_norm": 0.14574463665485382, + "learning_rate": 0.0002996400743162505, + "loss": 1.0491, + "step": 2505 + }, + { + "epoch": 0.3, + "grad_norm": 0.15741370618343353, + "learning_rate": 0.0002996329147144296, + "loss": 0.9946, + "step": 2510 + }, + { + "epoch": 0.3, + "grad_norm": 0.15442806482315063, + "learning_rate": 0.0002996256846903898, + "loss": 0.9688, + "step": 2515 + }, + { + "epoch": 0.3, + "grad_norm": 0.15500134229660034, + "learning_rate": 0.00029961838424753394, + "loss": 1.0, + "step": 2520 + }, + { + "epoch": 0.3, + "grad_norm": 0.13550515472888947, + "learning_rate": 0.0002996110133892978, + "loss": 0.9205, + "step": 2525 + }, + { + "epoch": 0.3, + "grad_norm": 0.14668314158916473, + "learning_rate": 0.00029960357211915024, + "loss": 0.9877, + "step": 2530 + }, + { + "epoch": 0.31, + "grad_norm": 0.1662401258945465, + "learning_rate": 0.0002995960604405935, + "loss": 1.0079, + "step": 2535 + }, + { + "epoch": 0.31, + "grad_norm": 0.14927972853183746, + "learning_rate": 0.00029958847835716285, + "loss": 0.8561, + "step": 2540 + }, + { + "epoch": 0.31, + "grad_norm": 0.14817696809768677, + "learning_rate": 0.0002995808258724265, + "loss": 0.9245, + "step": 2545 + }, + { + "epoch": 0.31, + "grad_norm": 0.1539267748594284, + "learning_rate": 0.00029957310298998614, + "loss": 1.0536, + "step": 2550 + }, + { + "epoch": 0.31, + "grad_norm": 0.16719764471054077, + "learning_rate": 0.00029956530971347634, + "loss": 0.9806, + "step": 2555 + }, + { + "epoch": 0.31, + "grad_norm": 0.1492961347103119, + "learning_rate": 0.0002995574460465648, + "loss": 0.9849, + "step": 2560 + }, + { + "epoch": 0.31, + "grad_norm": 0.16492590308189392, + "learning_rate": 0.00029954951199295257, + "loss": 1.0159, + "step": 2565 + }, + { + "epoch": 0.31, + "grad_norm": 0.16332361102104187, + "learning_rate": 0.0002995415075563736, + "loss": 1.1316, + "step": 2570 + }, + { + "epoch": 0.31, + "grad_norm": 0.15671640634536743, + "learning_rate": 0.0002995334327405951, + "loss": 0.9887, + "step": 2575 + }, + { + "epoch": 0.31, + "grad_norm": 0.1556137055158615, + "learning_rate": 0.00029952528754941725, + "loss": 1.0926, + "step": 2580 + }, + { + "epoch": 0.31, + "grad_norm": 0.1703471690416336, + "learning_rate": 0.00029951707198667347, + "loss": 0.9838, + "step": 2585 + }, + { + "epoch": 0.31, + "grad_norm": 0.15181928873062134, + "learning_rate": 0.0002995087860562304, + "loss": 0.9467, + "step": 2590 + }, + { + "epoch": 0.31, + "grad_norm": 0.16134661436080933, + "learning_rate": 0.0002995004297619875, + "loss": 1.0854, + "step": 2595 + }, + { + "epoch": 0.31, + "grad_norm": 0.17149412631988525, + "learning_rate": 0.0002994920031078776, + "loss": 1.0174, + "step": 2600 + }, + { + "epoch": 0.31, + "grad_norm": 0.15041255950927734, + "learning_rate": 0.0002994835060978666, + "loss": 1.0738, + "step": 2605 + }, + { + "epoch": 0.31, + "grad_norm": 0.1523531824350357, + "learning_rate": 0.0002994749387359534, + "loss": 1.0006, + "step": 2610 + }, + { + "epoch": 0.32, + "grad_norm": 0.15003371238708496, + "learning_rate": 0.0002994663010261701, + "loss": 1.0203, + "step": 2615 + }, + { + "epoch": 0.32, + "grad_norm": 0.1510554850101471, + "learning_rate": 0.000299457592972582, + "loss": 0.9488, + "step": 2620 + }, + { + "epoch": 0.32, + "grad_norm": 0.1823001652956009, + "learning_rate": 0.0002994488145792872, + "loss": 0.9754, + "step": 2625 + }, + { + "epoch": 0.32, + "grad_norm": 0.15630380809307098, + "learning_rate": 0.00029943996585041736, + "loss": 1.1082, + "step": 2630 + }, + { + "epoch": 0.32, + "grad_norm": 0.1587289720773697, + "learning_rate": 0.0002994310467901367, + "loss": 1.0511, + "step": 2635 + }, + { + "epoch": 0.32, + "grad_norm": 0.1543097347021103, + "learning_rate": 0.00029942205740264306, + "loss": 1.0272, + "step": 2640 + }, + { + "epoch": 0.32, + "grad_norm": 0.16899822652339935, + "learning_rate": 0.00029941299769216704, + "loss": 1.0257, + "step": 2645 + }, + { + "epoch": 0.32, + "grad_norm": 0.14778625965118408, + "learning_rate": 0.00029940386766297246, + "loss": 0.965, + "step": 2650 + }, + { + "epoch": 0.32, + "grad_norm": 0.15615618228912354, + "learning_rate": 0.00029939466731935616, + "loss": 1.0942, + "step": 2655 + }, + { + "epoch": 0.32, + "grad_norm": 0.15470904111862183, + "learning_rate": 0.0002993853966656482, + "loss": 0.9682, + "step": 2660 + }, + { + "epoch": 0.32, + "grad_norm": 0.16802319884300232, + "learning_rate": 0.0002993760557062117, + "loss": 1.1369, + "step": 2665 + }, + { + "epoch": 0.32, + "grad_norm": 0.15009041130542755, + "learning_rate": 0.0002993666444454426, + "loss": 0.9354, + "step": 2670 + }, + { + "epoch": 0.32, + "grad_norm": 0.1728777289390564, + "learning_rate": 0.0002993571628877704, + "loss": 1.0449, + "step": 2675 + }, + { + "epoch": 0.32, + "grad_norm": 0.1533242017030716, + "learning_rate": 0.0002993476110376574, + "loss": 0.9436, + "step": 2680 + }, + { + "epoch": 0.32, + "grad_norm": 0.15997876226902008, + "learning_rate": 0.0002993379888995989, + "loss": 1.0483, + "step": 2685 + }, + { + "epoch": 0.32, + "grad_norm": 0.16369123756885529, + "learning_rate": 0.00029932829647812354, + "loss": 0.9826, + "step": 2690 + }, + { + "epoch": 0.32, + "grad_norm": 0.1492471545934677, + "learning_rate": 0.0002993185337777927, + "loss": 1.0283, + "step": 2695 + }, + { + "epoch": 0.33, + "grad_norm": 0.16298407316207886, + "learning_rate": 0.00029930870080320125, + "loss": 0.9464, + "step": 2700 + }, + { + "epoch": 0.33, + "grad_norm": 0.20998337864875793, + "learning_rate": 0.00029929879755897674, + "loss": 1.0735, + "step": 2705 + }, + { + "epoch": 0.33, + "grad_norm": 0.15956445038318634, + "learning_rate": 0.0002992888240497801, + "loss": 1.037, + "step": 2710 + }, + { + "epoch": 0.33, + "grad_norm": 0.15296678245067596, + "learning_rate": 0.0002992787802803051, + "loss": 1.0028, + "step": 2715 + }, + { + "epoch": 0.33, + "grad_norm": 0.1596369743347168, + "learning_rate": 0.0002992686662552787, + "loss": 1.0732, + "step": 2720 + }, + { + "epoch": 0.33, + "grad_norm": 0.16030800342559814, + "learning_rate": 0.0002992584819794609, + "loss": 1.0009, + "step": 2725 + }, + { + "epoch": 0.33, + "grad_norm": 0.16185402870178223, + "learning_rate": 0.00029924822745764485, + "loss": 0.9573, + "step": 2730 + }, + { + "epoch": 0.33, + "grad_norm": 0.15265031158924103, + "learning_rate": 0.0002992379026946565, + "loss": 1.0878, + "step": 2735 + }, + { + "epoch": 0.33, + "grad_norm": 0.17246966063976288, + "learning_rate": 0.00029922750769535505, + "loss": 0.9797, + "step": 2740 + }, + { + "epoch": 0.33, + "grad_norm": 0.16208067536354065, + "learning_rate": 0.00029921704246463284, + "loss": 1.0398, + "step": 2745 + }, + { + "epoch": 0.33, + "grad_norm": 0.15843920409679413, + "learning_rate": 0.0002992065070074151, + "loss": 0.9905, + "step": 2750 + }, + { + "epoch": 0.33, + "grad_norm": 0.15435267984867096, + "learning_rate": 0.0002991959013286602, + "loss": 1.0302, + "step": 2755 + }, + { + "epoch": 0.33, + "grad_norm": 0.14609667658805847, + "learning_rate": 0.00029918522543335947, + "loss": 0.9949, + "step": 2760 + }, + { + "epoch": 0.33, + "grad_norm": 0.1628682017326355, + "learning_rate": 0.00029917447932653737, + "loss": 0.9568, + "step": 2765 + }, + { + "epoch": 0.33, + "grad_norm": 0.1572001874446869, + "learning_rate": 0.0002991636630132513, + "loss": 1.0415, + "step": 2770 + }, + { + "epoch": 0.33, + "grad_norm": 0.1506408005952835, + "learning_rate": 0.0002991527764985919, + "loss": 0.9368, + "step": 2775 + }, + { + "epoch": 0.33, + "grad_norm": 0.1670454889535904, + "learning_rate": 0.00029914181978768267, + "loss": 1.0478, + "step": 2780 + }, + { + "epoch": 0.34, + "grad_norm": 0.16997024416923523, + "learning_rate": 0.0002991307928856802, + "loss": 1.0219, + "step": 2785 + }, + { + "epoch": 0.34, + "grad_norm": 0.15132923424243927, + "learning_rate": 0.00029911969579777414, + "loss": 1.0253, + "step": 2790 + }, + { + "epoch": 0.34, + "grad_norm": 0.1776067316532135, + "learning_rate": 0.00029910852852918713, + "loss": 1.0261, + "step": 2795 + }, + { + "epoch": 0.34, + "grad_norm": 0.16810853779315948, + "learning_rate": 0.0002990972910851748, + "loss": 1.1051, + "step": 2800 + }, + { + "epoch": 0.34, + "grad_norm": 0.16523437201976776, + "learning_rate": 0.0002990859834710259, + "loss": 1.0066, + "step": 2805 + }, + { + "epoch": 0.34, + "grad_norm": 0.15905992686748505, + "learning_rate": 0.0002990746056920623, + "loss": 1.0197, + "step": 2810 + }, + { + "epoch": 0.34, + "grad_norm": 0.16057956218719482, + "learning_rate": 0.00029906315775363857, + "loss": 0.9492, + "step": 2815 + }, + { + "epoch": 0.34, + "grad_norm": 0.1586558222770691, + "learning_rate": 0.0002990516396611425, + "loss": 1.0266, + "step": 2820 + }, + { + "epoch": 0.34, + "grad_norm": 0.17754611372947693, + "learning_rate": 0.0002990400514199951, + "loss": 0.9497, + "step": 2825 + }, + { + "epoch": 0.34, + "grad_norm": 0.1540936827659607, + "learning_rate": 0.00029902839303564994, + "loss": 0.9285, + "step": 2830 + }, + { + "epoch": 0.34, + "grad_norm": 0.1709967404603958, + "learning_rate": 0.00029901666451359393, + "loss": 0.9819, + "step": 2835 + }, + { + "epoch": 0.34, + "grad_norm": 0.16606543958187103, + "learning_rate": 0.0002990048658593469, + "loss": 0.999, + "step": 2840 + }, + { + "epoch": 0.34, + "grad_norm": 0.16725273430347443, + "learning_rate": 0.0002989929970784618, + "loss": 1.0302, + "step": 2845 + }, + { + "epoch": 0.34, + "grad_norm": 0.16970454156398773, + "learning_rate": 0.0002989810581765243, + "loss": 1.0709, + "step": 2850 + }, + { + "epoch": 0.34, + "grad_norm": 0.18702805042266846, + "learning_rate": 0.0002989690491591533, + "loss": 0.9305, + "step": 2855 + }, + { + "epoch": 0.34, + "grad_norm": 0.16489920020103455, + "learning_rate": 0.0002989569700320007, + "loss": 0.9604, + "step": 2860 + }, + { + "epoch": 0.35, + "grad_norm": 0.21395571529865265, + "learning_rate": 0.0002989448208007513, + "loss": 0.9137, + "step": 2865 + }, + { + "epoch": 0.35, + "grad_norm": 0.157196506857872, + "learning_rate": 0.00029893260147112287, + "loss": 1.0027, + "step": 2870 + }, + { + "epoch": 0.35, + "grad_norm": 0.16530407965183258, + "learning_rate": 0.0002989203120488663, + "loss": 1.0449, + "step": 2875 + }, + { + "epoch": 0.35, + "grad_norm": 0.15222971141338348, + "learning_rate": 0.0002989079525397654, + "loss": 0.985, + "step": 2880 + }, + { + "epoch": 0.35, + "grad_norm": 0.1723487377166748, + "learning_rate": 0.00029889552294963697, + "loss": 0.9842, + "step": 2885 + }, + { + "epoch": 0.35, + "grad_norm": 0.15952201187610626, + "learning_rate": 0.0002988830232843308, + "loss": 1.0918, + "step": 2890 + }, + { + "epoch": 0.35, + "grad_norm": 0.1670973002910614, + "learning_rate": 0.00029887045354972953, + "loss": 1.0285, + "step": 2895 + }, + { + "epoch": 0.35, + "grad_norm": 0.16919025778770447, + "learning_rate": 0.00029885781375174906, + "loss": 0.9887, + "step": 2900 + }, + { + "epoch": 0.35, + "grad_norm": 0.15993523597717285, + "learning_rate": 0.000298845103896338, + "loss": 1.0421, + "step": 2905 + }, + { + "epoch": 0.35, + "grad_norm": 0.15911990404129028, + "learning_rate": 0.00029883232398947806, + "loss": 1.0282, + "step": 2910 + }, + { + "epoch": 0.35, + "grad_norm": 0.15578506886959076, + "learning_rate": 0.0002988194740371839, + "loss": 1.0445, + "step": 2915 + }, + { + "epoch": 0.35, + "grad_norm": 0.1739857941865921, + "learning_rate": 0.0002988065540455031, + "loss": 1.1877, + "step": 2920 + }, + { + "epoch": 0.35, + "grad_norm": 0.15756095945835114, + "learning_rate": 0.0002987935640205162, + "loss": 0.997, + "step": 2925 + }, + { + "epoch": 0.35, + "grad_norm": 0.1599845141172409, + "learning_rate": 0.00029878050396833685, + "loss": 0.998, + "step": 2930 + }, + { + "epoch": 0.35, + "grad_norm": 0.15747900307178497, + "learning_rate": 0.0002987673738951115, + "loss": 0.9975, + "step": 2935 + }, + { + "epoch": 0.35, + "grad_norm": 0.17130185663700104, + "learning_rate": 0.00029875417380701954, + "loss": 1.0535, + "step": 2940 + }, + { + "epoch": 0.35, + "grad_norm": 0.17812122404575348, + "learning_rate": 0.0002987409037102734, + "loss": 0.9872, + "step": 2945 + }, + { + "epoch": 0.36, + "grad_norm": 0.16236154735088348, + "learning_rate": 0.0002987275636111185, + "loss": 1.0886, + "step": 2950 + }, + { + "epoch": 0.36, + "grad_norm": 0.16785535216331482, + "learning_rate": 0.0002987141535158331, + "loss": 0.9917, + "step": 2955 + }, + { + "epoch": 0.36, + "grad_norm": 0.1586502194404602, + "learning_rate": 0.0002987006734307283, + "loss": 1.0217, + "step": 2960 + }, + { + "epoch": 0.36, + "grad_norm": 0.15011291205883026, + "learning_rate": 0.0002986871233621484, + "loss": 1.0619, + "step": 2965 + }, + { + "epoch": 0.36, + "grad_norm": 0.14755620062351227, + "learning_rate": 0.0002986735033164706, + "loss": 1.0555, + "step": 2970 + }, + { + "epoch": 0.36, + "grad_norm": 0.1600685715675354, + "learning_rate": 0.0002986598133001048, + "loss": 0.9612, + "step": 2975 + }, + { + "epoch": 0.36, + "grad_norm": 0.15799719095230103, + "learning_rate": 0.00029864605331949396, + "loss": 0.9002, + "step": 2980 + }, + { + "epoch": 0.36, + "grad_norm": 0.16441842913627625, + "learning_rate": 0.0002986322233811141, + "loss": 0.9336, + "step": 2985 + }, + { + "epoch": 0.36, + "grad_norm": 0.16389460861682892, + "learning_rate": 0.000298618323491474, + "loss": 0.8653, + "step": 2990 + }, + { + "epoch": 0.36, + "grad_norm": 0.16231980919837952, + "learning_rate": 0.00029860435365711537, + "loss": 1.0476, + "step": 2995 + }, + { + "epoch": 0.36, + "grad_norm": 0.15994654595851898, + "learning_rate": 0.00029859031388461296, + "loss": 0.9567, + "step": 3000 + }, + { + "epoch": 0.36, + "grad_norm": 0.16480427980422974, + "learning_rate": 0.00029857620418057424, + "loss": 0.9938, + "step": 3005 + }, + { + "epoch": 0.36, + "grad_norm": 0.16214433312416077, + "learning_rate": 0.0002985620245516398, + "loss": 0.9345, + "step": 3010 + }, + { + "epoch": 0.36, + "grad_norm": 0.1897832155227661, + "learning_rate": 0.00029854777500448303, + "loss": 0.9927, + "step": 3015 + }, + { + "epoch": 0.36, + "grad_norm": 0.1605633646249771, + "learning_rate": 0.00029853345554581024, + "loss": 0.9714, + "step": 3020 + }, + { + "epoch": 0.36, + "grad_norm": 0.17080001533031464, + "learning_rate": 0.0002985190661823606, + "loss": 1.0147, + "step": 3025 + }, + { + "epoch": 0.37, + "grad_norm": 0.16237355768680573, + "learning_rate": 0.0002985046069209062, + "loss": 1.0514, + "step": 3030 + }, + { + "epoch": 0.37, + "grad_norm": 0.17489008605480194, + "learning_rate": 0.0002984900777682522, + "loss": 0.9702, + "step": 3035 + }, + { + "epoch": 0.37, + "grad_norm": 0.1539561152458191, + "learning_rate": 0.00029847547873123627, + "loss": 0.9965, + "step": 3040 + }, + { + "epoch": 0.37, + "grad_norm": 0.15789541602134705, + "learning_rate": 0.0002984608098167295, + "loss": 1.0079, + "step": 3045 + }, + { + "epoch": 0.37, + "grad_norm": 0.17130644619464874, + "learning_rate": 0.0002984460710316353, + "loss": 1.006, + "step": 3050 + }, + { + "epoch": 0.37, + "grad_norm": 0.19519822299480438, + "learning_rate": 0.0002984312623828903, + "loss": 0.9245, + "step": 3055 + }, + { + "epoch": 0.37, + "grad_norm": 0.16348280012607574, + "learning_rate": 0.000298416383877464, + "loss": 1.042, + "step": 3060 + }, + { + "epoch": 0.37, + "grad_norm": 0.14846982061862946, + "learning_rate": 0.0002984014355223587, + "loss": 1.0291, + "step": 3065 + }, + { + "epoch": 0.37, + "grad_norm": 0.18363744020462036, + "learning_rate": 0.0002983864173246096, + "loss": 1.0021, + "step": 3070 + }, + { + "epoch": 0.37, + "grad_norm": 0.15816280245780945, + "learning_rate": 0.00029837132929128474, + "loss": 0.9942, + "step": 3075 + }, + { + "epoch": 0.37, + "grad_norm": 0.1437516212463379, + "learning_rate": 0.00029835617142948503, + "loss": 1.0911, + "step": 3080 + }, + { + "epoch": 0.37, + "grad_norm": 0.16180740296840668, + "learning_rate": 0.0002983409437463443, + "loss": 1.0143, + "step": 3085 + }, + { + "epoch": 0.37, + "grad_norm": 0.1789834052324295, + "learning_rate": 0.0002983256462490292, + "loss": 1.0422, + "step": 3090 + }, + { + "epoch": 0.37, + "grad_norm": 0.17736496031284332, + "learning_rate": 0.00029831027894473925, + "loss": 1.028, + "step": 3095 + }, + { + "epoch": 0.37, + "grad_norm": 0.16583746671676636, + "learning_rate": 0.00029829484184070674, + "loss": 1.0083, + "step": 3100 + }, + { + "epoch": 0.37, + "grad_norm": 0.1849256455898285, + "learning_rate": 0.000298279334944197, + "loss": 1.0822, + "step": 3105 + }, + { + "epoch": 0.37, + "grad_norm": 0.16508488357067108, + "learning_rate": 0.000298263758262508, + "loss": 0.9659, + "step": 3110 + }, + { + "epoch": 0.38, + "grad_norm": 0.1658620983362198, + "learning_rate": 0.0002982481118029707, + "loss": 1.0032, + "step": 3115 + }, + { + "epoch": 0.38, + "grad_norm": 0.16383464634418488, + "learning_rate": 0.0002982323955729488, + "loss": 1.012, + "step": 3120 + }, + { + "epoch": 0.38, + "grad_norm": 0.16278916597366333, + "learning_rate": 0.0002982166095798389, + "loss": 1.0042, + "step": 3125 + }, + { + "epoch": 0.38, + "grad_norm": 0.16901999711990356, + "learning_rate": 0.0002982007538310704, + "loss": 0.9857, + "step": 3130 + }, + { + "epoch": 0.38, + "grad_norm": 0.15740418434143066, + "learning_rate": 0.0002981848283341056, + "loss": 0.9621, + "step": 3135 + }, + { + "epoch": 0.38, + "grad_norm": 0.1794396936893463, + "learning_rate": 0.00029816883309643946, + "loss": 0.9642, + "step": 3140 + }, + { + "epoch": 0.38, + "grad_norm": 0.16382227838039398, + "learning_rate": 0.0002981527681255999, + "loss": 1.0157, + "step": 3145 + }, + { + "epoch": 0.38, + "grad_norm": 0.17343173921108246, + "learning_rate": 0.00029813663342914774, + "loss": 1.0122, + "step": 3150 + }, + { + "epoch": 0.38, + "grad_norm": 0.16225513815879822, + "learning_rate": 0.0002981204290146764, + "loss": 0.9324, + "step": 3155 + }, + { + "epoch": 0.38, + "grad_norm": 0.1837487518787384, + "learning_rate": 0.00029810415488981223, + "loss": 0.9521, + "step": 3160 + }, + { + "epoch": 0.38, + "grad_norm": 0.16996806859970093, + "learning_rate": 0.0002980878110622144, + "loss": 0.9372, + "step": 3165 + }, + { + "epoch": 0.38, + "grad_norm": 0.1729620099067688, + "learning_rate": 0.0002980713975395748, + "loss": 1.0272, + "step": 3170 + }, + { + "epoch": 0.38, + "grad_norm": 0.15893127024173737, + "learning_rate": 0.0002980549143296182, + "loss": 1.0605, + "step": 3175 + }, + { + "epoch": 0.38, + "grad_norm": 0.19714705646038055, + "learning_rate": 0.0002980383614401023, + "loss": 1.0335, + "step": 3180 + }, + { + "epoch": 0.38, + "grad_norm": 0.17942222952842712, + "learning_rate": 0.0002980217388788172, + "loss": 1.0134, + "step": 3185 + }, + { + "epoch": 0.38, + "grad_norm": 0.1684209406375885, + "learning_rate": 0.0002980050466535861, + "loss": 1.0344, + "step": 3190 + }, + { + "epoch": 0.38, + "grad_norm": 0.16573862731456757, + "learning_rate": 0.000297988284772265, + "loss": 1.0624, + "step": 3195 + }, + { + "epoch": 0.39, + "grad_norm": 0.16821695864200592, + "learning_rate": 0.00029797145324274256, + "loss": 0.9756, + "step": 3200 + }, + { + "epoch": 0.39, + "grad_norm": 0.15828034281730652, + "learning_rate": 0.0002979545520729402, + "loss": 0.9553, + "step": 3205 + }, + { + "epoch": 0.39, + "grad_norm": 0.17686916887760162, + "learning_rate": 0.00029793758127081226, + "loss": 0.9576, + "step": 3210 + }, + { + "epoch": 0.39, + "grad_norm": 0.16397157311439514, + "learning_rate": 0.00029792054084434573, + "loss": 0.9603, + "step": 3215 + }, + { + "epoch": 0.39, + "grad_norm": 0.16657304763793945, + "learning_rate": 0.0002979034308015603, + "loss": 1.0867, + "step": 3220 + }, + { + "epoch": 0.39, + "grad_norm": 0.16404330730438232, + "learning_rate": 0.00029788625115050873, + "loss": 1.0548, + "step": 3225 + }, + { + "epoch": 0.39, + "grad_norm": 0.16127341985702515, + "learning_rate": 0.0002978690018992761, + "loss": 1.0754, + "step": 3230 + }, + { + "epoch": 0.39, + "grad_norm": 0.1616549789905548, + "learning_rate": 0.0002978516830559807, + "loss": 0.9912, + "step": 3235 + }, + { + "epoch": 0.39, + "grad_norm": 0.15977180004119873, + "learning_rate": 0.0002978342946287732, + "loss": 0.9411, + "step": 3240 + }, + { + "epoch": 0.39, + "grad_norm": 0.18043774366378784, + "learning_rate": 0.00029781683662583725, + "loss": 1.0057, + "step": 3245 + }, + { + "epoch": 0.39, + "grad_norm": 0.16147902607917786, + "learning_rate": 0.00029779930905538915, + "loss": 0.9489, + "step": 3250 + }, + { + "epoch": 0.39, + "grad_norm": 0.17931294441223145, + "learning_rate": 0.0002977817119256779, + "loss": 1.0077, + "step": 3255 + }, + { + "epoch": 0.39, + "grad_norm": 0.16193710267543793, + "learning_rate": 0.00029776404524498533, + "loss": 1.0576, + "step": 3260 + }, + { + "epoch": 0.39, + "grad_norm": 0.17987242341041565, + "learning_rate": 0.00029774630902162604, + "loss": 0.9772, + "step": 3265 + }, + { + "epoch": 0.39, + "grad_norm": 0.15659894049167633, + "learning_rate": 0.0002977285032639472, + "loss": 0.9922, + "step": 3270 + }, + { + "epoch": 0.39, + "grad_norm": 0.17230859398841858, + "learning_rate": 0.0002977106279803288, + "loss": 1.0216, + "step": 3275 + }, + { + "epoch": 0.4, + "grad_norm": 0.16389912366867065, + "learning_rate": 0.00029769268317918354, + "loss": 0.9859, + "step": 3280 + }, + { + "epoch": 0.4, + "grad_norm": 0.1696440726518631, + "learning_rate": 0.00029767466886895685, + "loss": 1.0755, + "step": 3285 + }, + { + "epoch": 0.4, + "grad_norm": 0.18958187103271484, + "learning_rate": 0.0002976565850581269, + "loss": 0.9506, + "step": 3290 + }, + { + "epoch": 0.4, + "grad_norm": 0.1598709523677826, + "learning_rate": 0.0002976384317552044, + "loss": 0.9734, + "step": 3295 + }, + { + "epoch": 0.4, + "grad_norm": 0.17023463547229767, + "learning_rate": 0.0002976202089687331, + "loss": 0.9537, + "step": 3300 + }, + { + "epoch": 0.4, + "grad_norm": 0.19109511375427246, + "learning_rate": 0.0002976019167072891, + "loss": 0.9778, + "step": 3305 + }, + { + "epoch": 0.4, + "grad_norm": 0.1838565170764923, + "learning_rate": 0.00029758355497948145, + "loss": 1.0258, + "step": 3310 + }, + { + "epoch": 0.4, + "grad_norm": 0.16939514875411987, + "learning_rate": 0.0002975651237939517, + "loss": 1.0244, + "step": 3315 + }, + { + "epoch": 0.4, + "grad_norm": 0.1645985096693039, + "learning_rate": 0.0002975466231593742, + "loss": 0.9729, + "step": 3320 + }, + { + "epoch": 0.4, + "grad_norm": 0.17347556352615356, + "learning_rate": 0.0002975280530844559, + "loss": 1.0492, + "step": 3325 + }, + { + "epoch": 0.4, + "grad_norm": 0.21827654540538788, + "learning_rate": 0.00029750941357793666, + "loss": 0.9775, + "step": 3330 + }, + { + "epoch": 0.4, + "grad_norm": 0.1544768363237381, + "learning_rate": 0.00029749070464858875, + "loss": 1.0207, + "step": 3335 + }, + { + "epoch": 0.4, + "grad_norm": 0.15815982222557068, + "learning_rate": 0.00029747192630521715, + "loss": 1.0502, + "step": 3340 + }, + { + "epoch": 0.4, + "grad_norm": 0.17267605662345886, + "learning_rate": 0.0002974530785566597, + "loss": 0.9846, + "step": 3345 + }, + { + "epoch": 0.4, + "grad_norm": 0.1674361526966095, + "learning_rate": 0.00029743416141178667, + "loss": 0.9606, + "step": 3350 + }, + { + "epoch": 0.4, + "grad_norm": 0.16751405596733093, + "learning_rate": 0.00029741517487950116, + "loss": 1.0469, + "step": 3355 + }, + { + "epoch": 0.4, + "grad_norm": 0.20405547320842743, + "learning_rate": 0.00029739611896873884, + "loss": 0.9714, + "step": 3360 + }, + { + "epoch": 0.41, + "grad_norm": 0.15737318992614746, + "learning_rate": 0.00029737699368846806, + "loss": 0.982, + "step": 3365 + }, + { + "epoch": 0.41, + "grad_norm": 0.16428694128990173, + "learning_rate": 0.0002973577990476899, + "loss": 1.0685, + "step": 3370 + }, + { + "epoch": 0.41, + "grad_norm": 0.16110824048519135, + "learning_rate": 0.0002973385350554378, + "loss": 0.947, + "step": 3375 + }, + { + "epoch": 0.41, + "grad_norm": 0.1924237459897995, + "learning_rate": 0.00029731920172077815, + "loss": 0.9267, + "step": 3380 + }, + { + "epoch": 0.41, + "grad_norm": 0.17750895023345947, + "learning_rate": 0.00029729979905280987, + "loss": 1.0132, + "step": 3385 + }, + { + "epoch": 0.41, + "grad_norm": 0.16638635098934174, + "learning_rate": 0.0002972803270606645, + "loss": 0.9343, + "step": 3390 + }, + { + "epoch": 0.41, + "grad_norm": 0.16308121383190155, + "learning_rate": 0.00029726078575350613, + "loss": 1.0279, + "step": 3395 + }, + { + "epoch": 0.41, + "grad_norm": 0.16061437129974365, + "learning_rate": 0.00029724117514053164, + "loss": 0.9822, + "step": 3400 + }, + { + "epoch": 0.41, + "grad_norm": 0.1641559600830078, + "learning_rate": 0.00029722149523097046, + "loss": 1.014, + "step": 3405 + }, + { + "epoch": 0.41, + "grad_norm": 0.16438022255897522, + "learning_rate": 0.0002972017460340845, + "loss": 1.0781, + "step": 3410 + }, + { + "epoch": 0.41, + "grad_norm": 0.16575871407985687, + "learning_rate": 0.0002971819275591684, + "loss": 1.0313, + "step": 3415 + }, + { + "epoch": 0.41, + "grad_norm": 0.1603136956691742, + "learning_rate": 0.00029716203981554947, + "loss": 0.9883, + "step": 3420 + }, + { + "epoch": 0.41, + "grad_norm": 0.1538233458995819, + "learning_rate": 0.0002971420828125875, + "loss": 1.0134, + "step": 3425 + }, + { + "epoch": 0.41, + "grad_norm": 0.15688014030456543, + "learning_rate": 0.0002971220565596749, + "loss": 0.9029, + "step": 3430 + }, + { + "epoch": 0.41, + "grad_norm": 0.17302924394607544, + "learning_rate": 0.0002971019610662367, + "loss": 0.9459, + "step": 3435 + }, + { + "epoch": 0.41, + "grad_norm": 0.17244279384613037, + "learning_rate": 0.00029708179634173055, + "loss": 0.9259, + "step": 3440 + }, + { + "epoch": 0.42, + "grad_norm": 0.17056743800640106, + "learning_rate": 0.00029706156239564665, + "loss": 0.8916, + "step": 3445 + }, + { + "epoch": 0.42, + "grad_norm": 0.18607419729232788, + "learning_rate": 0.00029704125923750766, + "loss": 0.9067, + "step": 3450 + }, + { + "epoch": 0.42, + "grad_norm": 0.18427631258964539, + "learning_rate": 0.000297020886876869, + "loss": 0.9818, + "step": 3455 + }, + { + "epoch": 0.42, + "grad_norm": 0.17508511245250702, + "learning_rate": 0.00029700044532331854, + "loss": 0.9721, + "step": 3460 + }, + { + "epoch": 0.42, + "grad_norm": 0.1710289865732193, + "learning_rate": 0.0002969799345864768, + "loss": 1.0841, + "step": 3465 + }, + { + "epoch": 0.42, + "grad_norm": 0.16923823952674866, + "learning_rate": 0.00029695935467599676, + "loss": 1.036, + "step": 3470 + }, + { + "epoch": 0.42, + "grad_norm": 0.1833118349313736, + "learning_rate": 0.00029693870560156406, + "loss": 1.0369, + "step": 3475 + }, + { + "epoch": 0.42, + "grad_norm": 0.158515065908432, + "learning_rate": 0.0002969179873728968, + "loss": 0.9474, + "step": 3480 + }, + { + "epoch": 0.42, + "grad_norm": 0.18448443710803986, + "learning_rate": 0.0002968971999997458, + "loss": 1.036, + "step": 3485 + }, + { + "epoch": 0.42, + "grad_norm": 0.17607718706130981, + "learning_rate": 0.000296876343491894, + "loss": 1.1111, + "step": 3490 + }, + { + "epoch": 0.42, + "grad_norm": 0.16396880149841309, + "learning_rate": 0.0002968554178591575, + "loss": 1.0609, + "step": 3495 + }, + { + "epoch": 0.42, + "grad_norm": 0.1581658273935318, + "learning_rate": 0.00029683442311138436, + "loss": 1.0826, + "step": 3500 + }, + { + "epoch": 0.42, + "grad_norm": 0.16231244802474976, + "learning_rate": 0.00029681335925845544, + "loss": 0.9405, + "step": 3505 + }, + { + "epoch": 0.42, + "grad_norm": 0.17052994668483734, + "learning_rate": 0.0002967922263102842, + "loss": 1.057, + "step": 3510 + }, + { + "epoch": 0.42, + "grad_norm": 0.1752976030111313, + "learning_rate": 0.00029677102427681643, + "loss": 0.9852, + "step": 3515 + }, + { + "epoch": 0.42, + "grad_norm": 0.1679937094449997, + "learning_rate": 0.00029674975316803056, + "loss": 1.053, + "step": 3520 + }, + { + "epoch": 0.42, + "grad_norm": 0.17589174211025238, + "learning_rate": 0.00029672841299393734, + "loss": 1.0338, + "step": 3525 + }, + { + "epoch": 0.43, + "grad_norm": 0.17791646718978882, + "learning_rate": 0.00029670700376458034, + "loss": 0.9818, + "step": 3530 + }, + { + "epoch": 0.43, + "grad_norm": 0.1609509289264679, + "learning_rate": 0.0002966855254900353, + "loss": 0.9841, + "step": 3535 + }, + { + "epoch": 0.43, + "grad_norm": 0.1620442420244217, + "learning_rate": 0.0002966639781804108, + "loss": 1.0825, + "step": 3540 + }, + { + "epoch": 0.43, + "grad_norm": 0.18047846853733063, + "learning_rate": 0.00029664236184584757, + "loss": 1.0661, + "step": 3545 + }, + { + "epoch": 0.43, + "grad_norm": 0.16510747373104095, + "learning_rate": 0.00029662067649651895, + "loss": 0.8942, + "step": 3550 + }, + { + "epoch": 0.43, + "grad_norm": 0.18629179894924164, + "learning_rate": 0.00029659892214263094, + "loss": 1.0014, + "step": 3555 + }, + { + "epoch": 0.43, + "grad_norm": 0.18440277874469757, + "learning_rate": 0.0002965770987944217, + "loss": 1.0379, + "step": 3560 + }, + { + "epoch": 0.43, + "grad_norm": 0.1701250523328781, + "learning_rate": 0.00029655520646216214, + "loss": 0.9512, + "step": 3565 + }, + { + "epoch": 0.43, + "grad_norm": 0.17910198867321014, + "learning_rate": 0.00029653324515615544, + "loss": 1.0178, + "step": 3570 + }, + { + "epoch": 0.43, + "grad_norm": 0.15879999101161957, + "learning_rate": 0.00029651121488673737, + "loss": 0.9555, + "step": 3575 + }, + { + "epoch": 0.43, + "grad_norm": 0.17738574743270874, + "learning_rate": 0.00029648911566427605, + "loss": 0.9916, + "step": 3580 + }, + { + "epoch": 0.43, + "grad_norm": 0.1952105015516281, + "learning_rate": 0.00029646694749917217, + "loss": 1.0, + "step": 3585 + }, + { + "epoch": 0.43, + "grad_norm": 0.1668039858341217, + "learning_rate": 0.0002964447104018588, + "loss": 0.9486, + "step": 3590 + }, + { + "epoch": 0.43, + "grad_norm": 0.15947400033473969, + "learning_rate": 0.0002964224043828014, + "loss": 0.9834, + "step": 3595 + }, + { + "epoch": 0.43, + "grad_norm": 0.18073715269565582, + "learning_rate": 0.000296400029452498, + "loss": 0.9501, + "step": 3600 + }, + { + "epoch": 0.43, + "grad_norm": 0.18462218344211578, + "learning_rate": 0.00029637758562147895, + "loss": 1.0168, + "step": 3605 + }, + { + "epoch": 0.43, + "grad_norm": 0.1783631145954132, + "learning_rate": 0.000296355072900307, + "loss": 0.9695, + "step": 3610 + }, + { + "epoch": 0.44, + "grad_norm": 0.1757393628358841, + "learning_rate": 0.00029633249129957747, + "loss": 0.9556, + "step": 3615 + }, + { + "epoch": 0.44, + "grad_norm": 0.17686530947685242, + "learning_rate": 0.000296309840829918, + "loss": 0.9561, + "step": 3620 + }, + { + "epoch": 0.44, + "grad_norm": 0.16874971985816956, + "learning_rate": 0.00029628712150198865, + "loss": 0.9509, + "step": 3625 + }, + { + "epoch": 0.44, + "grad_norm": 0.17538826167583466, + "learning_rate": 0.00029626433332648183, + "loss": 1.0664, + "step": 3630 + }, + { + "epoch": 0.44, + "grad_norm": 0.16366077959537506, + "learning_rate": 0.00029624147631412246, + "loss": 0.9833, + "step": 3635 + }, + { + "epoch": 0.44, + "grad_norm": 0.17649723589420319, + "learning_rate": 0.0002962185504756678, + "loss": 0.9077, + "step": 3640 + }, + { + "epoch": 0.44, + "grad_norm": 0.1709517389535904, + "learning_rate": 0.0002961955558219076, + "loss": 0.9386, + "step": 3645 + }, + { + "epoch": 0.44, + "grad_norm": 0.1735381931066513, + "learning_rate": 0.0002961724923636637, + "loss": 1.0339, + "step": 3650 + }, + { + "epoch": 0.44, + "grad_norm": 0.17340226471424103, + "learning_rate": 0.00029614936011179076, + "loss": 0.9962, + "step": 3655 + }, + { + "epoch": 0.44, + "grad_norm": 0.1836249977350235, + "learning_rate": 0.0002961261590771755, + "loss": 0.8727, + "step": 3660 + }, + { + "epoch": 0.44, + "grad_norm": 0.16805151104927063, + "learning_rate": 0.000296102889270737, + "loss": 0.986, + "step": 3665 + }, + { + "epoch": 0.44, + "grad_norm": 0.1725672036409378, + "learning_rate": 0.00029607955070342685, + "loss": 1.0204, + "step": 3670 + }, + { + "epoch": 0.44, + "grad_norm": 0.17753121256828308, + "learning_rate": 0.00029605614338622905, + "loss": 0.9744, + "step": 3675 + }, + { + "epoch": 0.44, + "grad_norm": 0.18001006543636322, + "learning_rate": 0.00029603266733015983, + "loss": 0.9984, + "step": 3680 + }, + { + "epoch": 0.44, + "grad_norm": 0.18695253133773804, + "learning_rate": 0.0002960091225462677, + "loss": 1.0663, + "step": 3685 + }, + { + "epoch": 0.44, + "grad_norm": 0.18563799560070038, + "learning_rate": 0.00029598550904563374, + "loss": 0.9452, + "step": 3690 + }, + { + "epoch": 0.45, + "grad_norm": 0.18646039068698883, + "learning_rate": 0.0002959618268393712, + "loss": 1.0318, + "step": 3695 + }, + { + "epoch": 0.45, + "grad_norm": 0.16049382090568542, + "learning_rate": 0.00029593807593862565, + "loss": 1.0137, + "step": 3700 + }, + { + "epoch": 0.45, + "grad_norm": 0.17187224328517914, + "learning_rate": 0.00029591425635457514, + "loss": 0.9957, + "step": 3705 + }, + { + "epoch": 0.45, + "grad_norm": 0.18150264024734497, + "learning_rate": 0.00029589036809842987, + "loss": 0.9998, + "step": 3710 + }, + { + "epoch": 0.45, + "grad_norm": 0.18180853128433228, + "learning_rate": 0.0002958664111814326, + "loss": 0.9865, + "step": 3715 + }, + { + "epoch": 0.45, + "grad_norm": 0.17173470556735992, + "learning_rate": 0.0002958423856148581, + "loss": 0.9939, + "step": 3720 + }, + { + "epoch": 0.45, + "grad_norm": 0.18035240471363068, + "learning_rate": 0.0002958182914100137, + "loss": 0.9806, + "step": 3725 + }, + { + "epoch": 0.45, + "grad_norm": 0.18080519139766693, + "learning_rate": 0.00029579412857823887, + "loss": 0.9165, + "step": 3730 + }, + { + "epoch": 0.45, + "grad_norm": 0.18667446076869965, + "learning_rate": 0.0002957698971309054, + "loss": 0.9465, + "step": 3735 + }, + { + "epoch": 0.45, + "grad_norm": 0.17435042560100555, + "learning_rate": 0.0002957455970794175, + "loss": 0.9562, + "step": 3740 + }, + { + "epoch": 0.45, + "grad_norm": 0.19074921309947968, + "learning_rate": 0.0002957212284352116, + "loss": 0.9912, + "step": 3745 + }, + { + "epoch": 0.45, + "grad_norm": 0.1836334615945816, + "learning_rate": 0.0002956967912097563, + "loss": 0.9995, + "step": 3750 + }, + { + "epoch": 0.45, + "grad_norm": 0.19965748488903046, + "learning_rate": 0.00029567228541455264, + "loss": 0.9773, + "step": 3755 + }, + { + "epoch": 0.45, + "grad_norm": 0.19002239406108856, + "learning_rate": 0.0002956477110611338, + "loss": 0.978, + "step": 3760 + }, + { + "epoch": 0.45, + "grad_norm": 0.1769300252199173, + "learning_rate": 0.00029562306816106535, + "loss": 0.9428, + "step": 3765 + }, + { + "epoch": 0.45, + "grad_norm": 0.1742662787437439, + "learning_rate": 0.000295598356725945, + "loss": 0.9801, + "step": 3770 + }, + { + "epoch": 0.45, + "grad_norm": 0.17609982192516327, + "learning_rate": 0.00029557357676740286, + "loss": 0.9655, + "step": 3775 + }, + { + "epoch": 0.46, + "grad_norm": 0.152922123670578, + "learning_rate": 0.00029554872829710114, + "loss": 1.0099, + "step": 3780 + }, + { + "epoch": 0.46, + "grad_norm": 0.18085353076457977, + "learning_rate": 0.0002955238113267344, + "loss": 1.0337, + "step": 3785 + }, + { + "epoch": 0.46, + "grad_norm": 0.17463679611682892, + "learning_rate": 0.00029549882586802923, + "loss": 0.9719, + "step": 3790 + }, + { + "epoch": 0.46, + "grad_norm": 0.1799176186323166, + "learning_rate": 0.0002954737719327448, + "loss": 0.9933, + "step": 3795 + }, + { + "epoch": 0.46, + "grad_norm": 0.18026649951934814, + "learning_rate": 0.00029544864953267224, + "loss": 1.0829, + "step": 3800 + }, + { + "epoch": 0.46, + "grad_norm": 0.18664585053920746, + "learning_rate": 0.000295423458679635, + "loss": 0.9727, + "step": 3805 + }, + { + "epoch": 0.46, + "grad_norm": 0.1912483125925064, + "learning_rate": 0.0002953981993854888, + "loss": 0.9602, + "step": 3810 + }, + { + "epoch": 0.46, + "grad_norm": 0.17371563613414764, + "learning_rate": 0.00029537287166212146, + "loss": 0.9695, + "step": 3815 + }, + { + "epoch": 0.46, + "grad_norm": 0.17529870569705963, + "learning_rate": 0.00029534747552145295, + "loss": 1.0112, + "step": 3820 + }, + { + "epoch": 0.46, + "grad_norm": 0.20106241106987, + "learning_rate": 0.00029532201097543566, + "loss": 1.045, + "step": 3825 + }, + { + "epoch": 0.46, + "grad_norm": 0.16965335607528687, + "learning_rate": 0.00029529647803605406, + "loss": 0.9602, + "step": 3830 + }, + { + "epoch": 0.46, + "grad_norm": 0.16712799668312073, + "learning_rate": 0.00029527087671532467, + "loss": 0.8681, + "step": 3835 + }, + { + "epoch": 0.46, + "grad_norm": 0.17524246871471405, + "learning_rate": 0.00029524520702529645, + "loss": 1.0298, + "step": 3840 + }, + { + "epoch": 0.46, + "grad_norm": 0.16729198396205902, + "learning_rate": 0.00029521946897805034, + "loss": 1.0393, + "step": 3845 + }, + { + "epoch": 0.46, + "grad_norm": 0.1771865338087082, + "learning_rate": 0.00029519366258569954, + "loss": 0.9163, + "step": 3850 + }, + { + "epoch": 0.46, + "grad_norm": 0.19516459107398987, + "learning_rate": 0.0002951677878603894, + "loss": 1.0929, + "step": 3855 + }, + { + "epoch": 0.47, + "grad_norm": 0.16571563482284546, + "learning_rate": 0.0002951418448142974, + "loss": 0.9895, + "step": 3860 + }, + { + "epoch": 0.47, + "grad_norm": 0.17341572046279907, + "learning_rate": 0.00029511583345963327, + "loss": 0.951, + "step": 3865 + }, + { + "epoch": 0.47, + "grad_norm": 0.16998711228370667, + "learning_rate": 0.00029508975380863867, + "loss": 0.9239, + "step": 3870 + }, + { + "epoch": 0.47, + "grad_norm": 0.17954027652740479, + "learning_rate": 0.0002950636058735877, + "loss": 0.9932, + "step": 3875 + }, + { + "epoch": 0.47, + "grad_norm": 0.17167799174785614, + "learning_rate": 0.0002950373896667864, + "loss": 1.0185, + "step": 3880 + }, + { + "epoch": 0.47, + "grad_norm": 0.16167834401130676, + "learning_rate": 0.000295011105200573, + "loss": 1.0683, + "step": 3885 + }, + { + "epoch": 0.47, + "grad_norm": 0.1931363195180893, + "learning_rate": 0.0002949847524873178, + "loss": 0.8858, + "step": 3890 + }, + { + "epoch": 0.47, + "grad_norm": 0.1638021469116211, + "learning_rate": 0.0002949583315394233, + "loss": 0.9769, + "step": 3895 + }, + { + "epoch": 0.47, + "grad_norm": 0.17214910686016083, + "learning_rate": 0.00029493184236932405, + "loss": 1.0506, + "step": 3900 + }, + { + "epoch": 0.47, + "grad_norm": 0.18750952184200287, + "learning_rate": 0.0002949052849894867, + "loss": 0.9815, + "step": 3905 + }, + { + "epoch": 0.47, + "grad_norm": 0.16336201131343842, + "learning_rate": 0.00029487865941241014, + "loss": 0.9287, + "step": 3910 + }, + { + "epoch": 0.47, + "grad_norm": 0.17103566229343414, + "learning_rate": 0.00029485196565062516, + "loss": 0.9731, + "step": 3915 + }, + { + "epoch": 0.47, + "grad_norm": 0.16729870438575745, + "learning_rate": 0.0002948252037166948, + "loss": 0.9477, + "step": 3920 + }, + { + "epoch": 0.47, + "grad_norm": 0.1882411539554596, + "learning_rate": 0.00029479837362321405, + "loss": 0.9904, + "step": 3925 + }, + { + "epoch": 0.47, + "grad_norm": 0.18706083297729492, + "learning_rate": 0.00029477147538281004, + "loss": 0.997, + "step": 3930 + }, + { + "epoch": 0.47, + "grad_norm": 0.18847805261611938, + "learning_rate": 0.000294744509008142, + "loss": 0.9378, + "step": 3935 + }, + { + "epoch": 0.47, + "grad_norm": 0.1840084046125412, + "learning_rate": 0.00029471747451190124, + "loss": 0.9559, + "step": 3940 + }, + { + "epoch": 0.48, + "grad_norm": 0.1757187843322754, + "learning_rate": 0.000294690371906811, + "loss": 1.011, + "step": 3945 + }, + { + "epoch": 0.48, + "grad_norm": 0.16202908754348755, + "learning_rate": 0.00029466320120562683, + "loss": 0.9077, + "step": 3950 + }, + { + "epoch": 0.48, + "grad_norm": 0.18490107357501984, + "learning_rate": 0.00029463596242113596, + "loss": 1.0702, + "step": 3955 + }, + { + "epoch": 0.48, + "grad_norm": 0.1752914935350418, + "learning_rate": 0.000294608655566158, + "loss": 0.9612, + "step": 3960 + }, + { + "epoch": 0.48, + "grad_norm": 0.17405173182487488, + "learning_rate": 0.00029458128065354444, + "loss": 1.0131, + "step": 3965 + }, + { + "epoch": 0.48, + "grad_norm": 0.18187612295150757, + "learning_rate": 0.0002945538376961788, + "loss": 0.9848, + "step": 3970 + }, + { + "epoch": 0.48, + "grad_norm": 0.19082453846931458, + "learning_rate": 0.0002945263267069766, + "loss": 0.9559, + "step": 3975 + }, + { + "epoch": 0.48, + "grad_norm": 0.17413316667079926, + "learning_rate": 0.0002944987476988855, + "loss": 1.0772, + "step": 3980 + }, + { + "epoch": 0.48, + "grad_norm": 0.15954262018203735, + "learning_rate": 0.00029447110068488516, + "loss": 0.9037, + "step": 3985 + }, + { + "epoch": 0.48, + "grad_norm": 0.17299918830394745, + "learning_rate": 0.000294443385677987, + "loss": 0.9624, + "step": 3990 + }, + { + "epoch": 0.48, + "grad_norm": 0.179169163107872, + "learning_rate": 0.00029441560269123483, + "loss": 0.983, + "step": 3995 + }, + { + "epoch": 0.48, + "grad_norm": 0.16181403398513794, + "learning_rate": 0.00029438775173770405, + "loss": 0.9705, + "step": 4000 + }, + { + "epoch": 0.48, + "grad_norm": 0.1745695024728775, + "learning_rate": 0.0002943598328305024, + "loss": 0.9664, + "step": 4005 + }, + { + "epoch": 0.48, + "grad_norm": 0.1865403801202774, + "learning_rate": 0.0002943318459827693, + "loss": 1.0259, + "step": 4010 + }, + { + "epoch": 0.48, + "grad_norm": 0.21468792855739594, + "learning_rate": 0.0002943037912076764, + "loss": 0.9311, + "step": 4015 + }, + { + "epoch": 0.48, + "grad_norm": 0.18576794862747192, + "learning_rate": 0.0002942756685184272, + "loss": 1.07, + "step": 4020 + }, + { + "epoch": 0.48, + "grad_norm": 0.18512584269046783, + "learning_rate": 0.0002942474779282571, + "loss": 0.9204, + "step": 4025 + }, + { + "epoch": 0.49, + "grad_norm": 0.18075776100158691, + "learning_rate": 0.00029421921945043365, + "loss": 0.9853, + "step": 4030 + }, + { + "epoch": 0.49, + "grad_norm": 0.17702947556972504, + "learning_rate": 0.0002941908930982561, + "loss": 1.0537, + "step": 4035 + }, + { + "epoch": 0.49, + "grad_norm": 0.17525263130664825, + "learning_rate": 0.0002941624988850558, + "loss": 0.9713, + "step": 4040 + }, + { + "epoch": 0.49, + "grad_norm": 0.16947941482067108, + "learning_rate": 0.00029413403682419613, + "loss": 0.9988, + "step": 4045 + }, + { + "epoch": 0.49, + "grad_norm": 0.17657147347927094, + "learning_rate": 0.0002941055069290721, + "loss": 0.9946, + "step": 4050 + }, + { + "epoch": 0.49, + "grad_norm": 0.17668572068214417, + "learning_rate": 0.00029407690921311094, + "loss": 0.9288, + "step": 4055 + }, + { + "epoch": 0.49, + "grad_norm": 0.18631796538829803, + "learning_rate": 0.0002940482436897717, + "loss": 0.9218, + "step": 4060 + }, + { + "epoch": 0.49, + "grad_norm": 0.1784496307373047, + "learning_rate": 0.00029401951037254524, + "loss": 1.059, + "step": 4065 + }, + { + "epoch": 0.49, + "grad_norm": 0.19037973880767822, + "learning_rate": 0.00029399070927495447, + "loss": 0.9622, + "step": 4070 + }, + { + "epoch": 0.49, + "grad_norm": 0.17048045992851257, + "learning_rate": 0.0002939618404105541, + "loss": 0.8903, + "step": 4075 + }, + { + "epoch": 0.49, + "grad_norm": 0.1878838837146759, + "learning_rate": 0.00029393290379293085, + "loss": 1.0106, + "step": 4080 + }, + { + "epoch": 0.49, + "grad_norm": 0.18040554225444794, + "learning_rate": 0.0002939038994357032, + "loss": 0.9517, + "step": 4085 + }, + { + "epoch": 0.49, + "grad_norm": 0.19907140731811523, + "learning_rate": 0.0002938748273525216, + "loss": 0.9628, + "step": 4090 + }, + { + "epoch": 0.49, + "grad_norm": 0.18067404627799988, + "learning_rate": 0.00029384568755706824, + "loss": 0.9726, + "step": 4095 + }, + { + "epoch": 0.49, + "grad_norm": 0.16621145606040955, + "learning_rate": 0.0002938164800630574, + "loss": 1.0339, + "step": 4100 + }, + { + "epoch": 0.49, + "grad_norm": 0.17213256657123566, + "learning_rate": 0.00029378720488423506, + "loss": 0.933, + "step": 4105 + }, + { + "epoch": 0.5, + "grad_norm": 0.1835915744304657, + "learning_rate": 0.00029375786203437906, + "loss": 0.9645, + "step": 4110 + }, + { + "epoch": 0.5, + "grad_norm": 0.19219228625297546, + "learning_rate": 0.00029372845152729916, + "loss": 1.0066, + "step": 4115 + }, + { + "epoch": 0.5, + "grad_norm": 0.1753523349761963, + "learning_rate": 0.0002936989733768368, + "loss": 0.98, + "step": 4120 + }, + { + "epoch": 0.5, + "grad_norm": 0.16420799493789673, + "learning_rate": 0.00029366942759686556, + "loss": 0.9817, + "step": 4125 + }, + { + "epoch": 0.5, + "grad_norm": 0.17462868988513947, + "learning_rate": 0.0002936398142012906, + "loss": 1.024, + "step": 4130 + }, + { + "epoch": 0.5, + "grad_norm": 0.1853259801864624, + "learning_rate": 0.0002936101332040489, + "loss": 0.9538, + "step": 4135 + }, + { + "epoch": 0.5, + "grad_norm": 0.17520953714847565, + "learning_rate": 0.00029358038461910934, + "loss": 0.9453, + "step": 4140 + }, + { + "epoch": 0.5, + "grad_norm": 0.18350806832313538, + "learning_rate": 0.00029355056846047266, + "loss": 0.9445, + "step": 4145 + }, + { + "epoch": 0.5, + "grad_norm": 0.17712032794952393, + "learning_rate": 0.0002935206847421713, + "loss": 0.9821, + "step": 4150 + }, + { + "epoch": 0.5, + "grad_norm": 0.17345529794692993, + "learning_rate": 0.00029349073347826953, + "loss": 1.0079, + "step": 4155 + }, + { + "epoch": 0.5, + "grad_norm": 0.18202108144760132, + "learning_rate": 0.0002934607146828634, + "loss": 1.0103, + "step": 4160 + }, + { + "epoch": 0.5, + "grad_norm": 0.17905019223690033, + "learning_rate": 0.00029343062837008076, + "loss": 0.9744, + "step": 4165 + }, + { + "epoch": 0.5, + "grad_norm": 0.16872918605804443, + "learning_rate": 0.0002934004745540812, + "loss": 1.0276, + "step": 4170 + }, + { + "epoch": 0.5, + "grad_norm": 0.1752139776945114, + "learning_rate": 0.00029337025324905616, + "loss": 0.9822, + "step": 4175 + }, + { + "epoch": 0.5, + "grad_norm": 0.18080003559589386, + "learning_rate": 0.0002933399644692287, + "loss": 1.0102, + "step": 4180 + }, + { + "epoch": 0.5, + "grad_norm": 0.22213223576545715, + "learning_rate": 0.00029330960822885385, + "loss": 0.9913, + "step": 4185 + }, + { + "epoch": 0.5, + "grad_norm": 0.18527017533779144, + "learning_rate": 0.0002932791845422182, + "loss": 0.9575, + "step": 4190 + }, + { + "epoch": 0.51, + "grad_norm": 0.18203973770141602, + "learning_rate": 0.00029324869342364014, + "loss": 0.9891, + "step": 4195 + }, + { + "epoch": 0.51, + "grad_norm": 0.17969951033592224, + "learning_rate": 0.00029321813488746983, + "loss": 0.994, + "step": 4200 + }, + { + "epoch": 0.51, + "grad_norm": 0.18661588430404663, + "learning_rate": 0.0002931875089480891, + "loss": 1.0039, + "step": 4205 + }, + { + "epoch": 0.51, + "grad_norm": 0.18751640617847443, + "learning_rate": 0.0002931568156199115, + "loss": 0.9442, + "step": 4210 + }, + { + "epoch": 0.51, + "grad_norm": 0.16617530584335327, + "learning_rate": 0.0002931260549173825, + "loss": 0.885, + "step": 4215 + }, + { + "epoch": 0.51, + "grad_norm": 0.1970120072364807, + "learning_rate": 0.00029309522685497886, + "loss": 1.0264, + "step": 4220 + }, + { + "epoch": 0.51, + "grad_norm": 0.1805301010608673, + "learning_rate": 0.00029306433144720947, + "loss": 1.0025, + "step": 4225 + }, + { + "epoch": 0.51, + "grad_norm": 0.1933399736881256, + "learning_rate": 0.0002930333687086147, + "loss": 0.9908, + "step": 4230 + }, + { + "epoch": 0.51, + "grad_norm": 0.18400885164737701, + "learning_rate": 0.0002930023386537666, + "loss": 0.988, + "step": 4235 + }, + { + "epoch": 0.51, + "grad_norm": 0.18429554998874664, + "learning_rate": 0.000292971241297269, + "loss": 0.9955, + "step": 4240 + }, + { + "epoch": 0.51, + "grad_norm": 0.17350837588310242, + "learning_rate": 0.0002929400766537573, + "loss": 0.9956, + "step": 4245 + }, + { + "epoch": 0.51, + "grad_norm": 0.17750461399555206, + "learning_rate": 0.0002929088447378986, + "loss": 0.9968, + "step": 4250 + }, + { + "epoch": 0.51, + "grad_norm": 0.16923023760318756, + "learning_rate": 0.0002928775455643917, + "loss": 0.9371, + "step": 4255 + }, + { + "epoch": 0.51, + "grad_norm": 0.17826791107654572, + "learning_rate": 0.0002928461791479671, + "loss": 0.999, + "step": 4260 + }, + { + "epoch": 0.51, + "grad_norm": 0.1948651373386383, + "learning_rate": 0.0002928147455033868, + "loss": 0.9618, + "step": 4265 + }, + { + "epoch": 0.51, + "grad_norm": 0.16941259801387787, + "learning_rate": 0.00029278324464544455, + "loss": 0.9693, + "step": 4270 + }, + { + "epoch": 0.52, + "grad_norm": 0.1716768741607666, + "learning_rate": 0.00029275167658896563, + "loss": 1.0408, + "step": 4275 + }, + { + "epoch": 0.52, + "grad_norm": 0.17600005865097046, + "learning_rate": 0.00029272004134880714, + "loss": 1.0234, + "step": 4280 + }, + { + "epoch": 0.52, + "grad_norm": 0.18008339405059814, + "learning_rate": 0.00029268833893985756, + "loss": 0.9356, + "step": 4285 + }, + { + "epoch": 0.52, + "grad_norm": 0.18551653623580933, + "learning_rate": 0.00029265656937703713, + "loss": 0.9183, + "step": 4290 + }, + { + "epoch": 0.52, + "grad_norm": 0.18553385138511658, + "learning_rate": 0.0002926247326752977, + "loss": 0.9736, + "step": 4295 + }, + { + "epoch": 0.52, + "grad_norm": 0.18531882762908936, + "learning_rate": 0.00029259282884962266, + "loss": 1.0085, + "step": 4300 + }, + { + "epoch": 0.52, + "grad_norm": 0.17965206503868103, + "learning_rate": 0.000292560857915027, + "loss": 0.9765, + "step": 4305 + }, + { + "epoch": 0.52, + "grad_norm": 0.1778474599123001, + "learning_rate": 0.0002925288198865573, + "loss": 0.9646, + "step": 4310 + }, + { + "epoch": 0.52, + "grad_norm": 0.19227425754070282, + "learning_rate": 0.0002924967147792917, + "loss": 0.996, + "step": 4315 + }, + { + "epoch": 0.52, + "grad_norm": 0.20739661157131195, + "learning_rate": 0.00029246454260833997, + "loss": 0.9635, + "step": 4320 + }, + { + "epoch": 0.52, + "grad_norm": 0.17460639774799347, + "learning_rate": 0.0002924323033888434, + "loss": 0.9606, + "step": 4325 + }, + { + "epoch": 0.52, + "grad_norm": 0.20777195692062378, + "learning_rate": 0.0002923999971359748, + "loss": 1.0319, + "step": 4330 + }, + { + "epoch": 0.52, + "grad_norm": 0.19790644943714142, + "learning_rate": 0.00029236762386493863, + "loss": 1.0417, + "step": 4335 + }, + { + "epoch": 0.52, + "grad_norm": 0.17253664135932922, + "learning_rate": 0.0002923351835909707, + "loss": 0.9815, + "step": 4340 + }, + { + "epoch": 0.52, + "grad_norm": 0.18922512233257294, + "learning_rate": 0.0002923026763293387, + "loss": 1.0032, + "step": 4345 + }, + { + "epoch": 0.52, + "grad_norm": 0.1988437920808792, + "learning_rate": 0.0002922701020953414, + "loss": 0.9955, + "step": 4350 + }, + { + "epoch": 0.52, + "grad_norm": 0.19901318848133087, + "learning_rate": 0.00029223746090430944, + "loss": 0.9678, + "step": 4355 + }, + { + "epoch": 0.53, + "grad_norm": 0.1846579760313034, + "learning_rate": 0.0002922047527716048, + "loss": 0.8918, + "step": 4360 + }, + { + "epoch": 0.53, + "grad_norm": 0.18826933205127716, + "learning_rate": 0.000292171977712621, + "loss": 0.9736, + "step": 4365 + }, + { + "epoch": 0.53, + "grad_norm": 0.1800205409526825, + "learning_rate": 0.00029213913574278324, + "loss": 0.9471, + "step": 4370 + }, + { + "epoch": 0.53, + "grad_norm": 0.184101402759552, + "learning_rate": 0.00029210622687754777, + "loss": 1.0363, + "step": 4375 + }, + { + "epoch": 0.53, + "grad_norm": 0.18093040585517883, + "learning_rate": 0.0002920732511324028, + "loss": 0.8811, + "step": 4380 + }, + { + "epoch": 0.53, + "grad_norm": 0.16744713485240936, + "learning_rate": 0.0002920402085228677, + "loss": 0.9148, + "step": 4385 + }, + { + "epoch": 0.53, + "grad_norm": 0.3018034100532532, + "learning_rate": 0.0002920070990644935, + "loss": 1.0029, + "step": 4390 + }, + { + "epoch": 0.53, + "grad_norm": 0.2019168734550476, + "learning_rate": 0.0002919739227728625, + "loss": 0.8603, + "step": 4395 + }, + { + "epoch": 0.53, + "grad_norm": 0.20707763731479645, + "learning_rate": 0.0002919406796635887, + "loss": 1.0115, + "step": 4400 + }, + { + "epoch": 0.53, + "grad_norm": 0.18676406145095825, + "learning_rate": 0.00029190736975231736, + "loss": 1.0022, + "step": 4405 + }, + { + "epoch": 0.53, + "grad_norm": 0.1945050209760666, + "learning_rate": 0.00029187399305472515, + "loss": 1.068, + "step": 4410 + }, + { + "epoch": 0.53, + "grad_norm": 0.17335373163223267, + "learning_rate": 0.0002918405495865203, + "loss": 0.9242, + "step": 4415 + }, + { + "epoch": 0.53, + "grad_norm": 0.17637088894844055, + "learning_rate": 0.0002918070393634425, + "loss": 0.9241, + "step": 4420 + }, + { + "epoch": 0.53, + "grad_norm": 0.18394945561885834, + "learning_rate": 0.00029177346240126273, + "loss": 1.0194, + "step": 4425 + }, + { + "epoch": 0.53, + "grad_norm": 0.21276423335075378, + "learning_rate": 0.0002917398187157834, + "loss": 1.0077, + "step": 4430 + }, + { + "epoch": 0.53, + "grad_norm": 0.19257494807243347, + "learning_rate": 0.0002917061083228383, + "loss": 0.982, + "step": 4435 + }, + { + "epoch": 0.53, + "grad_norm": 0.2055777907371521, + "learning_rate": 0.0002916723312382927, + "loss": 1.0397, + "step": 4440 + }, + { + "epoch": 0.54, + "grad_norm": 0.18082697689533234, + "learning_rate": 0.00029163848747804327, + "loss": 0.963, + "step": 4445 + }, + { + "epoch": 0.54, + "grad_norm": 0.18980878591537476, + "learning_rate": 0.00029160457705801796, + "loss": 0.9314, + "step": 4450 + }, + { + "epoch": 0.54, + "grad_norm": 0.1710338592529297, + "learning_rate": 0.0002915705999941761, + "loss": 1.0261, + "step": 4455 + }, + { + "epoch": 0.54, + "grad_norm": 0.17998263239860535, + "learning_rate": 0.0002915365563025085, + "loss": 0.9295, + "step": 4460 + }, + { + "epoch": 0.54, + "grad_norm": 0.19266332685947418, + "learning_rate": 0.00029150244599903725, + "loss": 0.8856, + "step": 4465 + }, + { + "epoch": 0.54, + "grad_norm": 0.1885053515434265, + "learning_rate": 0.0002914682690998157, + "loss": 0.9803, + "step": 4470 + }, + { + "epoch": 0.54, + "grad_norm": 0.19506719708442688, + "learning_rate": 0.00029143402562092875, + "loss": 0.9873, + "step": 4475 + }, + { + "epoch": 0.54, + "grad_norm": 0.20998205244541168, + "learning_rate": 0.0002913997155784924, + "loss": 0.9647, + "step": 4480 + }, + { + "epoch": 0.54, + "grad_norm": 0.21117228269577026, + "learning_rate": 0.00029136533898865423, + "loss": 0.9814, + "step": 4485 + }, + { + "epoch": 0.54, + "grad_norm": 0.18329034745693207, + "learning_rate": 0.0002913308958675929, + "loss": 0.9783, + "step": 4490 + }, + { + "epoch": 0.54, + "grad_norm": 0.18641537427902222, + "learning_rate": 0.0002912963862315185, + "loss": 0.8872, + "step": 4495 + }, + { + "epoch": 0.54, + "grad_norm": 0.19384218752384186, + "learning_rate": 0.0002912618100966725, + "loss": 0.9641, + "step": 4500 + }, + { + "epoch": 0.54, + "grad_norm": 0.19041642546653748, + "learning_rate": 0.00029122716747932747, + "loss": 0.9446, + "step": 4505 + }, + { + "epoch": 0.54, + "grad_norm": 0.20249037444591522, + "learning_rate": 0.0002911924583957874, + "loss": 0.9998, + "step": 4510 + }, + { + "epoch": 0.54, + "grad_norm": 0.1947818547487259, + "learning_rate": 0.00029115768286238757, + "loss": 0.9414, + "step": 4515 + }, + { + "epoch": 0.54, + "grad_norm": 0.1864546537399292, + "learning_rate": 0.00029112284089549445, + "loss": 1.0308, + "step": 4520 + }, + { + "epoch": 0.55, + "grad_norm": 0.18546593189239502, + "learning_rate": 0.0002910879325115059, + "loss": 0.9426, + "step": 4525 + }, + { + "epoch": 0.55, + "grad_norm": 0.19408635795116425, + "learning_rate": 0.0002910529577268509, + "loss": 0.9535, + "step": 4530 + }, + { + "epoch": 0.55, + "grad_norm": 0.19161821901798248, + "learning_rate": 0.0002910179165579898, + "loss": 0.9453, + "step": 4535 + }, + { + "epoch": 0.55, + "grad_norm": 0.1930609792470932, + "learning_rate": 0.00029098280902141406, + "loss": 1.051, + "step": 4540 + }, + { + "epoch": 0.55, + "grad_norm": 0.17416320741176605, + "learning_rate": 0.0002909476351336465, + "loss": 0.9667, + "step": 4545 + }, + { + "epoch": 0.55, + "grad_norm": 0.1923515945672989, + "learning_rate": 0.0002909123949112412, + "loss": 0.9336, + "step": 4550 + }, + { + "epoch": 0.55, + "grad_norm": 0.19930413365364075, + "learning_rate": 0.0002908770883707832, + "loss": 1.032, + "step": 4555 + }, + { + "epoch": 0.55, + "grad_norm": 0.19108958542346954, + "learning_rate": 0.00029084171552888914, + "loss": 0.9452, + "step": 4560 + }, + { + "epoch": 0.55, + "grad_norm": 0.20645657181739807, + "learning_rate": 0.00029080627640220647, + "loss": 0.9045, + "step": 4565 + }, + { + "epoch": 0.55, + "grad_norm": 0.21397635340690613, + "learning_rate": 0.0002907707710074141, + "loss": 0.8976, + "step": 4570 + }, + { + "epoch": 0.55, + "grad_norm": 0.1848032921552658, + "learning_rate": 0.000290735199361222, + "loss": 0.8218, + "step": 4575 + }, + { + "epoch": 0.55, + "grad_norm": 0.18126817047595978, + "learning_rate": 0.0002906995614803715, + "loss": 1.0663, + "step": 4580 + }, + { + "epoch": 0.55, + "grad_norm": 0.1907825469970703, + "learning_rate": 0.0002906638573816348, + "loss": 0.9277, + "step": 4585 + }, + { + "epoch": 0.55, + "grad_norm": 0.18289636075496674, + "learning_rate": 0.0002906280870818156, + "loss": 1.0149, + "step": 4590 + }, + { + "epoch": 0.55, + "grad_norm": 0.17690010368824005, + "learning_rate": 0.00029059225059774844, + "loss": 1.0116, + "step": 4595 + }, + { + "epoch": 0.55, + "grad_norm": 0.19581712782382965, + "learning_rate": 0.00029055634794629924, + "loss": 0.9966, + "step": 4600 + }, + { + "epoch": 0.55, + "grad_norm": 0.20098170638084412, + "learning_rate": 0.00029052037914436494, + "loss": 1.0375, + "step": 4605 + }, + { + "epoch": 0.56, + "grad_norm": 0.20016787946224213, + "learning_rate": 0.00029048434420887373, + "loss": 1.0045, + "step": 4610 + }, + { + "epoch": 0.56, + "grad_norm": 0.1856658011674881, + "learning_rate": 0.0002904482431567847, + "loss": 1.0191, + "step": 4615 + }, + { + "epoch": 0.56, + "grad_norm": 0.19540125131607056, + "learning_rate": 0.0002904120760050884, + "loss": 1.0104, + "step": 4620 + }, + { + "epoch": 0.56, + "grad_norm": 0.20216308534145355, + "learning_rate": 0.00029037584277080616, + "loss": 1.0659, + "step": 4625 + }, + { + "epoch": 0.56, + "grad_norm": 0.224759042263031, + "learning_rate": 0.00029033954347099057, + "loss": 0.9496, + "step": 4630 + }, + { + "epoch": 0.56, + "grad_norm": 0.16096115112304688, + "learning_rate": 0.0002903031781227253, + "loss": 0.9887, + "step": 4635 + }, + { + "epoch": 0.56, + "grad_norm": 0.20530791580677032, + "learning_rate": 0.00029026674674312503, + "loss": 0.9417, + "step": 4640 + }, + { + "epoch": 0.56, + "grad_norm": 0.17820000648498535, + "learning_rate": 0.0002902302493493357, + "loss": 0.9157, + "step": 4645 + }, + { + "epoch": 0.56, + "grad_norm": 0.20749807357788086, + "learning_rate": 0.00029019368595853407, + "loss": 0.958, + "step": 4650 + }, + { + "epoch": 0.56, + "grad_norm": 0.19266118109226227, + "learning_rate": 0.00029015705658792817, + "loss": 1.05, + "step": 4655 + }, + { + "epoch": 0.56, + "grad_norm": 0.2041333168745041, + "learning_rate": 0.00029012036125475695, + "loss": 0.9891, + "step": 4660 + }, + { + "epoch": 0.56, + "grad_norm": 0.19730497896671295, + "learning_rate": 0.00029008359997629045, + "loss": 1.0394, + "step": 4665 + }, + { + "epoch": 0.56, + "grad_norm": 0.17330877482891083, + "learning_rate": 0.00029004677276982986, + "loss": 1.0221, + "step": 4670 + }, + { + "epoch": 0.56, + "grad_norm": 0.1951434314250946, + "learning_rate": 0.0002900098796527071, + "loss": 1.0018, + "step": 4675 + }, + { + "epoch": 0.56, + "grad_norm": 0.21202173829078674, + "learning_rate": 0.00028997292064228544, + "loss": 0.9171, + "step": 4680 + }, + { + "epoch": 0.56, + "grad_norm": 0.19243790209293365, + "learning_rate": 0.00028993589575595894, + "loss": 0.9833, + "step": 4685 + }, + { + "epoch": 0.57, + "grad_norm": 0.1826474815607071, + "learning_rate": 0.00028989880501115276, + "loss": 0.875, + "step": 4690 + }, + { + "epoch": 0.57, + "grad_norm": 0.20149312913417816, + "learning_rate": 0.0002898616484253231, + "loss": 1.0055, + "step": 4695 + }, + { + "epoch": 0.57, + "grad_norm": 0.19697706401348114, + "learning_rate": 0.000289824426015957, + "loss": 0.9464, + "step": 4700 + }, + { + "epoch": 0.57, + "grad_norm": 0.19809427857398987, + "learning_rate": 0.00028978713780057256, + "loss": 0.9814, + "step": 4705 + }, + { + "epoch": 0.57, + "grad_norm": 0.206287682056427, + "learning_rate": 0.00028974978379671894, + "loss": 0.9652, + "step": 4710 + }, + { + "epoch": 0.57, + "grad_norm": 0.18428125977516174, + "learning_rate": 0.0002897123640219761, + "loss": 0.942, + "step": 4715 + }, + { + "epoch": 0.57, + "grad_norm": 0.16364052891731262, + "learning_rate": 0.000289674878493955, + "loss": 0.9741, + "step": 4720 + }, + { + "epoch": 0.57, + "grad_norm": 0.20780855417251587, + "learning_rate": 0.0002896373272302977, + "loss": 0.9654, + "step": 4725 + }, + { + "epoch": 0.57, + "grad_norm": 0.2168758362531662, + "learning_rate": 0.0002895997102486769, + "loss": 0.9247, + "step": 4730 + }, + { + "epoch": 0.57, + "grad_norm": 0.1811356246471405, + "learning_rate": 0.0002895620275667966, + "loss": 0.9142, + "step": 4735 + }, + { + "epoch": 0.57, + "grad_norm": 0.18936580419540405, + "learning_rate": 0.00028952427920239134, + "loss": 0.9091, + "step": 4740 + }, + { + "epoch": 0.57, + "grad_norm": 0.19724291563034058, + "learning_rate": 0.00028948646517322686, + "loss": 0.9266, + "step": 4745 + }, + { + "epoch": 0.57, + "grad_norm": 0.17332862317562103, + "learning_rate": 0.0002894485854970997, + "loss": 1.0734, + "step": 4750 + }, + { + "epoch": 0.57, + "grad_norm": 0.1730174571275711, + "learning_rate": 0.00028941064019183713, + "loss": 1.0088, + "step": 4755 + }, + { + "epoch": 0.57, + "grad_norm": 0.18203985691070557, + "learning_rate": 0.0002893726292752977, + "loss": 0.9595, + "step": 4760 + }, + { + "epoch": 0.57, + "grad_norm": 0.18378064036369324, + "learning_rate": 0.0002893345527653705, + "loss": 0.9354, + "step": 4765 + }, + { + "epoch": 0.57, + "grad_norm": 0.18141499161720276, + "learning_rate": 0.0002892964106799757, + "loss": 0.9606, + "step": 4770 + }, + { + "epoch": 0.58, + "grad_norm": 0.19755259156227112, + "learning_rate": 0.000289258203037064, + "loss": 1.054, + "step": 4775 + }, + { + "epoch": 0.58, + "grad_norm": 0.213453009724617, + "learning_rate": 0.0002892199298546174, + "loss": 0.8363, + "step": 4780 + }, + { + "epoch": 0.58, + "grad_norm": 0.18598608672618866, + "learning_rate": 0.00028918159115064846, + "loss": 0.9375, + "step": 4785 + }, + { + "epoch": 0.58, + "grad_norm": 0.19149407744407654, + "learning_rate": 0.0002891431869432006, + "loss": 0.97, + "step": 4790 + }, + { + "epoch": 0.58, + "grad_norm": 0.20365601778030396, + "learning_rate": 0.0002891047172503482, + "loss": 0.9405, + "step": 4795 + }, + { + "epoch": 0.58, + "grad_norm": 0.18206772208213806, + "learning_rate": 0.0002890661820901963, + "loss": 1.011, + "step": 4800 + }, + { + "epoch": 0.58, + "grad_norm": 0.18795382976531982, + "learning_rate": 0.00028902758148088094, + "loss": 0.854, + "step": 4805 + }, + { + "epoch": 0.58, + "grad_norm": 0.18891260027885437, + "learning_rate": 0.00028898891544056873, + "loss": 0.9468, + "step": 4810 + }, + { + "epoch": 0.58, + "grad_norm": 0.19970501959323883, + "learning_rate": 0.0002889501839874572, + "loss": 0.9918, + "step": 4815 + }, + { + "epoch": 0.58, + "grad_norm": 0.20287778973579407, + "learning_rate": 0.00028891138713977476, + "loss": 0.9039, + "step": 4820 + }, + { + "epoch": 0.58, + "grad_norm": 0.17449329793453217, + "learning_rate": 0.0002888725249157804, + "loss": 0.928, + "step": 4825 + }, + { + "epoch": 0.58, + "grad_norm": 0.22723489999771118, + "learning_rate": 0.000288833597333764, + "loss": 0.9622, + "step": 4830 + }, + { + "epoch": 0.58, + "grad_norm": 0.17067822813987732, + "learning_rate": 0.0002887946044120461, + "loss": 0.9349, + "step": 4835 + }, + { + "epoch": 0.58, + "grad_norm": 0.19400516152381897, + "learning_rate": 0.00028875554616897823, + "loss": 0.9223, + "step": 4840 + }, + { + "epoch": 0.58, + "grad_norm": 0.1980266571044922, + "learning_rate": 0.00028871642262294234, + "loss": 0.9518, + "step": 4845 + }, + { + "epoch": 0.58, + "grad_norm": 0.1934380978345871, + "learning_rate": 0.0002886772337923514, + "loss": 1.0257, + "step": 4850 + }, + { + "epoch": 0.58, + "grad_norm": 0.1748974621295929, + "learning_rate": 0.0002886379796956489, + "loss": 1.0209, + "step": 4855 + }, + { + "epoch": 0.59, + "grad_norm": 0.17561843991279602, + "learning_rate": 0.0002885986603513091, + "loss": 0.9646, + "step": 4860 + }, + { + "epoch": 0.59, + "grad_norm": 0.19227540493011475, + "learning_rate": 0.00028855927577783706, + "loss": 0.9638, + "step": 4865 + }, + { + "epoch": 0.59, + "grad_norm": 0.20389790832996368, + "learning_rate": 0.0002885198259937684, + "loss": 0.9399, + "step": 4870 + }, + { + "epoch": 0.59, + "grad_norm": 0.18054336309432983, + "learning_rate": 0.0002884803110176695, + "loss": 0.9318, + "step": 4875 + }, + { + "epoch": 0.59, + "grad_norm": 0.1733674854040146, + "learning_rate": 0.00028844073086813744, + "loss": 0.8768, + "step": 4880 + }, + { + "epoch": 0.59, + "grad_norm": 0.185361847281456, + "learning_rate": 0.00028840108556380006, + "loss": 0.9853, + "step": 4885 + }, + { + "epoch": 0.59, + "grad_norm": 0.1938917487859726, + "learning_rate": 0.00028836137512331555, + "loss": 0.9558, + "step": 4890 + }, + { + "epoch": 0.59, + "grad_norm": 0.18514370918273926, + "learning_rate": 0.00028832159956537306, + "loss": 0.9225, + "step": 4895 + }, + { + "epoch": 0.59, + "grad_norm": 0.1769241839647293, + "learning_rate": 0.0002882817589086924, + "loss": 1.0125, + "step": 4900 + }, + { + "epoch": 0.59, + "grad_norm": 0.18880169093608856, + "learning_rate": 0.0002882418531720237, + "loss": 0.9748, + "step": 4905 + }, + { + "epoch": 0.59, + "grad_norm": 0.20545773208141327, + "learning_rate": 0.0002882018823741481, + "loss": 0.972, + "step": 4910 + }, + { + "epoch": 0.59, + "grad_norm": 0.17919191718101501, + "learning_rate": 0.00028816184653387706, + "loss": 0.9201, + "step": 4915 + }, + { + "epoch": 0.59, + "grad_norm": 0.17469099164009094, + "learning_rate": 0.00028812174567005285, + "loss": 0.9619, + "step": 4920 + }, + { + "epoch": 0.59, + "grad_norm": 0.1765175461769104, + "learning_rate": 0.0002880815798015483, + "loss": 0.9707, + "step": 4925 + }, + { + "epoch": 0.59, + "grad_norm": 0.18801696598529816, + "learning_rate": 0.00028804134894726665, + "loss": 0.9369, + "step": 4930 + }, + { + "epoch": 0.59, + "grad_norm": 0.19199238717556, + "learning_rate": 0.00028800105312614196, + "loss": 0.8942, + "step": 4935 + }, + { + "epoch": 0.6, + "grad_norm": 0.17167691886425018, + "learning_rate": 0.00028796069235713893, + "loss": 0.8596, + "step": 4940 + }, + { + "epoch": 0.6, + "grad_norm": 0.18981756269931793, + "learning_rate": 0.00028792026665925245, + "loss": 1.0821, + "step": 4945 + }, + { + "epoch": 0.6, + "grad_norm": 0.18595005571842194, + "learning_rate": 0.0002878797760515083, + "loss": 1.0417, + "step": 4950 + }, + { + "epoch": 0.6, + "grad_norm": 0.20380659401416779, + "learning_rate": 0.0002878392205529627, + "loss": 0.9396, + "step": 4955 + }, + { + "epoch": 0.6, + "grad_norm": 0.21177223324775696, + "learning_rate": 0.0002877986001827024, + "loss": 0.9777, + "step": 4960 + }, + { + "epoch": 0.6, + "grad_norm": 0.18329915404319763, + "learning_rate": 0.00028775791495984474, + "loss": 1.068, + "step": 4965 + }, + { + "epoch": 0.6, + "grad_norm": 0.2002885490655899, + "learning_rate": 0.0002877171649035375, + "loss": 0.9144, + "step": 4970 + }, + { + "epoch": 0.6, + "grad_norm": 0.18435192108154297, + "learning_rate": 0.00028767635003295895, + "loss": 0.8893, + "step": 4975 + }, + { + "epoch": 0.6, + "grad_norm": 0.18662065267562866, + "learning_rate": 0.000287635470367318, + "loss": 0.9352, + "step": 4980 + }, + { + "epoch": 0.6, + "grad_norm": 0.19780193269252777, + "learning_rate": 0.000287594525925854, + "loss": 0.9593, + "step": 4985 + }, + { + "epoch": 0.6, + "grad_norm": 0.18069545924663544, + "learning_rate": 0.0002875535167278367, + "loss": 1.0398, + "step": 4990 + }, + { + "epoch": 0.6, + "grad_norm": 0.1925516277551651, + "learning_rate": 0.00028751244279256647, + "loss": 0.9382, + "step": 4995 + }, + { + "epoch": 0.6, + "grad_norm": 0.1789090633392334, + "learning_rate": 0.0002874713041393739, + "loss": 1.0408, + "step": 5000 + }, + { + "epoch": 0.6, + "grad_norm": 0.20351684093475342, + "learning_rate": 0.0002874301007876204, + "loss": 1.0124, + "step": 5005 + }, + { + "epoch": 0.6, + "grad_norm": 0.20231549441814423, + "learning_rate": 0.00028738883275669755, + "loss": 1.0305, + "step": 5010 + }, + { + "epoch": 0.6, + "grad_norm": 0.17243053019046783, + "learning_rate": 0.00028734750006602746, + "loss": 0.9991, + "step": 5015 + }, + { + "epoch": 0.6, + "grad_norm": 0.17361502349376678, + "learning_rate": 0.0002873061027350627, + "loss": 0.971, + "step": 5020 + }, + { + "epoch": 0.61, + "grad_norm": 0.1816028356552124, + "learning_rate": 0.00028726464078328615, + "loss": 0.9863, + "step": 5025 + }, + { + "epoch": 0.61, + "grad_norm": 0.18330328166484833, + "learning_rate": 0.00028722311423021125, + "loss": 1.0478, + "step": 5030 + }, + { + "epoch": 0.61, + "grad_norm": 0.20598195493221283, + "learning_rate": 0.00028718152309538175, + "loss": 0.9304, + "step": 5035 + }, + { + "epoch": 0.61, + "grad_norm": 0.19323958456516266, + "learning_rate": 0.00028713986739837183, + "loss": 0.9557, + "step": 5040 + }, + { + "epoch": 0.61, + "grad_norm": 0.17913362383842468, + "learning_rate": 0.0002870981471587861, + "loss": 0.8907, + "step": 5045 + }, + { + "epoch": 0.61, + "grad_norm": 0.1934191733598709, + "learning_rate": 0.0002870563623962593, + "loss": 0.9355, + "step": 5050 + }, + { + "epoch": 0.61, + "grad_norm": 0.1913587599992752, + "learning_rate": 0.00028701451313045695, + "loss": 0.9905, + "step": 5055 + }, + { + "epoch": 0.61, + "grad_norm": 0.19491787254810333, + "learning_rate": 0.0002869725993810746, + "loss": 0.9745, + "step": 5060 + }, + { + "epoch": 0.61, + "grad_norm": 0.20128677785396576, + "learning_rate": 0.00028693062116783816, + "loss": 0.9302, + "step": 5065 + }, + { + "epoch": 0.61, + "grad_norm": 0.18914251029491425, + "learning_rate": 0.00028688857851050416, + "loss": 0.891, + "step": 5070 + }, + { + "epoch": 0.61, + "grad_norm": 0.19719275832176208, + "learning_rate": 0.0002868464714288592, + "loss": 0.9716, + "step": 5075 + }, + { + "epoch": 0.61, + "grad_norm": 0.1894945651292801, + "learning_rate": 0.0002868042999427202, + "loss": 0.9554, + "step": 5080 + }, + { + "epoch": 0.61, + "grad_norm": 0.1891942322254181, + "learning_rate": 0.0002867620640719346, + "loss": 0.8918, + "step": 5085 + }, + { + "epoch": 0.61, + "grad_norm": 0.1876845359802246, + "learning_rate": 0.00028671976383637984, + "loss": 0.98, + "step": 5090 + }, + { + "epoch": 0.61, + "grad_norm": 0.17926909029483795, + "learning_rate": 0.00028667739925596395, + "loss": 0.8958, + "step": 5095 + }, + { + "epoch": 0.61, + "grad_norm": 0.203196182847023, + "learning_rate": 0.000286634970350625, + "loss": 0.9356, + "step": 5100 + }, + { + "epoch": 0.62, + "grad_norm": 0.17668934166431427, + "learning_rate": 0.00028659247714033154, + "loss": 0.9355, + "step": 5105 + }, + { + "epoch": 0.62, + "grad_norm": 0.1789340376853943, + "learning_rate": 0.00028654991964508224, + "loss": 1.0171, + "step": 5110 + }, + { + "epoch": 0.62, + "grad_norm": 0.17405730485916138, + "learning_rate": 0.00028650729788490606, + "loss": 0.9634, + "step": 5115 + }, + { + "epoch": 0.62, + "grad_norm": 0.1780971735715866, + "learning_rate": 0.0002864646118798622, + "loss": 0.9728, + "step": 5120 + }, + { + "epoch": 0.62, + "grad_norm": 0.2179042249917984, + "learning_rate": 0.0002864218616500402, + "loss": 1.0805, + "step": 5125 + }, + { + "epoch": 0.62, + "grad_norm": 0.20955954492092133, + "learning_rate": 0.00028637904721555966, + "loss": 0.9261, + "step": 5130 + }, + { + "epoch": 0.62, + "grad_norm": 0.18898724019527435, + "learning_rate": 0.00028633616859657045, + "loss": 1.0134, + "step": 5135 + }, + { + "epoch": 0.62, + "grad_norm": 0.20803610980510712, + "learning_rate": 0.0002862932258132527, + "loss": 0.9414, + "step": 5140 + }, + { + "epoch": 0.62, + "grad_norm": 0.21270735561847687, + "learning_rate": 0.00028625021888581685, + "loss": 0.9779, + "step": 5145 + }, + { + "epoch": 0.62, + "grad_norm": 0.20544709265232086, + "learning_rate": 0.0002862071478345031, + "loss": 0.9707, + "step": 5150 + }, + { + "epoch": 0.62, + "grad_norm": 0.19069305062294006, + "learning_rate": 0.00028616401267958237, + "loss": 0.9954, + "step": 5155 + }, + { + "epoch": 0.62, + "grad_norm": 0.19101087749004364, + "learning_rate": 0.00028612081344135546, + "loss": 0.9348, + "step": 5160 + }, + { + "epoch": 0.62, + "grad_norm": 0.17791791260242462, + "learning_rate": 0.0002860775501401532, + "loss": 0.9205, + "step": 5165 + }, + { + "epoch": 0.62, + "grad_norm": 0.21044310927391052, + "learning_rate": 0.00028603422279633694, + "loss": 0.9649, + "step": 5170 + }, + { + "epoch": 0.62, + "grad_norm": 0.1965063214302063, + "learning_rate": 0.0002859908314302978, + "loss": 0.9648, + "step": 5175 + }, + { + "epoch": 0.62, + "grad_norm": 0.19343961775302887, + "learning_rate": 0.00028594737606245726, + "loss": 0.932, + "step": 5180 + }, + { + "epoch": 0.62, + "grad_norm": 0.18944412469863892, + "learning_rate": 0.00028590385671326695, + "loss": 0.9144, + "step": 5185 + }, + { + "epoch": 0.63, + "grad_norm": 0.20036911964416504, + "learning_rate": 0.0002858602734032084, + "loss": 0.965, + "step": 5190 + }, + { + "epoch": 0.63, + "grad_norm": 0.21332001686096191, + "learning_rate": 0.00028581662615279345, + "loss": 0.9766, + "step": 5195 + }, + { + "epoch": 0.63, + "grad_norm": 0.17393194139003754, + "learning_rate": 0.00028577291498256384, + "loss": 0.9954, + "step": 5200 + }, + { + "epoch": 0.63, + "grad_norm": 0.1827279031276703, + "learning_rate": 0.0002857291399130916, + "loss": 0.9646, + "step": 5205 + }, + { + "epoch": 0.63, + "grad_norm": 0.190265491604805, + "learning_rate": 0.0002856853009649787, + "loss": 1.0151, + "step": 5210 + }, + { + "epoch": 0.63, + "grad_norm": 0.18640612065792084, + "learning_rate": 0.0002856413981588572, + "loss": 1.0085, + "step": 5215 + }, + { + "epoch": 0.63, + "grad_norm": 0.21478848159313202, + "learning_rate": 0.00028559743151538913, + "loss": 0.9588, + "step": 5220 + }, + { + "epoch": 0.63, + "grad_norm": 0.18051999807357788, + "learning_rate": 0.00028555340105526676, + "loss": 0.9699, + "step": 5225 + }, + { + "epoch": 0.63, + "grad_norm": 0.1969325840473175, + "learning_rate": 0.0002855093067992123, + "loss": 0.9939, + "step": 5230 + }, + { + "epoch": 0.63, + "grad_norm": 0.2040921300649643, + "learning_rate": 0.00028546514876797796, + "loss": 0.8854, + "step": 5235 + }, + { + "epoch": 0.63, + "grad_norm": 0.1841362863779068, + "learning_rate": 0.0002854209269823459, + "loss": 0.9255, + "step": 5240 + }, + { + "epoch": 0.63, + "grad_norm": 0.19539733231067657, + "learning_rate": 0.0002853766414631285, + "loss": 1.0226, + "step": 5245 + }, + { + "epoch": 0.63, + "grad_norm": 0.19817125797271729, + "learning_rate": 0.0002853322922311678, + "loss": 0.9626, + "step": 5250 + }, + { + "epoch": 0.63, + "grad_norm": 0.21229805052280426, + "learning_rate": 0.0002852878793073362, + "loss": 0.957, + "step": 5255 + }, + { + "epoch": 0.63, + "grad_norm": 0.19763095676898956, + "learning_rate": 0.0002852434027125358, + "loss": 0.8457, + "step": 5260 + }, + { + "epoch": 0.63, + "grad_norm": 0.19969575107097626, + "learning_rate": 0.00028519886246769884, + "loss": 1.0371, + "step": 5265 + }, + { + "epoch": 0.63, + "grad_norm": 0.1981430947780609, + "learning_rate": 0.0002851542585937873, + "loss": 0.9438, + "step": 5270 + }, + { + "epoch": 0.64, + "grad_norm": 0.20760881900787354, + "learning_rate": 0.0002851095911117934, + "loss": 1.0244, + "step": 5275 + }, + { + "epoch": 0.64, + "grad_norm": 0.21313069760799408, + "learning_rate": 0.00028506486004273903, + "loss": 1.0125, + "step": 5280 + }, + { + "epoch": 0.64, + "grad_norm": 0.18353518843650818, + "learning_rate": 0.00028502006540767616, + "loss": 0.9716, + "step": 5285 + }, + { + "epoch": 0.64, + "grad_norm": 0.1756833791732788, + "learning_rate": 0.0002849752072276867, + "loss": 1.0393, + "step": 5290 + }, + { + "epoch": 0.64, + "grad_norm": 0.20792143046855927, + "learning_rate": 0.00028493028552388223, + "loss": 1.0315, + "step": 5295 + }, + { + "epoch": 0.64, + "grad_norm": 0.2053552269935608, + "learning_rate": 0.00028488530031740454, + "loss": 1.0352, + "step": 5300 + }, + { + "epoch": 0.64, + "grad_norm": 0.1828060746192932, + "learning_rate": 0.00028484025162942516, + "loss": 0.8927, + "step": 5305 + }, + { + "epoch": 0.64, + "grad_norm": 0.19744126498699188, + "learning_rate": 0.0002847951394811454, + "loss": 0.9503, + "step": 5310 + }, + { + "epoch": 0.64, + "grad_norm": 0.22091755270957947, + "learning_rate": 0.0002847499638937966, + "loss": 0.9118, + "step": 5315 + }, + { + "epoch": 0.64, + "grad_norm": 0.19735994935035706, + "learning_rate": 0.0002847047248886399, + "loss": 0.9983, + "step": 5320 + }, + { + "epoch": 0.64, + "grad_norm": 0.17844171822071075, + "learning_rate": 0.00028465942248696624, + "loss": 0.9512, + "step": 5325 + }, + { + "epoch": 0.64, + "grad_norm": 0.20808811485767365, + "learning_rate": 0.00028461405671009645, + "loss": 0.9539, + "step": 5330 + }, + { + "epoch": 0.64, + "grad_norm": 0.18508221209049225, + "learning_rate": 0.00028456862757938117, + "loss": 0.9328, + "step": 5335 + }, + { + "epoch": 0.64, + "grad_norm": 0.1841544657945633, + "learning_rate": 0.0002845231351162009, + "loss": 0.9095, + "step": 5340 + }, + { + "epoch": 0.64, + "grad_norm": 0.1901453733444214, + "learning_rate": 0.0002844775793419659, + "loss": 0.8944, + "step": 5345 + }, + { + "epoch": 0.64, + "grad_norm": 0.2000284641981125, + "learning_rate": 0.00028443196027811617, + "loss": 0.9468, + "step": 5350 + }, + { + "epoch": 0.65, + "grad_norm": 0.2096180021762848, + "learning_rate": 0.0002843862779461216, + "loss": 0.9388, + "step": 5355 + }, + { + "epoch": 0.65, + "grad_norm": 0.18580293655395508, + "learning_rate": 0.00028434053236748175, + "loss": 0.9352, + "step": 5360 + }, + { + "epoch": 0.65, + "grad_norm": 0.21264111995697021, + "learning_rate": 0.00028429472356372606, + "loss": 1.0138, + "step": 5365 + }, + { + "epoch": 0.65, + "grad_norm": 0.19658392667770386, + "learning_rate": 0.00028424885155641373, + "loss": 0.9843, + "step": 5370 + }, + { + "epoch": 0.65, + "grad_norm": 0.19093391299247742, + "learning_rate": 0.00028420291636713354, + "loss": 0.9271, + "step": 5375 + }, + { + "epoch": 0.65, + "grad_norm": 0.21088851988315582, + "learning_rate": 0.00028415691801750417, + "loss": 1.0146, + "step": 5380 + }, + { + "epoch": 0.65, + "grad_norm": 0.17928850650787354, + "learning_rate": 0.000284110856529174, + "loss": 0.9973, + "step": 5385 + }, + { + "epoch": 0.65, + "grad_norm": 0.18797273933887482, + "learning_rate": 0.000284064731923821, + "loss": 1.0008, + "step": 5390 + }, + { + "epoch": 0.65, + "grad_norm": 0.19398821890354156, + "learning_rate": 0.00028401854422315306, + "loss": 0.8467, + "step": 5395 + }, + { + "epoch": 0.65, + "grad_norm": 0.20564605295658112, + "learning_rate": 0.0002839722934489076, + "loss": 0.9809, + "step": 5400 + }, + { + "epoch": 0.65, + "grad_norm": 0.2124805599451065, + "learning_rate": 0.0002839259796228517, + "loss": 0.9094, + "step": 5405 + }, + { + "epoch": 0.65, + "grad_norm": 0.16906094551086426, + "learning_rate": 0.0002838796027667823, + "loss": 0.9083, + "step": 5410 + }, + { + "epoch": 0.65, + "grad_norm": 0.21367835998535156, + "learning_rate": 0.0002838331629025258, + "loss": 0.9278, + "step": 5415 + }, + { + "epoch": 0.65, + "grad_norm": 0.20226390659809113, + "learning_rate": 0.00028378666005193846, + "loss": 0.9629, + "step": 5420 + }, + { + "epoch": 0.65, + "grad_norm": 0.21728989481925964, + "learning_rate": 0.0002837400942369059, + "loss": 0.952, + "step": 5425 + }, + { + "epoch": 0.65, + "grad_norm": 0.19252964854240417, + "learning_rate": 0.0002836934654793436, + "loss": 0.9497, + "step": 5430 + }, + { + "epoch": 0.65, + "grad_norm": 0.21155524253845215, + "learning_rate": 0.00028364677380119665, + "loss": 0.9298, + "step": 5435 + }, + { + "epoch": 0.66, + "grad_norm": 0.18080827593803406, + "learning_rate": 0.0002836000192244397, + "loss": 0.9502, + "step": 5440 + }, + { + "epoch": 0.66, + "grad_norm": 0.19508054852485657, + "learning_rate": 0.000283553201771077, + "loss": 0.9299, + "step": 5445 + }, + { + "epoch": 0.66, + "grad_norm": 0.19555960595607758, + "learning_rate": 0.00028350632146314234, + "loss": 0.8338, + "step": 5450 + }, + { + "epoch": 0.66, + "grad_norm": 0.19276316463947296, + "learning_rate": 0.00028345937832269924, + "loss": 1.0148, + "step": 5455 + }, + { + "epoch": 0.66, + "grad_norm": 0.19060495495796204, + "learning_rate": 0.0002834123723718406, + "loss": 0.9975, + "step": 5460 + }, + { + "epoch": 0.66, + "grad_norm": 0.18638461828231812, + "learning_rate": 0.00028336530363268903, + "loss": 1.1051, + "step": 5465 + }, + { + "epoch": 0.66, + "grad_norm": 0.21244901418685913, + "learning_rate": 0.00028331817212739666, + "loss": 0.9824, + "step": 5470 + }, + { + "epoch": 0.66, + "grad_norm": 0.1781817078590393, + "learning_rate": 0.0002832709778781451, + "loss": 0.9437, + "step": 5475 + }, + { + "epoch": 0.66, + "grad_norm": 0.21162748336791992, + "learning_rate": 0.00028322372090714565, + "loss": 0.8513, + "step": 5480 + }, + { + "epoch": 0.66, + "grad_norm": 0.20514048635959625, + "learning_rate": 0.00028317640123663886, + "loss": 0.9249, + "step": 5485 + }, + { + "epoch": 0.66, + "grad_norm": 0.1924484670162201, + "learning_rate": 0.000283129018888895, + "loss": 0.9966, + "step": 5490 + }, + { + "epoch": 0.66, + "grad_norm": 0.2287726253271103, + "learning_rate": 0.00028308157388621375, + "loss": 1.0219, + "step": 5495 + }, + { + "epoch": 0.66, + "grad_norm": 0.1756589561700821, + "learning_rate": 0.00028303406625092433, + "loss": 0.9488, + "step": 5500 + }, + { + "epoch": 0.66, + "grad_norm": 0.23016351461410522, + "learning_rate": 0.00028298649600538546, + "loss": 0.9064, + "step": 5505 + }, + { + "epoch": 0.66, + "grad_norm": 0.17148062586784363, + "learning_rate": 0.0002829388631719852, + "loss": 1.0173, + "step": 5510 + }, + { + "epoch": 0.66, + "grad_norm": 0.2095944881439209, + "learning_rate": 0.0002828911677731411, + "loss": 0.9834, + "step": 5515 + }, + { + "epoch": 0.67, + "grad_norm": 0.1759607195854187, + "learning_rate": 0.00028284340983130037, + "loss": 0.935, + "step": 5520 + }, + { + "epoch": 0.67, + "grad_norm": 0.2132364809513092, + "learning_rate": 0.0002827955893689393, + "loss": 0.9313, + "step": 5525 + }, + { + "epoch": 0.67, + "grad_norm": 0.18752391636371613, + "learning_rate": 0.00028274770640856394, + "loss": 0.8606, + "step": 5530 + }, + { + "epoch": 0.67, + "grad_norm": 0.1804865151643753, + "learning_rate": 0.00028269976097270946, + "loss": 1.0291, + "step": 5535 + }, + { + "epoch": 0.67, + "grad_norm": 0.17835399508476257, + "learning_rate": 0.0002826517530839407, + "loss": 0.9643, + "step": 5540 + }, + { + "epoch": 0.67, + "grad_norm": 0.21218432486057281, + "learning_rate": 0.0002826036827648517, + "loss": 0.9989, + "step": 5545 + }, + { + "epoch": 0.67, + "grad_norm": 0.20228761434555054, + "learning_rate": 0.0002825555500380659, + "loss": 0.9734, + "step": 5550 + }, + { + "epoch": 0.67, + "grad_norm": 0.19665905833244324, + "learning_rate": 0.0002825073549262363, + "loss": 0.9381, + "step": 5555 + }, + { + "epoch": 0.67, + "grad_norm": 0.20172518491744995, + "learning_rate": 0.000282459097452045, + "loss": 1.0319, + "step": 5560 + }, + { + "epoch": 0.67, + "grad_norm": 0.19320924580097198, + "learning_rate": 0.0002824107776382036, + "loss": 1.0349, + "step": 5565 + }, + { + "epoch": 0.67, + "grad_norm": 0.1959851086139679, + "learning_rate": 0.00028236239550745305, + "loss": 0.9762, + "step": 5570 + }, + { + "epoch": 0.67, + "grad_norm": 0.18430382013320923, + "learning_rate": 0.00028231395108256353, + "loss": 1.0082, + "step": 5575 + }, + { + "epoch": 0.67, + "grad_norm": 0.18709762394428253, + "learning_rate": 0.0002822654443863346, + "loss": 0.9993, + "step": 5580 + }, + { + "epoch": 0.67, + "grad_norm": 0.20324410498142242, + "learning_rate": 0.00028221687544159515, + "loss": 0.9235, + "step": 5585 + }, + { + "epoch": 0.67, + "grad_norm": 0.1890849769115448, + "learning_rate": 0.0002821682442712033, + "loss": 1.0188, + "step": 5590 + }, + { + "epoch": 0.67, + "grad_norm": 0.19836723804473877, + "learning_rate": 0.00028211955089804664, + "loss": 0.9934, + "step": 5595 + }, + { + "epoch": 0.67, + "grad_norm": 0.17568111419677734, + "learning_rate": 0.00028207079534504166, + "loss": 0.8919, + "step": 5600 + }, + { + "epoch": 0.68, + "grad_norm": 0.20651723444461823, + "learning_rate": 0.0002820219776351345, + "loss": 0.9338, + "step": 5605 + }, + { + "epoch": 0.68, + "grad_norm": 0.18715310096740723, + "learning_rate": 0.00028197309779130037, + "loss": 0.9869, + "step": 5610 + }, + { + "epoch": 0.68, + "grad_norm": 0.20862345397472382, + "learning_rate": 0.0002819241558365437, + "loss": 0.9206, + "step": 5615 + }, + { + "epoch": 0.68, + "grad_norm": 0.19458311796188354, + "learning_rate": 0.00028187515179389826, + "loss": 0.9267, + "step": 5620 + }, + { + "epoch": 0.68, + "grad_norm": 0.17458118498325348, + "learning_rate": 0.0002818260856864269, + "loss": 0.879, + "step": 5625 + }, + { + "epoch": 0.68, + "grad_norm": 0.19164584577083588, + "learning_rate": 0.0002817769575372218, + "loss": 0.9952, + "step": 5630 + }, + { + "epoch": 0.68, + "grad_norm": 0.1988450139760971, + "learning_rate": 0.00028172776736940436, + "loss": 0.8229, + "step": 5635 + }, + { + "epoch": 0.68, + "grad_norm": 0.2000226527452469, + "learning_rate": 0.000281678515206125, + "loss": 0.9074, + "step": 5640 + }, + { + "epoch": 0.68, + "grad_norm": 0.21437396109104156, + "learning_rate": 0.0002816292010705635, + "loss": 0.9317, + "step": 5645 + }, + { + "epoch": 0.68, + "grad_norm": 0.18313203752040863, + "learning_rate": 0.0002815798249859287, + "loss": 0.9046, + "step": 5650 + }, + { + "epoch": 0.68, + "grad_norm": 0.17462490499019623, + "learning_rate": 0.00028153038697545867, + "loss": 1.0031, + "step": 5655 + }, + { + "epoch": 0.68, + "grad_norm": 0.2010510116815567, + "learning_rate": 0.0002814808870624205, + "loss": 0.9259, + "step": 5660 + }, + { + "epoch": 0.68, + "grad_norm": 0.21494421362876892, + "learning_rate": 0.00028143132527011055, + "loss": 0.961, + "step": 5665 + }, + { + "epoch": 0.68, + "grad_norm": 0.20886385440826416, + "learning_rate": 0.00028138170162185424, + "loss": 0.863, + "step": 5670 + }, + { + "epoch": 0.68, + "grad_norm": 0.204929918050766, + "learning_rate": 0.00028133201614100604, + "loss": 0.948, + "step": 5675 + }, + { + "epoch": 0.68, + "grad_norm": 0.21892932057380676, + "learning_rate": 0.0002812822688509497, + "loss": 0.9707, + "step": 5680 + }, + { + "epoch": 0.68, + "grad_norm": 0.2091946005821228, + "learning_rate": 0.00028123245977509784, + "loss": 0.9417, + "step": 5685 + }, + { + "epoch": 0.69, + "grad_norm": 0.21841882169246674, + "learning_rate": 0.0002811825889368924, + "loss": 1.0073, + "step": 5690 + }, + { + "epoch": 0.69, + "grad_norm": 0.17423883080482483, + "learning_rate": 0.0002811326563598041, + "loss": 0.9885, + "step": 5695 + }, + { + "epoch": 0.69, + "grad_norm": 0.22584623098373413, + "learning_rate": 0.0002810826620673329, + "loss": 0.9202, + "step": 5700 + }, + { + "epoch": 0.69, + "grad_norm": 0.2022750824689865, + "learning_rate": 0.00028103260608300785, + "loss": 1.1009, + "step": 5705 + }, + { + "epoch": 0.69, + "grad_norm": 0.20764531195163727, + "learning_rate": 0.00028098248843038694, + "loss": 0.9689, + "step": 5710 + }, + { + "epoch": 0.69, + "grad_norm": 0.2145623415708542, + "learning_rate": 0.00028093230913305716, + "loss": 0.9474, + "step": 5715 + }, + { + "epoch": 0.69, + "grad_norm": 0.1971713900566101, + "learning_rate": 0.00028088206821463456, + "loss": 0.8979, + "step": 5720 + }, + { + "epoch": 0.69, + "grad_norm": 0.18897688388824463, + "learning_rate": 0.00028083176569876426, + "loss": 0.9777, + "step": 5725 + }, + { + "epoch": 0.69, + "grad_norm": 0.20765312016010284, + "learning_rate": 0.00028078140160912017, + "loss": 0.8788, + "step": 5730 + }, + { + "epoch": 0.69, + "grad_norm": 0.2097761631011963, + "learning_rate": 0.00028073097596940537, + "loss": 0.9324, + "step": 5735 + }, + { + "epoch": 0.69, + "grad_norm": 0.19260716438293457, + "learning_rate": 0.0002806804888033519, + "loss": 0.8419, + "step": 5740 + }, + { + "epoch": 0.69, + "grad_norm": 0.209737628698349, + "learning_rate": 0.0002806299401347206, + "loss": 0.8578, + "step": 5745 + }, + { + "epoch": 0.69, + "grad_norm": 0.1828380972146988, + "learning_rate": 0.00028057932998730136, + "loss": 0.9546, + "step": 5750 + }, + { + "epoch": 0.69, + "grad_norm": 0.19403524696826935, + "learning_rate": 0.00028052865838491304, + "loss": 1.0046, + "step": 5755 + }, + { + "epoch": 0.69, + "grad_norm": 0.21876457333564758, + "learning_rate": 0.00028047792535140336, + "loss": 0.9358, + "step": 5760 + }, + { + "epoch": 0.69, + "grad_norm": 0.182986319065094, + "learning_rate": 0.00028042713091064894, + "loss": 0.9362, + "step": 5765 + }, + { + "epoch": 0.7, + "grad_norm": 0.1893165558576584, + "learning_rate": 0.00028037627508655546, + "loss": 0.9647, + "step": 5770 + }, + { + "epoch": 0.7, + "grad_norm": 0.20949877798557281, + "learning_rate": 0.0002803253579030572, + "loss": 0.9254, + "step": 5775 + }, + { + "epoch": 0.7, + "grad_norm": 0.20711813867092133, + "learning_rate": 0.0002802743793841175, + "loss": 0.9423, + "step": 5780 + }, + { + "epoch": 0.7, + "grad_norm": 0.20353513956069946, + "learning_rate": 0.00028022333955372863, + "loss": 0.9021, + "step": 5785 + }, + { + "epoch": 0.7, + "grad_norm": 0.17357411980628967, + "learning_rate": 0.0002801722384359116, + "loss": 1.0085, + "step": 5790 + }, + { + "epoch": 0.7, + "grad_norm": 0.20761722326278687, + "learning_rate": 0.0002801210760547162, + "loss": 0.9346, + "step": 5795 + }, + { + "epoch": 0.7, + "grad_norm": 0.2074555903673172, + "learning_rate": 0.00028006985243422133, + "loss": 0.9919, + "step": 5800 + }, + { + "epoch": 0.7, + "grad_norm": 0.17981496453285217, + "learning_rate": 0.00028001856759853435, + "loss": 0.9016, + "step": 5805 + }, + { + "epoch": 0.7, + "grad_norm": 0.19092728197574615, + "learning_rate": 0.0002799672215717917, + "loss": 0.9511, + "step": 5810 + }, + { + "epoch": 0.7, + "grad_norm": 0.20729589462280273, + "learning_rate": 0.0002799158143781585, + "loss": 0.9397, + "step": 5815 + }, + { + "epoch": 0.7, + "grad_norm": 0.19325435161590576, + "learning_rate": 0.0002798643460418286, + "loss": 1.0048, + "step": 5820 + }, + { + "epoch": 0.7, + "grad_norm": 0.2112487405538559, + "learning_rate": 0.0002798128165870249, + "loss": 1.0018, + "step": 5825 + }, + { + "epoch": 0.7, + "grad_norm": 0.20866331458091736, + "learning_rate": 0.0002797612260379987, + "loss": 1.0327, + "step": 5830 + }, + { + "epoch": 0.7, + "grad_norm": 0.2057882696390152, + "learning_rate": 0.0002797095744190303, + "loss": 0.9205, + "step": 5835 + }, + { + "epoch": 0.7, + "grad_norm": 0.22236908972263336, + "learning_rate": 0.00027965786175442866, + "loss": 0.9787, + "step": 5840 + }, + { + "epoch": 0.7, + "grad_norm": 0.22123856842517853, + "learning_rate": 0.0002796060880685315, + "loss": 0.9991, + "step": 5845 + }, + { + "epoch": 0.7, + "grad_norm": 0.19182252883911133, + "learning_rate": 0.0002795542533857052, + "loss": 0.9982, + "step": 5850 + }, + { + "epoch": 0.71, + "grad_norm": 0.1859193593263626, + "learning_rate": 0.00027950235773034485, + "loss": 0.8689, + "step": 5855 + }, + { + "epoch": 0.71, + "grad_norm": 0.20256547629833221, + "learning_rate": 0.0002794504011268744, + "loss": 0.9262, + "step": 5860 + }, + { + "epoch": 0.71, + "grad_norm": 0.19035175442695618, + "learning_rate": 0.0002793983835997462, + "loss": 0.8788, + "step": 5865 + }, + { + "epoch": 0.71, + "grad_norm": 0.22139105200767517, + "learning_rate": 0.00027934630517344145, + "loss": 0.9814, + "step": 5870 + }, + { + "epoch": 0.71, + "grad_norm": 0.19183175265789032, + "learning_rate": 0.00027929416587247014, + "loss": 0.8917, + "step": 5875 + }, + { + "epoch": 0.71, + "grad_norm": 0.2111898511648178, + "learning_rate": 0.00027924196572137055, + "loss": 0.9915, + "step": 5880 + }, + { + "epoch": 0.71, + "grad_norm": 0.21190297603607178, + "learning_rate": 0.00027918970474470996, + "loss": 1.0738, + "step": 5885 + }, + { + "epoch": 0.71, + "grad_norm": 0.18288253247737885, + "learning_rate": 0.00027913738296708404, + "loss": 0.9588, + "step": 5890 + }, + { + "epoch": 0.71, + "grad_norm": 0.2102237343788147, + "learning_rate": 0.0002790850004131171, + "loss": 0.8978, + "step": 5895 + }, + { + "epoch": 0.71, + "grad_norm": 0.22349348664283752, + "learning_rate": 0.0002790325571074623, + "loss": 0.8554, + "step": 5900 + }, + { + "epoch": 0.71, + "grad_norm": 0.19603189826011658, + "learning_rate": 0.000278980053074801, + "loss": 0.8539, + "step": 5905 + }, + { + "epoch": 0.71, + "grad_norm": 0.19490835070610046, + "learning_rate": 0.0002789274883398435, + "loss": 0.9656, + "step": 5910 + }, + { + "epoch": 0.71, + "grad_norm": 0.19723361730575562, + "learning_rate": 0.0002788748629273284, + "loss": 0.8647, + "step": 5915 + }, + { + "epoch": 0.71, + "grad_norm": 0.224490687251091, + "learning_rate": 0.00027882217686202304, + "loss": 1.0071, + "step": 5920 + }, + { + "epoch": 0.71, + "grad_norm": 0.19954900443553925, + "learning_rate": 0.0002787694301687231, + "loss": 1.0931, + "step": 5925 + }, + { + "epoch": 0.71, + "grad_norm": 0.20417670905590057, + "learning_rate": 0.00027871662287225316, + "loss": 0.9013, + "step": 5930 + }, + { + "epoch": 0.72, + "grad_norm": 0.20717184245586395, + "learning_rate": 0.00027866375499746587, + "loss": 1.0245, + "step": 5935 + }, + { + "epoch": 0.72, + "grad_norm": 0.1946130394935608, + "learning_rate": 0.0002786108265692427, + "loss": 1.0055, + "step": 5940 + }, + { + "epoch": 0.72, + "grad_norm": 0.19675332307815552, + "learning_rate": 0.0002785578376124935, + "loss": 0.9285, + "step": 5945 + }, + { + "epoch": 0.72, + "grad_norm": 0.20337584614753723, + "learning_rate": 0.00027850478815215666, + "loss": 0.9615, + "step": 5950 + }, + { + "epoch": 0.72, + "grad_norm": 0.2144574224948883, + "learning_rate": 0.000278451678213199, + "loss": 0.9641, + "step": 5955 + }, + { + "epoch": 0.72, + "grad_norm": 0.19476327300071716, + "learning_rate": 0.0002783985078206158, + "loss": 0.8523, + "step": 5960 + }, + { + "epoch": 0.72, + "grad_norm": 0.21178047358989716, + "learning_rate": 0.00027834527699943087, + "loss": 0.9104, + "step": 5965 + }, + { + "epoch": 0.72, + "grad_norm": 0.20542141795158386, + "learning_rate": 0.00027829198577469636, + "loss": 0.9744, + "step": 5970 + }, + { + "epoch": 0.72, + "grad_norm": 0.18979620933532715, + "learning_rate": 0.0002782386341714929, + "loss": 0.9429, + "step": 5975 + }, + { + "epoch": 0.72, + "grad_norm": 0.18054763972759247, + "learning_rate": 0.00027818522221492953, + "loss": 0.9203, + "step": 5980 + }, + { + "epoch": 0.72, + "grad_norm": 0.1968124806880951, + "learning_rate": 0.0002781317499301437, + "loss": 0.974, + "step": 5985 + }, + { + "epoch": 0.72, + "grad_norm": 0.2041236311197281, + "learning_rate": 0.0002780782173423012, + "loss": 0.9027, + "step": 5990 + }, + { + "epoch": 0.72, + "grad_norm": 0.19817684590816498, + "learning_rate": 0.0002780246244765963, + "loss": 0.9473, + "step": 5995 + }, + { + "epoch": 0.72, + "grad_norm": 0.19076892733573914, + "learning_rate": 0.0002779709713582515, + "loss": 0.9915, + "step": 6000 + }, + { + "epoch": 0.72, + "grad_norm": 0.1970764398574829, + "learning_rate": 0.00027791725801251785, + "loss": 0.9481, + "step": 6005 + }, + { + "epoch": 0.72, + "grad_norm": 0.20971329510211945, + "learning_rate": 0.00027786348446467453, + "loss": 0.996, + "step": 6010 + }, + { + "epoch": 0.72, + "grad_norm": 0.18908850848674774, + "learning_rate": 0.00027780965074002925, + "loss": 0.9819, + "step": 6015 + }, + { + "epoch": 0.73, + "grad_norm": 0.1883561760187149, + "learning_rate": 0.0002777557568639179, + "loss": 0.915, + "step": 6020 + }, + { + "epoch": 0.73, + "grad_norm": 0.20280209183692932, + "learning_rate": 0.0002777018028617047, + "loss": 0.8848, + "step": 6025 + }, + { + "epoch": 0.73, + "grad_norm": 0.23292230069637299, + "learning_rate": 0.00027764778875878225, + "loss": 1.0372, + "step": 6030 + }, + { + "epoch": 0.73, + "grad_norm": 0.20740561187267303, + "learning_rate": 0.0002775937145805714, + "loss": 0.8555, + "step": 6035 + }, + { + "epoch": 0.73, + "grad_norm": 0.22304105758666992, + "learning_rate": 0.0002775395803525211, + "loss": 1.0331, + "step": 6040 + }, + { + "epoch": 0.73, + "grad_norm": 0.24742744863033295, + "learning_rate": 0.0002774853861001089, + "loss": 0.9875, + "step": 6045 + }, + { + "epoch": 0.73, + "grad_norm": 0.19389699399471283, + "learning_rate": 0.0002774311318488404, + "loss": 0.9054, + "step": 6050 + }, + { + "epoch": 0.73, + "grad_norm": 0.20694305002689362, + "learning_rate": 0.00027737681762424936, + "loss": 0.9755, + "step": 6055 + }, + { + "epoch": 0.73, + "grad_norm": 0.21031732857227325, + "learning_rate": 0.0002773224434518978, + "loss": 0.9667, + "step": 6060 + }, + { + "epoch": 0.73, + "grad_norm": 0.22336505353450775, + "learning_rate": 0.0002772680093573762, + "loss": 0.9697, + "step": 6065 + }, + { + "epoch": 0.73, + "grad_norm": 0.2182660698890686, + "learning_rate": 0.0002772135153663029, + "loss": 1.0385, + "step": 6070 + }, + { + "epoch": 0.73, + "grad_norm": 0.1784467101097107, + "learning_rate": 0.00027715896150432473, + "loss": 0.868, + "step": 6075 + }, + { + "epoch": 0.73, + "grad_norm": 0.2188597470521927, + "learning_rate": 0.0002771043477971164, + "loss": 0.8924, + "step": 6080 + }, + { + "epoch": 0.73, + "grad_norm": 0.2050686776638031, + "learning_rate": 0.000277049674270381, + "loss": 0.9832, + "step": 6085 + }, + { + "epoch": 0.73, + "grad_norm": 0.1878337264060974, + "learning_rate": 0.00027699494094984964, + "loss": 0.935, + "step": 6090 + }, + { + "epoch": 0.73, + "grad_norm": 0.20712505280971527, + "learning_rate": 0.00027694014786128175, + "loss": 0.9392, + "step": 6095 + }, + { + "epoch": 0.73, + "grad_norm": 0.2205507606267929, + "learning_rate": 0.00027688529503046473, + "loss": 0.9543, + "step": 6100 + }, + { + "epoch": 0.74, + "grad_norm": 0.21329660713672638, + "learning_rate": 0.0002768303824832141, + "loss": 0.9957, + "step": 6105 + }, + { + "epoch": 0.74, + "grad_norm": 0.22187063097953796, + "learning_rate": 0.00027677541024537363, + "loss": 0.8959, + "step": 6110 + }, + { + "epoch": 0.74, + "grad_norm": 0.2496599406003952, + "learning_rate": 0.00027672037834281497, + "loss": 0.9699, + "step": 6115 + }, + { + "epoch": 0.74, + "grad_norm": 0.20457853376865387, + "learning_rate": 0.000276665286801438, + "loss": 0.9096, + "step": 6120 + }, + { + "epoch": 0.74, + "grad_norm": 0.2040635645389557, + "learning_rate": 0.00027661013564717063, + "loss": 0.9936, + "step": 6125 + }, + { + "epoch": 0.74, + "grad_norm": 0.203216552734375, + "learning_rate": 0.00027655492490596885, + "loss": 0.9246, + "step": 6130 + }, + { + "epoch": 0.74, + "grad_norm": 0.18687255680561066, + "learning_rate": 0.0002764996546038167, + "loss": 0.8891, + "step": 6135 + }, + { + "epoch": 0.74, + "grad_norm": 0.2031632959842682, + "learning_rate": 0.00027644432476672614, + "loss": 1.0533, + "step": 6140 + }, + { + "epoch": 0.74, + "grad_norm": 0.19905006885528564, + "learning_rate": 0.00027638893542073726, + "loss": 0.9701, + "step": 6145 + }, + { + "epoch": 0.74, + "grad_norm": 0.21555280685424805, + "learning_rate": 0.0002763334865919181, + "loss": 0.8389, + "step": 6150 + }, + { + "epoch": 0.74, + "grad_norm": 0.2201685756444931, + "learning_rate": 0.00027627797830636475, + "loss": 0.9749, + "step": 6155 + }, + { + "epoch": 0.74, + "grad_norm": 0.20311476290225983, + "learning_rate": 0.00027622241059020123, + "loss": 0.914, + "step": 6160 + }, + { + "epoch": 0.74, + "grad_norm": 0.2248125523328781, + "learning_rate": 0.0002761667834695796, + "loss": 0.9121, + "step": 6165 + }, + { + "epoch": 0.74, + "grad_norm": 0.19576215744018555, + "learning_rate": 0.00027611109697067975, + "loss": 0.8951, + "step": 6170 + }, + { + "epoch": 0.74, + "grad_norm": 0.22087323665618896, + "learning_rate": 0.00027605535111970974, + "loss": 0.8982, + "step": 6175 + }, + { + "epoch": 0.74, + "grad_norm": 0.19735217094421387, + "learning_rate": 0.0002759995459429053, + "loss": 1.0688, + "step": 6180 + }, + { + "epoch": 0.75, + "grad_norm": 0.20120544731616974, + "learning_rate": 0.00027594368146653013, + "loss": 0.8408, + "step": 6185 + }, + { + "epoch": 0.75, + "grad_norm": 0.18868596851825714, + "learning_rate": 0.0002758877577168761, + "loss": 0.9014, + "step": 6190 + }, + { + "epoch": 0.75, + "grad_norm": 0.19567087292671204, + "learning_rate": 0.00027583177472026264, + "loss": 0.9612, + "step": 6195 + }, + { + "epoch": 0.75, + "grad_norm": 0.19370317459106445, + "learning_rate": 0.00027577573250303725, + "loss": 0.9728, + "step": 6200 + }, + { + "epoch": 0.75, + "grad_norm": 0.20244130492210388, + "learning_rate": 0.00027571963109157533, + "loss": 1.0169, + "step": 6205 + }, + { + "epoch": 0.75, + "grad_norm": 0.19868247210979462, + "learning_rate": 0.0002756634705122799, + "loss": 0.9771, + "step": 6210 + }, + { + "epoch": 0.75, + "grad_norm": 0.20510122179985046, + "learning_rate": 0.00027560725079158214, + "loss": 0.958, + "step": 6215 + }, + { + "epoch": 0.75, + "grad_norm": 0.21019263565540314, + "learning_rate": 0.00027555097195594086, + "loss": 0.9748, + "step": 6220 + }, + { + "epoch": 0.75, + "grad_norm": 0.21073760092258453, + "learning_rate": 0.0002754946340318428, + "loss": 0.9534, + "step": 6225 + }, + { + "epoch": 0.75, + "grad_norm": 0.23775875568389893, + "learning_rate": 0.0002754382370458024, + "loss": 0.9429, + "step": 6230 + }, + { + "epoch": 0.75, + "grad_norm": 0.19734768569469452, + "learning_rate": 0.00027538178102436194, + "loss": 0.9563, + "step": 6235 + }, + { + "epoch": 0.75, + "grad_norm": 0.20335164666175842, + "learning_rate": 0.00027532526599409154, + "loss": 0.9426, + "step": 6240 + }, + { + "epoch": 0.75, + "grad_norm": 0.20909333229064941, + "learning_rate": 0.000275268691981589, + "loss": 0.9798, + "step": 6245 + }, + { + "epoch": 0.75, + "grad_norm": 0.21021656692028046, + "learning_rate": 0.00027521205901348, + "loss": 0.9096, + "step": 6250 + }, + { + "epoch": 0.75, + "grad_norm": 0.17861585319042206, + "learning_rate": 0.0002751553671164179, + "loss": 0.9817, + "step": 6255 + }, + { + "epoch": 0.75, + "grad_norm": 0.2392309159040451, + "learning_rate": 0.00027509861631708373, + "loss": 0.9609, + "step": 6260 + }, + { + "epoch": 0.75, + "grad_norm": 0.2181014120578766, + "learning_rate": 0.0002750418066421863, + "loss": 0.9862, + "step": 6265 + }, + { + "epoch": 0.76, + "grad_norm": 0.21585321426391602, + "learning_rate": 0.0002749849381184622, + "loss": 0.9432, + "step": 6270 + }, + { + "epoch": 0.76, + "grad_norm": 0.203776016831398, + "learning_rate": 0.00027492801077267563, + "loss": 0.9256, + "step": 6275 + }, + { + "epoch": 0.76, + "grad_norm": 0.19271011650562286, + "learning_rate": 0.00027487102463161835, + "loss": 0.9535, + "step": 6280 + }, + { + "epoch": 0.76, + "grad_norm": 0.20827655494213104, + "learning_rate": 0.00027481397972211013, + "loss": 1.0035, + "step": 6285 + }, + { + "epoch": 0.76, + "grad_norm": 0.2051791101694107, + "learning_rate": 0.0002747568760709981, + "loss": 0.9953, + "step": 6290 + }, + { + "epoch": 0.76, + "grad_norm": 0.2057846039533615, + "learning_rate": 0.0002746997137051571, + "loss": 0.9733, + "step": 6295 + }, + { + "epoch": 0.76, + "grad_norm": 0.1999838501214981, + "learning_rate": 0.00027464249265148966, + "loss": 0.9024, + "step": 6300 + }, + { + "epoch": 0.76, + "grad_norm": 0.22202569246292114, + "learning_rate": 0.000274585212936926, + "loss": 1.0046, + "step": 6305 + }, + { + "epoch": 0.76, + "grad_norm": 0.20919722318649292, + "learning_rate": 0.00027452787458842376, + "loss": 0.9343, + "step": 6310 + }, + { + "epoch": 0.76, + "grad_norm": 0.19470633566379547, + "learning_rate": 0.0002744704776329683, + "loss": 0.9979, + "step": 6315 + }, + { + "epoch": 0.76, + "grad_norm": 0.21001891791820526, + "learning_rate": 0.0002744130220975725, + "loss": 0.9753, + "step": 6320 + }, + { + "epoch": 0.76, + "grad_norm": 0.21184568107128143, + "learning_rate": 0.0002743555080092769, + "loss": 1.0369, + "step": 6325 + }, + { + "epoch": 0.76, + "grad_norm": 0.18605372309684753, + "learning_rate": 0.00027429793539514953, + "loss": 0.9481, + "step": 6330 + }, + { + "epoch": 0.76, + "grad_norm": 0.23986585438251495, + "learning_rate": 0.0002742403042822859, + "loss": 0.9951, + "step": 6335 + }, + { + "epoch": 0.76, + "grad_norm": 0.20543424785137177, + "learning_rate": 0.00027418261469780924, + "loss": 0.9394, + "step": 6340 + }, + { + "epoch": 0.76, + "grad_norm": 0.2040044069290161, + "learning_rate": 0.00027412486666887007, + "loss": 0.9354, + "step": 6345 + }, + { + "epoch": 0.77, + "grad_norm": 0.22892433404922485, + "learning_rate": 0.0002740670602226466, + "loss": 0.9778, + "step": 6350 + }, + { + "epoch": 0.77, + "grad_norm": 0.19521504640579224, + "learning_rate": 0.00027400919538634444, + "loss": 0.958, + "step": 6355 + }, + { + "epoch": 0.77, + "grad_norm": 0.23373307287693024, + "learning_rate": 0.00027395127218719675, + "loss": 0.9722, + "step": 6360 + }, + { + "epoch": 0.77, + "grad_norm": 0.1768098771572113, + "learning_rate": 0.0002738932906524641, + "loss": 0.9762, + "step": 6365 + }, + { + "epoch": 0.77, + "grad_norm": 0.22005042433738708, + "learning_rate": 0.00027383525080943447, + "loss": 0.8488, + "step": 6370 + }, + { + "epoch": 0.77, + "grad_norm": 0.1887035220861435, + "learning_rate": 0.00027377715268542334, + "loss": 0.9401, + "step": 6375 + }, + { + "epoch": 0.77, + "grad_norm": 0.192814439535141, + "learning_rate": 0.0002737189963077737, + "loss": 0.8912, + "step": 6380 + }, + { + "epoch": 0.77, + "grad_norm": 0.2182081937789917, + "learning_rate": 0.00027366078170385573, + "loss": 0.9403, + "step": 6385 + }, + { + "epoch": 0.77, + "grad_norm": 0.1897241324186325, + "learning_rate": 0.0002736025089010673, + "loss": 0.9351, + "step": 6390 + }, + { + "epoch": 0.77, + "grad_norm": 0.21122263371944427, + "learning_rate": 0.0002735441779268335, + "loss": 1.0048, + "step": 6395 + }, + { + "epoch": 0.77, + "grad_norm": 0.19820435345172882, + "learning_rate": 0.00027348578880860677, + "loss": 0.9342, + "step": 6400 + }, + { + "epoch": 0.77, + "grad_norm": 0.1979910135269165, + "learning_rate": 0.0002734273415738669, + "loss": 0.9044, + "step": 6405 + }, + { + "epoch": 0.77, + "grad_norm": 0.19765061140060425, + "learning_rate": 0.0002733688362501213, + "loss": 1.0196, + "step": 6410 + }, + { + "epoch": 0.77, + "grad_norm": 0.19544093310832977, + "learning_rate": 0.0002733102728649044, + "loss": 0.8445, + "step": 6415 + }, + { + "epoch": 0.77, + "grad_norm": 0.20427776873111725, + "learning_rate": 0.00027325165144577804, + "loss": 0.924, + "step": 6420 + }, + { + "epoch": 0.77, + "grad_norm": 0.26921671628952026, + "learning_rate": 0.0002731929720203315, + "loss": 0.9489, + "step": 6425 + }, + { + "epoch": 0.77, + "grad_norm": 0.21001920104026794, + "learning_rate": 0.00027313423461618116, + "loss": 0.9385, + "step": 6430 + }, + { + "epoch": 0.78, + "grad_norm": 0.21902385354042053, + "learning_rate": 0.0002730754392609708, + "loss": 0.9673, + "step": 6435 + }, + { + "epoch": 0.78, + "grad_norm": 0.22075380384922028, + "learning_rate": 0.0002730165859823716, + "loss": 0.8973, + "step": 6440 + }, + { + "epoch": 0.78, + "grad_norm": 0.22786179184913635, + "learning_rate": 0.0002729576748080818, + "loss": 0.9234, + "step": 6445 + }, + { + "epoch": 0.78, + "grad_norm": 0.20489779114723206, + "learning_rate": 0.00027289870576582695, + "loss": 0.92, + "step": 6450 + }, + { + "epoch": 0.78, + "grad_norm": 0.19824951887130737, + "learning_rate": 0.0002728396788833598, + "loss": 0.9119, + "step": 6455 + }, + { + "epoch": 0.78, + "grad_norm": 0.23278149962425232, + "learning_rate": 0.0002727805941884603, + "loss": 1.0518, + "step": 6460 + }, + { + "epoch": 0.78, + "grad_norm": 0.22010761499404907, + "learning_rate": 0.00027272145170893585, + "loss": 0.9255, + "step": 6465 + }, + { + "epoch": 0.78, + "grad_norm": 0.22987917065620422, + "learning_rate": 0.00027266225147262073, + "loss": 0.9481, + "step": 6470 + }, + { + "epoch": 0.78, + "grad_norm": 0.19295156002044678, + "learning_rate": 0.00027260299350737656, + "loss": 0.9497, + "step": 6475 + }, + { + "epoch": 0.78, + "grad_norm": 0.18979798257350922, + "learning_rate": 0.0002725436778410922, + "loss": 0.8827, + "step": 6480 + }, + { + "epoch": 0.78, + "grad_norm": 0.21442236006259918, + "learning_rate": 0.00027248430450168345, + "loss": 0.8832, + "step": 6485 + }, + { + "epoch": 0.78, + "grad_norm": 0.2078474760055542, + "learning_rate": 0.0002724248735170934, + "loss": 0.9101, + "step": 6490 + }, + { + "epoch": 0.78, + "grad_norm": 0.1883726716041565, + "learning_rate": 0.00027236538491529235, + "loss": 0.8445, + "step": 6495 + }, + { + "epoch": 0.78, + "grad_norm": 0.1775909662246704, + "learning_rate": 0.0002723058387242775, + "loss": 0.9911, + "step": 6500 + }, + { + "epoch": 0.78, + "grad_norm": 0.2114265412092209, + "learning_rate": 0.0002722462349720733, + "loss": 1.0032, + "step": 6505 + }, + { + "epoch": 0.78, + "grad_norm": 0.18201524019241333, + "learning_rate": 0.0002721865736867312, + "loss": 1.0234, + "step": 6510 + }, + { + "epoch": 0.78, + "grad_norm": 0.2337150126695633, + "learning_rate": 0.00027212685489632986, + "loss": 0.9353, + "step": 6515 + }, + { + "epoch": 0.79, + "grad_norm": 0.19889463484287262, + "learning_rate": 0.0002720670786289749, + "loss": 0.8353, + "step": 6520 + }, + { + "epoch": 0.79, + "grad_norm": 0.19570806622505188, + "learning_rate": 0.000272007244912799, + "loss": 0.9103, + "step": 6525 + }, + { + "epoch": 0.79, + "grad_norm": 0.216067835688591, + "learning_rate": 0.0002719473537759619, + "loss": 0.9829, + "step": 6530 + }, + { + "epoch": 0.79, + "grad_norm": 0.23183125257492065, + "learning_rate": 0.0002718874052466504, + "loss": 0.9251, + "step": 6535 + }, + { + "epoch": 0.79, + "grad_norm": 0.22221823036670685, + "learning_rate": 0.00027182739935307826, + "loss": 0.9561, + "step": 6540 + }, + { + "epoch": 0.79, + "grad_norm": 0.22014914453029633, + "learning_rate": 0.00027176733612348616, + "loss": 0.9561, + "step": 6545 + }, + { + "epoch": 0.79, + "grad_norm": 0.20240789651870728, + "learning_rate": 0.0002717072155861419, + "loss": 0.9529, + "step": 6550 + }, + { + "epoch": 0.79, + "grad_norm": 0.181975319981575, + "learning_rate": 0.0002716470377693403, + "loss": 0.8618, + "step": 6555 + }, + { + "epoch": 0.79, + "grad_norm": 0.18764451146125793, + "learning_rate": 0.0002715868027014029, + "loss": 1.0196, + "step": 6560 + }, + { + "epoch": 0.79, + "grad_norm": 0.20951347053050995, + "learning_rate": 0.0002715265104106784, + "loss": 0.952, + "step": 6565 + }, + { + "epoch": 0.79, + "grad_norm": 0.2056897133588791, + "learning_rate": 0.0002714661609255423, + "loss": 0.9513, + "step": 6570 + }, + { + "epoch": 0.79, + "grad_norm": 0.22014079988002777, + "learning_rate": 0.0002714057542743971, + "loss": 0.9351, + "step": 6575 + }, + { + "epoch": 0.79, + "grad_norm": 0.20651625096797943, + "learning_rate": 0.00027134529048567223, + "loss": 0.9362, + "step": 6580 + }, + { + "epoch": 0.79, + "grad_norm": 0.2232450395822525, + "learning_rate": 0.00027128476958782386, + "loss": 1.0102, + "step": 6585 + }, + { + "epoch": 0.79, + "grad_norm": 0.2110331654548645, + "learning_rate": 0.00027122419160933515, + "loss": 0.9414, + "step": 6590 + }, + { + "epoch": 0.79, + "grad_norm": 0.20717830955982208, + "learning_rate": 0.0002711635565787162, + "loss": 0.9147, + "step": 6595 + }, + { + "epoch": 0.8, + "grad_norm": 0.20119911432266235, + "learning_rate": 0.00027110286452450375, + "loss": 0.892, + "step": 6600 + }, + { + "epoch": 0.8, + "grad_norm": 0.22192524373531342, + "learning_rate": 0.0002710421154752616, + "loss": 0.9916, + "step": 6605 + }, + { + "epoch": 0.8, + "grad_norm": 0.19518253207206726, + "learning_rate": 0.0002709813094595802, + "loss": 0.8616, + "step": 6610 + }, + { + "epoch": 0.8, + "grad_norm": 0.21804095804691315, + "learning_rate": 0.0002709204465060769, + "loss": 0.939, + "step": 6615 + }, + { + "epoch": 0.8, + "grad_norm": 0.23485125601291656, + "learning_rate": 0.00027085952664339586, + "loss": 0.9235, + "step": 6620 + }, + { + "epoch": 0.8, + "grad_norm": 0.21770937740802765, + "learning_rate": 0.00027079854990020793, + "loss": 0.9558, + "step": 6625 + }, + { + "epoch": 0.8, + "grad_norm": 0.19482004642486572, + "learning_rate": 0.00027073751630521083, + "loss": 0.9604, + "step": 6630 + }, + { + "epoch": 0.8, + "grad_norm": 0.1984790414571762, + "learning_rate": 0.000270676425887129, + "loss": 0.9706, + "step": 6635 + }, + { + "epoch": 0.8, + "grad_norm": 0.1953192949295044, + "learning_rate": 0.0002706152786747136, + "loss": 1.0016, + "step": 6640 + }, + { + "epoch": 0.8, + "grad_norm": 0.19475625455379486, + "learning_rate": 0.00027055407469674255, + "loss": 0.9146, + "step": 6645 + }, + { + "epoch": 0.8, + "grad_norm": 0.2099728286266327, + "learning_rate": 0.00027049281398202046, + "loss": 0.9263, + "step": 6650 + }, + { + "epoch": 0.8, + "grad_norm": 0.2410629391670227, + "learning_rate": 0.00027043149655937864, + "loss": 0.9945, + "step": 6655 + }, + { + "epoch": 0.8, + "grad_norm": 0.21162976324558258, + "learning_rate": 0.0002703701224576752, + "loss": 0.8995, + "step": 6660 + }, + { + "epoch": 0.8, + "grad_norm": 0.20428724586963654, + "learning_rate": 0.0002703086917057947, + "loss": 0.899, + "step": 6665 + }, + { + "epoch": 0.8, + "grad_norm": 0.22366644442081451, + "learning_rate": 0.0002702472043326486, + "loss": 1.0311, + "step": 6670 + }, + { + "epoch": 0.8, + "grad_norm": 0.20753750205039978, + "learning_rate": 0.00027018566036717483, + "loss": 0.9162, + "step": 6675 + }, + { + "epoch": 0.8, + "grad_norm": 0.20095977187156677, + "learning_rate": 0.00027012405983833806, + "loss": 0.9609, + "step": 6680 + }, + { + "epoch": 0.81, + "grad_norm": 0.1888018697500229, + "learning_rate": 0.00027006240277512955, + "loss": 0.9741, + "step": 6685 + }, + { + "epoch": 0.81, + "grad_norm": 0.21836860477924347, + "learning_rate": 0.0002700006892065671, + "loss": 0.9016, + "step": 6690 + }, + { + "epoch": 0.81, + "grad_norm": 0.23613397777080536, + "learning_rate": 0.0002699389191616952, + "loss": 0.8804, + "step": 6695 + }, + { + "epoch": 0.81, + "grad_norm": 0.2188320904970169, + "learning_rate": 0.00026987709266958497, + "loss": 0.8731, + "step": 6700 + }, + { + "epoch": 0.81, + "grad_norm": 0.21378405392169952, + "learning_rate": 0.0002698152097593339, + "loss": 1.0018, + "step": 6705 + }, + { + "epoch": 0.81, + "grad_norm": 0.2068474143743515, + "learning_rate": 0.0002697532704600662, + "loss": 0.8768, + "step": 6710 + }, + { + "epoch": 0.81, + "grad_norm": 0.20965330302715302, + "learning_rate": 0.0002696912748009325, + "loss": 1.071, + "step": 6715 + }, + { + "epoch": 0.81, + "grad_norm": 0.2051534354686737, + "learning_rate": 0.00026962922281111, + "loss": 0.8599, + "step": 6720 + }, + { + "epoch": 0.81, + "grad_norm": 0.20214852690696716, + "learning_rate": 0.0002695671145198026, + "loss": 0.974, + "step": 6725 + }, + { + "epoch": 0.81, + "grad_norm": 0.23140518367290497, + "learning_rate": 0.00026950494995624035, + "loss": 0.8391, + "step": 6730 + }, + { + "epoch": 0.81, + "grad_norm": 0.22693967819213867, + "learning_rate": 0.00026944272914968, + "loss": 0.9452, + "step": 6735 + }, + { + "epoch": 0.81, + "grad_norm": 0.19972123205661774, + "learning_rate": 0.00026938045212940477, + "loss": 0.8612, + "step": 6740 + }, + { + "epoch": 0.81, + "grad_norm": 0.1956896334886551, + "learning_rate": 0.00026931811892472423, + "loss": 0.9701, + "step": 6745 + }, + { + "epoch": 0.81, + "grad_norm": 0.21115869283676147, + "learning_rate": 0.00026925572956497455, + "loss": 0.8717, + "step": 6750 + }, + { + "epoch": 0.81, + "grad_norm": 0.22253437340259552, + "learning_rate": 0.00026919328407951814, + "loss": 0.9054, + "step": 6755 + }, + { + "epoch": 0.81, + "grad_norm": 0.2643227279186249, + "learning_rate": 0.0002691307824977439, + "loss": 0.961, + "step": 6760 + }, + { + "epoch": 0.82, + "grad_norm": 0.20802360773086548, + "learning_rate": 0.0002690682248490673, + "loss": 0.8992, + "step": 6765 + }, + { + "epoch": 0.82, + "grad_norm": 0.25810569524765015, + "learning_rate": 0.00026900561116292995, + "loss": 0.9011, + "step": 6770 + }, + { + "epoch": 0.82, + "grad_norm": 0.20976845920085907, + "learning_rate": 0.0002689429414687999, + "loss": 0.9485, + "step": 6775 + }, + { + "epoch": 0.82, + "grad_norm": 0.17168466746807098, + "learning_rate": 0.0002688802157961716, + "loss": 1.0081, + "step": 6780 + }, + { + "epoch": 0.82, + "grad_norm": 0.21908418834209442, + "learning_rate": 0.00026881743417456585, + "loss": 0.9562, + "step": 6785 + }, + { + "epoch": 0.82, + "grad_norm": 0.2043706625699997, + "learning_rate": 0.0002687545966335298, + "loss": 1.0096, + "step": 6790 + }, + { + "epoch": 0.82, + "grad_norm": 0.20396995544433594, + "learning_rate": 0.00026869170320263685, + "loss": 1.0249, + "step": 6795 + }, + { + "epoch": 0.82, + "grad_norm": 0.2096068263053894, + "learning_rate": 0.00026862875391148676, + "loss": 1.0383, + "step": 6800 + }, + { + "epoch": 0.82, + "grad_norm": 0.23566322028636932, + "learning_rate": 0.0002685657487897055, + "loss": 0.9589, + "step": 6805 + }, + { + "epoch": 0.82, + "grad_norm": 0.20193132758140564, + "learning_rate": 0.0002685026878669455, + "loss": 0.9232, + "step": 6810 + }, + { + "epoch": 0.82, + "grad_norm": 0.21133491396903992, + "learning_rate": 0.0002684395711728851, + "loss": 0.8744, + "step": 6815 + }, + { + "epoch": 0.82, + "grad_norm": 0.2161945253610611, + "learning_rate": 0.00026837639873722934, + "loss": 0.9788, + "step": 6820 + }, + { + "epoch": 0.82, + "grad_norm": 0.21006715297698975, + "learning_rate": 0.0002683131705897092, + "loss": 0.9007, + "step": 6825 + }, + { + "epoch": 0.82, + "grad_norm": 0.19389329850673676, + "learning_rate": 0.0002682498867600819, + "loss": 0.9501, + "step": 6830 + }, + { + "epoch": 0.82, + "grad_norm": 0.2149161845445633, + "learning_rate": 0.00026818654727813086, + "loss": 0.9366, + "step": 6835 + }, + { + "epoch": 0.82, + "grad_norm": 0.19821734726428986, + "learning_rate": 0.0002681231521736659, + "loss": 0.9437, + "step": 6840 + }, + { + "epoch": 0.82, + "grad_norm": 0.21618692576885223, + "learning_rate": 0.00026805970147652277, + "loss": 0.8672, + "step": 6845 + }, + { + "epoch": 0.83, + "grad_norm": 0.21498163044452667, + "learning_rate": 0.00026799619521656346, + "loss": 1.0109, + "step": 6850 + }, + { + "epoch": 0.83, + "grad_norm": 0.2292798012495041, + "learning_rate": 0.00026793263342367616, + "loss": 0.9201, + "step": 6855 + }, + { + "epoch": 0.83, + "grad_norm": 0.19892308115959167, + "learning_rate": 0.0002678690161277752, + "loss": 0.9206, + "step": 6860 + }, + { + "epoch": 0.83, + "grad_norm": 0.18660472333431244, + "learning_rate": 0.00026780534335880084, + "loss": 0.9114, + "step": 6865 + }, + { + "epoch": 0.83, + "grad_norm": 0.2231675237417221, + "learning_rate": 0.00026774161514671975, + "loss": 0.9933, + "step": 6870 + }, + { + "epoch": 0.83, + "grad_norm": 0.2067699432373047, + "learning_rate": 0.0002676778315215245, + "loss": 1.0153, + "step": 6875 + }, + { + "epoch": 0.83, + "grad_norm": 0.19282564520835876, + "learning_rate": 0.00026761399251323375, + "loss": 0.8656, + "step": 6880 + }, + { + "epoch": 0.83, + "grad_norm": 0.21129785478115082, + "learning_rate": 0.0002675500981518923, + "loss": 0.9499, + "step": 6885 + }, + { + "epoch": 0.83, + "grad_norm": 0.19059066474437714, + "learning_rate": 0.0002674861484675709, + "loss": 1.0097, + "step": 6890 + }, + { + "epoch": 0.83, + "grad_norm": 0.23170576989650726, + "learning_rate": 0.0002674221434903665, + "loss": 0.8877, + "step": 6895 + }, + { + "epoch": 0.83, + "grad_norm": 0.21638576686382294, + "learning_rate": 0.0002673580832504019, + "loss": 0.987, + "step": 6900 + }, + { + "epoch": 0.83, + "grad_norm": 0.22118517756462097, + "learning_rate": 0.00026729396777782597, + "loss": 0.9849, + "step": 6905 + }, + { + "epoch": 0.83, + "grad_norm": 0.20184585452079773, + "learning_rate": 0.0002672297971028136, + "loss": 0.9106, + "step": 6910 + }, + { + "epoch": 0.83, + "grad_norm": 0.21392621099948883, + "learning_rate": 0.0002671655712555656, + "loss": 1.0239, + "step": 6915 + }, + { + "epoch": 0.83, + "grad_norm": 0.2100532501935959, + "learning_rate": 0.0002671012902663089, + "loss": 0.994, + "step": 6920 + }, + { + "epoch": 0.83, + "grad_norm": 0.21326220035552979, + "learning_rate": 0.0002670369541652961, + "loss": 0.9117, + "step": 6925 + }, + { + "epoch": 0.83, + "grad_norm": 0.2225876897573471, + "learning_rate": 0.000266972562982806, + "loss": 0.9071, + "step": 6930 + }, + { + "epoch": 0.84, + "grad_norm": 0.18916994333267212, + "learning_rate": 0.00026690811674914323, + "loss": 0.9641, + "step": 6935 + }, + { + "epoch": 0.84, + "grad_norm": 0.2092379480600357, + "learning_rate": 0.0002668436154946383, + "loss": 1.061, + "step": 6940 + }, + { + "epoch": 0.84, + "grad_norm": 0.18153078854084015, + "learning_rate": 0.0002667790592496477, + "loss": 1.0148, + "step": 6945 + }, + { + "epoch": 0.84, + "grad_norm": 0.20708273351192474, + "learning_rate": 0.00026671444804455375, + "loss": 0.9417, + "step": 6950 + }, + { + "epoch": 0.84, + "grad_norm": 0.2154036909341812, + "learning_rate": 0.0002666497819097645, + "loss": 0.8497, + "step": 6955 + }, + { + "epoch": 0.84, + "grad_norm": 0.22335532307624817, + "learning_rate": 0.0002665850608757141, + "loss": 0.884, + "step": 6960 + }, + { + "epoch": 0.84, + "grad_norm": 0.2135249823331833, + "learning_rate": 0.0002665202849728624, + "loss": 0.9287, + "step": 6965 + }, + { + "epoch": 0.84, + "grad_norm": 0.2231944501399994, + "learning_rate": 0.000266455454231695, + "loss": 0.9385, + "step": 6970 + }, + { + "epoch": 0.84, + "grad_norm": 0.19477687776088715, + "learning_rate": 0.0002663905686827235, + "loss": 0.9468, + "step": 6975 + }, + { + "epoch": 0.84, + "grad_norm": 0.21901415288448334, + "learning_rate": 0.0002663256283564852, + "loss": 0.8921, + "step": 6980 + }, + { + "epoch": 0.84, + "grad_norm": 0.21255704760551453, + "learning_rate": 0.00026626063328354316, + "loss": 0.9014, + "step": 6985 + }, + { + "epoch": 0.84, + "grad_norm": 0.20495054125785828, + "learning_rate": 0.0002661955834944861, + "loss": 1.0034, + "step": 6990 + }, + { + "epoch": 0.84, + "grad_norm": 0.1916796863079071, + "learning_rate": 0.0002661304790199288, + "loss": 1.0317, + "step": 6995 + }, + { + "epoch": 0.84, + "grad_norm": 0.21294309198856354, + "learning_rate": 0.0002660653198905115, + "loss": 0.9128, + "step": 7000 + }, + { + "epoch": 0.84, + "grad_norm": 0.22985853254795074, + "learning_rate": 0.00026600010613690023, + "loss": 0.9521, + "step": 7005 + }, + { + "epoch": 0.84, + "grad_norm": 0.24661943316459656, + "learning_rate": 0.00026593483778978677, + "loss": 0.9456, + "step": 7010 + }, + { + "epoch": 0.85, + "grad_norm": 0.21790087223052979, + "learning_rate": 0.0002658695148798886, + "loss": 0.9558, + "step": 7015 + }, + { + "epoch": 0.85, + "grad_norm": 0.18873614072799683, + "learning_rate": 0.00026580413743794874, + "loss": 0.914, + "step": 7020 + }, + { + "epoch": 0.85, + "grad_norm": 0.21589027345180511, + "learning_rate": 0.0002657387054947361, + "loss": 0.9285, + "step": 7025 + }, + { + "epoch": 0.85, + "grad_norm": 0.1877526193857193, + "learning_rate": 0.0002656732190810451, + "loss": 0.9537, + "step": 7030 + }, + { + "epoch": 0.85, + "grad_norm": 0.22506104409694672, + "learning_rate": 0.0002656076782276958, + "loss": 0.8938, + "step": 7035 + }, + { + "epoch": 0.85, + "grad_norm": 0.23874376714229584, + "learning_rate": 0.000265542082965534, + "loss": 0.8486, + "step": 7040 + }, + { + "epoch": 0.85, + "grad_norm": 0.20064063370227814, + "learning_rate": 0.00026547643332543077, + "loss": 0.98, + "step": 7045 + }, + { + "epoch": 0.85, + "grad_norm": 0.22124479711055756, + "learning_rate": 0.0002654107293382833, + "loss": 0.9002, + "step": 7050 + }, + { + "epoch": 0.85, + "grad_norm": 0.1762978434562683, + "learning_rate": 0.00026534497103501383, + "loss": 0.9397, + "step": 7055 + }, + { + "epoch": 0.85, + "grad_norm": 0.21579968929290771, + "learning_rate": 0.0002652791584465706, + "loss": 0.9606, + "step": 7060 + }, + { + "epoch": 0.85, + "grad_norm": 0.2093055099248886, + "learning_rate": 0.000265213291603927, + "loss": 0.9967, + "step": 7065 + }, + { + "epoch": 0.85, + "grad_norm": 0.2238216996192932, + "learning_rate": 0.00026514737053808234, + "loss": 0.9985, + "step": 7070 + }, + { + "epoch": 0.85, + "grad_norm": 0.20095250010490417, + "learning_rate": 0.0002650813952800611, + "loss": 0.8892, + "step": 7075 + }, + { + "epoch": 0.85, + "grad_norm": 0.2064996361732483, + "learning_rate": 0.00026501536586091357, + "loss": 0.9211, + "step": 7080 + }, + { + "epoch": 0.85, + "grad_norm": 0.20481903851032257, + "learning_rate": 0.0002649492823117153, + "loss": 0.8601, + "step": 7085 + }, + { + "epoch": 0.85, + "grad_norm": 0.19760817289352417, + "learning_rate": 0.0002648831446635674, + "loss": 0.9275, + "step": 7090 + }, + { + "epoch": 0.85, + "grad_norm": 0.2292260080575943, + "learning_rate": 0.0002648169529475965, + "loss": 1.0424, + "step": 7095 + }, + { + "epoch": 0.86, + "grad_norm": 0.2114565074443817, + "learning_rate": 0.0002647507071949546, + "loss": 0.874, + "step": 7100 + }, + { + "epoch": 0.86, + "grad_norm": 0.19804421067237854, + "learning_rate": 0.00026468440743681915, + "loss": 0.9876, + "step": 7105 + }, + { + "epoch": 0.86, + "grad_norm": 0.21388956904411316, + "learning_rate": 0.000264618053704393, + "loss": 0.8863, + "step": 7110 + }, + { + "epoch": 0.86, + "grad_norm": 0.19273947179317474, + "learning_rate": 0.0002645516460289044, + "loss": 1.0307, + "step": 7115 + }, + { + "epoch": 0.86, + "grad_norm": 0.22840675711631775, + "learning_rate": 0.000264485184441607, + "loss": 0.8876, + "step": 7120 + }, + { + "epoch": 0.86, + "grad_norm": 0.22079254686832428, + "learning_rate": 0.00026441866897378, + "loss": 0.9526, + "step": 7125 + }, + { + "epoch": 0.86, + "grad_norm": 0.1948045939207077, + "learning_rate": 0.00026435209965672756, + "loss": 0.9624, + "step": 7130 + }, + { + "epoch": 0.86, + "grad_norm": 0.19133198261260986, + "learning_rate": 0.0002642854765217795, + "loss": 0.9241, + "step": 7135 + }, + { + "epoch": 0.86, + "grad_norm": 0.2186049222946167, + "learning_rate": 0.00026421879960029096, + "loss": 0.9452, + "step": 7140 + }, + { + "epoch": 0.86, + "grad_norm": 0.19075119495391846, + "learning_rate": 0.00026415206892364216, + "loss": 0.9007, + "step": 7145 + }, + { + "epoch": 0.86, + "grad_norm": 0.20891107618808746, + "learning_rate": 0.00026408528452323885, + "loss": 0.9266, + "step": 7150 + }, + { + "epoch": 0.86, + "grad_norm": 0.20901614427566528, + "learning_rate": 0.000264018446430512, + "loss": 0.9023, + "step": 7155 + }, + { + "epoch": 0.86, + "grad_norm": 0.20765335857868195, + "learning_rate": 0.0002639515546769179, + "loss": 0.9053, + "step": 7160 + }, + { + "epoch": 0.86, + "grad_norm": 0.21316955983638763, + "learning_rate": 0.0002638846092939379, + "loss": 0.8916, + "step": 7165 + }, + { + "epoch": 0.86, + "grad_norm": 0.20992538332939148, + "learning_rate": 0.00026381761031307873, + "loss": 0.968, + "step": 7170 + }, + { + "epoch": 0.86, + "grad_norm": 0.2142067402601242, + "learning_rate": 0.0002637505577658725, + "loss": 0.9669, + "step": 7175 + }, + { + "epoch": 0.87, + "grad_norm": 0.2190820276737213, + "learning_rate": 0.0002636834516838761, + "loss": 0.9586, + "step": 7180 + }, + { + "epoch": 0.87, + "grad_norm": 0.18383167684078217, + "learning_rate": 0.0002636162920986721, + "loss": 0.9392, + "step": 7185 + }, + { + "epoch": 0.87, + "grad_norm": 0.24156947433948517, + "learning_rate": 0.00026354907904186796, + "loss": 0.942, + "step": 7190 + }, + { + "epoch": 0.87, + "grad_norm": 0.2070099264383316, + "learning_rate": 0.00026348181254509635, + "loss": 0.9375, + "step": 7195 + }, + { + "epoch": 0.87, + "grad_norm": 0.22052322328090668, + "learning_rate": 0.00026341449264001516, + "loss": 0.8665, + "step": 7200 + }, + { + "epoch": 0.87, + "grad_norm": 0.20919539034366608, + "learning_rate": 0.00026334711935830735, + "loss": 0.8609, + "step": 7205 + }, + { + "epoch": 0.87, + "grad_norm": 0.24203932285308838, + "learning_rate": 0.00026327969273168104, + "loss": 0.9312, + "step": 7210 + }, + { + "epoch": 0.87, + "grad_norm": 0.19879262149333954, + "learning_rate": 0.00026321221279186944, + "loss": 0.7982, + "step": 7215 + }, + { + "epoch": 0.87, + "grad_norm": 0.190317302942276, + "learning_rate": 0.0002631446795706308, + "loss": 0.9479, + "step": 7220 + }, + { + "epoch": 0.87, + "grad_norm": 0.20319460332393646, + "learning_rate": 0.0002630770930997486, + "loss": 0.9041, + "step": 7225 + }, + { + "epoch": 0.87, + "grad_norm": 0.20878246426582336, + "learning_rate": 0.00026300945341103113, + "loss": 0.9042, + "step": 7230 + }, + { + "epoch": 0.87, + "grad_norm": 0.208158478140831, + "learning_rate": 0.000262941760536312, + "loss": 0.91, + "step": 7235 + }, + { + "epoch": 0.87, + "grad_norm": 0.2144281417131424, + "learning_rate": 0.0002628740145074497, + "loss": 0.8517, + "step": 7240 + }, + { + "epoch": 0.87, + "grad_norm": 0.1934666633605957, + "learning_rate": 0.0002628062153563277, + "loss": 0.9417, + "step": 7245 + }, + { + "epoch": 0.87, + "grad_norm": 0.2240288108587265, + "learning_rate": 0.0002627383631148546, + "loss": 0.9912, + "step": 7250 + }, + { + "epoch": 0.87, + "grad_norm": 0.2270900011062622, + "learning_rate": 0.00026267045781496384, + "loss": 0.8963, + "step": 7255 + }, + { + "epoch": 0.87, + "grad_norm": 0.2111656367778778, + "learning_rate": 0.00026260249948861406, + "loss": 0.9598, + "step": 7260 + }, + { + "epoch": 0.88, + "grad_norm": 0.21797975897789001, + "learning_rate": 0.0002625344881677885, + "loss": 0.9216, + "step": 7265 + }, + { + "epoch": 0.88, + "grad_norm": 0.20986701548099518, + "learning_rate": 0.00026246642388449575, + "loss": 0.8921, + "step": 7270 + }, + { + "epoch": 0.88, + "grad_norm": 0.21102838218212128, + "learning_rate": 0.00026239830667076897, + "loss": 1.0191, + "step": 7275 + }, + { + "epoch": 0.88, + "grad_norm": 0.2326158732175827, + "learning_rate": 0.00026233013655866646, + "loss": 0.9563, + "step": 7280 + }, + { + "epoch": 0.88, + "grad_norm": 0.1865602284669876, + "learning_rate": 0.0002622619135802713, + "loss": 0.974, + "step": 7285 + }, + { + "epoch": 0.88, + "grad_norm": 0.1943947672843933, + "learning_rate": 0.00026219363776769155, + "loss": 0.9563, + "step": 7290 + }, + { + "epoch": 0.88, + "grad_norm": 0.21199694275856018, + "learning_rate": 0.00026212530915306, + "loss": 0.9788, + "step": 7295 + }, + { + "epoch": 0.88, + "grad_norm": 0.20368671417236328, + "learning_rate": 0.0002620569277685344, + "loss": 0.9533, + "step": 7300 + }, + { + "epoch": 0.88, + "grad_norm": 0.2417079508304596, + "learning_rate": 0.00026198849364629723, + "loss": 0.9253, + "step": 7305 + }, + { + "epoch": 0.88, + "grad_norm": 0.20021148025989532, + "learning_rate": 0.00026192000681855604, + "loss": 0.9328, + "step": 7310 + }, + { + "epoch": 0.88, + "grad_norm": 0.21492332220077515, + "learning_rate": 0.00026185146731754285, + "loss": 0.9173, + "step": 7315 + }, + { + "epoch": 0.88, + "grad_norm": 0.22613677382469177, + "learning_rate": 0.00026178287517551464, + "loss": 0.9277, + "step": 7320 + }, + { + "epoch": 0.88, + "grad_norm": 0.21383176743984222, + "learning_rate": 0.0002617142304247532, + "loss": 0.9311, + "step": 7325 + }, + { + "epoch": 0.88, + "grad_norm": 0.20949169993400574, + "learning_rate": 0.00026164553309756497, + "loss": 0.9854, + "step": 7330 + }, + { + "epoch": 0.88, + "grad_norm": 0.23615580797195435, + "learning_rate": 0.00026157678322628127, + "loss": 0.9295, + "step": 7335 + }, + { + "epoch": 0.88, + "grad_norm": 0.18629109859466553, + "learning_rate": 0.00026150798084325803, + "loss": 0.9045, + "step": 7340 + }, + { + "epoch": 0.88, + "grad_norm": 0.2189890593290329, + "learning_rate": 0.00026143912598087593, + "loss": 0.9756, + "step": 7345 + }, + { + "epoch": 0.89, + "grad_norm": 0.1957770138978958, + "learning_rate": 0.00026137021867154043, + "loss": 0.8535, + "step": 7350 + }, + { + "epoch": 0.89, + "grad_norm": 0.20852722227573395, + "learning_rate": 0.00026130125894768146, + "loss": 0.8852, + "step": 7355 + }, + { + "epoch": 0.89, + "grad_norm": 0.2296961098909378, + "learning_rate": 0.0002612322468417538, + "loss": 0.929, + "step": 7360 + }, + { + "epoch": 0.89, + "grad_norm": 0.21166066825389862, + "learning_rate": 0.00026116318238623694, + "loss": 1.0121, + "step": 7365 + }, + { + "epoch": 0.89, + "grad_norm": 0.20803911983966827, + "learning_rate": 0.0002610940656136348, + "loss": 0.9132, + "step": 7370 + }, + { + "epoch": 0.89, + "grad_norm": 0.26100292801856995, + "learning_rate": 0.0002610248965564761, + "loss": 0.9956, + "step": 7375 + }, + { + "epoch": 0.89, + "grad_norm": 0.19871069490909576, + "learning_rate": 0.000260955675247314, + "loss": 0.904, + "step": 7380 + }, + { + "epoch": 0.89, + "grad_norm": 0.21508464217185974, + "learning_rate": 0.0002608864017187264, + "loss": 0.8962, + "step": 7385 + }, + { + "epoch": 0.89, + "grad_norm": 0.22008737921714783, + "learning_rate": 0.0002608170760033158, + "loss": 0.8767, + "step": 7390 + }, + { + "epoch": 0.89, + "grad_norm": 0.2107255458831787, + "learning_rate": 0.0002607476981337091, + "loss": 0.9574, + "step": 7395 + }, + { + "epoch": 0.89, + "grad_norm": 0.2372090220451355, + "learning_rate": 0.00026067826814255777, + "loss": 0.9463, + "step": 7400 + }, + { + "epoch": 0.89, + "grad_norm": 0.20857949554920197, + "learning_rate": 0.000260608786062538, + "loss": 0.9372, + "step": 7405 + }, + { + "epoch": 0.89, + "grad_norm": 0.21467188000679016, + "learning_rate": 0.0002605392519263503, + "loss": 0.9589, + "step": 7410 + }, + { + "epoch": 0.89, + "grad_norm": 0.20547367632389069, + "learning_rate": 0.0002604696657667197, + "loss": 0.9673, + "step": 7415 + }, + { + "epoch": 0.89, + "grad_norm": 0.23634222149848938, + "learning_rate": 0.00026040002761639586, + "loss": 0.8479, + "step": 7420 + }, + { + "epoch": 0.89, + "grad_norm": 0.19742855429649353, + "learning_rate": 0.0002603303375081527, + "loss": 0.9117, + "step": 7425 + }, + { + "epoch": 0.9, + "grad_norm": 0.2211044728755951, + "learning_rate": 0.0002602605954747888, + "loss": 0.9479, + "step": 7430 + }, + { + "epoch": 0.9, + "grad_norm": 0.20085452497005463, + "learning_rate": 0.000260190801549127, + "loss": 0.96, + "step": 7435 + }, + { + "epoch": 0.9, + "grad_norm": 0.20444773137569427, + "learning_rate": 0.0002601209557640147, + "loss": 0.8778, + "step": 7440 + }, + { + "epoch": 0.9, + "grad_norm": 0.21985496580600739, + "learning_rate": 0.00026005105815232364, + "loss": 0.9513, + "step": 7445 + }, + { + "epoch": 0.9, + "grad_norm": 0.22137680649757385, + "learning_rate": 0.0002599811087469498, + "loss": 1.0321, + "step": 7450 + }, + { + "epoch": 0.9, + "grad_norm": 0.20409676432609558, + "learning_rate": 0.0002599111075808139, + "loss": 0.9039, + "step": 7455 + }, + { + "epoch": 0.9, + "grad_norm": 0.20805245637893677, + "learning_rate": 0.0002598410546868608, + "loss": 0.9834, + "step": 7460 + }, + { + "epoch": 0.9, + "grad_norm": 0.23089753091335297, + "learning_rate": 0.00025977095009805957, + "loss": 0.9394, + "step": 7465 + }, + { + "epoch": 0.9, + "grad_norm": 0.2073964923620224, + "learning_rate": 0.0002597007938474039, + "loss": 0.8757, + "step": 7470 + }, + { + "epoch": 0.9, + "grad_norm": 0.22039000689983368, + "learning_rate": 0.00025963058596791157, + "loss": 0.8823, + "step": 7475 + }, + { + "epoch": 0.9, + "grad_norm": 0.20339925587177277, + "learning_rate": 0.00025956032649262475, + "loss": 0.9434, + "step": 7480 + }, + { + "epoch": 0.9, + "grad_norm": 0.2380754053592682, + "learning_rate": 0.0002594900154546099, + "loss": 0.9431, + "step": 7485 + }, + { + "epoch": 0.9, + "grad_norm": 0.22166424989700317, + "learning_rate": 0.00025941965288695776, + "loss": 0.8625, + "step": 7490 + }, + { + "epoch": 0.9, + "grad_norm": 0.19690659642219543, + "learning_rate": 0.00025934923882278325, + "loss": 0.8757, + "step": 7495 + }, + { + "epoch": 0.9, + "grad_norm": 0.225230872631073, + "learning_rate": 0.00025927877329522554, + "loss": 0.9952, + "step": 7500 + }, + { + "epoch": 0.9, + "grad_norm": 0.23189792037010193, + "learning_rate": 0.00025920825633744815, + "loss": 1.0034, + "step": 7505 + }, + { + "epoch": 0.9, + "grad_norm": 0.22297634184360504, + "learning_rate": 0.0002591376879826386, + "loss": 0.9525, + "step": 7510 + }, + { + "epoch": 0.91, + "grad_norm": 0.2259618490934372, + "learning_rate": 0.00025906706826400863, + "loss": 0.9325, + "step": 7515 + }, + { + "epoch": 0.91, + "grad_norm": 0.21782998740673065, + "learning_rate": 0.0002589963972147945, + "loss": 0.9086, + "step": 7520 + }, + { + "epoch": 0.91, + "grad_norm": 0.2169419527053833, + "learning_rate": 0.000258925674868256, + "loss": 0.9848, + "step": 7525 + }, + { + "epoch": 0.91, + "grad_norm": 0.20681601762771606, + "learning_rate": 0.00025885490125767774, + "loss": 0.9753, + "step": 7530 + }, + { + "epoch": 0.91, + "grad_norm": 0.21519158780574799, + "learning_rate": 0.00025878407641636794, + "loss": 0.9679, + "step": 7535 + }, + { + "epoch": 0.91, + "grad_norm": 0.22115248441696167, + "learning_rate": 0.00025871320037765917, + "loss": 0.8951, + "step": 7540 + }, + { + "epoch": 0.91, + "grad_norm": 0.21856175363063812, + "learning_rate": 0.0002586422731749081, + "loss": 0.9302, + "step": 7545 + }, + { + "epoch": 0.91, + "grad_norm": 0.21080365777015686, + "learning_rate": 0.0002585712948414953, + "loss": 0.9291, + "step": 7550 + }, + { + "epoch": 0.91, + "grad_norm": 0.210664302110672, + "learning_rate": 0.0002585002654108257, + "loss": 0.9287, + "step": 7555 + }, + { + "epoch": 0.91, + "grad_norm": 0.19781038165092468, + "learning_rate": 0.0002584291849163279, + "loss": 0.9384, + "step": 7560 + }, + { + "epoch": 0.91, + "grad_norm": 0.22275352478027344, + "learning_rate": 0.0002583580533914549, + "loss": 0.824, + "step": 7565 + }, + { + "epoch": 0.91, + "grad_norm": 0.2083931863307953, + "learning_rate": 0.00025828687086968354, + "loss": 0.9764, + "step": 7570 + }, + { + "epoch": 0.91, + "grad_norm": 0.20877456665039062, + "learning_rate": 0.00025821563738451464, + "loss": 0.9347, + "step": 7575 + }, + { + "epoch": 0.91, + "grad_norm": 0.1873340606689453, + "learning_rate": 0.00025814435296947307, + "loss": 0.9142, + "step": 7580 + }, + { + "epoch": 0.91, + "grad_norm": 0.2477707713842392, + "learning_rate": 0.0002580730176581076, + "loss": 0.9027, + "step": 7585 + }, + { + "epoch": 0.91, + "grad_norm": 0.23362557590007782, + "learning_rate": 0.000258001631483991, + "loss": 0.8038, + "step": 7590 + }, + { + "epoch": 0.92, + "grad_norm": 0.22810354828834534, + "learning_rate": 0.00025793019448072007, + "loss": 0.9889, + "step": 7595 + }, + { + "epoch": 0.92, + "grad_norm": 0.2291039526462555, + "learning_rate": 0.0002578587066819153, + "loss": 0.9264, + "step": 7600 + }, + { + "epoch": 0.92, + "grad_norm": 0.22235728800296783, + "learning_rate": 0.00025778716812122136, + "loss": 0.9701, + "step": 7605 + }, + { + "epoch": 0.92, + "grad_norm": 0.21946272253990173, + "learning_rate": 0.00025771557883230657, + "loss": 0.8935, + "step": 7610 + }, + { + "epoch": 0.92, + "grad_norm": 0.21064096689224243, + "learning_rate": 0.00025764393884886324, + "loss": 0.9104, + "step": 7615 + }, + { + "epoch": 0.92, + "grad_norm": 0.2479812055826187, + "learning_rate": 0.00025757224820460755, + "loss": 0.8872, + "step": 7620 + }, + { + "epoch": 0.92, + "grad_norm": 0.2186814844608307, + "learning_rate": 0.0002575005069332795, + "loss": 0.9364, + "step": 7625 + }, + { + "epoch": 0.92, + "grad_norm": 0.2108563780784607, + "learning_rate": 0.00025742871506864295, + "loss": 0.8766, + "step": 7630 + }, + { + "epoch": 0.92, + "grad_norm": 0.213986337184906, + "learning_rate": 0.0002573568726444854, + "loss": 0.9053, + "step": 7635 + }, + { + "epoch": 0.92, + "grad_norm": 0.23144182562828064, + "learning_rate": 0.0002572849796946184, + "loss": 0.9933, + "step": 7640 + }, + { + "epoch": 0.92, + "grad_norm": 0.2001713067293167, + "learning_rate": 0.00025721303625287717, + "loss": 0.9321, + "step": 7645 + }, + { + "epoch": 0.92, + "grad_norm": 0.19496455788612366, + "learning_rate": 0.00025714104235312064, + "loss": 0.861, + "step": 7650 + }, + { + "epoch": 0.92, + "grad_norm": 0.19990824162960052, + "learning_rate": 0.0002570689980292315, + "loss": 0.9007, + "step": 7655 + }, + { + "epoch": 0.92, + "grad_norm": 0.2162327915430069, + "learning_rate": 0.0002569969033151163, + "loss": 0.9936, + "step": 7660 + }, + { + "epoch": 0.92, + "grad_norm": 0.20773164927959442, + "learning_rate": 0.00025692475824470504, + "loss": 0.9124, + "step": 7665 + }, + { + "epoch": 0.92, + "grad_norm": 0.19005447626113892, + "learning_rate": 0.0002568525628519518, + "loss": 1.0014, + "step": 7670 + }, + { + "epoch": 0.92, + "grad_norm": 0.18565884232521057, + "learning_rate": 0.00025678031717083394, + "loss": 0.8888, + "step": 7675 + }, + { + "epoch": 0.93, + "grad_norm": 0.21810932457447052, + "learning_rate": 0.0002567080212353528, + "loss": 0.8517, + "step": 7680 + }, + { + "epoch": 0.93, + "grad_norm": 0.21368250250816345, + "learning_rate": 0.00025663567507953314, + "loss": 0.9296, + "step": 7685 + }, + { + "epoch": 0.93, + "grad_norm": 0.19680459797382355, + "learning_rate": 0.0002565632787374236, + "loss": 1.0182, + "step": 7690 + }, + { + "epoch": 0.93, + "grad_norm": 0.23982003331184387, + "learning_rate": 0.00025649083224309617, + "loss": 0.9503, + "step": 7695 + }, + { + "epoch": 0.93, + "grad_norm": 0.24432729184627533, + "learning_rate": 0.00025641833563064666, + "loss": 0.903, + "step": 7700 + }, + { + "epoch": 0.93, + "grad_norm": 0.20875149965286255, + "learning_rate": 0.00025634578893419434, + "loss": 0.8951, + "step": 7705 + }, + { + "epoch": 0.93, + "grad_norm": 0.21281814575195312, + "learning_rate": 0.00025627319218788215, + "loss": 0.9059, + "step": 7710 + }, + { + "epoch": 0.93, + "grad_norm": 0.226210355758667, + "learning_rate": 0.0002562005454258765, + "loss": 0.9195, + "step": 7715 + }, + { + "epoch": 0.93, + "grad_norm": 0.22133582830429077, + "learning_rate": 0.0002561278486823673, + "loss": 0.8901, + "step": 7720 + }, + { + "epoch": 0.93, + "grad_norm": 0.24061179161071777, + "learning_rate": 0.00025605510199156817, + "loss": 0.9548, + "step": 7725 + }, + { + "epoch": 0.93, + "grad_norm": 0.20201779901981354, + "learning_rate": 0.000255982305387716, + "loss": 0.8928, + "step": 7730 + }, + { + "epoch": 0.93, + "grad_norm": 0.23118892312049866, + "learning_rate": 0.00025590945890507146, + "loss": 0.9098, + "step": 7735 + }, + { + "epoch": 0.93, + "grad_norm": 0.3037243187427521, + "learning_rate": 0.00025583656257791834, + "loss": 0.8707, + "step": 7740 + }, + { + "epoch": 0.93, + "grad_norm": 0.2243233621120453, + "learning_rate": 0.0002557636164405641, + "loss": 0.8642, + "step": 7745 + }, + { + "epoch": 0.93, + "grad_norm": 0.1916697919368744, + "learning_rate": 0.0002556906205273398, + "loss": 0.9297, + "step": 7750 + }, + { + "epoch": 0.93, + "grad_norm": 0.18961936235427856, + "learning_rate": 0.00025561757487259953, + "loss": 0.8665, + "step": 7755 + }, + { + "epoch": 0.93, + "grad_norm": 0.21484404802322388, + "learning_rate": 0.000255544479510721, + "loss": 0.8957, + "step": 7760 + }, + { + "epoch": 0.94, + "grad_norm": 0.2340548038482666, + "learning_rate": 0.0002554713344761055, + "loss": 0.8395, + "step": 7765 + }, + { + "epoch": 0.94, + "grad_norm": 0.20856556296348572, + "learning_rate": 0.00025539813980317733, + "loss": 0.9225, + "step": 7770 + }, + { + "epoch": 0.94, + "grad_norm": 0.21993188560009003, + "learning_rate": 0.00025532489552638446, + "loss": 0.9975, + "step": 7775 + }, + { + "epoch": 0.94, + "grad_norm": 0.21606536209583282, + "learning_rate": 0.000255251601680198, + "loss": 0.9617, + "step": 7780 + }, + { + "epoch": 0.94, + "grad_norm": 0.23577193915843964, + "learning_rate": 0.00025517825829911246, + "loss": 0.9713, + "step": 7785 + }, + { + "epoch": 0.94, + "grad_norm": 0.2203446924686432, + "learning_rate": 0.0002551048654176457, + "loss": 0.984, + "step": 7790 + }, + { + "epoch": 0.94, + "grad_norm": 0.23059968650341034, + "learning_rate": 0.0002550314230703389, + "loss": 0.9334, + "step": 7795 + }, + { + "epoch": 0.94, + "grad_norm": 0.2229011207818985, + "learning_rate": 0.0002549579312917564, + "loss": 1.0343, + "step": 7800 + }, + { + "epoch": 0.94, + "grad_norm": 0.18573442101478577, + "learning_rate": 0.0002548843901164859, + "loss": 0.8456, + "step": 7805 + }, + { + "epoch": 0.94, + "grad_norm": 0.2154776155948639, + "learning_rate": 0.00025481079957913826, + "loss": 0.983, + "step": 7810 + }, + { + "epoch": 0.94, + "grad_norm": 0.20241975784301758, + "learning_rate": 0.0002547371597143477, + "loss": 1.0003, + "step": 7815 + }, + { + "epoch": 0.94, + "grad_norm": 0.2103583961725235, + "learning_rate": 0.0002546634705567716, + "loss": 0.9616, + "step": 7820 + }, + { + "epoch": 0.94, + "grad_norm": 0.1959349662065506, + "learning_rate": 0.0002545897321410905, + "loss": 1.0201, + "step": 7825 + }, + { + "epoch": 0.94, + "grad_norm": 0.22472964227199554, + "learning_rate": 0.00025451594450200804, + "loss": 0.997, + "step": 7830 + }, + { + "epoch": 0.94, + "grad_norm": 0.2131294161081314, + "learning_rate": 0.0002544421076742513, + "loss": 0.9673, + "step": 7835 + }, + { + "epoch": 0.94, + "grad_norm": 0.21508903801441193, + "learning_rate": 0.00025436822169257027, + "loss": 0.9775, + "step": 7840 + }, + { + "epoch": 0.95, + "grad_norm": 0.2155522108078003, + "learning_rate": 0.00025429428659173815, + "loss": 0.7657, + "step": 7845 + }, + { + "epoch": 0.95, + "grad_norm": 0.22796915471553802, + "learning_rate": 0.00025422030240655123, + "loss": 0.9161, + "step": 7850 + }, + { + "epoch": 0.95, + "grad_norm": 0.19367897510528564, + "learning_rate": 0.000254146269171829, + "loss": 0.9141, + "step": 7855 + }, + { + "epoch": 0.95, + "grad_norm": 0.2098265439271927, + "learning_rate": 0.00025407218692241384, + "loss": 1.0136, + "step": 7860 + }, + { + "epoch": 0.95, + "grad_norm": 0.20742662250995636, + "learning_rate": 0.00025399805569317145, + "loss": 1.0097, + "step": 7865 + }, + { + "epoch": 0.95, + "grad_norm": 0.19661657512187958, + "learning_rate": 0.00025392387551899034, + "loss": 0.8654, + "step": 7870 + }, + { + "epoch": 0.95, + "grad_norm": 0.2003818154335022, + "learning_rate": 0.0002538496464347822, + "loss": 0.9325, + "step": 7875 + }, + { + "epoch": 0.95, + "grad_norm": 0.21950916945934296, + "learning_rate": 0.0002537753684754817, + "loss": 0.9218, + "step": 7880 + }, + { + "epoch": 0.95, + "grad_norm": 0.2205251157283783, + "learning_rate": 0.00025370104167604657, + "loss": 0.9599, + "step": 7885 + }, + { + "epoch": 0.95, + "grad_norm": 0.21949486434459686, + "learning_rate": 0.00025362666607145744, + "loss": 0.8522, + "step": 7890 + }, + { + "epoch": 0.95, + "grad_norm": 0.20743881165981293, + "learning_rate": 0.00025355224169671786, + "loss": 0.9354, + "step": 7895 + }, + { + "epoch": 0.95, + "grad_norm": 0.19251248240470886, + "learning_rate": 0.0002534777685868545, + "loss": 0.9454, + "step": 7900 + }, + { + "epoch": 0.95, + "grad_norm": 0.22613319754600525, + "learning_rate": 0.00025340324677691685, + "loss": 0.929, + "step": 7905 + }, + { + "epoch": 0.95, + "grad_norm": 0.206687793135643, + "learning_rate": 0.00025332867630197735, + "loss": 0.8953, + "step": 7910 + }, + { + "epoch": 0.95, + "grad_norm": 0.19443422555923462, + "learning_rate": 0.0002532540571971313, + "loss": 0.9731, + "step": 7915 + }, + { + "epoch": 0.95, + "grad_norm": 0.20105896890163422, + "learning_rate": 0.00025317938949749705, + "loss": 0.924, + "step": 7920 + }, + { + "epoch": 0.95, + "grad_norm": 0.2265363335609436, + "learning_rate": 0.0002531046732382156, + "loss": 0.9835, + "step": 7925 + }, + { + "epoch": 0.96, + "grad_norm": 0.19356288015842438, + "learning_rate": 0.00025302990845445087, + "loss": 1.0531, + "step": 7930 + }, + { + "epoch": 0.96, + "grad_norm": 0.1920010894536972, + "learning_rate": 0.00025295509518138975, + "loss": 0.9466, + "step": 7935 + }, + { + "epoch": 0.96, + "grad_norm": 0.22366103529930115, + "learning_rate": 0.00025288023345424176, + "loss": 0.9589, + "step": 7940 + }, + { + "epoch": 0.96, + "grad_norm": 0.22013331949710846, + "learning_rate": 0.00025280532330823944, + "loss": 1.0145, + "step": 7945 + }, + { + "epoch": 0.96, + "grad_norm": 0.24709923565387726, + "learning_rate": 0.00025273036477863785, + "loss": 1.0086, + "step": 7950 + }, + { + "epoch": 0.96, + "grad_norm": 0.2514353096485138, + "learning_rate": 0.00025265535790071505, + "loss": 0.9726, + "step": 7955 + }, + { + "epoch": 0.96, + "grad_norm": 0.2117491513490677, + "learning_rate": 0.0002525803027097717, + "loss": 0.8978, + "step": 7960 + }, + { + "epoch": 0.96, + "grad_norm": 0.21775074303150177, + "learning_rate": 0.0002525051992411314, + "loss": 0.8716, + "step": 7965 + }, + { + "epoch": 0.96, + "grad_norm": 0.20569385588169098, + "learning_rate": 0.0002524300475301402, + "loss": 0.8722, + "step": 7970 + }, + { + "epoch": 0.96, + "grad_norm": 0.19833216071128845, + "learning_rate": 0.00025235484761216697, + "loss": 0.859, + "step": 7975 + }, + { + "epoch": 0.96, + "grad_norm": 0.19562938809394836, + "learning_rate": 0.00025227959952260344, + "loss": 0.9029, + "step": 7980 + }, + { + "epoch": 0.96, + "grad_norm": 0.19179534912109375, + "learning_rate": 0.00025220430329686377, + "loss": 0.9367, + "step": 7985 + }, + { + "epoch": 0.96, + "grad_norm": 0.20224545896053314, + "learning_rate": 0.0002521289589703848, + "loss": 0.8961, + "step": 7990 + }, + { + "epoch": 0.96, + "grad_norm": 0.2402665615081787, + "learning_rate": 0.0002520535665786262, + "loss": 0.8604, + "step": 7995 + }, + { + "epoch": 0.96, + "grad_norm": 0.24251817166805267, + "learning_rate": 0.00025197812615707007, + "loss": 0.9031, + "step": 8000 + }, + { + "epoch": 0.96, + "grad_norm": 0.23049946129322052, + "learning_rate": 0.00025190263774122113, + "loss": 0.8619, + "step": 8005 + }, + { + "epoch": 0.97, + "grad_norm": 0.2142931967973709, + "learning_rate": 0.0002518271013666068, + "loss": 0.8672, + "step": 8010 + }, + { + "epoch": 0.97, + "grad_norm": 0.21534845232963562, + "learning_rate": 0.0002517515170687771, + "loss": 0.9955, + "step": 8015 + }, + { + "epoch": 0.97, + "grad_norm": 0.19614648818969727, + "learning_rate": 0.0002516758848833043, + "loss": 0.8943, + "step": 8020 + }, + { + "epoch": 0.97, + "grad_norm": 0.21292729675769806, + "learning_rate": 0.0002516002048457835, + "loss": 0.9365, + "step": 8025 + }, + { + "epoch": 0.97, + "grad_norm": 0.20765379071235657, + "learning_rate": 0.0002515244769918323, + "loss": 0.8914, + "step": 8030 + }, + { + "epoch": 0.97, + "grad_norm": 0.24531885981559753, + "learning_rate": 0.00025144870135709077, + "loss": 0.9454, + "step": 8035 + }, + { + "epoch": 0.97, + "grad_norm": 0.21342940628528595, + "learning_rate": 0.0002513728779772213, + "loss": 0.9629, + "step": 8040 + }, + { + "epoch": 0.97, + "grad_norm": 0.20636995136737823, + "learning_rate": 0.00025129700688790896, + "loss": 0.9343, + "step": 8045 + }, + { + "epoch": 0.97, + "grad_norm": 0.20889271795749664, + "learning_rate": 0.00025122108812486124, + "loss": 0.9349, + "step": 8050 + }, + { + "epoch": 0.97, + "grad_norm": 0.2028619647026062, + "learning_rate": 0.000251145121723808, + "loss": 0.9775, + "step": 8055 + }, + { + "epoch": 0.97, + "grad_norm": 0.20538374781608582, + "learning_rate": 0.0002510691077205015, + "loss": 0.8841, + "step": 8060 + }, + { + "epoch": 0.97, + "grad_norm": 0.23155823349952698, + "learning_rate": 0.0002509930461507166, + "loss": 0.9182, + "step": 8065 + }, + { + "epoch": 0.97, + "grad_norm": 0.21618753671646118, + "learning_rate": 0.00025091693705025023, + "loss": 0.8398, + "step": 8070 + }, + { + "epoch": 0.97, + "grad_norm": 0.2337218075990677, + "learning_rate": 0.00025084078045492194, + "loss": 0.9557, + "step": 8075 + }, + { + "epoch": 0.97, + "grad_norm": 0.2220161408185959, + "learning_rate": 0.0002507645764005736, + "loss": 1.008, + "step": 8080 + }, + { + "epoch": 0.97, + "grad_norm": 0.20406211912631989, + "learning_rate": 0.00025068832492306924, + "loss": 0.8859, + "step": 8085 + }, + { + "epoch": 0.97, + "grad_norm": 0.22060614824295044, + "learning_rate": 0.0002506120260582955, + "loss": 0.936, + "step": 8090 + }, + { + "epoch": 0.98, + "grad_norm": 0.2314573973417282, + "learning_rate": 0.000250535679842161, + "loss": 0.9284, + "step": 8095 + }, + { + "epoch": 0.98, + "grad_norm": 0.22508397698402405, + "learning_rate": 0.00025045928631059694, + "loss": 0.9302, + "step": 8100 + }, + { + "epoch": 0.98, + "grad_norm": 0.2177351713180542, + "learning_rate": 0.00025038284549955655, + "loss": 0.9366, + "step": 8105 + }, + { + "epoch": 0.98, + "grad_norm": 0.2541705369949341, + "learning_rate": 0.0002503063574450155, + "loss": 0.9312, + "step": 8110 + }, + { + "epoch": 0.98, + "grad_norm": 0.22795630991458893, + "learning_rate": 0.0002502298221829715, + "loss": 0.8796, + "step": 8115 + }, + { + "epoch": 0.98, + "grad_norm": 0.23147457838058472, + "learning_rate": 0.0002501532397494447, + "loss": 0.9392, + "step": 8120 + }, + { + "epoch": 0.98, + "grad_norm": 0.21217665076255798, + "learning_rate": 0.0002500766101804773, + "loss": 0.9313, + "step": 8125 + }, + { + "epoch": 0.98, + "grad_norm": 0.2149830311536789, + "learning_rate": 0.0002499999335121337, + "loss": 0.8611, + "step": 8130 + }, + { + "epoch": 0.98, + "grad_norm": 0.22693344950675964, + "learning_rate": 0.0002499232097805004, + "loss": 0.9367, + "step": 8135 + }, + { + "epoch": 0.98, + "grad_norm": 0.23371291160583496, + "learning_rate": 0.0002498464390216864, + "loss": 0.9207, + "step": 8140 + }, + { + "epoch": 0.98, + "grad_norm": 0.2417604625225067, + "learning_rate": 0.00024976962127182224, + "loss": 0.9299, + "step": 8145 + }, + { + "epoch": 0.98, + "grad_norm": 0.2232208102941513, + "learning_rate": 0.00024969275656706115, + "loss": 0.9032, + "step": 8150 + }, + { + "epoch": 0.98, + "grad_norm": 0.23492024838924408, + "learning_rate": 0.0002496158449435781, + "loss": 0.8478, + "step": 8155 + }, + { + "epoch": 0.98, + "grad_norm": 0.22294215857982635, + "learning_rate": 0.00024953888643757026, + "loss": 0.9258, + "step": 8160 + }, + { + "epoch": 0.98, + "grad_norm": 0.20344236493110657, + "learning_rate": 0.0002494618810852569, + "loss": 0.8963, + "step": 8165 + }, + { + "epoch": 0.98, + "grad_norm": 0.1954520046710968, + "learning_rate": 0.0002493848289228793, + "loss": 0.8931, + "step": 8170 + }, + { + "epoch": 0.98, + "grad_norm": 0.2296024113893509, + "learning_rate": 0.00024930772998670074, + "loss": 0.931, + "step": 8175 + }, + { + "epoch": 0.99, + "grad_norm": 0.19687330722808838, + "learning_rate": 0.00024923058431300653, + "loss": 0.8976, + "step": 8180 + }, + { + "epoch": 0.99, + "grad_norm": 0.20918093621730804, + "learning_rate": 0.000249153391938104, + "loss": 0.9273, + "step": 8185 + }, + { + "epoch": 0.99, + "grad_norm": 0.2444303333759308, + "learning_rate": 0.0002490761528983224, + "loss": 0.9052, + "step": 8190 + }, + { + "epoch": 0.99, + "grad_norm": 0.2560185194015503, + "learning_rate": 0.00024899886723001307, + "loss": 0.8845, + "step": 8195 + }, + { + "epoch": 0.99, + "grad_norm": 0.20742972195148468, + "learning_rate": 0.00024892153496954917, + "loss": 0.8958, + "step": 8200 + }, + { + "epoch": 0.99, + "grad_norm": 0.25156137347221375, + "learning_rate": 0.0002488441561533258, + "loss": 0.862, + "step": 8205 + }, + { + "epoch": 0.99, + "grad_norm": 0.22066602110862732, + "learning_rate": 0.00024876673081776, + "loss": 0.9099, + "step": 8210 + }, + { + "epoch": 0.99, + "grad_norm": 0.2297365367412567, + "learning_rate": 0.00024868925899929084, + "loss": 0.8339, + "step": 8215 + }, + { + "epoch": 0.99, + "grad_norm": 0.23268838226795197, + "learning_rate": 0.0002486117407343789, + "loss": 0.8629, + "step": 8220 + }, + { + "epoch": 0.99, + "grad_norm": 0.20442204177379608, + "learning_rate": 0.0002485341760595071, + "loss": 0.929, + "step": 8225 + }, + { + "epoch": 0.99, + "grad_norm": 0.2177409529685974, + "learning_rate": 0.0002484565650111798, + "loss": 0.9028, + "step": 8230 + }, + { + "epoch": 0.99, + "grad_norm": 0.2090945988893509, + "learning_rate": 0.0002483789076259233, + "loss": 0.9519, + "step": 8235 + }, + { + "epoch": 0.99, + "grad_norm": 0.20811286568641663, + "learning_rate": 0.00024830120394028586, + "loss": 0.9343, + "step": 8240 + }, + { + "epoch": 0.99, + "grad_norm": 0.21751175820827484, + "learning_rate": 0.0002482234539908374, + "loss": 0.9037, + "step": 8245 + }, + { + "epoch": 0.99, + "grad_norm": 0.20378360152244568, + "learning_rate": 0.0002481456578141695, + "loss": 0.9319, + "step": 8250 + }, + { + "epoch": 0.99, + "grad_norm": 0.21400323510169983, + "learning_rate": 0.00024806781544689575, + "loss": 0.8737, + "step": 8255 + }, + { + "epoch": 1.0, + "grad_norm": 0.19827169179916382, + "learning_rate": 0.00024798992692565136, + "loss": 0.9123, + "step": 8260 + }, + { + "epoch": 1.0, + "grad_norm": 0.23193618655204773, + "learning_rate": 0.00024791199228709317, + "loss": 0.9127, + "step": 8265 + }, + { + "epoch": 1.0, + "grad_norm": 0.24364005029201508, + "learning_rate": 0.00024783401156789985, + "loss": 0.9055, + "step": 8270 + }, + { + "epoch": 1.0, + "grad_norm": 0.23433059453964233, + "learning_rate": 0.00024775598480477175, + "loss": 1.01, + "step": 8275 + }, + { + "epoch": 1.0, + "grad_norm": 0.20962095260620117, + "learning_rate": 0.0002476779120344308, + "loss": 0.8084, + "step": 8280 + }, + { + "epoch": 1.0, + "grad_norm": 0.2257934808731079, + "learning_rate": 0.00024759979329362067, + "loss": 0.9176, + "step": 8285 + }, + { + "epoch": 1.0, + "grad_norm": 0.21964751183986664, + "learning_rate": 0.0002475216286191067, + "loss": 0.8642, + "step": 8290 + }, + { + "epoch": 1.0, + "grad_norm": 0.21779011189937592, + "learning_rate": 0.00024744341804767555, + "loss": 0.9282, + "step": 8295 + }, + { + "epoch": 1.0, + "grad_norm": 0.21334509551525116, + "learning_rate": 0.000247365161616136, + "loss": 0.8526, + "step": 8300 + }, + { + "epoch": 1.0, + "grad_norm": 0.21431876718997955, + "learning_rate": 0.00024728685936131794, + "loss": 0.8295, + "step": 8305 + }, + { + "epoch": 1.0, + "grad_norm": 0.22023451328277588, + "learning_rate": 0.0002472085113200731, + "loss": 0.9086, + "step": 8310 + }, + { + "epoch": 1.0, + "grad_norm": 0.20858727395534515, + "learning_rate": 0.0002471301175292746, + "loss": 0.9956, + "step": 8315 + }, + { + "epoch": 1.0, + "grad_norm": 0.19900618493556976, + "learning_rate": 0.00024705167802581727, + "loss": 0.9715, + "step": 8320 + }, + { + "epoch": 1.0, + "grad_norm": 0.21103012561798096, + "learning_rate": 0.0002469731928466172, + "loss": 0.8762, + "step": 8325 + }, + { + "epoch": 1.0, + "grad_norm": 0.2619311809539795, + "learning_rate": 0.0002468946620286122, + "loss": 0.9099, + "step": 8330 + }, + { + "epoch": 1.0, + "grad_norm": 0.22611823678016663, + "learning_rate": 0.0002468160856087615, + "loss": 0.9334, + "step": 8335 + }, + { + "epoch": 1.0, + "grad_norm": 0.20762339234352112, + "learning_rate": 0.0002467374636240458, + "loss": 0.9715, + "step": 8340 + }, + { + "epoch": 1.01, + "grad_norm": 0.20186099410057068, + "learning_rate": 0.0002466587961114671, + "loss": 0.8772, + "step": 8345 + }, + { + "epoch": 1.01, + "grad_norm": 0.2584278881549835, + "learning_rate": 0.000246580083108049, + "loss": 0.9342, + "step": 8350 + }, + { + "epoch": 1.01, + "grad_norm": 0.20932774245738983, + "learning_rate": 0.0002465013246508365, + "loss": 0.9424, + "step": 8355 + }, + { + "epoch": 1.01, + "grad_norm": 0.22265595197677612, + "learning_rate": 0.0002464225207768959, + "loss": 0.8854, + "step": 8360 + }, + { + "epoch": 1.01, + "grad_norm": 0.24274423718452454, + "learning_rate": 0.0002463436715233149, + "loss": 0.9185, + "step": 8365 + }, + { + "epoch": 1.01, + "grad_norm": 0.22588522732257843, + "learning_rate": 0.0002462647769272027, + "loss": 0.8415, + "step": 8370 + }, + { + "epoch": 1.01, + "grad_norm": 0.21932215988636017, + "learning_rate": 0.00024618583702568954, + "loss": 0.9426, + "step": 8375 + }, + { + "epoch": 1.01, + "grad_norm": 0.2024673968553543, + "learning_rate": 0.0002461068518559273, + "loss": 0.833, + "step": 8380 + }, + { + "epoch": 1.01, + "grad_norm": 0.22020402550697327, + "learning_rate": 0.00024602782145508885, + "loss": 0.8884, + "step": 8385 + }, + { + "epoch": 1.01, + "grad_norm": 0.24328754842281342, + "learning_rate": 0.00024594874586036876, + "loss": 0.8266, + "step": 8390 + }, + { + "epoch": 1.01, + "grad_norm": 0.23017850518226624, + "learning_rate": 0.00024586962510898244, + "loss": 0.96, + "step": 8395 + }, + { + "epoch": 1.01, + "grad_norm": 0.22810372710227966, + "learning_rate": 0.0002457904592381668, + "loss": 0.9248, + "step": 8400 + }, + { + "epoch": 1.01, + "grad_norm": 0.23655059933662415, + "learning_rate": 0.00024571124828518003, + "loss": 0.9204, + "step": 8405 + }, + { + "epoch": 1.01, + "grad_norm": 0.24605585634708405, + "learning_rate": 0.0002456319922873013, + "loss": 0.8909, + "step": 8410 + }, + { + "epoch": 1.01, + "grad_norm": 0.22107426822185516, + "learning_rate": 0.00024555269128183116, + "loss": 0.8827, + "step": 8415 + }, + { + "epoch": 1.01, + "grad_norm": 0.24414613842964172, + "learning_rate": 0.00024547334530609124, + "loss": 0.8416, + "step": 8420 + }, + { + "epoch": 1.02, + "grad_norm": 0.23650889098644257, + "learning_rate": 0.00024539395439742453, + "loss": 0.8868, + "step": 8425 + }, + { + "epoch": 1.02, + "grad_norm": 0.23410624265670776, + "learning_rate": 0.000245314518593195, + "loss": 1.0127, + "step": 8430 + }, + { + "epoch": 1.02, + "grad_norm": 0.2250613421201706, + "learning_rate": 0.0002452350379307876, + "loss": 0.9244, + "step": 8435 + }, + { + "epoch": 1.02, + "grad_norm": 0.22231152653694153, + "learning_rate": 0.00024515551244760865, + "loss": 0.9534, + "step": 8440 + }, + { + "epoch": 1.02, + "grad_norm": 0.2265365868806839, + "learning_rate": 0.0002450759421810856, + "loss": 0.8905, + "step": 8445 + }, + { + "epoch": 1.02, + "grad_norm": 0.21347838640213013, + "learning_rate": 0.0002449963271686668, + "loss": 0.9037, + "step": 8450 + }, + { + "epoch": 1.02, + "grad_norm": 0.23790526390075684, + "learning_rate": 0.0002449166674478217, + "loss": 0.8819, + "step": 8455 + }, + { + "epoch": 1.02, + "grad_norm": 0.2057124823331833, + "learning_rate": 0.0002448369630560408, + "loss": 0.9506, + "step": 8460 + }, + { + "epoch": 1.02, + "grad_norm": 0.22661954164505005, + "learning_rate": 0.00024475721403083566, + "loss": 0.8741, + "step": 8465 + }, + { + "epoch": 1.02, + "grad_norm": 0.2240198701620102, + "learning_rate": 0.0002446774204097388, + "loss": 0.8722, + "step": 8470 + }, + { + "epoch": 1.02, + "grad_norm": 0.19682292640209198, + "learning_rate": 0.0002445975822303038, + "loss": 0.8477, + "step": 8475 + }, + { + "epoch": 1.02, + "grad_norm": 0.20487092435359955, + "learning_rate": 0.00024451769953010504, + "loss": 0.8088, + "step": 8480 + }, + { + "epoch": 1.02, + "grad_norm": 0.23260267078876495, + "learning_rate": 0.00024443777234673807, + "loss": 0.8667, + "step": 8485 + }, + { + "epoch": 1.02, + "grad_norm": 0.2150842696428299, + "learning_rate": 0.00024435780071781926, + "loss": 0.8324, + "step": 8490 + }, + { + "epoch": 1.02, + "grad_norm": 0.24172092974185944, + "learning_rate": 0.00024427778468098587, + "loss": 0.8618, + "step": 8495 + }, + { + "epoch": 1.02, + "grad_norm": 0.23562046885490417, + "learning_rate": 0.0002441977242738962, + "loss": 0.9769, + "step": 8500 + }, + { + "epoch": 1.02, + "grad_norm": 0.2472938448190689, + "learning_rate": 0.00024411761953422922, + "loss": 0.8776, + "step": 8505 + }, + { + "epoch": 1.03, + "grad_norm": 0.23211072385311127, + "learning_rate": 0.000244037470499685, + "loss": 0.8499, + "step": 8510 + }, + { + "epoch": 1.03, + "grad_norm": 0.2152828425168991, + "learning_rate": 0.00024395727720798424, + "loss": 0.8764, + "step": 8515 + }, + { + "epoch": 1.03, + "grad_norm": 0.21999038755893707, + "learning_rate": 0.0002438770396968686, + "loss": 0.8729, + "step": 8520 + }, + { + "epoch": 1.03, + "grad_norm": 0.21073397994041443, + "learning_rate": 0.0002437967580041005, + "loss": 0.8665, + "step": 8525 + }, + { + "epoch": 1.03, + "grad_norm": 0.21868467330932617, + "learning_rate": 0.00024371643216746324, + "loss": 0.8653, + "step": 8530 + }, + { + "epoch": 1.03, + "grad_norm": 0.22046944499015808, + "learning_rate": 0.00024363606222476075, + "loss": 0.8832, + "step": 8535 + }, + { + "epoch": 1.03, + "grad_norm": 0.23462894558906555, + "learning_rate": 0.00024355564821381784, + "loss": 0.8693, + "step": 8540 + }, + { + "epoch": 1.03, + "grad_norm": 0.20430710911750793, + "learning_rate": 0.00024347519017247995, + "loss": 0.9157, + "step": 8545 + }, + { + "epoch": 1.03, + "grad_norm": 0.2538154721260071, + "learning_rate": 0.00024339468813861336, + "loss": 0.8323, + "step": 8550 + }, + { + "epoch": 1.03, + "grad_norm": 0.2316758930683136, + "learning_rate": 0.000243314142150105, + "loss": 0.8574, + "step": 8555 + }, + { + "epoch": 1.03, + "grad_norm": 0.239527627825737, + "learning_rate": 0.0002432335522448625, + "loss": 0.9551, + "step": 8560 + }, + { + "epoch": 1.03, + "grad_norm": 0.21243155002593994, + "learning_rate": 0.00024315291846081406, + "loss": 0.9939, + "step": 8565 + }, + { + "epoch": 1.03, + "grad_norm": 0.23001927137374878, + "learning_rate": 0.00024307224083590874, + "loss": 0.8392, + "step": 8570 + }, + { + "epoch": 1.03, + "grad_norm": 0.21947364509105682, + "learning_rate": 0.00024299151940811606, + "loss": 0.8661, + "step": 8575 + }, + { + "epoch": 1.03, + "grad_norm": 0.23186787962913513, + "learning_rate": 0.0002429107542154261, + "loss": 0.8679, + "step": 8580 + }, + { + "epoch": 1.03, + "grad_norm": 0.2559497058391571, + "learning_rate": 0.00024282994529584983, + "loss": 0.8598, + "step": 8585 + }, + { + "epoch": 1.04, + "grad_norm": 0.23815637826919556, + "learning_rate": 0.00024274909268741848, + "loss": 0.8349, + "step": 8590 + }, + { + "epoch": 1.04, + "grad_norm": 0.2084992676973343, + "learning_rate": 0.00024266819642818405, + "loss": 0.8345, + "step": 8595 + }, + { + "epoch": 1.04, + "grad_norm": 0.2263403832912445, + "learning_rate": 0.0002425872565562189, + "loss": 0.8879, + "step": 8600 + }, + { + "epoch": 1.04, + "grad_norm": 0.23247919976711273, + "learning_rate": 0.00024250627310961614, + "loss": 0.7877, + "step": 8605 + }, + { + "epoch": 1.04, + "grad_norm": 0.21611854434013367, + "learning_rate": 0.00024242524612648917, + "loss": 0.8724, + "step": 8610 + }, + { + "epoch": 1.04, + "grad_norm": 0.2197057604789734, + "learning_rate": 0.00024234417564497206, + "loss": 0.8138, + "step": 8615 + }, + { + "epoch": 1.04, + "grad_norm": 0.2542968988418579, + "learning_rate": 0.00024226306170321924, + "loss": 0.9583, + "step": 8620 + }, + { + "epoch": 1.04, + "grad_norm": 0.21506857872009277, + "learning_rate": 0.00024218190433940558, + "loss": 0.9456, + "step": 8625 + }, + { + "epoch": 1.04, + "grad_norm": 0.21320503950119019, + "learning_rate": 0.0002421007035917265, + "loss": 0.9033, + "step": 8630 + }, + { + "epoch": 1.04, + "grad_norm": 0.23590131103992462, + "learning_rate": 0.00024201945949839775, + "loss": 0.9297, + "step": 8635 + }, + { + "epoch": 1.04, + "grad_norm": 0.23142153024673462, + "learning_rate": 0.0002419381720976555, + "loss": 0.9025, + "step": 8640 + }, + { + "epoch": 1.04, + "grad_norm": 0.2024351954460144, + "learning_rate": 0.00024185684142775623, + "loss": 0.9095, + "step": 8645 + }, + { + "epoch": 1.04, + "grad_norm": 0.2539699077606201, + "learning_rate": 0.00024177546752697697, + "loss": 0.8968, + "step": 8650 + }, + { + "epoch": 1.04, + "grad_norm": 0.22273805737495422, + "learning_rate": 0.00024169405043361492, + "loss": 0.8366, + "step": 8655 + }, + { + "epoch": 1.04, + "grad_norm": 0.2413063645362854, + "learning_rate": 0.00024161259018598764, + "loss": 0.8781, + "step": 8660 + }, + { + "epoch": 1.04, + "grad_norm": 0.21893925964832306, + "learning_rate": 0.00024153108682243307, + "loss": 0.8827, + "step": 8665 + }, + { + "epoch": 1.04, + "grad_norm": 0.23175576329231262, + "learning_rate": 0.00024144954038130936, + "loss": 0.8959, + "step": 8670 + }, + { + "epoch": 1.05, + "grad_norm": 0.2520297169685364, + "learning_rate": 0.00024136795090099502, + "loss": 0.9922, + "step": 8675 + }, + { + "epoch": 1.05, + "grad_norm": 0.21685057878494263, + "learning_rate": 0.00024128631841988877, + "loss": 0.9727, + "step": 8680 + }, + { + "epoch": 1.05, + "grad_norm": 0.22948361933231354, + "learning_rate": 0.00024120464297640955, + "loss": 0.8525, + "step": 8685 + }, + { + "epoch": 1.05, + "grad_norm": 0.24964188039302826, + "learning_rate": 0.00024112292460899652, + "loss": 0.8168, + "step": 8690 + }, + { + "epoch": 1.05, + "grad_norm": 0.27692463994026184, + "learning_rate": 0.00024104116335610905, + "loss": 0.957, + "step": 8695 + }, + { + "epoch": 1.05, + "grad_norm": 0.24565111100673676, + "learning_rate": 0.00024095935925622675, + "loss": 0.8839, + "step": 8700 + }, + { + "epoch": 1.05, + "grad_norm": 0.20882341265678406, + "learning_rate": 0.00024087751234784933, + "loss": 0.9486, + "step": 8705 + }, + { + "epoch": 1.05, + "grad_norm": 0.23391057550907135, + "learning_rate": 0.0002407956226694966, + "loss": 0.9281, + "step": 8710 + }, + { + "epoch": 1.05, + "grad_norm": 0.2399512678384781, + "learning_rate": 0.00024071369025970867, + "loss": 0.7803, + "step": 8715 + }, + { + "epoch": 1.05, + "grad_norm": 0.21823789179325104, + "learning_rate": 0.00024063171515704556, + "loss": 0.8684, + "step": 8720 + }, + { + "epoch": 1.05, + "grad_norm": 0.21568119525909424, + "learning_rate": 0.0002405496974000875, + "loss": 0.7765, + "step": 8725 + }, + { + "epoch": 1.05, + "grad_norm": 0.2163381576538086, + "learning_rate": 0.00024046763702743478, + "loss": 0.8652, + "step": 8730 + }, + { + "epoch": 1.05, + "grad_norm": 0.22184047102928162, + "learning_rate": 0.00024038553407770778, + "loss": 0.8446, + "step": 8735 + }, + { + "epoch": 1.05, + "grad_norm": 0.22127588093280792, + "learning_rate": 0.00024030338858954678, + "loss": 0.8153, + "step": 8740 + }, + { + "epoch": 1.05, + "grad_norm": 0.25217580795288086, + "learning_rate": 0.0002402212006016123, + "loss": 0.8407, + "step": 8745 + }, + { + "epoch": 1.05, + "grad_norm": 0.22362253069877625, + "learning_rate": 0.0002401389701525846, + "loss": 0.7756, + "step": 8750 + }, + { + "epoch": 1.05, + "grad_norm": 0.2407907396554947, + "learning_rate": 0.00024005669728116417, + "loss": 0.8441, + "step": 8755 + }, + { + "epoch": 1.06, + "grad_norm": 0.23379315435886383, + "learning_rate": 0.00023997438202607124, + "loss": 0.8697, + "step": 8760 + }, + { + "epoch": 1.06, + "grad_norm": 0.23524151742458344, + "learning_rate": 0.00023989202442604621, + "loss": 0.9377, + "step": 8765 + }, + { + "epoch": 1.06, + "grad_norm": 0.22088152170181274, + "learning_rate": 0.00023980962451984928, + "loss": 0.8814, + "step": 8770 + }, + { + "epoch": 1.06, + "grad_norm": 0.24777436256408691, + "learning_rate": 0.00023972718234626056, + "loss": 0.7183, + "step": 8775 + }, + { + "epoch": 1.06, + "grad_norm": 0.22386722266674042, + "learning_rate": 0.00023964469794408006, + "loss": 0.9437, + "step": 8780 + }, + { + "epoch": 1.06, + "grad_norm": 0.2572818100452423, + "learning_rate": 0.0002395621713521277, + "loss": 0.9118, + "step": 8785 + }, + { + "epoch": 1.06, + "grad_norm": 0.23130157589912415, + "learning_rate": 0.00023947960260924326, + "loss": 0.8401, + "step": 8790 + }, + { + "epoch": 1.06, + "grad_norm": 0.26878032088279724, + "learning_rate": 0.0002393969917542863, + "loss": 0.9485, + "step": 8795 + }, + { + "epoch": 1.06, + "grad_norm": 0.21913166344165802, + "learning_rate": 0.00023931433882613617, + "loss": 0.8719, + "step": 8800 + }, + { + "epoch": 1.06, + "grad_norm": 0.2042398601770401, + "learning_rate": 0.00023923164386369225, + "loss": 0.8471, + "step": 8805 + }, + { + "epoch": 1.06, + "grad_norm": NaN, + "learning_rate": 0.00023916545765519838, + "loss": 1.0099, + "step": 8810 + }, + { + "epoch": 1.06, + "grad_norm": 0.2109871208667755, + "learning_rate": 0.0002390826871291146, + "loss": 0.9514, + "step": 8815 + }, + { + "epoch": 1.06, + "grad_norm": 0.24620996415615082, + "learning_rate": 0.00023899987467775985, + "loss": 0.794, + "step": 8820 + }, + { + "epoch": 1.06, + "grad_norm": 0.21983186900615692, + "learning_rate": 0.00023891702034010856, + "loss": 0.8971, + "step": 8825 + }, + { + "epoch": 1.06, + "grad_norm": 0.23902229964733124, + "learning_rate": 0.00023883412415515458, + "loss": 0.874, + "step": 8830 + }, + { + "epoch": 1.06, + "grad_norm": 0.2428964078426361, + "learning_rate": 0.0002387511861619117, + "loss": 0.9071, + "step": 8835 + }, + { + "epoch": 1.07, + "grad_norm": 0.22053857147693634, + "learning_rate": 0.00023866820639941328, + "loss": 0.8574, + "step": 8840 + }, + { + "epoch": 1.07, + "grad_norm": 0.20824211835861206, + "learning_rate": 0.0002385851849067124, + "loss": 0.9146, + "step": 8845 + }, + { + "epoch": 1.07, + "grad_norm": 0.21827155351638794, + "learning_rate": 0.0002385021217228816, + "loss": 0.8853, + "step": 8850 + }, + { + "epoch": 1.07, + "grad_norm": 0.24931156635284424, + "learning_rate": 0.0002384190168870133, + "loss": 0.8394, + "step": 8855 + }, + { + "epoch": 1.07, + "grad_norm": 0.23989886045455933, + "learning_rate": 0.00023833587043821933, + "loss": 0.8461, + "step": 8860 + }, + { + "epoch": 1.07, + "grad_norm": 0.25101611018180847, + "learning_rate": 0.00023825268241563121, + "loss": 0.8247, + "step": 8865 + }, + { + "epoch": 1.07, + "grad_norm": 0.22489690780639648, + "learning_rate": 0.00023816945285839994, + "loss": 0.918, + "step": 8870 + }, + { + "epoch": 1.07, + "grad_norm": 0.22989100217819214, + "learning_rate": 0.00023808618180569613, + "loss": 0.8747, + "step": 8875 + }, + { + "epoch": 1.07, + "grad_norm": 0.22614827752113342, + "learning_rate": 0.00023800286929670996, + "loss": 0.8885, + "step": 8880 + }, + { + "epoch": 1.07, + "grad_norm": 0.213409423828125, + "learning_rate": 0.00023791951537065098, + "loss": 0.9846, + "step": 8885 + }, + { + "epoch": 1.07, + "grad_norm": 0.22785250842571259, + "learning_rate": 0.00023783612006674835, + "loss": 0.9043, + "step": 8890 + }, + { + "epoch": 1.07, + "grad_norm": 0.2558668553829193, + "learning_rate": 0.00023775268342425071, + "loss": 0.8575, + "step": 8895 + }, + { + "epoch": 1.07, + "grad_norm": 0.2358073592185974, + "learning_rate": 0.00023766920548242597, + "loss": 0.8788, + "step": 8900 + }, + { + "epoch": 1.07, + "grad_norm": 0.2330961525440216, + "learning_rate": 0.00023758568628056185, + "loss": 0.8552, + "step": 8905 + }, + { + "epoch": 1.07, + "grad_norm": 0.26288071274757385, + "learning_rate": 0.00023750212585796506, + "loss": 0.9431, + "step": 8910 + }, + { + "epoch": 1.07, + "grad_norm": 0.22953157126903534, + "learning_rate": 0.000237418524253962, + "loss": 0.8415, + "step": 8915 + }, + { + "epoch": 1.07, + "grad_norm": 0.23770654201507568, + "learning_rate": 0.00023733488150789832, + "loss": 0.9387, + "step": 8920 + }, + { + "epoch": 1.08, + "grad_norm": 0.24671205878257751, + "learning_rate": 0.00023725119765913915, + "loss": 0.8804, + "step": 8925 + }, + { + "epoch": 1.08, + "grad_norm": 0.2374102622270584, + "learning_rate": 0.00023716747274706886, + "loss": 1.0063, + "step": 8930 + }, + { + "epoch": 1.08, + "grad_norm": 0.24478651583194733, + "learning_rate": 0.0002370837068110911, + "loss": 0.9604, + "step": 8935 + }, + { + "epoch": 1.08, + "grad_norm": 0.22993561625480652, + "learning_rate": 0.00023699989989062892, + "loss": 0.861, + "step": 8940 + }, + { + "epoch": 1.08, + "grad_norm": 0.2684665620326996, + "learning_rate": 0.0002369160520251248, + "loss": 0.8511, + "step": 8945 + }, + { + "epoch": 1.08, + "grad_norm": 0.25119537115097046, + "learning_rate": 0.00023683216325404009, + "loss": 0.8144, + "step": 8950 + }, + { + "epoch": 1.08, + "grad_norm": 0.22731202840805054, + "learning_rate": 0.0002367482336168558, + "loss": 0.8949, + "step": 8955 + }, + { + "epoch": 1.08, + "grad_norm": 0.23801319301128387, + "learning_rate": 0.00023666426315307188, + "loss": 0.9336, + "step": 8960 + }, + { + "epoch": 1.08, + "grad_norm": 0.24626471102237701, + "learning_rate": 0.00023658025190220774, + "loss": 0.8794, + "step": 8965 + }, + { + "epoch": 1.08, + "grad_norm": 0.235481858253479, + "learning_rate": 0.00023649619990380184, + "loss": 0.8264, + "step": 8970 + }, + { + "epoch": 1.08, + "grad_norm": 0.2578223645687103, + "learning_rate": 0.00023641210719741175, + "loss": 0.9651, + "step": 8975 + }, + { + "epoch": 1.08, + "grad_norm": 0.24087263643741608, + "learning_rate": 0.00023632797382261441, + "loss": 0.8688, + "step": 8980 + }, + { + "epoch": 1.08, + "grad_norm": 0.2264793962240219, + "learning_rate": 0.00023624379981900572, + "loss": 0.8988, + "step": 8985 + }, + { + "epoch": 1.08, + "grad_norm": 0.21337303519248962, + "learning_rate": 0.00023615958522620078, + "loss": 0.8212, + "step": 8990 + }, + { + "epoch": 1.08, + "grad_norm": 0.24498280882835388, + "learning_rate": 0.00023607533008383373, + "loss": 0.8061, + "step": 8995 + }, + { + "epoch": 1.08, + "grad_norm": 0.23145338892936707, + "learning_rate": 0.00023599103443155788, + "loss": 0.8282, + "step": 9000 + }, + { + "epoch": 1.09, + "grad_norm": 0.2749665379524231, + "learning_rate": 0.00023590669830904554, + "loss": 0.9067, + "step": 9005 + }, + { + "epoch": 1.09, + "grad_norm": 0.23750852048397064, + "learning_rate": 0.00023582232175598812, + "loss": 0.9132, + "step": 9010 + }, + { + "epoch": 1.09, + "grad_norm": 0.22623078525066376, + "learning_rate": 0.000235737904812096, + "loss": 0.9233, + "step": 9015 + }, + { + "epoch": 1.09, + "grad_norm": 0.24327704310417175, + "learning_rate": 0.00023567034220227885, + "loss": 0.9254, + "step": 9020 + }, + { + "epoch": 1.09, + "grad_norm": 0.23452426493167877, + "learning_rate": 0.00023558585265501518, + "loss": 0.789, + "step": 9025 + }, + { + "epoch": 1.09, + "grad_norm": 0.24461832642555237, + "learning_rate": 0.00023550132282820706, + "loss": 0.9243, + "step": 9030 + }, + { + "epoch": 1.09, + "grad_norm": 0.22294314205646515, + "learning_rate": 0.00023541675276163697, + "loss": 0.8918, + "step": 9035 + }, + { + "epoch": 1.09, + "grad_norm": 0.2230527549982071, + "learning_rate": 0.00023533214249510647, + "loss": 0.8376, + "step": 9040 + }, + { + "epoch": 1.09, + "grad_norm": 0.26348450779914856, + "learning_rate": 0.00023524749206843586, + "loss": 0.9469, + "step": 9045 + }, + { + "epoch": 1.09, + "grad_norm": 0.2523627281188965, + "learning_rate": 0.00023516280152146454, + "loss": 0.881, + "step": 9050 + }, + { + "epoch": 1.09, + "grad_norm": 0.2316828817129135, + "learning_rate": 0.00023507807089405064, + "loss": 0.8915, + "step": 9055 + }, + { + "epoch": 1.09, + "grad_norm": 0.21704962849617004, + "learning_rate": 0.00023499330022607124, + "loss": 0.8677, + "step": 9060 + }, + { + "epoch": 1.09, + "grad_norm": 0.26538321375846863, + "learning_rate": 0.0002349084895574222, + "loss": 0.8538, + "step": 9065 + }, + { + "epoch": 1.09, + "grad_norm": 0.21919500827789307, + "learning_rate": 0.00023482363892801827, + "loss": 0.8222, + "step": 9070 + }, + { + "epoch": 1.09, + "grad_norm": 0.27171579003334045, + "learning_rate": 0.00023473874837779294, + "loss": 0.8996, + "step": 9075 + }, + { + "epoch": 1.09, + "grad_norm": 0.20920461416244507, + "learning_rate": 0.0002346538179466985, + "loss": 0.8708, + "step": 9080 + }, + { + "epoch": 1.09, + "grad_norm": 0.23751945793628693, + "learning_rate": 0.00023456884767470614, + "loss": 0.9058, + "step": 9085 + }, + { + "epoch": 1.1, + "grad_norm": 0.23487117886543274, + "learning_rate": 0.0002344838376018056, + "loss": 0.9029, + "step": 9090 + }, + { + "epoch": 1.1, + "grad_norm": 0.20502355694770813, + "learning_rate": 0.00023439878776800542, + "loss": 0.9057, + "step": 9095 + }, + { + "epoch": 1.1, + "grad_norm": 0.26424431800842285, + "learning_rate": 0.00023431369821333293, + "loss": 0.9132, + "step": 9100 + }, + { + "epoch": 1.1, + "grad_norm": 0.2559512257575989, + "learning_rate": 0.00023422856897783412, + "loss": 0.8964, + "step": 9105 + }, + { + "epoch": 1.1, + "grad_norm": 0.21677064895629883, + "learning_rate": 0.0002341434001015736, + "loss": 0.851, + "step": 9110 + }, + { + "epoch": 1.1, + "grad_norm": 0.2509647607803345, + "learning_rate": 0.00023405819162463466, + "loss": 0.8652, + "step": 9115 + }, + { + "epoch": 1.1, + "grad_norm": 0.20928482711315155, + "learning_rate": 0.00023397294358711924, + "loss": 0.8989, + "step": 9120 + }, + { + "epoch": 1.1, + "grad_norm": 0.2391272932291031, + "learning_rate": 0.00023388765602914792, + "loss": 1.0029, + "step": 9125 + }, + { + "epoch": 1.1, + "grad_norm": 0.21468204259872437, + "learning_rate": 0.0002338023289908599, + "loss": 0.817, + "step": 9130 + }, + { + "epoch": 1.1, + "grad_norm": 0.2177235335111618, + "learning_rate": 0.00023371696251241279, + "loss": 0.8765, + "step": 9135 + }, + { + "epoch": 1.1, + "grad_norm": 0.2142833024263382, + "learning_rate": 0.000233631556633983, + "loss": 0.8559, + "step": 9140 + }, + { + "epoch": 1.1, + "grad_norm": 0.22310085594654083, + "learning_rate": 0.00023354611139576536, + "loss": 0.8448, + "step": 9145 + }, + { + "epoch": 1.1, + "grad_norm": 0.2540875971317291, + "learning_rate": 0.00023346062683797324, + "loss": 0.7871, + "step": 9150 + }, + { + "epoch": 1.1, + "grad_norm": 0.2475668340921402, + "learning_rate": 0.0002333751030008384, + "loss": 0.923, + "step": 9155 + }, + { + "epoch": 1.1, + "grad_norm": 0.24676388502120972, + "learning_rate": 0.0002332895399246114, + "loss": 0.792, + "step": 9160 + }, + { + "epoch": 1.1, + "grad_norm": 0.22518321871757507, + "learning_rate": 0.00023320393764956086, + "loss": 0.8875, + "step": 9165 + }, + { + "epoch": 1.1, + "grad_norm": 0.24502132833003998, + "learning_rate": 0.00023311829621597418, + "loss": 0.9119, + "step": 9170 + }, + { + "epoch": 1.11, + "grad_norm": 0.21606354415416718, + "learning_rate": 0.00023303261566415704, + "loss": 0.908, + "step": 9175 + }, + { + "epoch": 1.11, + "grad_norm": 0.2341851145029068, + "learning_rate": 0.00023294689603443352, + "loss": 0.8705, + "step": 9180 + }, + { + "epoch": 1.11, + "grad_norm": 0.23818925023078918, + "learning_rate": 0.00023286113736714612, + "loss": 0.9589, + "step": 9185 + }, + { + "epoch": 1.11, + "grad_norm": 0.2910539507865906, + "learning_rate": 0.0002327753397026558, + "loss": 0.8729, + "step": 9190 + }, + { + "epoch": 1.11, + "grad_norm": 0.24293243885040283, + "learning_rate": 0.0002326895030813417, + "loss": 0.8785, + "step": 9195 + }, + { + "epoch": 1.11, + "grad_norm": 0.22408191859722137, + "learning_rate": 0.0002326036275436014, + "loss": 0.8641, + "step": 9200 + }, + { + "epoch": 1.11, + "grad_norm": 0.2291467785835266, + "learning_rate": 0.00023251771312985084, + "loss": 0.8185, + "step": 9205 + }, + { + "epoch": 1.11, + "grad_norm": 0.24368716776371002, + "learning_rate": 0.0002324317598805241, + "loss": 0.8201, + "step": 9210 + }, + { + "epoch": 1.11, + "grad_norm": 0.24222686886787415, + "learning_rate": 0.00023234576783607373, + "loss": 0.9468, + "step": 9215 + }, + { + "epoch": 1.11, + "grad_norm": 0.23429812490940094, + "learning_rate": 0.00023225973703697037, + "loss": 0.9275, + "step": 9220 + }, + { + "epoch": 1.11, + "grad_norm": 0.23759447038173676, + "learning_rate": 0.000232173667523703, + "loss": 0.8471, + "step": 9225 + }, + { + "epoch": 1.11, + "grad_norm": 0.2510417401790619, + "learning_rate": 0.00023208755933677881, + "loss": 0.8939, + "step": 9230 + }, + { + "epoch": 1.11, + "grad_norm": 0.23719042539596558, + "learning_rate": 0.00023200141251672314, + "loss": 0.8509, + "step": 9235 + }, + { + "epoch": 1.11, + "grad_norm": 0.21512798964977264, + "learning_rate": 0.0002319152271040796, + "loss": 0.8789, + "step": 9240 + }, + { + "epoch": 1.11, + "grad_norm": 0.2508089542388916, + "learning_rate": 0.00023182900313940979, + "loss": 0.8954, + "step": 9245 + }, + { + "epoch": 1.11, + "grad_norm": 0.22558368742465973, + "learning_rate": 0.00023174274066329367, + "loss": 0.9115, + "step": 9250 + }, + { + "epoch": 1.12, + "grad_norm": 0.2333495169878006, + "learning_rate": 0.00023165643971632924, + "loss": 0.9437, + "step": 9255 + }, + { + "epoch": 1.12, + "grad_norm": 0.27782970666885376, + "learning_rate": 0.00023157010033913252, + "loss": 0.9699, + "step": 9260 + }, + { + "epoch": 1.12, + "grad_norm": 0.2564479410648346, + "learning_rate": 0.0002314837225723377, + "loss": 0.8747, + "step": 9265 + }, + { + "epoch": 1.12, + "grad_norm": 0.2404770702123642, + "learning_rate": 0.0002313973064565971, + "loss": 0.891, + "step": 9270 + }, + { + "epoch": 1.12, + "grad_norm": 0.22722746431827545, + "learning_rate": 0.00023131085203258092, + "loss": 0.8386, + "step": 9275 + }, + { + "epoch": 1.12, + "grad_norm": 0.21286796033382416, + "learning_rate": 0.00023122435934097755, + "loss": 0.9262, + "step": 9280 + }, + { + "epoch": 1.12, + "grad_norm": 0.22385799884796143, + "learning_rate": 0.00023113782842249328, + "loss": 0.857, + "step": 9285 + }, + { + "epoch": 1.12, + "grad_norm": 0.222075417637825, + "learning_rate": 0.00023105125931785245, + "loss": 0.894, + "step": 9290 + }, + { + "epoch": 1.12, + "grad_norm": 0.23088407516479492, + "learning_rate": 0.00023096465206779736, + "loss": 0.8705, + "step": 9295 + }, + { + "epoch": 1.12, + "grad_norm": 0.2188183218240738, + "learning_rate": 0.00023087800671308826, + "loss": 0.8935, + "step": 9300 + }, + { + "epoch": 1.12, + "grad_norm": 0.2047409862279892, + "learning_rate": 0.0002307913232945033, + "loss": 0.8515, + "step": 9305 + }, + { + "epoch": 1.12, + "grad_norm": 0.23035292327404022, + "learning_rate": 0.00023070460185283862, + "loss": 0.9546, + "step": 9310 + }, + { + "epoch": 1.12, + "grad_norm": 0.2521415054798126, + "learning_rate": 0.00023061784242890817, + "loss": 0.8708, + "step": 9315 + }, + { + "epoch": 1.12, + "grad_norm": 0.2409847378730774, + "learning_rate": 0.00023053104506354387, + "loss": 0.857, + "step": 9320 + }, + { + "epoch": 1.12, + "grad_norm": 0.31675606966018677, + "learning_rate": 0.0002304442097975954, + "loss": 0.9459, + "step": 9325 + }, + { + "epoch": 1.12, + "grad_norm": 0.30147692561149597, + "learning_rate": 0.00023035733667193034, + "loss": 0.8456, + "step": 9330 + }, + { + "epoch": 1.12, + "grad_norm": 0.24489833414554596, + "learning_rate": 0.00023027042572743405, + "loss": 0.8259, + "step": 9335 + }, + { + "epoch": 1.13, + "grad_norm": 0.2613428831100464, + "learning_rate": 0.00023018347700500973, + "loss": 1.0109, + "step": 9340 + }, + { + "epoch": 1.13, + "grad_norm": 0.24886786937713623, + "learning_rate": 0.00023009649054557828, + "loss": 0.8927, + "step": 9345 + }, + { + "epoch": 1.13, + "grad_norm": 0.2450346052646637, + "learning_rate": 0.00023000946639007848, + "loss": 0.9878, + "step": 9350 + }, + { + "epoch": 1.13, + "grad_norm": 0.2285212278366089, + "learning_rate": 0.00022992240457946674, + "loss": 0.8819, + "step": 9355 + }, + { + "epoch": 1.13, + "grad_norm": 0.22266454994678497, + "learning_rate": 0.00022983530515471732, + "loss": 0.8673, + "step": 9360 + }, + { + "epoch": 1.13, + "grad_norm": 0.228920578956604, + "learning_rate": 0.000229748168156822, + "loss": 0.9104, + "step": 9365 + }, + { + "epoch": 1.13, + "grad_norm": 0.22747859358787537, + "learning_rate": 0.00022966099362679038, + "loss": 0.8806, + "step": 9370 + }, + { + "epoch": 1.13, + "grad_norm": 0.24505122005939484, + "learning_rate": 0.00022957378160564973, + "loss": 0.9054, + "step": 9375 + }, + { + "epoch": 1.13, + "grad_norm": 0.22118686139583588, + "learning_rate": 0.00022948653213444487, + "loss": 0.8939, + "step": 9380 + }, + { + "epoch": 1.13, + "grad_norm": 0.21411919593811035, + "learning_rate": 0.00022939924525423834, + "loss": 0.9658, + "step": 9385 + }, + { + "epoch": 1.13, + "grad_norm": 0.25308582186698914, + "learning_rate": 0.00022931192100611022, + "loss": 0.9179, + "step": 9390 + }, + { + "epoch": 1.13, + "grad_norm": 0.24361754953861237, + "learning_rate": 0.0002292245594311582, + "loss": 0.95, + "step": 9395 + }, + { + "epoch": 1.13, + "grad_norm": 0.23733997344970703, + "learning_rate": 0.00022913716057049757, + "loss": 0.8376, + "step": 9400 + }, + { + "epoch": 1.13, + "grad_norm": 0.23433391749858856, + "learning_rate": 0.0002290497244652611, + "loss": 0.9291, + "step": 9405 + }, + { + "epoch": 1.13, + "grad_norm": 0.22158537805080414, + "learning_rate": 0.00022896225115659913, + "loss": 0.8927, + "step": 9410 + }, + { + "epoch": 1.13, + "grad_norm": 0.21899032592773438, + "learning_rate": 0.00022887474068567952, + "loss": 0.8168, + "step": 9415 + }, + { + "epoch": 1.14, + "grad_norm": 0.23201830685138702, + "learning_rate": 0.0002287871930936876, + "loss": 0.9301, + "step": 9420 + }, + { + "epoch": 1.14, + "grad_norm": 0.22301717102527618, + "learning_rate": 0.00022869960842182614, + "loss": 0.8611, + "step": 9425 + }, + { + "epoch": 1.14, + "grad_norm": 0.2502361238002777, + "learning_rate": 0.00022861198671131542, + "loss": 0.9485, + "step": 9430 + }, + { + "epoch": 1.14, + "grad_norm": 0.24774372577667236, + "learning_rate": 0.00022852432800339313, + "loss": 0.8949, + "step": 9435 + }, + { + "epoch": 1.14, + "grad_norm": 0.2235736846923828, + "learning_rate": 0.00022843663233931442, + "loss": 0.8333, + "step": 9440 + }, + { + "epoch": 1.14, + "grad_norm": 0.23626188933849335, + "learning_rate": 0.0002283488997603517, + "loss": 0.8102, + "step": 9445 + }, + { + "epoch": 1.14, + "grad_norm": 0.25719600915908813, + "learning_rate": 0.00022826113030779486, + "loss": 0.8408, + "step": 9450 + }, + { + "epoch": 1.14, + "grad_norm": 0.2473081797361374, + "learning_rate": 0.00022817332402295113, + "loss": 0.81, + "step": 9455 + }, + { + "epoch": 1.14, + "grad_norm": 0.20688697695732117, + "learning_rate": 0.00022808548094714506, + "loss": 0.896, + "step": 9460 + }, + { + "epoch": 1.14, + "grad_norm": 0.26973938941955566, + "learning_rate": 0.0002279976011217185, + "loss": 0.878, + "step": 9465 + }, + { + "epoch": 1.14, + "grad_norm": 0.24812696874141693, + "learning_rate": 0.00022790968458803065, + "loss": 0.7833, + "step": 9470 + }, + { + "epoch": 1.14, + "grad_norm": 0.22464975714683533, + "learning_rate": 0.00022782173138745793, + "loss": 0.8938, + "step": 9475 + }, + { + "epoch": 1.14, + "grad_norm": 0.23503178358078003, + "learning_rate": 0.00022773374156139406, + "loss": 0.8603, + "step": 9480 + }, + { + "epoch": 1.14, + "grad_norm": 0.236893430352211, + "learning_rate": 0.00022764571515124994, + "loss": 0.8637, + "step": 9485 + }, + { + "epoch": 1.14, + "grad_norm": 0.24112752079963684, + "learning_rate": 0.00022755765219845372, + "loss": 0.893, + "step": 9490 + }, + { + "epoch": 1.14, + "grad_norm": 0.2311786562204361, + "learning_rate": 0.00022746955274445077, + "loss": 0.8524, + "step": 9495 + }, + { + "epoch": 1.14, + "grad_norm": 0.22384856641292572, + "learning_rate": 0.0002273814168307036, + "loss": 0.8552, + "step": 9500 + }, + { + "epoch": 1.15, + "grad_norm": 0.2306017130613327, + "learning_rate": 0.00022729324449869198, + "loss": 0.9581, + "step": 9505 + }, + { + "epoch": 1.15, + "grad_norm": 0.23824800550937653, + "learning_rate": 0.0002272050357899126, + "loss": 0.8444, + "step": 9510 + }, + { + "epoch": 1.15, + "grad_norm": 0.22070693969726562, + "learning_rate": 0.00022711679074587953, + "loss": 0.9218, + "step": 9515 + }, + { + "epoch": 1.15, + "grad_norm": 0.25660625100135803, + "learning_rate": 0.0002270285094081237, + "loss": 0.9183, + "step": 9520 + }, + { + "epoch": 1.15, + "grad_norm": 0.22683514654636383, + "learning_rate": 0.00022694019181819337, + "loss": 0.8944, + "step": 9525 + }, + { + "epoch": 1.15, + "grad_norm": 0.2356630563735962, + "learning_rate": 0.0002268518380176536, + "loss": 0.86, + "step": 9530 + }, + { + "epoch": 1.15, + "grad_norm": 0.2331274300813675, + "learning_rate": 0.00022676344804808675, + "loss": 0.9143, + "step": 9535 + }, + { + "epoch": 1.15, + "grad_norm": 0.22305701673030853, + "learning_rate": 0.00022667502195109198, + "loss": 0.8527, + "step": 9540 + }, + { + "epoch": 1.15, + "grad_norm": 0.23889246582984924, + "learning_rate": 0.00022658655976828557, + "loss": 0.9313, + "step": 9545 + }, + { + "epoch": 1.15, + "grad_norm": 0.25855275988578796, + "learning_rate": 0.00022649806154130078, + "loss": 0.9187, + "step": 9550 + }, + { + "epoch": 1.15, + "grad_norm": 0.21481330692768097, + "learning_rate": 0.00022640952731178786, + "loss": 0.8408, + "step": 9555 + }, + { + "epoch": 1.15, + "grad_norm": 0.2593211531639099, + "learning_rate": 0.00022632095712141382, + "loss": 0.8638, + "step": 9560 + }, + { + "epoch": 1.15, + "grad_norm": 0.2373592108488083, + "learning_rate": 0.0002262323510118629, + "loss": 0.8963, + "step": 9565 + }, + { + "epoch": 1.15, + "grad_norm": 0.2230243980884552, + "learning_rate": 0.0002261437090248359, + "loss": 0.8791, + "step": 9570 + }, + { + "epoch": 1.15, + "grad_norm": 0.21514588594436646, + "learning_rate": 0.00022605503120205086, + "loss": 0.8647, + "step": 9575 + }, + { + "epoch": 1.15, + "grad_norm": 0.22944727540016174, + "learning_rate": 0.0002259663175852424, + "loss": 0.9185, + "step": 9580 + }, + { + "epoch": 1.15, + "grad_norm": 0.2357485294342041, + "learning_rate": 0.00022587756821616214, + "loss": 0.8481, + "step": 9585 + }, + { + "epoch": 1.16, + "grad_norm": 0.21694447100162506, + "learning_rate": 0.00022578878313657844, + "loss": 0.9085, + "step": 9590 + }, + { + "epoch": 1.16, + "grad_norm": 0.23980222642421722, + "learning_rate": 0.00022569996238827654, + "loss": 0.8727, + "step": 9595 + }, + { + "epoch": 1.16, + "grad_norm": 0.21908119320869446, + "learning_rate": 0.00022561110601305838, + "loss": 0.8425, + "step": 9600 + }, + { + "epoch": 1.16, + "grad_norm": 0.2655925452709198, + "learning_rate": 0.0002255222140527428, + "loss": 0.9436, + "step": 9605 + }, + { + "epoch": 1.16, + "grad_norm": 0.25513389706611633, + "learning_rate": 0.0002254332865491653, + "loss": 0.9691, + "step": 9610 + }, + { + "epoch": 1.16, + "grad_norm": 0.2476007342338562, + "learning_rate": 0.000225344323544178, + "loss": 0.9306, + "step": 9615 + }, + { + "epoch": 1.16, + "grad_norm": 0.2203802466392517, + "learning_rate": 0.00022525532507965004, + "loss": 0.9796, + "step": 9620 + }, + { + "epoch": 1.16, + "grad_norm": 0.21193300187587738, + "learning_rate": 0.0002251662911974669, + "loss": 0.8913, + "step": 9625 + }, + { + "epoch": 1.16, + "grad_norm": 0.21106500923633575, + "learning_rate": 0.0002250772219395309, + "loss": 0.8984, + "step": 9630 + }, + { + "epoch": 1.16, + "grad_norm": 0.24506857991218567, + "learning_rate": 0.00022498811734776103, + "loss": 0.8861, + "step": 9635 + }, + { + "epoch": 1.16, + "grad_norm": 0.22095774114131927, + "learning_rate": 0.0002248989774640929, + "loss": 0.8482, + "step": 9640 + }, + { + "epoch": 1.16, + "grad_norm": 0.24854597449302673, + "learning_rate": 0.0002248098023304786, + "loss": 0.8741, + "step": 9645 + }, + { + "epoch": 1.16, + "grad_norm": 0.22818954288959503, + "learning_rate": 0.00022472059198888698, + "loss": 0.8486, + "step": 9650 + }, + { + "epoch": 1.16, + "grad_norm": 0.24147124588489532, + "learning_rate": 0.0002246313464813034, + "loss": 0.9675, + "step": 9655 + }, + { + "epoch": 1.16, + "grad_norm": 0.22634311020374298, + "learning_rate": 0.00022454206584972971, + "loss": 0.8038, + "step": 9660 + }, + { + "epoch": 1.16, + "grad_norm": 0.23374037444591522, + "learning_rate": 0.00022445275013618444, + "loss": 0.9025, + "step": 9665 + }, + { + "epoch": 1.17, + "grad_norm": 0.22729557752609253, + "learning_rate": 0.00022436339938270236, + "loss": 0.8884, + "step": 9670 + }, + { + "epoch": 1.17, + "grad_norm": 0.24079249799251556, + "learning_rate": 0.00022427401363133502, + "loss": 0.8695, + "step": 9675 + }, + { + "epoch": 1.17, + "grad_norm": 0.22827574610710144, + "learning_rate": 0.0002241845929241503, + "loss": 0.8461, + "step": 9680 + }, + { + "epoch": 1.17, + "grad_norm": 0.22621886432170868, + "learning_rate": 0.00022409513730323256, + "loss": 1.0162, + "step": 9685 + }, + { + "epoch": 1.17, + "grad_norm": 0.24320857226848602, + "learning_rate": 0.00022400564681068264, + "loss": 0.899, + "step": 9690 + }, + { + "epoch": 1.17, + "grad_norm": 0.24638378620147705, + "learning_rate": 0.00022391612148861764, + "loss": 0.8847, + "step": 9695 + }, + { + "epoch": 1.17, + "grad_norm": 0.25771698355674744, + "learning_rate": 0.00022382656137917117, + "loss": 0.9079, + "step": 9700 + }, + { + "epoch": 1.17, + "grad_norm": 0.2110559642314911, + "learning_rate": 0.0002237369665244932, + "loss": 0.9551, + "step": 9705 + }, + { + "epoch": 1.17, + "grad_norm": 0.220799520611763, + "learning_rate": 0.00022364733696675007, + "loss": 0.9265, + "step": 9710 + }, + { + "epoch": 1.17, + "grad_norm": 0.23551765084266663, + "learning_rate": 0.00022355767274812442, + "loss": 0.9568, + "step": 9715 + }, + { + "epoch": 1.17, + "grad_norm": 0.22385844588279724, + "learning_rate": 0.0002234679739108152, + "loss": 0.8029, + "step": 9720 + }, + { + "epoch": 1.17, + "grad_norm": 0.22753435373306274, + "learning_rate": 0.00022337824049703764, + "loss": 0.9675, + "step": 9725 + }, + { + "epoch": 1.17, + "grad_norm": 0.2253807932138443, + "learning_rate": 0.00022328847254902333, + "loss": 0.8846, + "step": 9730 + }, + { + "epoch": 1.17, + "grad_norm": 0.20921418070793152, + "learning_rate": 0.00022319867010901998, + "loss": 0.8575, + "step": 9735 + }, + { + "epoch": 1.17, + "grad_norm": 0.22992300987243652, + "learning_rate": 0.0002231088332192916, + "loss": 0.8715, + "step": 9740 + }, + { + "epoch": 1.17, + "grad_norm": 0.23045524954795837, + "learning_rate": 0.00022301896192211847, + "loss": 0.8612, + "step": 9745 + }, + { + "epoch": 1.17, + "grad_norm": 0.2747400999069214, + "learning_rate": 0.00022292905625979694, + "loss": 0.8996, + "step": 9750 + }, + { + "epoch": 1.18, + "grad_norm": 0.2662562131881714, + "learning_rate": 0.0002228391162746397, + "loss": 0.8955, + "step": 9755 + }, + { + "epoch": 1.18, + "grad_norm": 0.2570030689239502, + "learning_rate": 0.00022274914200897533, + "loss": 0.9389, + "step": 9760 + }, + { + "epoch": 1.18, + "grad_norm": 0.21170185506343842, + "learning_rate": 0.0002226591335051489, + "loss": 0.8996, + "step": 9765 + }, + { + "epoch": 1.18, + "grad_norm": 0.2330474704504013, + "learning_rate": 0.00022256909080552127, + "loss": 0.855, + "step": 9770 + }, + { + "epoch": 1.18, + "grad_norm": 0.23178677260875702, + "learning_rate": 0.00022247901395246956, + "loss": 0.908, + "step": 9775 + }, + { + "epoch": 1.18, + "grad_norm": 0.2605157494544983, + "learning_rate": 0.00022238890298838696, + "loss": 0.9045, + "step": 9780 + }, + { + "epoch": 1.18, + "grad_norm": 0.2496998906135559, + "learning_rate": 0.00022229875795568262, + "loss": 0.8768, + "step": 9785 + }, + { + "epoch": 1.18, + "grad_norm": 0.24557536840438843, + "learning_rate": 0.00022220857889678177, + "loss": 0.8546, + "step": 9790 + }, + { + "epoch": 1.18, + "grad_norm": 0.22727739810943604, + "learning_rate": 0.00022211836585412582, + "loss": 0.8804, + "step": 9795 + }, + { + "epoch": 1.18, + "grad_norm": 0.21401318907737732, + "learning_rate": 0.00022202811887017188, + "loss": 0.8564, + "step": 9800 + }, + { + "epoch": 1.18, + "grad_norm": 0.25362133979797363, + "learning_rate": 0.00022193783798739325, + "loss": 0.925, + "step": 9805 + }, + { + "epoch": 1.18, + "grad_norm": 0.24248819053173065, + "learning_rate": 0.00022184752324827902, + "loss": 0.8071, + "step": 9810 + }, + { + "epoch": 1.18, + "grad_norm": 0.2690879702568054, + "learning_rate": 0.0002217571746953344, + "loss": 0.926, + "step": 9815 + }, + { + "epoch": 1.18, + "grad_norm": 0.24345597624778748, + "learning_rate": 0.00022166679237108037, + "loss": 0.9282, + "step": 9820 + }, + { + "epoch": 1.18, + "grad_norm": 0.26057952642440796, + "learning_rate": 0.0002215763763180539, + "loss": 0.9823, + "step": 9825 + }, + { + "epoch": 1.18, + "grad_norm": 0.23440931737422943, + "learning_rate": 0.00022148592657880768, + "loss": 0.8077, + "step": 9830 + }, + { + "epoch": 1.19, + "grad_norm": 0.258858323097229, + "learning_rate": 0.00022139544319591052, + "loss": 0.8348, + "step": 9835 + }, + { + "epoch": 1.19, + "grad_norm": 0.24155326187610626, + "learning_rate": 0.00022130492621194681, + "loss": 0.854, + "step": 9840 + }, + { + "epoch": 1.19, + "grad_norm": 0.23018091917037964, + "learning_rate": 0.00022121437566951686, + "loss": 1.018, + "step": 9845 + }, + { + "epoch": 1.19, + "grad_norm": 0.21002855896949768, + "learning_rate": 0.00022112379161123673, + "loss": 0.8137, + "step": 9850 + }, + { + "epoch": 1.19, + "grad_norm": 0.28612980246543884, + "learning_rate": 0.00022103317407973837, + "loss": 0.8465, + "step": 9855 + }, + { + "epoch": 1.19, + "grad_norm": 0.20760983228683472, + "learning_rate": 0.00022094252311766929, + "loss": 0.9207, + "step": 9860 + }, + { + "epoch": 1.19, + "grad_norm": 0.23730003833770752, + "learning_rate": 0.00022085183876769293, + "loss": 0.9247, + "step": 9865 + }, + { + "epoch": 1.19, + "grad_norm": 0.2215205430984497, + "learning_rate": 0.00022076112107248833, + "loss": 0.901, + "step": 9870 + }, + { + "epoch": 1.19, + "grad_norm": 0.2527283728122711, + "learning_rate": 0.00022067037007475026, + "loss": 0.9053, + "step": 9875 + }, + { + "epoch": 1.19, + "grad_norm": 0.20407399535179138, + "learning_rate": 0.00022057958581718915, + "loss": 0.9464, + "step": 9880 + }, + { + "epoch": 1.19, + "grad_norm": 0.23334871232509613, + "learning_rate": 0.00022048876834253103, + "loss": 0.8626, + "step": 9885 + }, + { + "epoch": 1.19, + "grad_norm": 0.22306807339191437, + "learning_rate": 0.00022039791769351772, + "loss": 0.8706, + "step": 9890 + }, + { + "epoch": 1.19, + "grad_norm": 0.21715085208415985, + "learning_rate": 0.00022030703391290646, + "loss": 0.847, + "step": 9895 + }, + { + "epoch": 1.19, + "grad_norm": 0.24487926065921783, + "learning_rate": 0.00022021611704347026, + "loss": 0.9132, + "step": 9900 + }, + { + "epoch": 1.19, + "grad_norm": 0.24505330622196198, + "learning_rate": 0.00022012516712799756, + "loss": 0.9286, + "step": 9905 + }, + { + "epoch": 1.19, + "grad_norm": 0.2754554748535156, + "learning_rate": 0.00022003418420929243, + "loss": 0.885, + "step": 9910 + }, + { + "epoch": 1.19, + "grad_norm": 0.2651885449886322, + "learning_rate": 0.00021994316833017443, + "loss": 0.9331, + "step": 9915 + }, + { + "epoch": 1.2, + "grad_norm": 0.25558674335479736, + "learning_rate": 0.00021987033192416743, + "loss": 1.0044, + "step": 9920 + }, + { + "epoch": 1.2, + "grad_norm": 0.2470509111881256, + "learning_rate": 0.00021977925682426118, + "loss": 0.8667, + "step": 9925 + }, + { + "epoch": 1.2, + "grad_norm": 0.22508101165294647, + "learning_rate": 0.00021968814888391935, + "loss": 0.9218, + "step": 9930 + }, + { + "epoch": 1.2, + "grad_norm": 0.22299574315547943, + "learning_rate": 0.00021959700814602035, + "loss": 0.9041, + "step": 9935 + }, + { + "epoch": 1.2, + "grad_norm": 0.25101420283317566, + "learning_rate": 0.0002195058346534581, + "loss": 0.8797, + "step": 9940 + }, + { + "epoch": 1.2, + "grad_norm": 0.23478543758392334, + "learning_rate": 0.00021941462844914182, + "loss": 0.9141, + "step": 9945 + }, + { + "epoch": 1.2, + "grad_norm": 0.25883424282073975, + "learning_rate": 0.00021932338957599625, + "loss": 0.9015, + "step": 9950 + }, + { + "epoch": 1.2, + "grad_norm": 0.22861811518669128, + "learning_rate": 0.00021923211807696133, + "loss": 0.8923, + "step": 9955 + }, + { + "epoch": 1.2, + "grad_norm": 0.24520093202590942, + "learning_rate": 0.00021914081399499258, + "loss": 0.8958, + "step": 9960 + }, + { + "epoch": 1.2, + "grad_norm": 0.23438018560409546, + "learning_rate": 0.00021904947737306065, + "loss": 1.023, + "step": 9965 + }, + { + "epoch": 1.2, + "grad_norm": 0.24439287185668945, + "learning_rate": 0.0002189581082541516, + "loss": 0.9076, + "step": 9970 + }, + { + "epoch": 1.2, + "grad_norm": 0.23474149405956268, + "learning_rate": 0.00021886670668126674, + "loss": 0.8026, + "step": 9975 + }, + { + "epoch": 1.2, + "grad_norm": 0.25041329860687256, + "learning_rate": 0.00021877527269742277, + "loss": 0.9282, + "step": 9980 + }, + { + "epoch": 1.2, + "grad_norm": 0.2247781604528427, + "learning_rate": 0.00021868380634565147, + "loss": 0.907, + "step": 9985 + }, + { + "epoch": 1.2, + "grad_norm": 0.22819451987743378, + "learning_rate": 0.00021859230766900004, + "loss": 0.8245, + "step": 9990 + }, + { + "epoch": 1.2, + "grad_norm": 0.2385624200105667, + "learning_rate": 0.00021850077671053072, + "loss": 0.9177, + "step": 9995 + }, + { + "epoch": 1.2, + "grad_norm": 0.21760748326778412, + "learning_rate": 0.00021840921351332107, + "loss": 0.8566, + "step": 10000 + }, + { + "epoch": 1.21, + "grad_norm": 0.22546535730361938, + "learning_rate": 0.00021831761812046363, + "loss": 0.8671, + "step": 10005 + }, + { + "epoch": 1.21, + "grad_norm": 0.2704804539680481, + "learning_rate": 0.00021822599057506648, + "loss": 0.8348, + "step": 10010 + }, + { + "epoch": 1.21, + "grad_norm": 0.2751447558403015, + "learning_rate": 0.00021813433092025236, + "loss": 0.9228, + "step": 10015 + }, + { + "epoch": 1.21, + "grad_norm": 0.241837278008461, + "learning_rate": 0.00021804263919915947, + "loss": 0.9471, + "step": 10020 + }, + { + "epoch": 1.21, + "grad_norm": 0.24710319936275482, + "learning_rate": 0.00021795091545494092, + "loss": 0.959, + "step": 10025 + }, + { + "epoch": 1.21, + "grad_norm": 0.2651970684528351, + "learning_rate": 0.000217859159730765, + "loss": 0.8134, + "step": 10030 + }, + { + "epoch": 1.21, + "grad_norm": 0.2539055347442627, + "learning_rate": 0.00021776737206981498, + "loss": 0.9223, + "step": 10035 + }, + { + "epoch": 1.21, + "grad_norm": 0.31177031993865967, + "learning_rate": 0.0002176755525152892, + "loss": 0.9064, + "step": 10040 + }, + { + "epoch": 1.21, + "grad_norm": 0.22030460834503174, + "learning_rate": 0.00021758370111040094, + "loss": 0.8532, + "step": 10045 + }, + { + "epoch": 1.21, + "grad_norm": 0.2701607346534729, + "learning_rate": 0.00021749181789837858, + "loss": 0.8357, + "step": 10050 + }, + { + "epoch": 1.21, + "grad_norm": 0.24372340738773346, + "learning_rate": 0.00021739990292246535, + "loss": 0.929, + "step": 10055 + }, + { + "epoch": 1.21, + "grad_norm": 0.26125940680503845, + "learning_rate": 0.00021730795622591952, + "loss": 0.9623, + "step": 10060 + }, + { + "epoch": 1.21, + "grad_norm": 0.23417995870113373, + "learning_rate": 0.00021721597785201427, + "loss": 0.7909, + "step": 10065 + }, + { + "epoch": 1.21, + "grad_norm": 0.22926466166973114, + "learning_rate": 0.00021712396784403772, + "loss": 0.8785, + "step": 10070 + }, + { + "epoch": 1.21, + "grad_norm": 0.24475359916687012, + "learning_rate": 0.00021703192624529272, + "loss": 0.902, + "step": 10075 + }, + { + "epoch": 1.21, + "grad_norm": 0.2308359593153, + "learning_rate": 0.0002169398530990972, + "loss": 0.9083, + "step": 10080 + }, + { + "epoch": 1.22, + "grad_norm": 0.27559104561805725, + "learning_rate": 0.00021684774844878376, + "loss": 0.8948, + "step": 10085 + }, + { + "epoch": 1.22, + "grad_norm": 0.22297874093055725, + "learning_rate": 0.0002167556123377, + "loss": 0.8743, + "step": 10090 + }, + { + "epoch": 1.22, + "grad_norm": 0.25241953134536743, + "learning_rate": 0.00021666344480920818, + "loss": 0.8831, + "step": 10095 + }, + { + "epoch": 1.22, + "grad_norm": 0.2630932033061981, + "learning_rate": 0.0002165712459066854, + "loss": 0.8839, + "step": 10100 + }, + { + "epoch": 1.22, + "grad_norm": 0.24986648559570312, + "learning_rate": 0.00021647901567352357, + "loss": 0.9109, + "step": 10105 + }, + { + "epoch": 1.22, + "grad_norm": 0.24380378425121307, + "learning_rate": 0.00021638675415312924, + "loss": 0.9033, + "step": 10110 + }, + { + "epoch": 1.22, + "grad_norm": 0.22641584277153015, + "learning_rate": 0.00021629446138892377, + "loss": 0.9176, + "step": 10115 + }, + { + "epoch": 1.22, + "grad_norm": 0.2435508668422699, + "learning_rate": 0.0002162021374243432, + "loss": 0.8718, + "step": 10120 + }, + { + "epoch": 1.22, + "grad_norm": 0.25140294432640076, + "learning_rate": 0.00021610978230283823, + "loss": 0.9289, + "step": 10125 + }, + { + "epoch": 1.22, + "grad_norm": 0.23567943274974823, + "learning_rate": 0.0002160173960678743, + "loss": 0.8465, + "step": 10130 + }, + { + "epoch": 1.22, + "grad_norm": 0.2589253783226013, + "learning_rate": 0.00021592497876293137, + "loss": 0.9386, + "step": 10135 + }, + { + "epoch": 1.22, + "grad_norm": 0.2508923411369324, + "learning_rate": 0.0002158325304315042, + "loss": 0.8488, + "step": 10140 + }, + { + "epoch": 1.22, + "grad_norm": 0.2547130584716797, + "learning_rate": 0.00021574005111710192, + "loss": 0.8462, + "step": 10145 + }, + { + "epoch": 1.22, + "grad_norm": 0.22627414762973785, + "learning_rate": 0.00021564754086324844, + "loss": 0.8633, + "step": 10150 + }, + { + "epoch": 1.22, + "grad_norm": 0.21264758706092834, + "learning_rate": 0.00021555499971348215, + "loss": 0.8839, + "step": 10155 + }, + { + "epoch": 1.22, + "grad_norm": 0.21670518815517426, + "learning_rate": 0.00021546242771135597, + "loss": 0.8753, + "step": 10160 + }, + { + "epoch": 1.22, + "grad_norm": 0.21713611483573914, + "learning_rate": 0.0002153698249004374, + "loss": 0.838, + "step": 10165 + }, + { + "epoch": 1.23, + "grad_norm": 0.20740459859371185, + "learning_rate": 0.00021527719132430833, + "loss": 0.9157, + "step": 10170 + }, + { + "epoch": 1.23, + "grad_norm": 0.23185713589191437, + "learning_rate": 0.00021518452702656528, + "loss": 0.8678, + "step": 10175 + }, + { + "epoch": 1.23, + "grad_norm": 0.24864669144153595, + "learning_rate": 0.00021509183205081905, + "loss": 0.8134, + "step": 10180 + }, + { + "epoch": 1.23, + "grad_norm": 0.21126706898212433, + "learning_rate": 0.00021499910644069502, + "loss": 0.8509, + "step": 10185 + }, + { + "epoch": 1.23, + "grad_norm": 0.21727898716926575, + "learning_rate": 0.000214906350239833, + "loss": 0.8654, + "step": 10190 + }, + { + "epoch": 1.23, + "grad_norm": 0.2389363795518875, + "learning_rate": 0.00021481356349188705, + "loss": 0.8315, + "step": 10195 + }, + { + "epoch": 1.23, + "grad_norm": 0.22045078873634338, + "learning_rate": 0.00021472074624052573, + "loss": 0.851, + "step": 10200 + }, + { + "epoch": 1.23, + "grad_norm": 0.22910571098327637, + "learning_rate": 0.0002146278985294319, + "loss": 0.9826, + "step": 10205 + }, + { + "epoch": 1.23, + "grad_norm": 0.2421579509973526, + "learning_rate": 0.0002145350204023028, + "loss": 0.8577, + "step": 10210 + }, + { + "epoch": 1.23, + "grad_norm": 0.2479030340909958, + "learning_rate": 0.00021444211190285001, + "loss": 0.9059, + "step": 10215 + }, + { + "epoch": 1.23, + "grad_norm": 0.22180233895778656, + "learning_rate": 0.00021434917307479927, + "loss": 0.9248, + "step": 10220 + }, + { + "epoch": 1.23, + "grad_norm": 0.23422615230083466, + "learning_rate": 0.00021425620396189071, + "loss": 0.8011, + "step": 10225 + }, + { + "epoch": 1.23, + "grad_norm": 0.23456484079360962, + "learning_rate": 0.0002141632046078787, + "loss": 0.8772, + "step": 10230 + }, + { + "epoch": 1.23, + "grad_norm": 0.24434050917625427, + "learning_rate": 0.00021407017505653176, + "loss": 0.8983, + "step": 10235 + }, + { + "epoch": 1.23, + "grad_norm": 0.26233410835266113, + "learning_rate": 0.00021397711535163275, + "loss": 0.7669, + "step": 10240 + }, + { + "epoch": 1.23, + "grad_norm": 0.23086436092853546, + "learning_rate": 0.00021388402553697863, + "loss": 0.85, + "step": 10245 + }, + { + "epoch": 1.24, + "grad_norm": 0.2340136170387268, + "learning_rate": 0.00021379090565638064, + "loss": 0.8472, + "step": 10250 + }, + { + "epoch": 1.24, + "grad_norm": 0.2060890644788742, + "learning_rate": 0.00021369775575366397, + "loss": 0.791, + "step": 10255 + }, + { + "epoch": 1.24, + "grad_norm": 0.22005440294742584, + "learning_rate": 0.00021360457587266812, + "loss": 0.7792, + "step": 10260 + }, + { + "epoch": 1.24, + "grad_norm": 0.220989391207695, + "learning_rate": 0.00021351136605724658, + "loss": 0.9087, + "step": 10265 + }, + { + "epoch": 1.24, + "grad_norm": 0.20939388871192932, + "learning_rate": 0.00021341812635126706, + "loss": 0.8171, + "step": 10270 + }, + { + "epoch": 1.24, + "grad_norm": 0.22090451419353485, + "learning_rate": 0.00021332485679861123, + "loss": 0.831, + "step": 10275 + }, + { + "epoch": 1.24, + "grad_norm": 0.25128301978111267, + "learning_rate": 0.0002132315574431748, + "loss": 0.8665, + "step": 10280 + }, + { + "epoch": 1.24, + "grad_norm": 0.23706404864788055, + "learning_rate": 0.00021313822832886762, + "loss": 0.8653, + "step": 10285 + }, + { + "epoch": 1.24, + "grad_norm": 0.21622657775878906, + "learning_rate": 0.00021304486949961344, + "loss": 0.9074, + "step": 10290 + }, + { + "epoch": 1.24, + "grad_norm": 0.20798739790916443, + "learning_rate": 0.00021295148099935, + "loss": 0.9282, + "step": 10295 + }, + { + "epoch": 1.24, + "grad_norm": 0.226746067404747, + "learning_rate": 0.00021285806287202902, + "loss": 0.8985, + "step": 10300 + }, + { + "epoch": 1.24, + "grad_norm": 0.2492750883102417, + "learning_rate": 0.00021276461516161622, + "loss": 0.8985, + "step": 10305 + }, + { + "epoch": 1.24, + "grad_norm": 0.252853125333786, + "learning_rate": 0.0002126711379120912, + "loss": 0.9173, + "step": 10310 + }, + { + "epoch": 1.24, + "grad_norm": 0.23097646236419678, + "learning_rate": 0.00021257763116744744, + "loss": 0.9335, + "step": 10315 + }, + { + "epoch": 1.24, + "grad_norm": 0.25283390283584595, + "learning_rate": 0.0002124840949716923, + "loss": 0.9115, + "step": 10320 + }, + { + "epoch": 1.24, + "grad_norm": 0.2866900861263275, + "learning_rate": 0.00021239052936884703, + "loss": 0.9016, + "step": 10325 + }, + { + "epoch": 1.24, + "grad_norm": 0.23940923810005188, + "learning_rate": 0.0002122969344029467, + "loss": 0.8282, + "step": 10330 + }, + { + "epoch": 1.25, + "grad_norm": 0.26236215233802795, + "learning_rate": 0.00021220331011804022, + "loss": 0.8244, + "step": 10335 + }, + { + "epoch": 1.25, + "grad_norm": 0.23621851205825806, + "learning_rate": 0.00021210965655819031, + "loss": 0.8655, + "step": 10340 + }, + { + "epoch": 1.25, + "grad_norm": 0.21117086708545685, + "learning_rate": 0.00021201597376747344, + "loss": 0.9017, + "step": 10345 + }, + { + "epoch": 1.25, + "grad_norm": 0.2675582468509674, + "learning_rate": 0.00021192226178997977, + "loss": 0.8432, + "step": 10350 + }, + { + "epoch": 1.25, + "grad_norm": 0.2619032561779022, + "learning_rate": 0.0002118285206698134, + "loss": 0.8931, + "step": 10355 + }, + { + "epoch": 1.25, + "grad_norm": 0.2394731044769287, + "learning_rate": 0.0002117347504510919, + "loss": 0.8431, + "step": 10360 + }, + { + "epoch": 1.25, + "grad_norm": 0.22042964398860931, + "learning_rate": 0.00021164095117794674, + "loss": 0.8538, + "step": 10365 + }, + { + "epoch": 1.25, + "grad_norm": 0.22833536565303802, + "learning_rate": 0.00021154712289452285, + "loss": 0.9355, + "step": 10370 + }, + { + "epoch": 1.25, + "grad_norm": 0.24898435175418854, + "learning_rate": 0.00021145326564497903, + "loss": 0.9127, + "step": 10375 + }, + { + "epoch": 1.25, + "grad_norm": 0.2540437579154968, + "learning_rate": 0.00021135937947348757, + "loss": 0.9235, + "step": 10380 + }, + { + "epoch": 1.25, + "grad_norm": 0.26430103182792664, + "learning_rate": 0.00021126546442423453, + "loss": 0.8293, + "step": 10385 + }, + { + "epoch": 1.25, + "grad_norm": 0.24272888898849487, + "learning_rate": 0.0002111715205414193, + "loss": 0.9249, + "step": 10390 + }, + { + "epoch": 1.25, + "grad_norm": 0.2834715247154236, + "learning_rate": 0.00021107754786925512, + "loss": 0.8638, + "step": 10395 + }, + { + "epoch": 1.25, + "grad_norm": 0.23184886574745178, + "learning_rate": 0.0002109835464519685, + "loss": 0.8379, + "step": 10400 + }, + { + "epoch": 1.25, + "grad_norm": 0.2805626392364502, + "learning_rate": 0.00021088951633379982, + "loss": 0.9148, + "step": 10405 + }, + { + "epoch": 1.25, + "grad_norm": 0.2405596673488617, + "learning_rate": 0.0002107954575590026, + "loss": 0.9326, + "step": 10410 + }, + { + "epoch": 1.25, + "grad_norm": 0.21941906213760376, + "learning_rate": 0.00021070137017184415, + "loss": 0.8786, + "step": 10415 + }, + { + "epoch": 1.26, + "grad_norm": 0.24544595181941986, + "learning_rate": 0.000210607254216605, + "loss": 0.8619, + "step": 10420 + }, + { + "epoch": 1.26, + "grad_norm": 0.22791095077991486, + "learning_rate": 0.00021051310973757936, + "loss": 0.8595, + "step": 10425 + }, + { + "epoch": 1.26, + "grad_norm": 0.2604474127292633, + "learning_rate": 0.00021041893677907473, + "loss": 0.8809, + "step": 10430 + }, + { + "epoch": 1.26, + "grad_norm": 0.22478339076042175, + "learning_rate": 0.00021032473538541195, + "loss": 0.8569, + "step": 10435 + }, + { + "epoch": 1.26, + "grad_norm": 0.25225207209587097, + "learning_rate": 0.0002102305056009254, + "loss": 0.8429, + "step": 10440 + }, + { + "epoch": 1.26, + "grad_norm": 0.24379763007164001, + "learning_rate": 0.00021013624746996272, + "loss": 0.8249, + "step": 10445 + }, + { + "epoch": 1.26, + "grad_norm": 0.22271931171417236, + "learning_rate": 0.00021004196103688487, + "loss": 0.9641, + "step": 10450 + }, + { + "epoch": 1.26, + "grad_norm": 0.2445717453956604, + "learning_rate": 0.00020994764634606628, + "loss": 0.8815, + "step": 10455 + }, + { + "epoch": 1.26, + "grad_norm": 0.22217918932437897, + "learning_rate": 0.0002098533034418945, + "loss": 0.8908, + "step": 10460 + }, + { + "epoch": 1.26, + "grad_norm": 0.24231913685798645, + "learning_rate": 0.00020975893236877048, + "loss": 0.864, + "step": 10465 + }, + { + "epoch": 1.26, + "grad_norm": 0.23367081582546234, + "learning_rate": 0.0002096645331711083, + "loss": 0.8971, + "step": 10470 + }, + { + "epoch": 1.26, + "grad_norm": 0.2660883963108063, + "learning_rate": 0.00020957010589333546, + "loss": 0.8435, + "step": 10475 + }, + { + "epoch": 1.26, + "grad_norm": 0.2160717397928238, + "learning_rate": 0.00020947565057989249, + "loss": 0.8609, + "step": 10480 + }, + { + "epoch": 1.26, + "grad_norm": 0.26367098093032837, + "learning_rate": 0.00020938116727523324, + "loss": 0.9108, + "step": 10485 + }, + { + "epoch": 1.26, + "grad_norm": 0.23195737600326538, + "learning_rate": 0.0002092866560238247, + "loss": 0.8359, + "step": 10490 + }, + { + "epoch": 1.26, + "grad_norm": 0.2410704344511032, + "learning_rate": 0.00020919211687014697, + "loss": 0.8616, + "step": 10495 + }, + { + "epoch": 1.27, + "grad_norm": 0.23946736752986908, + "learning_rate": 0.00020909754985869335, + "loss": 0.7911, + "step": 10500 + }, + { + "epoch": 1.27, + "grad_norm": 0.23445159196853638, + "learning_rate": 0.0002090029550339702, + "loss": 0.8216, + "step": 10505 + }, + { + "epoch": 1.27, + "grad_norm": 0.2318773865699768, + "learning_rate": 0.00020890833244049695, + "loss": 0.7892, + "step": 10510 + }, + { + "epoch": 1.27, + "grad_norm": 0.22648349404335022, + "learning_rate": 0.0002088136821228062, + "loss": 0.9097, + "step": 10515 + }, + { + "epoch": 1.27, + "grad_norm": 0.23851384222507477, + "learning_rate": 0.00020871900412544345, + "loss": 0.8725, + "step": 10520 + }, + { + "epoch": 1.27, + "grad_norm": 0.22477130591869354, + "learning_rate": 0.00020862429849296743, + "loss": 0.8717, + "step": 10525 + }, + { + "epoch": 1.27, + "grad_norm": 0.22048017382621765, + "learning_rate": 0.00020852956526994963, + "loss": 0.8755, + "step": 10530 + }, + { + "epoch": 1.27, + "grad_norm": 0.2534314692020416, + "learning_rate": 0.00020843480450097473, + "loss": 0.8789, + "step": 10535 + }, + { + "epoch": 1.27, + "grad_norm": 0.2241785079240799, + "learning_rate": 0.0002083400162306403, + "loss": 0.818, + "step": 10540 + }, + { + "epoch": 1.27, + "grad_norm": 0.25126171112060547, + "learning_rate": 0.00020824520050355681, + "loss": 0.8867, + "step": 10545 + }, + { + "epoch": 1.27, + "grad_norm": 0.24423153698444366, + "learning_rate": 0.00020815035736434766, + "loss": 0.9067, + "step": 10550 + }, + { + "epoch": 1.27, + "grad_norm": 0.2558138072490692, + "learning_rate": 0.00020805548685764923, + "loss": 0.8161, + "step": 10555 + }, + { + "epoch": 1.27, + "grad_norm": 0.23357480764389038, + "learning_rate": 0.00020796058902811075, + "loss": 0.8864, + "step": 10560 + }, + { + "epoch": 1.27, + "grad_norm": 0.2539251446723938, + "learning_rate": 0.00020786566392039428, + "loss": 0.881, + "step": 10565 + }, + { + "epoch": 1.27, + "grad_norm": 0.2158096432685852, + "learning_rate": 0.00020777071157917468, + "loss": 0.8784, + "step": 10570 + }, + { + "epoch": 1.27, + "grad_norm": 0.22970278561115265, + "learning_rate": 0.0002076757320491397, + "loss": 0.9085, + "step": 10575 + }, + { + "epoch": 1.27, + "grad_norm": 0.2667364776134491, + "learning_rate": 0.00020758072537498985, + "loss": 0.896, + "step": 10580 + }, + { + "epoch": 1.28, + "grad_norm": 0.22878745198249817, + "learning_rate": 0.00020748569160143845, + "loss": 0.9321, + "step": 10585 + }, + { + "epoch": 1.28, + "grad_norm": 0.2596031725406647, + "learning_rate": 0.00020739063077321155, + "loss": 0.9034, + "step": 10590 + }, + { + "epoch": 1.28, + "grad_norm": 0.23091496527194977, + "learning_rate": 0.00020729554293504794, + "loss": 0.9254, + "step": 10595 + }, + { + "epoch": 1.28, + "grad_norm": 0.258047491312027, + "learning_rate": 0.00020720042813169906, + "loss": 0.7867, + "step": 10600 + }, + { + "epoch": 1.28, + "grad_norm": 0.23771882057189941, + "learning_rate": 0.00020710528640792916, + "loss": 0.8768, + "step": 10605 + }, + { + "epoch": 1.28, + "grad_norm": 0.22498424351215363, + "learning_rate": 0.0002070101178085151, + "loss": 0.9163, + "step": 10610 + }, + { + "epoch": 1.28, + "grad_norm": 0.2267744243144989, + "learning_rate": 0.0002069149223782463, + "loss": 0.872, + "step": 10615 + }, + { + "epoch": 1.28, + "grad_norm": 0.23287959396839142, + "learning_rate": 0.00020681970016192495, + "loss": 0.9396, + "step": 10620 + }, + { + "epoch": 1.28, + "grad_norm": 0.23463228344917297, + "learning_rate": 0.00020672445120436582, + "loss": 0.9393, + "step": 10625 + }, + { + "epoch": 1.28, + "grad_norm": 0.24994787573814392, + "learning_rate": 0.00020662917555039616, + "loss": 0.827, + "step": 10630 + }, + { + "epoch": 1.28, + "grad_norm": 0.24779953062534332, + "learning_rate": 0.00020653387324485588, + "loss": 0.9111, + "step": 10635 + }, + { + "epoch": 1.28, + "grad_norm": 0.27712857723236084, + "learning_rate": 0.00020643854433259742, + "loss": 0.9207, + "step": 10640 + }, + { + "epoch": 1.28, + "grad_norm": 0.25297531485557556, + "learning_rate": 0.0002063431888584858, + "loss": 0.7711, + "step": 10645 + }, + { + "epoch": 1.28, + "grad_norm": 0.22088488936424255, + "learning_rate": 0.0002062478068673983, + "loss": 0.8474, + "step": 10650 + }, + { + "epoch": 1.28, + "grad_norm": 0.2587786018848419, + "learning_rate": 0.00020615239840422506, + "loss": 0.9111, + "step": 10655 + }, + { + "epoch": 1.28, + "grad_norm": 0.2736605107784271, + "learning_rate": 0.00020605696351386828, + "loss": 0.9088, + "step": 10660 + }, + { + "epoch": 1.29, + "grad_norm": 0.275921493768692, + "learning_rate": 0.00020596150224124293, + "loss": 0.8737, + "step": 10665 + }, + { + "epoch": 1.29, + "grad_norm": 0.24181650578975677, + "learning_rate": 0.00020586601463127611, + "loss": 0.9555, + "step": 10670 + }, + { + "epoch": 1.29, + "grad_norm": 0.25569936633110046, + "learning_rate": 0.00020577050072890764, + "loss": 0.9392, + "step": 10675 + }, + { + "epoch": 1.29, + "grad_norm": 0.22888949513435364, + "learning_rate": 0.00020567496057908948, + "loss": 0.8406, + "step": 10680 + }, + { + "epoch": 1.29, + "grad_norm": 0.22984679043293, + "learning_rate": 0.0002055793942267859, + "loss": 0.7473, + "step": 10685 + }, + { + "epoch": 1.29, + "grad_norm": 0.21859197318553925, + "learning_rate": 0.00020548380171697366, + "loss": 0.9393, + "step": 10690 + }, + { + "epoch": 1.29, + "grad_norm": 0.23983286321163177, + "learning_rate": 0.00020538818309464178, + "loss": 0.7875, + "step": 10695 + }, + { + "epoch": 1.29, + "grad_norm": 0.23376217484474182, + "learning_rate": 0.00020529253840479155, + "loss": 0.9696, + "step": 10700 + }, + { + "epoch": 1.29, + "grad_norm": 0.2256166785955429, + "learning_rate": 0.00020519686769243653, + "loss": 0.9457, + "step": 10705 + }, + { + "epoch": 1.29, + "grad_norm": 0.24345454573631287, + "learning_rate": 0.00020510117100260255, + "loss": 0.77, + "step": 10710 + }, + { + "epoch": 1.29, + "grad_norm": 0.2544359862804413, + "learning_rate": 0.00020500544838032765, + "loss": 0.895, + "step": 10715 + }, + { + "epoch": 1.29, + "grad_norm": 0.2471429854631424, + "learning_rate": 0.00020490969987066207, + "loss": 0.9008, + "step": 10720 + }, + { + "epoch": 1.29, + "grad_norm": 0.22353385388851166, + "learning_rate": 0.00020481392551866827, + "loss": 0.9069, + "step": 10725 + }, + { + "epoch": 1.29, + "grad_norm": 0.24593974649906158, + "learning_rate": 0.00020471812536942074, + "loss": 1.057, + "step": 10730 + }, + { + "epoch": 1.29, + "grad_norm": 0.2564097046852112, + "learning_rate": 0.00020462229946800634, + "loss": 0.8346, + "step": 10735 + }, + { + "epoch": 1.29, + "grad_norm": 0.22804971039295197, + "learning_rate": 0.0002045264478595238, + "loss": 0.8141, + "step": 10740 + }, + { + "epoch": 1.29, + "grad_norm": 0.25203844904899597, + "learning_rate": 0.00020443057058908417, + "loss": 0.8955, + "step": 10745 + }, + { + "epoch": 1.3, + "grad_norm": 0.24008525907993317, + "learning_rate": 0.0002043346677018104, + "loss": 0.8148, + "step": 10750 + }, + { + "epoch": 1.3, + "grad_norm": 0.27215099334716797, + "learning_rate": 0.00020423873924283763, + "loss": 0.8765, + "step": 10755 + }, + { + "epoch": 1.3, + "grad_norm": 0.2237570732831955, + "learning_rate": 0.000204142785257313, + "loss": 0.9221, + "step": 10760 + }, + { + "epoch": 1.3, + "grad_norm": 0.24160721898078918, + "learning_rate": 0.00020404680579039558, + "loss": 0.8637, + "step": 10765 + }, + { + "epoch": 1.3, + "grad_norm": 0.23897403478622437, + "learning_rate": 0.0002039508008872565, + "loss": 0.7704, + "step": 10770 + }, + { + "epoch": 1.3, + "grad_norm": 0.23746059834957123, + "learning_rate": 0.00020385477059307885, + "loss": 0.8945, + "step": 10775 + }, + { + "epoch": 1.3, + "grad_norm": 0.23488637804985046, + "learning_rate": 0.0002037587149530577, + "loss": 0.8504, + "step": 10780 + }, + { + "epoch": 1.3, + "grad_norm": 0.26902300119400024, + "learning_rate": 0.00020366263401240005, + "loss": 0.8331, + "step": 10785 + }, + { + "epoch": 1.3, + "grad_norm": 0.25453630089759827, + "learning_rate": 0.00020356652781632477, + "loss": 0.913, + "step": 10790 + }, + { + "epoch": 1.3, + "grad_norm": 0.23822472989559174, + "learning_rate": 0.00020347039641006257, + "loss": 0.8158, + "step": 10795 + }, + { + "epoch": 1.3, + "grad_norm": 0.25680580735206604, + "learning_rate": 0.00020337423983885617, + "loss": 0.7857, + "step": 10800 + }, + { + "epoch": 1.3, + "grad_norm": 0.2617832124233246, + "learning_rate": 0.00020327805814795993, + "loss": 0.9449, + "step": 10805 + }, + { + "epoch": 1.3, + "grad_norm": 0.23507827520370483, + "learning_rate": 0.00020318185138264026, + "loss": 0.9813, + "step": 10810 + }, + { + "epoch": 1.3, + "grad_norm": 0.2346469610929489, + "learning_rate": 0.00020308561958817518, + "loss": 0.8555, + "step": 10815 + }, + { + "epoch": 1.3, + "grad_norm": 0.2674165666103363, + "learning_rate": 0.00020298936280985466, + "loss": 0.9193, + "step": 10820 + }, + { + "epoch": 1.3, + "grad_norm": 0.21533791720867157, + "learning_rate": 0.00020289308109298028, + "loss": 0.8939, + "step": 10825 + }, + { + "epoch": 1.3, + "grad_norm": 0.23552724719047546, + "learning_rate": 0.0002027967744828654, + "loss": 0.8654, + "step": 10830 + }, + { + "epoch": 1.31, + "grad_norm": 0.26331043243408203, + "learning_rate": 0.00020270044302483505, + "loss": 0.801, + "step": 10835 + }, + { + "epoch": 1.31, + "grad_norm": 0.240268275141716, + "learning_rate": 0.00020260408676422615, + "loss": 0.7792, + "step": 10840 + }, + { + "epoch": 1.31, + "grad_norm": 0.23092767596244812, + "learning_rate": 0.00020250770574638708, + "loss": 0.9452, + "step": 10845 + }, + { + "epoch": 1.31, + "grad_norm": 0.2468583583831787, + "learning_rate": 0.00020241130001667797, + "loss": 0.8753, + "step": 10850 + }, + { + "epoch": 1.31, + "grad_norm": 0.27682459354400635, + "learning_rate": 0.0002023148696204705, + "loss": 0.8244, + "step": 10855 + }, + { + "epoch": 1.31, + "grad_norm": 0.27396732568740845, + "learning_rate": 0.00020221841460314814, + "loss": 0.8651, + "step": 10860 + }, + { + "epoch": 1.31, + "grad_norm": 0.24095529317855835, + "learning_rate": 0.00020212193501010573, + "loss": 0.8664, + "step": 10865 + }, + { + "epoch": 1.31, + "grad_norm": 0.24401700496673584, + "learning_rate": 0.00020202543088674977, + "loss": 0.8406, + "step": 10870 + }, + { + "epoch": 1.31, + "grad_norm": 0.22764812409877777, + "learning_rate": 0.0002019289022784983, + "loss": 0.9253, + "step": 10875 + }, + { + "epoch": 1.31, + "grad_norm": 0.2579391598701477, + "learning_rate": 0.00020183234923078092, + "loss": 0.9783, + "step": 10880 + }, + { + "epoch": 1.31, + "grad_norm": 0.23062071204185486, + "learning_rate": 0.0002017357717890387, + "loss": 0.8766, + "step": 10885 + }, + { + "epoch": 1.31, + "grad_norm": 0.23495079576969147, + "learning_rate": 0.00020163916999872418, + "loss": 0.8313, + "step": 10890 + }, + { + "epoch": 1.31, + "grad_norm": 0.2631068527698517, + "learning_rate": 0.00020154254390530142, + "loss": 0.9304, + "step": 10895 + }, + { + "epoch": 1.31, + "grad_norm": 0.2535402178764343, + "learning_rate": 0.00020144589355424578, + "loss": 0.8889, + "step": 10900 + }, + { + "epoch": 1.31, + "grad_norm": 0.2067861258983612, + "learning_rate": 0.00020134921899104416, + "loss": 0.8527, + "step": 10905 + }, + { + "epoch": 1.31, + "grad_norm": 0.2404996007680893, + "learning_rate": 0.00020125252026119487, + "loss": 0.8413, + "step": 10910 + }, + { + "epoch": 1.32, + "grad_norm": 0.27104899287223816, + "learning_rate": 0.00020115579741020745, + "loss": 0.8226, + "step": 10915 + }, + { + "epoch": 1.32, + "grad_norm": 0.21938621997833252, + "learning_rate": 0.000201059050483603, + "loss": 0.85, + "step": 10920 + }, + { + "epoch": 1.32, + "grad_norm": 0.2280419021844864, + "learning_rate": 0.00020096227952691366, + "loss": 0.8882, + "step": 10925 + }, + { + "epoch": 1.32, + "grad_norm": 0.23701699078083038, + "learning_rate": 0.00020086548458568326, + "loss": 0.8571, + "step": 10930 + }, + { + "epoch": 1.32, + "grad_norm": 0.22839820384979248, + "learning_rate": 0.00020076866570546662, + "loss": 0.797, + "step": 10935 + }, + { + "epoch": 1.32, + "grad_norm": 0.23596982657909393, + "learning_rate": 0.00020067182293182994, + "loss": 0.9124, + "step": 10940 + }, + { + "epoch": 1.32, + "grad_norm": 0.22004413604736328, + "learning_rate": 0.00020057495631035056, + "loss": 0.9069, + "step": 10945 + }, + { + "epoch": 1.32, + "grad_norm": 0.23677165806293488, + "learning_rate": 0.00020047806588661726, + "loss": 0.8417, + "step": 10950 + }, + { + "epoch": 1.32, + "grad_norm": 0.2435624748468399, + "learning_rate": 0.00020038115170622982, + "loss": 0.7833, + "step": 10955 + }, + { + "epoch": 1.32, + "grad_norm": 0.2496495395898819, + "learning_rate": 0.00020028421381479926, + "loss": 0.8015, + "step": 10960 + }, + { + "epoch": 1.32, + "grad_norm": 0.23711077868938446, + "learning_rate": 0.0002001872522579478, + "loss": 0.8105, + "step": 10965 + }, + { + "epoch": 1.32, + "grad_norm": 0.23245902359485626, + "learning_rate": 0.0002000902670813088, + "loss": 0.973, + "step": 10970 + }, + { + "epoch": 1.32, + "grad_norm": 0.23252607882022858, + "learning_rate": 0.0001999932583305266, + "loss": 0.8141, + "step": 10975 + }, + { + "epoch": 1.32, + "grad_norm": 0.22429288923740387, + "learning_rate": 0.00019989622605125684, + "loss": 0.9489, + "step": 10980 + }, + { + "epoch": 1.32, + "grad_norm": 0.2248774915933609, + "learning_rate": 0.00019979917028916606, + "loss": 0.9385, + "step": 10985 + }, + { + "epoch": 1.32, + "grad_norm": 0.2719941735267639, + "learning_rate": 0.00019970209108993202, + "loss": 0.8649, + "step": 10990 + }, + { + "epoch": 1.32, + "grad_norm": 0.2678004503250122, + "learning_rate": 0.0001996049884992433, + "loss": 0.8, + "step": 10995 + }, + { + "epoch": 1.33, + "grad_norm": 0.2112882286310196, + "learning_rate": 0.0001995078625627997, + "loss": 0.8478, + "step": 11000 + }, + { + "epoch": 1.33, + "grad_norm": 0.2869125306606293, + "learning_rate": 0.00019941071332631188, + "loss": 0.8296, + "step": 11005 + }, + { + "epoch": 1.33, + "grad_norm": 0.21323542296886444, + "learning_rate": 0.00019931354083550147, + "loss": 0.8881, + "step": 11010 + }, + { + "epoch": 1.33, + "grad_norm": 0.23022828996181488, + "learning_rate": 0.0001992163451361011, + "loss": 0.8604, + "step": 11015 + }, + { + "epoch": 1.33, + "grad_norm": 0.23515096306800842, + "learning_rate": 0.00019911912627385426, + "loss": 0.9111, + "step": 11020 + }, + { + "epoch": 1.33, + "grad_norm": 0.2547999918460846, + "learning_rate": 0.00019902188429451542, + "loss": 0.881, + "step": 11025 + }, + { + "epoch": 1.33, + "grad_norm": 0.28279218077659607, + "learning_rate": 0.0001989246192438499, + "loss": 0.908, + "step": 11030 + }, + { + "epoch": 1.33, + "grad_norm": 0.24193325638771057, + "learning_rate": 0.00019882733116763376, + "loss": 0.8769, + "step": 11035 + }, + { + "epoch": 1.33, + "grad_norm": 0.2332753986120224, + "learning_rate": 0.00019873002011165409, + "loss": 0.803, + "step": 11040 + }, + { + "epoch": 1.33, + "grad_norm": 0.24493838846683502, + "learning_rate": 0.00019863268612170873, + "loss": 0.825, + "step": 11045 + }, + { + "epoch": 1.33, + "grad_norm": 0.22939081490039825, + "learning_rate": 0.00019853532924360618, + "loss": 0.9395, + "step": 11050 + }, + { + "epoch": 1.33, + "grad_norm": 0.24580281972885132, + "learning_rate": 0.0001984379495231659, + "loss": 0.8964, + "step": 11055 + }, + { + "epoch": 1.33, + "grad_norm": 0.22950120270252228, + "learning_rate": 0.00019834054700621802, + "loss": 0.8883, + "step": 11060 + }, + { + "epoch": 1.33, + "grad_norm": 0.2450743019580841, + "learning_rate": 0.00019824312173860332, + "loss": 0.8721, + "step": 11065 + }, + { + "epoch": 1.33, + "grad_norm": 0.2725335955619812, + "learning_rate": 0.00019814567376617346, + "loss": 0.8063, + "step": 11070 + }, + { + "epoch": 1.33, + "grad_norm": 0.27576860785484314, + "learning_rate": 0.00019804820313479066, + "loss": 0.9648, + "step": 11075 + }, + { + "epoch": 1.34, + "grad_norm": 0.23889948427677155, + "learning_rate": 0.00019795070989032788, + "loss": 0.8958, + "step": 11080 + }, + { + "epoch": 1.34, + "grad_norm": 0.25442391633987427, + "learning_rate": 0.00019785319407866853, + "loss": 0.838, + "step": 11085 + }, + { + "epoch": 1.34, + "grad_norm": 0.26583176851272583, + "learning_rate": 0.00019775565574570698, + "loss": 0.8578, + "step": 11090 + }, + { + "epoch": 1.34, + "grad_norm": 0.25874045491218567, + "learning_rate": 0.00019765809493734786, + "loss": 0.8606, + "step": 11095 + }, + { + "epoch": 1.34, + "grad_norm": 0.24192367494106293, + "learning_rate": 0.00019756051169950663, + "loss": 0.7974, + "step": 11100 + }, + { + "epoch": 1.34, + "grad_norm": 0.23584434390068054, + "learning_rate": 0.0001974629060781091, + "loss": 0.8465, + "step": 11105 + }, + { + "epoch": 1.34, + "grad_norm": 0.28704845905303955, + "learning_rate": 0.00019736527811909185, + "loss": 0.9114, + "step": 11110 + }, + { + "epoch": 1.34, + "grad_norm": 0.21719807386398315, + "learning_rate": 0.00019726762786840177, + "loss": 0.9526, + "step": 11115 + }, + { + "epoch": 1.34, + "grad_norm": 0.23290511965751648, + "learning_rate": 0.00019716995537199624, + "loss": 0.8695, + "step": 11120 + }, + { + "epoch": 1.34, + "grad_norm": 0.26695218682289124, + "learning_rate": 0.00019707226067584326, + "loss": 0.8904, + "step": 11125 + }, + { + "epoch": 1.34, + "grad_norm": 0.225587397813797, + "learning_rate": 0.0001969745438259212, + "loss": 0.7754, + "step": 11130 + }, + { + "epoch": 1.34, + "grad_norm": 0.2279675006866455, + "learning_rate": 0.00019687680486821883, + "loss": 0.9336, + "step": 11135 + }, + { + "epoch": 1.34, + "grad_norm": 0.2307017594575882, + "learning_rate": 0.00019677904384873536, + "loss": 0.8192, + "step": 11140 + }, + { + "epoch": 1.34, + "grad_norm": 0.2410012036561966, + "learning_rate": 0.00019668126081348036, + "loss": 0.8694, + "step": 11145 + }, + { + "epoch": 1.34, + "grad_norm": 0.26555535197257996, + "learning_rate": 0.00019658345580847382, + "loss": 0.8598, + "step": 11150 + }, + { + "epoch": 1.34, + "grad_norm": 0.225142240524292, + "learning_rate": 0.00019648562887974598, + "loss": 0.7988, + "step": 11155 + }, + { + "epoch": 1.34, + "grad_norm": 0.2711564600467682, + "learning_rate": 0.00019638778007333743, + "loss": 0.8018, + "step": 11160 + }, + { + "epoch": 1.35, + "grad_norm": 0.2621377408504486, + "learning_rate": 0.00019628990943529909, + "loss": 0.8682, + "step": 11165 + }, + { + "epoch": 1.35, + "grad_norm": 0.23893457651138306, + "learning_rate": 0.00019619201701169217, + "loss": 0.8165, + "step": 11170 + }, + { + "epoch": 1.35, + "grad_norm": 0.2528732419013977, + "learning_rate": 0.00019609410284858797, + "loss": 0.8636, + "step": 11175 + }, + { + "epoch": 1.35, + "grad_norm": 0.24171330034732819, + "learning_rate": 0.0001959961669920683, + "loss": 0.9089, + "step": 11180 + }, + { + "epoch": 1.35, + "grad_norm": 0.24632695317268372, + "learning_rate": 0.00019589820948822493, + "loss": 0.92, + "step": 11185 + }, + { + "epoch": 1.35, + "grad_norm": 0.2687559723854065, + "learning_rate": 0.00019580023038316, + "loss": 0.8794, + "step": 11190 + }, + { + "epoch": 1.35, + "grad_norm": 0.2685549259185791, + "learning_rate": 0.0001957022297229856, + "loss": 0.8629, + "step": 11195 + }, + { + "epoch": 1.35, + "grad_norm": 0.23067381978034973, + "learning_rate": 0.00019560420755382416, + "loss": 0.836, + "step": 11200 + }, + { + "epoch": 1.35, + "grad_norm": 0.2574182450771332, + "learning_rate": 0.0001955061639218082, + "loss": 0.8133, + "step": 11205 + }, + { + "epoch": 1.35, + "grad_norm": 0.2490427941083908, + "learning_rate": 0.00019540809887308032, + "loss": 0.7957, + "step": 11210 + }, + { + "epoch": 1.35, + "grad_norm": 0.24531954526901245, + "learning_rate": 0.00019531001245379312, + "loss": 0.9497, + "step": 11215 + }, + { + "epoch": 1.35, + "grad_norm": 0.2284311205148697, + "learning_rate": 0.00019521190471010936, + "loss": 0.8048, + "step": 11220 + }, + { + "epoch": 1.35, + "grad_norm": 0.23735016584396362, + "learning_rate": 0.00019511377568820184, + "loss": 0.7849, + "step": 11225 + }, + { + "epoch": 1.35, + "grad_norm": 0.24056896567344666, + "learning_rate": 0.00019501562543425329, + "loss": 0.8545, + "step": 11230 + }, + { + "epoch": 1.35, + "grad_norm": 0.2434719055891037, + "learning_rate": 0.00019491745399445644, + "loss": 0.8467, + "step": 11235 + }, + { + "epoch": 1.35, + "grad_norm": 0.24002546072006226, + "learning_rate": 0.0001948192614150141, + "loss": 0.8546, + "step": 11240 + }, + { + "epoch": 1.35, + "grad_norm": 0.2705593705177307, + "learning_rate": 0.00019472104774213893, + "loss": 0.9254, + "step": 11245 + }, + { + "epoch": 1.36, + "grad_norm": 0.2447061389684677, + "learning_rate": 0.00019462281302205355, + "loss": 0.946, + "step": 11250 + }, + { + "epoch": 1.36, + "grad_norm": 0.24118636548519135, + "learning_rate": 0.0001945245573009905, + "loss": 0.8431, + "step": 11255 + }, + { + "epoch": 1.36, + "grad_norm": 0.22377237677574158, + "learning_rate": 0.0001944262806251922, + "loss": 0.9679, + "step": 11260 + }, + { + "epoch": 1.36, + "grad_norm": 0.24068385362625122, + "learning_rate": 0.00019432798304091085, + "loss": 0.928, + "step": 11265 + }, + { + "epoch": 1.36, + "grad_norm": 0.2337529957294464, + "learning_rate": 0.0001942296645944086, + "loss": 0.9368, + "step": 11270 + }, + { + "epoch": 1.36, + "grad_norm": 0.23204030096530914, + "learning_rate": 0.00019413132533195737, + "loss": 0.9575, + "step": 11275 + }, + { + "epoch": 1.36, + "grad_norm": 0.23348142206668854, + "learning_rate": 0.00019403296529983888, + "loss": 0.8327, + "step": 11280 + }, + { + "epoch": 1.36, + "grad_norm": 0.22873573005199432, + "learning_rate": 0.00019393458454434464, + "loss": 0.7914, + "step": 11285 + }, + { + "epoch": 1.36, + "grad_norm": 0.2327519953250885, + "learning_rate": 0.00019383618311177587, + "loss": 0.8401, + "step": 11290 + }, + { + "epoch": 1.36, + "grad_norm": 0.2520465850830078, + "learning_rate": 0.00019373776104844362, + "loss": 0.8922, + "step": 11295 + }, + { + "epoch": 1.36, + "grad_norm": 0.2645134925842285, + "learning_rate": 0.00019363931840066847, + "loss": 0.8806, + "step": 11300 + }, + { + "epoch": 1.36, + "grad_norm": 0.2500244677066803, + "learning_rate": 0.00019354085521478088, + "loss": 0.9306, + "step": 11305 + }, + { + "epoch": 1.36, + "grad_norm": 0.2344416230916977, + "learning_rate": 0.0001934423715371209, + "loss": 0.9232, + "step": 11310 + }, + { + "epoch": 1.36, + "grad_norm": 0.2218599170446396, + "learning_rate": 0.0001933438674140382, + "loss": 0.8308, + "step": 11315 + }, + { + "epoch": 1.36, + "grad_norm": 0.25029993057250977, + "learning_rate": 0.00019324534289189203, + "loss": 0.861, + "step": 11320 + }, + { + "epoch": 1.36, + "grad_norm": 0.25338882207870483, + "learning_rate": 0.00019314679801705144, + "loss": 0.8986, + "step": 11325 + }, + { + "epoch": 1.37, + "grad_norm": 0.22876478731632233, + "learning_rate": 0.00019304823283589482, + "loss": 1.0461, + "step": 11330 + }, + { + "epoch": 1.37, + "grad_norm": 0.2699691355228424, + "learning_rate": 0.00019294964739481024, + "loss": 0.8773, + "step": 11335 + }, + { + "epoch": 1.37, + "grad_norm": 0.23028436303138733, + "learning_rate": 0.00019285104174019527, + "loss": 0.9431, + "step": 11340 + }, + { + "epoch": 1.37, + "grad_norm": 0.21594227850437164, + "learning_rate": 0.00019275241591845704, + "loss": 0.904, + "step": 11345 + }, + { + "epoch": 1.37, + "grad_norm": 0.25375184416770935, + "learning_rate": 0.00019265376997601205, + "loss": 0.9063, + "step": 11350 + }, + { + "epoch": 1.37, + "grad_norm": 0.24144120514392853, + "learning_rate": 0.0001925551039592865, + "loss": 0.8952, + "step": 11355 + }, + { + "epoch": 1.37, + "grad_norm": 0.23259896039962769, + "learning_rate": 0.00019245641791471577, + "loss": 0.9056, + "step": 11360 + }, + { + "epoch": 1.37, + "grad_norm": 0.26159223914146423, + "learning_rate": 0.00019235771188874485, + "loss": 0.8856, + "step": 11365 + }, + { + "epoch": 1.37, + "grad_norm": 0.22818419337272644, + "learning_rate": 0.0001922589859278281, + "loss": 0.8179, + "step": 11370 + }, + { + "epoch": 1.37, + "grad_norm": 0.24243588745594025, + "learning_rate": 0.00019216024007842915, + "loss": 0.8159, + "step": 11375 + }, + { + "epoch": 1.37, + "grad_norm": 0.22796067595481873, + "learning_rate": 0.00019206147438702108, + "loss": 0.9193, + "step": 11380 + }, + { + "epoch": 1.37, + "grad_norm": 0.25152865052223206, + "learning_rate": 0.00019196268890008642, + "loss": 0.938, + "step": 11385 + }, + { + "epoch": 1.37, + "grad_norm": 0.22121083736419678, + "learning_rate": 0.0001918638836641168, + "loss": 0.82, + "step": 11390 + }, + { + "epoch": 1.37, + "grad_norm": 0.22968332469463348, + "learning_rate": 0.00019176505872561326, + "loss": 0.9606, + "step": 11395 + }, + { + "epoch": 1.37, + "grad_norm": 0.2671717405319214, + "learning_rate": 0.00019166621413108613, + "loss": 0.9147, + "step": 11400 + }, + { + "epoch": 1.37, + "grad_norm": 0.24766571819782257, + "learning_rate": 0.00019156734992705496, + "loss": 0.8639, + "step": 11405 + }, + { + "epoch": 1.37, + "grad_norm": 0.244247704744339, + "learning_rate": 0.00019146846616004842, + "loss": 0.7505, + "step": 11410 + }, + { + "epoch": 1.38, + "grad_norm": 0.23957106471061707, + "learning_rate": 0.00019136956287660464, + "loss": 0.8708, + "step": 11415 + }, + { + "epoch": 1.38, + "grad_norm": 0.2419036477804184, + "learning_rate": 0.00019127064012327072, + "loss": 0.9644, + "step": 11420 + }, + { + "epoch": 1.38, + "grad_norm": 0.2512461245059967, + "learning_rate": 0.000191171697946603, + "loss": 0.8277, + "step": 11425 + }, + { + "epoch": 1.38, + "grad_norm": 0.24636131525039673, + "learning_rate": 0.00019107273639316696, + "loss": 0.877, + "step": 11430 + }, + { + "epoch": 1.38, + "grad_norm": 0.21014370024204254, + "learning_rate": 0.0001909737555095372, + "loss": 0.8611, + "step": 11435 + }, + { + "epoch": 1.38, + "grad_norm": 0.2295524775981903, + "learning_rate": 0.0001908747553422974, + "loss": 0.9243, + "step": 11440 + }, + { + "epoch": 1.38, + "grad_norm": 0.24215233325958252, + "learning_rate": 0.0001907757359380403, + "loss": 0.9948, + "step": 11445 + }, + { + "epoch": 1.38, + "grad_norm": 0.2857617735862732, + "learning_rate": 0.00019067669734336776, + "loss": 0.8859, + "step": 11450 + }, + { + "epoch": 1.38, + "grad_norm": 0.25199002027511597, + "learning_rate": 0.00019057763960489063, + "loss": 0.9734, + "step": 11455 + }, + { + "epoch": 1.38, + "grad_norm": 0.2548152506351471, + "learning_rate": 0.00019047856276922873, + "loss": 0.8183, + "step": 11460 + }, + { + "epoch": 1.38, + "grad_norm": 0.23145854473114014, + "learning_rate": 0.00019037946688301097, + "loss": 0.9237, + "step": 11465 + }, + { + "epoch": 1.38, + "grad_norm": 0.24691297113895416, + "learning_rate": 0.00019028035199287512, + "loss": 0.8724, + "step": 11470 + }, + { + "epoch": 1.38, + "grad_norm": 0.2465231567621231, + "learning_rate": 0.00019018121814546799, + "loss": 0.8638, + "step": 11475 + }, + { + "epoch": 1.38, + "grad_norm": 0.2617138922214508, + "learning_rate": 0.00019008206538744516, + "loss": 0.9152, + "step": 11480 + }, + { + "epoch": 1.38, + "grad_norm": 0.23858489096164703, + "learning_rate": 0.00018998289376547135, + "loss": 0.7177, + "step": 11485 + }, + { + "epoch": 1.38, + "grad_norm": 0.2825676202774048, + "learning_rate": 0.00018988370332621987, + "loss": 0.9155, + "step": 11490 + }, + { + "epoch": 1.39, + "grad_norm": 0.21846085786819458, + "learning_rate": 0.00018978449411637317, + "loss": 0.8946, + "step": 11495 + }, + { + "epoch": 1.39, + "grad_norm": 0.23341843485832214, + "learning_rate": 0.00018968526618262226, + "loss": 0.833, + "step": 11500 + }, + { + "epoch": 1.39, + "grad_norm": 0.24220559000968933, + "learning_rate": 0.00018958601957166725, + "loss": 0.8047, + "step": 11505 + }, + { + "epoch": 1.39, + "grad_norm": 0.2510910630226135, + "learning_rate": 0.0001894867543302168, + "loss": 0.8861, + "step": 11510 + }, + { + "epoch": 1.39, + "grad_norm": 0.24948173761367798, + "learning_rate": 0.00018938747050498847, + "loss": 0.8644, + "step": 11515 + }, + { + "epoch": 1.39, + "grad_norm": 0.25031885504722595, + "learning_rate": 0.00018928816814270844, + "loss": 0.8713, + "step": 11520 + }, + { + "epoch": 1.39, + "grad_norm": 0.2391744703054428, + "learning_rate": 0.0001891888472901118, + "loss": 0.8059, + "step": 11525 + }, + { + "epoch": 1.39, + "grad_norm": 0.26208725571632385, + "learning_rate": 0.0001890895079939422, + "loss": 0.8532, + "step": 11530 + }, + { + "epoch": 1.39, + "grad_norm": 0.2613973021507263, + "learning_rate": 0.00018899015030095201, + "loss": 0.8387, + "step": 11535 + }, + { + "epoch": 1.39, + "grad_norm": 0.2578382194042206, + "learning_rate": 0.00018889077425790225, + "loss": 0.9407, + "step": 11540 + }, + { + "epoch": 1.39, + "grad_norm": 0.22165289521217346, + "learning_rate": 0.00018879137991156263, + "loss": 0.817, + "step": 11545 + }, + { + "epoch": 1.39, + "grad_norm": 0.2351749688386917, + "learning_rate": 0.0001886919673087114, + "loss": 0.8507, + "step": 11550 + }, + { + "epoch": 1.39, + "grad_norm": 0.2208326905965805, + "learning_rate": 0.00018859253649613545, + "loss": 0.7727, + "step": 11555 + }, + { + "epoch": 1.39, + "grad_norm": 0.21380893886089325, + "learning_rate": 0.00018849308752063017, + "loss": 0.829, + "step": 11560 + }, + { + "epoch": 1.39, + "grad_norm": 0.25803694128990173, + "learning_rate": 0.00018839362042899962, + "loss": 0.8363, + "step": 11565 + }, + { + "epoch": 1.39, + "grad_norm": 0.28940629959106445, + "learning_rate": 0.0001882941352680562, + "loss": 0.9299, + "step": 11570 + }, + { + "epoch": 1.39, + "grad_norm": 0.2648991346359253, + "learning_rate": 0.00018819463208462112, + "loss": 0.8416, + "step": 11575 + }, + { + "epoch": 1.4, + "grad_norm": 0.22129850089550018, + "learning_rate": 0.00018809511092552372, + "loss": 0.8448, + "step": 11580 + }, + { + "epoch": 1.4, + "grad_norm": 0.26116523146629333, + "learning_rate": 0.00018799557183760206, + "loss": 0.9321, + "step": 11585 + }, + { + "epoch": 1.4, + "grad_norm": 0.2547585964202881, + "learning_rate": 0.00018789601486770245, + "loss": 0.8947, + "step": 11590 + }, + { + "epoch": 1.4, + "grad_norm": 0.28379014134407043, + "learning_rate": 0.0001877964400626798, + "loss": 0.8113, + "step": 11595 + }, + { + "epoch": 1.4, + "grad_norm": 0.21314746141433716, + "learning_rate": 0.00018769684746939723, + "loss": 0.89, + "step": 11600 + }, + { + "epoch": 1.4, + "grad_norm": 0.2194174975156784, + "learning_rate": 0.00018759723713472642, + "loss": 0.8691, + "step": 11605 + }, + { + "epoch": 1.4, + "grad_norm": 0.2298017144203186, + "learning_rate": 0.00018749760910554715, + "loss": 0.8259, + "step": 11610 + }, + { + "epoch": 1.4, + "grad_norm": 0.2847272455692291, + "learning_rate": 0.0001873979634287479, + "loss": 0.8773, + "step": 11615 + }, + { + "epoch": 1.4, + "grad_norm": 0.23975825309753418, + "learning_rate": 0.00018729830015122508, + "loss": 0.9002, + "step": 11620 + }, + { + "epoch": 1.4, + "grad_norm": 0.22761796414852142, + "learning_rate": 0.00018719861931988354, + "loss": 0.8456, + "step": 11625 + }, + { + "epoch": 1.4, + "grad_norm": 0.24914436042308807, + "learning_rate": 0.0001870989209816364, + "loss": 0.8834, + "step": 11630 + }, + { + "epoch": 1.4, + "grad_norm": 0.2555813491344452, + "learning_rate": 0.0001869992051834051, + "loss": 0.7416, + "step": 11635 + }, + { + "epoch": 1.4, + "grad_norm": 0.2689943015575409, + "learning_rate": 0.00018689947197211908, + "loss": 0.85, + "step": 11640 + }, + { + "epoch": 1.4, + "grad_norm": 0.21290338039398193, + "learning_rate": 0.00018679972139471615, + "loss": 0.9248, + "step": 11645 + }, + { + "epoch": 1.4, + "grad_norm": 0.21777038276195526, + "learning_rate": 0.00018669995349814228, + "loss": 0.9654, + "step": 11650 + }, + { + "epoch": 1.4, + "grad_norm": 0.2189740687608719, + "learning_rate": 0.0001866001683293515, + "loss": 0.8877, + "step": 11655 + }, + { + "epoch": 1.4, + "grad_norm": 0.2227373570203781, + "learning_rate": 0.00018650036593530607, + "loss": 0.9332, + "step": 11660 + }, + { + "epoch": 1.41, + "grad_norm": 0.2457219511270523, + "learning_rate": 0.00018640054636297632, + "loss": 0.7944, + "step": 11665 + }, + { + "epoch": 1.41, + "grad_norm": 0.2539142072200775, + "learning_rate": 0.00018630070965934057, + "loss": 0.7875, + "step": 11670 + }, + { + "epoch": 1.41, + "grad_norm": 0.23336252570152283, + "learning_rate": 0.0001862008558713854, + "loss": 0.7474, + "step": 11675 + }, + { + "epoch": 1.41, + "grad_norm": 0.33268702030181885, + "learning_rate": 0.00018610098504610523, + "loss": 0.944, + "step": 11680 + }, + { + "epoch": 1.41, + "grad_norm": 0.23532938957214355, + "learning_rate": 0.0001860010972305026, + "loss": 0.998, + "step": 11685 + }, + { + "epoch": 1.41, + "grad_norm": 0.24364040791988373, + "learning_rate": 0.0001859011924715881, + "loss": 0.8007, + "step": 11690 + }, + { + "epoch": 1.41, + "grad_norm": 0.26851868629455566, + "learning_rate": 0.00018580127081638018, + "loss": 0.8973, + "step": 11695 + }, + { + "epoch": 1.41, + "grad_norm": 0.27191677689552307, + "learning_rate": 0.00018570133231190524, + "loss": 0.9319, + "step": 11700 + }, + { + "epoch": 1.41, + "grad_norm": 0.24935273826122284, + "learning_rate": 0.00018560137700519776, + "loss": 0.8381, + "step": 11705 + }, + { + "epoch": 1.41, + "grad_norm": 0.19971820712089539, + "learning_rate": 0.00018550140494329995, + "loss": 0.9152, + "step": 11710 + }, + { + "epoch": 1.41, + "grad_norm": 0.2567872703075409, + "learning_rate": 0.000185401416173262, + "loss": 0.8468, + "step": 11715 + }, + { + "epoch": 1.41, + "grad_norm": 0.2599542438983917, + "learning_rate": 0.00018530141074214195, + "loss": 0.8679, + "step": 11720 + }, + { + "epoch": 1.41, + "grad_norm": 0.2495017796754837, + "learning_rate": 0.00018520138869700573, + "loss": 0.8317, + "step": 11725 + }, + { + "epoch": 1.41, + "grad_norm": 0.25889188051223755, + "learning_rate": 0.00018510135008492696, + "loss": 0.9075, + "step": 11730 + }, + { + "epoch": 1.41, + "grad_norm": 0.24831044673919678, + "learning_rate": 0.00018500129495298718, + "loss": 0.8721, + "step": 11735 + }, + { + "epoch": 1.41, + "grad_norm": 0.23168979585170746, + "learning_rate": 0.0001849012233482756, + "loss": 0.9592, + "step": 11740 + }, + { + "epoch": 1.42, + "grad_norm": 0.27312368154525757, + "learning_rate": 0.0001848011353178893, + "loss": 0.8826, + "step": 11745 + }, + { + "epoch": 1.42, + "grad_norm": 0.23228977620601654, + "learning_rate": 0.00018470103090893297, + "loss": 0.8053, + "step": 11750 + }, + { + "epoch": 1.42, + "grad_norm": 0.23857393860816956, + "learning_rate": 0.00018460091016851915, + "loss": 0.8638, + "step": 11755 + }, + { + "epoch": 1.42, + "grad_norm": 0.24902932345867157, + "learning_rate": 0.00018450077314376793, + "loss": 0.8719, + "step": 11760 + }, + { + "epoch": 1.42, + "grad_norm": 0.29293930530548096, + "learning_rate": 0.0001844006198818072, + "loss": 0.9337, + "step": 11765 + }, + { + "epoch": 1.42, + "grad_norm": 0.2380332350730896, + "learning_rate": 0.00018430045042977224, + "loss": 0.8136, + "step": 11770 + }, + { + "epoch": 1.42, + "grad_norm": 0.2724967896938324, + "learning_rate": 0.0001842002648348063, + "loss": 0.9473, + "step": 11775 + }, + { + "epoch": 1.42, + "grad_norm": 0.24302633106708527, + "learning_rate": 0.00018410006314405992, + "loss": 0.7744, + "step": 11780 + }, + { + "epoch": 1.42, + "grad_norm": 0.242751345038414, + "learning_rate": 0.0001839998454046914, + "loss": 0.8238, + "step": 11785 + }, + { + "epoch": 1.42, + "grad_norm": 0.23499782383441925, + "learning_rate": 0.00018389961166386657, + "loss": 0.8673, + "step": 11790 + }, + { + "epoch": 1.42, + "grad_norm": 0.22136856615543365, + "learning_rate": 0.00018379936196875868, + "loss": 0.9232, + "step": 11795 + }, + { + "epoch": 1.42, + "grad_norm": 0.22664988040924072, + "learning_rate": 0.00018369909636654867, + "loss": 0.8296, + "step": 11800 + }, + { + "epoch": 1.42, + "grad_norm": 0.2627767026424408, + "learning_rate": 0.00018359881490442471, + "loss": 0.8778, + "step": 11805 + }, + { + "epoch": 1.42, + "grad_norm": 0.22755451500415802, + "learning_rate": 0.00018349851762958274, + "loss": 0.949, + "step": 11810 + }, + { + "epoch": 1.42, + "grad_norm": 0.2317892462015152, + "learning_rate": 0.00018339820458922589, + "loss": 0.8828, + "step": 11815 + }, + { + "epoch": 1.42, + "grad_norm": 0.23136401176452637, + "learning_rate": 0.00018329787583056486, + "loss": 0.7622, + "step": 11820 + }, + { + "epoch": 1.42, + "grad_norm": 0.24994751811027527, + "learning_rate": 0.00018319753140081765, + "loss": 0.8811, + "step": 11825 + }, + { + "epoch": 1.43, + "grad_norm": 0.2566693425178528, + "learning_rate": 0.00018309717134720974, + "loss": 0.9607, + "step": 11830 + }, + { + "epoch": 1.43, + "grad_norm": 0.2577987313270569, + "learning_rate": 0.00018299679571697383, + "loss": 0.8902, + "step": 11835 + }, + { + "epoch": 1.43, + "grad_norm": 0.2891867160797119, + "learning_rate": 0.00018289640455735012, + "loss": 0.8107, + "step": 11840 + }, + { + "epoch": 1.43, + "grad_norm": 0.24823874235153198, + "learning_rate": 0.0001827959979155859, + "loss": 0.8493, + "step": 11845 + }, + { + "epoch": 1.43, + "grad_norm": 0.23691007494926453, + "learning_rate": 0.00018269557583893602, + "loss": 0.8704, + "step": 11850 + }, + { + "epoch": 1.43, + "grad_norm": 0.2653874158859253, + "learning_rate": 0.00018259513837466228, + "loss": 0.8665, + "step": 11855 + }, + { + "epoch": 1.43, + "grad_norm": 0.3187112510204315, + "learning_rate": 0.00018249468557003404, + "loss": 0.7894, + "step": 11860 + }, + { + "epoch": 1.43, + "grad_norm": 0.2333473116159439, + "learning_rate": 0.00018239421747232758, + "loss": 0.7926, + "step": 11865 + }, + { + "epoch": 1.43, + "grad_norm": 0.2679731547832489, + "learning_rate": 0.0001822937341288267, + "loss": 0.848, + "step": 11870 + }, + { + "epoch": 1.43, + "grad_norm": 0.2512493431568146, + "learning_rate": 0.00018219323558682203, + "loss": 0.9605, + "step": 11875 + }, + { + "epoch": 1.43, + "grad_norm": 0.241338312625885, + "learning_rate": 0.0001820927218936116, + "loss": 0.8209, + "step": 11880 + }, + { + "epoch": 1.43, + "grad_norm": 0.2403215616941452, + "learning_rate": 0.0001819921930965005, + "loss": 0.808, + "step": 11885 + }, + { + "epoch": 1.43, + "grad_norm": 0.24776634573936462, + "learning_rate": 0.00018189164924280088, + "loss": 0.8508, + "step": 11890 + }, + { + "epoch": 1.43, + "grad_norm": 0.23758898675441742, + "learning_rate": 0.00018179109037983203, + "loss": 0.9911, + "step": 11895 + }, + { + "epoch": 1.43, + "grad_norm": 0.24971432983875275, + "learning_rate": 0.00018169051655492034, + "loss": 0.8894, + "step": 11900 + }, + { + "epoch": 1.43, + "grad_norm": 0.22912085056304932, + "learning_rate": 0.0001815899278153991, + "loss": 0.8912, + "step": 11905 + }, + { + "epoch": 1.44, + "grad_norm": 0.271433562040329, + "learning_rate": 0.0001814893242086088, + "loss": 0.8364, + "step": 11910 + }, + { + "epoch": 1.44, + "grad_norm": 0.2268996387720108, + "learning_rate": 0.00018138870578189676, + "loss": 0.8419, + "step": 11915 + }, + { + "epoch": 1.44, + "grad_norm": 0.2744608521461487, + "learning_rate": 0.0001812880725826174, + "loss": 0.8018, + "step": 11920 + }, + { + "epoch": 1.44, + "grad_norm": 0.2408338189125061, + "learning_rate": 0.00018118742465813206, + "loss": 0.84, + "step": 11925 + }, + { + "epoch": 1.44, + "grad_norm": 0.27505186200141907, + "learning_rate": 0.00018108676205580895, + "loss": 0.8555, + "step": 11930 + }, + { + "epoch": 1.44, + "grad_norm": 0.23788927495479584, + "learning_rate": 0.00018098608482302328, + "loss": 0.8613, + "step": 11935 + }, + { + "epoch": 1.44, + "grad_norm": 0.2505749762058258, + "learning_rate": 0.00018088539300715705, + "loss": 0.7991, + "step": 11940 + }, + { + "epoch": 1.44, + "grad_norm": 0.22999998927116394, + "learning_rate": 0.00018078468665559924, + "loss": 0.7935, + "step": 11945 + }, + { + "epoch": 1.44, + "grad_norm": 0.2383863478899002, + "learning_rate": 0.00018068396581574553, + "loss": 0.878, + "step": 11950 + }, + { + "epoch": 1.44, + "grad_norm": 0.22946636378765106, + "learning_rate": 0.00018058323053499854, + "loss": 0.8047, + "step": 11955 + }, + { + "epoch": 1.44, + "grad_norm": 0.23430262506008148, + "learning_rate": 0.0001804824808607676, + "loss": 0.8075, + "step": 11960 + }, + { + "epoch": 1.44, + "grad_norm": 0.2360125333070755, + "learning_rate": 0.0001803817168404689, + "loss": 0.8581, + "step": 11965 + }, + { + "epoch": 1.44, + "grad_norm": 0.24007384479045868, + "learning_rate": 0.00018028093852152528, + "loss": 0.8953, + "step": 11970 + }, + { + "epoch": 1.44, + "grad_norm": 0.2502374053001404, + "learning_rate": 0.00018018014595136644, + "loss": 0.8884, + "step": 11975 + }, + { + "epoch": 1.44, + "grad_norm": 0.24452511966228485, + "learning_rate": 0.00018007933917742864, + "loss": 1.0623, + "step": 11980 + }, + { + "epoch": 1.44, + "grad_norm": 0.23154529929161072, + "learning_rate": 0.0001799785182471549, + "loss": 0.8695, + "step": 11985 + }, + { + "epoch": 1.44, + "grad_norm": 0.2401302307844162, + "learning_rate": 0.00017987768320799495, + "loss": 0.9001, + "step": 11990 + }, + { + "epoch": 1.45, + "grad_norm": 0.37644556164741516, + "learning_rate": 0.00017977683410740503, + "loss": 0.8526, + "step": 11995 + }, + { + "epoch": 1.45, + "grad_norm": 0.2637287974357605, + "learning_rate": 0.00017967597099284813, + "loss": 0.9424, + "step": 12000 + }, + { + "epoch": 1.45, + "grad_norm": 0.23815138638019562, + "learning_rate": 0.0001795750939117938, + "loss": 0.8184, + "step": 12005 + }, + { + "epoch": 1.45, + "grad_norm": 0.2669789493083954, + "learning_rate": 0.00017947420291171813, + "loss": 0.9174, + "step": 12010 + }, + { + "epoch": 1.45, + "grad_norm": 0.3020137548446655, + "learning_rate": 0.00017937329804010372, + "loss": 0.9216, + "step": 12015 + }, + { + "epoch": 1.45, + "grad_norm": 0.22852839529514313, + "learning_rate": 0.00017927237934443983, + "loss": 0.8869, + "step": 12020 + }, + { + "epoch": 1.45, + "grad_norm": 0.27569183707237244, + "learning_rate": 0.00017917144687222206, + "loss": 0.8327, + "step": 12025 + }, + { + "epoch": 1.45, + "grad_norm": 0.2452722042798996, + "learning_rate": 0.0001790705006709527, + "loss": 0.9104, + "step": 12030 + }, + { + "epoch": 1.45, + "grad_norm": 0.24271848797798157, + "learning_rate": 0.00017896954078814028, + "loss": 0.9341, + "step": 12035 + }, + { + "epoch": 1.45, + "grad_norm": 0.21451996266841888, + "learning_rate": 0.00017886856727129994, + "loss": 0.8143, + "step": 12040 + }, + { + "epoch": 1.45, + "grad_norm": 0.23725618422031403, + "learning_rate": 0.00017876758016795313, + "loss": 0.8801, + "step": 12045 + }, + { + "epoch": 1.45, + "grad_norm": 0.24408486485481262, + "learning_rate": 0.00017866657952562778, + "loss": 0.8149, + "step": 12050 + }, + { + "epoch": 1.45, + "grad_norm": 0.22583889961242676, + "learning_rate": 0.0001785655653918581, + "loss": 0.8196, + "step": 12055 + }, + { + "epoch": 1.45, + "grad_norm": 0.23950889706611633, + "learning_rate": 0.00017846453781418474, + "loss": 0.7369, + "step": 12060 + }, + { + "epoch": 1.45, + "grad_norm": 0.24430805444717407, + "learning_rate": 0.00017836349684015456, + "loss": 0.9414, + "step": 12065 + }, + { + "epoch": 1.45, + "grad_norm": 0.22092491388320923, + "learning_rate": 0.00017826244251732088, + "loss": 0.9288, + "step": 12070 + }, + { + "epoch": 1.45, + "grad_norm": 0.23350967466831207, + "learning_rate": 0.00017816137489324314, + "loss": 0.8764, + "step": 12075 + }, + { + "epoch": 1.46, + "grad_norm": 0.25949931144714355, + "learning_rate": 0.0001780602940154872, + "loss": 0.7207, + "step": 12080 + }, + { + "epoch": 1.46, + "grad_norm": 0.23288941383361816, + "learning_rate": 0.00017795919993162504, + "loss": 0.8304, + "step": 12085 + }, + { + "epoch": 1.46, + "grad_norm": 0.261445015668869, + "learning_rate": 0.00017785809268923493, + "loss": 0.8202, + "step": 12090 + }, + { + "epoch": 1.46, + "grad_norm": 0.26035887002944946, + "learning_rate": 0.0001777569723359012, + "loss": 0.8119, + "step": 12095 + }, + { + "epoch": 1.46, + "grad_norm": 0.24832282960414886, + "learning_rate": 0.00017765583891921454, + "loss": 0.8939, + "step": 12100 + }, + { + "epoch": 1.46, + "grad_norm": 0.24832552671432495, + "learning_rate": 0.00017755469248677163, + "loss": 0.8043, + "step": 12105 + }, + { + "epoch": 1.46, + "grad_norm": 0.24635636806488037, + "learning_rate": 0.0001774535330861754, + "loss": 0.9287, + "step": 12110 + }, + { + "epoch": 1.46, + "grad_norm": 0.26285749673843384, + "learning_rate": 0.0001773523607650348, + "loss": 0.903, + "step": 12115 + }, + { + "epoch": 1.46, + "grad_norm": 0.23266449570655823, + "learning_rate": 0.0001772511755709649, + "loss": 0.8647, + "step": 12120 + }, + { + "epoch": 1.46, + "grad_norm": 0.23919335007667542, + "learning_rate": 0.00017714997755158675, + "loss": 0.8978, + "step": 12125 + }, + { + "epoch": 1.46, + "grad_norm": 0.247425839304924, + "learning_rate": 0.0001770487667545276, + "loss": 0.8336, + "step": 12130 + }, + { + "epoch": 1.46, + "grad_norm": 0.28573018312454224, + "learning_rate": 0.00017694754322742048, + "loss": 0.9016, + "step": 12135 + }, + { + "epoch": 1.46, + "grad_norm": 0.23751410841941833, + "learning_rate": 0.00017684630701790468, + "loss": 0.8116, + "step": 12140 + }, + { + "epoch": 1.46, + "grad_norm": 0.2392253428697586, + "learning_rate": 0.0001767450581736252, + "loss": 0.9136, + "step": 12145 + }, + { + "epoch": 1.46, + "grad_norm": 0.23572461307048798, + "learning_rate": 0.00017664379674223318, + "loss": 0.8182, + "step": 12150 + }, + { + "epoch": 1.46, + "grad_norm": 0.2536904513835907, + "learning_rate": 0.0001765425227713856, + "loss": 0.8735, + "step": 12155 + }, + { + "epoch": 1.47, + "grad_norm": 0.2494330257177353, + "learning_rate": 0.00017644123630874539, + "loss": 0.888, + "step": 12160 + }, + { + "epoch": 1.47, + "grad_norm": 0.2419360876083374, + "learning_rate": 0.00017633993740198128, + "loss": 0.8367, + "step": 12165 + }, + { + "epoch": 1.47, + "grad_norm": 0.22948452830314636, + "learning_rate": 0.0001762386260987679, + "loss": 0.903, + "step": 12170 + }, + { + "epoch": 1.47, + "grad_norm": 0.2524193227291107, + "learning_rate": 0.0001761373024467857, + "loss": 0.8555, + "step": 12175 + }, + { + "epoch": 1.47, + "grad_norm": 0.2861385941505432, + "learning_rate": 0.000176035966493721, + "loss": 0.9068, + "step": 12180 + }, + { + "epoch": 1.47, + "grad_norm": 0.2484591007232666, + "learning_rate": 0.0001759346182872658, + "loss": 0.9578, + "step": 12185 + }, + { + "epoch": 1.47, + "grad_norm": 0.24529612064361572, + "learning_rate": 0.000175833257875118, + "loss": 0.8648, + "step": 12190 + }, + { + "epoch": 1.47, + "grad_norm": 0.23602719604969025, + "learning_rate": 0.00017573188530498117, + "loss": 0.9002, + "step": 12195 + }, + { + "epoch": 1.47, + "grad_norm": 0.24331901967525482, + "learning_rate": 0.0001756305006245646, + "loss": 0.8624, + "step": 12200 + }, + { + "epoch": 1.47, + "grad_norm": 0.2478303611278534, + "learning_rate": 0.0001755291038815832, + "loss": 0.8435, + "step": 12205 + }, + { + "epoch": 1.47, + "grad_norm": 0.26991724967956543, + "learning_rate": 0.0001754276951237578, + "loss": 0.8139, + "step": 12210 + }, + { + "epoch": 1.47, + "grad_norm": 0.2541549503803253, + "learning_rate": 0.00017532627439881458, + "loss": 0.85, + "step": 12215 + }, + { + "epoch": 1.47, + "grad_norm": 0.24864061176776886, + "learning_rate": 0.0001752248417544856, + "loss": 0.8312, + "step": 12220 + }, + { + "epoch": 1.47, + "grad_norm": 0.26080724596977234, + "learning_rate": 0.00017512339723850835, + "loss": 0.8774, + "step": 12225 + }, + { + "epoch": 1.47, + "grad_norm": 0.27303215861320496, + "learning_rate": 0.00017502194089862608, + "loss": 0.8132, + "step": 12230 + }, + { + "epoch": 1.47, + "grad_norm": 0.24449607729911804, + "learning_rate": 0.00017492047278258748, + "loss": 0.9219, + "step": 12235 + }, + { + "epoch": 1.47, + "grad_norm": 0.24350832402706146, + "learning_rate": 0.0001748189929381468, + "loss": 0.814, + "step": 12240 + }, + { + "epoch": 1.48, + "grad_norm": 0.22966676950454712, + "learning_rate": 0.0001747175014130638, + "loss": 0.8465, + "step": 12245 + }, + { + "epoch": 1.48, + "grad_norm": 0.2714262008666992, + "learning_rate": 0.00017461599825510386, + "loss": 0.885, + "step": 12250 + }, + { + "epoch": 1.48, + "grad_norm": 0.2457001507282257, + "learning_rate": 0.00017451448351203758, + "loss": 0.8376, + "step": 12255 + }, + { + "epoch": 1.48, + "grad_norm": 0.22185452282428741, + "learning_rate": 0.00017441295723164132, + "loss": 0.8311, + "step": 12260 + }, + { + "epoch": 1.48, + "grad_norm": 0.2851621210575104, + "learning_rate": 0.00017431141946169662, + "loss": 0.7891, + "step": 12265 + }, + { + "epoch": 1.48, + "grad_norm": 0.2442905604839325, + "learning_rate": 0.00017420987024999065, + "loss": 0.9465, + "step": 12270 + }, + { + "epoch": 1.48, + "grad_norm": 0.26960885524749756, + "learning_rate": 0.00017410830964431566, + "loss": 0.8436, + "step": 12275 + }, + { + "epoch": 1.48, + "grad_norm": 0.23894372582435608, + "learning_rate": 0.0001740067376924696, + "loss": 0.8469, + "step": 12280 + }, + { + "epoch": 1.48, + "grad_norm": 0.295200914144516, + "learning_rate": 0.00017390515444225548, + "loss": 0.907, + "step": 12285 + }, + { + "epoch": 1.48, + "grad_norm": 0.24998031556606293, + "learning_rate": 0.00017380355994148187, + "loss": 0.8642, + "step": 12290 + }, + { + "epoch": 1.48, + "grad_norm": 0.25072336196899414, + "learning_rate": 0.0001737019542379624, + "loss": 0.8299, + "step": 12295 + }, + { + "epoch": 1.48, + "grad_norm": 0.23040717840194702, + "learning_rate": 0.00017360033737951622, + "loss": 0.8464, + "step": 12300 + }, + { + "epoch": 1.48, + "grad_norm": 0.21226970851421356, + "learning_rate": 0.0001734987094139675, + "loss": 0.8759, + "step": 12305 + }, + { + "epoch": 1.48, + "grad_norm": 0.26856541633605957, + "learning_rate": 0.0001733970703891457, + "loss": 0.8438, + "step": 12310 + }, + { + "epoch": 1.48, + "grad_norm": 0.2336094230413437, + "learning_rate": 0.00017329542035288565, + "loss": 0.8019, + "step": 12315 + }, + { + "epoch": 1.48, + "grad_norm": 0.2525746524333954, + "learning_rate": 0.00017319375935302713, + "loss": 0.9302, + "step": 12320 + }, + { + "epoch": 1.49, + "grad_norm": 0.21425200998783112, + "learning_rate": 0.00017309208743741526, + "loss": 0.835, + "step": 12325 + }, + { + "epoch": 1.49, + "grad_norm": 0.2605019509792328, + "learning_rate": 0.00017299040465390013, + "loss": 0.9248, + "step": 12330 + }, + { + "epoch": 1.49, + "grad_norm": 0.22529718279838562, + "learning_rate": 0.00017288871105033713, + "loss": 0.8703, + "step": 12335 + }, + { + "epoch": 1.49, + "grad_norm": 0.24195663630962372, + "learning_rate": 0.00017278700667458657, + "loss": 0.836, + "step": 12340 + }, + { + "epoch": 1.49, + "grad_norm": 0.25822916626930237, + "learning_rate": 0.00017268529157451394, + "loss": 0.8901, + "step": 12345 + }, + { + "epoch": 1.49, + "grad_norm": 0.2364194244146347, + "learning_rate": 0.00017258356579798973, + "loss": 0.8438, + "step": 12350 + }, + { + "epoch": 1.49, + "grad_norm": 0.26082637906074524, + "learning_rate": 0.0001724818293928895, + "loss": 0.9269, + "step": 12355 + }, + { + "epoch": 1.49, + "grad_norm": 0.2297302931547165, + "learning_rate": 0.00017238008240709374, + "loss": 0.9076, + "step": 12360 + }, + { + "epoch": 1.49, + "grad_norm": 0.2835592031478882, + "learning_rate": 0.00017227832488848799, + "loss": 0.8485, + "step": 12365 + }, + { + "epoch": 1.49, + "grad_norm": 0.2650296986103058, + "learning_rate": 0.0001721765568849627, + "loss": 0.9593, + "step": 12370 + }, + { + "epoch": 1.49, + "grad_norm": 0.22768686711788177, + "learning_rate": 0.00017207477844441335, + "loss": 0.8921, + "step": 12375 + }, + { + "epoch": 1.49, + "grad_norm": 0.22558462619781494, + "learning_rate": 0.00017197298961474006, + "loss": 0.8421, + "step": 12380 + }, + { + "epoch": 1.49, + "grad_norm": 0.224419966340065, + "learning_rate": 0.00017187119044384823, + "loss": 0.8392, + "step": 12385 + }, + { + "epoch": 1.49, + "grad_norm": 0.2719701826572418, + "learning_rate": 0.00017176938097964784, + "loss": 0.9365, + "step": 12390 + }, + { + "epoch": 1.49, + "grad_norm": 0.24385976791381836, + "learning_rate": 0.00017166756127005384, + "loss": 0.8264, + "step": 12395 + }, + { + "epoch": 1.49, + "grad_norm": 0.23817993700504303, + "learning_rate": 0.00017156573136298592, + "loss": 0.9154, + "step": 12400 + }, + { + "epoch": 1.49, + "grad_norm": 0.25745269656181335, + "learning_rate": 0.00017146389130636864, + "loss": 0.8911, + "step": 12405 + }, + { + "epoch": 1.5, + "grad_norm": 0.29501572251319885, + "learning_rate": 0.0001713620411481314, + "loss": 0.8841, + "step": 12410 + }, + { + "epoch": 1.5, + "grad_norm": 0.263081431388855, + "learning_rate": 0.00017126018093620808, + "loss": 0.8334, + "step": 12415 + }, + { + "epoch": 1.5, + "grad_norm": 0.22736036777496338, + "learning_rate": 0.0001711583107185376, + "loss": 0.88, + "step": 12420 + }, + { + "epoch": 1.5, + "grad_norm": 0.23410391807556152, + "learning_rate": 0.00017105643054306352, + "loss": 0.8257, + "step": 12425 + }, + { + "epoch": 1.5, + "grad_norm": 0.26745110750198364, + "learning_rate": 0.00017095454045773387, + "loss": 0.9252, + "step": 12430 + }, + { + "epoch": 1.5, + "grad_norm": 0.24385030567646027, + "learning_rate": 0.00017085264051050166, + "loss": 0.8395, + "step": 12435 + }, + { + "epoch": 1.5, + "grad_norm": 0.2342156320810318, + "learning_rate": 0.0001707507307493243, + "loss": 0.8266, + "step": 12440 + }, + { + "epoch": 1.5, + "grad_norm": 0.2477523386478424, + "learning_rate": 0.00017064881122216398, + "loss": 0.8427, + "step": 12445 + }, + { + "epoch": 1.5, + "grad_norm": 0.252871572971344, + "learning_rate": 0.00017054688197698736, + "loss": 0.8368, + "step": 12450 + }, + { + "epoch": 1.5, + "grad_norm": 0.22697529196739197, + "learning_rate": 0.00017044494306176576, + "loss": 0.7964, + "step": 12455 + }, + { + "epoch": 1.5, + "grad_norm": 0.27553001046180725, + "learning_rate": 0.00017034299452447493, + "loss": 0.8546, + "step": 12460 + }, + { + "epoch": 1.5, + "grad_norm": 0.26646605134010315, + "learning_rate": 0.00017024103641309537, + "loss": 0.8383, + "step": 12465 + }, + { + "epoch": 1.5, + "grad_norm": 0.255669504404068, + "learning_rate": 0.00017013906877561187, + "loss": 0.7589, + "step": 12470 + }, + { + "epoch": 1.5, + "grad_norm": 0.25287455320358276, + "learning_rate": 0.0001700370916600138, + "loss": 0.8203, + "step": 12475 + }, + { + "epoch": 1.5, + "grad_norm": 0.2946148216724396, + "learning_rate": 0.000169935105114295, + "loss": 0.9016, + "step": 12480 + }, + { + "epoch": 1.5, + "grad_norm": 0.2425968050956726, + "learning_rate": 0.0001698331091864537, + "loss": 0.786, + "step": 12485 + }, + { + "epoch": 1.5, + "grad_norm": 0.24818405508995056, + "learning_rate": 0.00016973110392449255, + "loss": 0.8466, + "step": 12490 + }, + { + "epoch": 1.51, + "grad_norm": 0.2572382688522339, + "learning_rate": 0.00016962908937641873, + "loss": 0.8854, + "step": 12495 + }, + { + "epoch": 1.51, + "grad_norm": 0.22933310270309448, + "learning_rate": 0.0001695270655902435, + "loss": 0.8843, + "step": 12500 + }, + { + "epoch": 1.51, + "grad_norm": 0.2477482110261917, + "learning_rate": 0.00016942503261398276, + "loss": 0.8412, + "step": 12505 + }, + { + "epoch": 1.51, + "grad_norm": 0.2406783103942871, + "learning_rate": 0.00016932299049565657, + "loss": 0.7979, + "step": 12510 + }, + { + "epoch": 1.51, + "grad_norm": 0.2689076364040375, + "learning_rate": 0.00016922093928328937, + "loss": 0.8078, + "step": 12515 + }, + { + "epoch": 1.51, + "grad_norm": 0.2702324092388153, + "learning_rate": 0.00016911887902490986, + "loss": 0.8463, + "step": 12520 + }, + { + "epoch": 1.51, + "grad_norm": 0.27444717288017273, + "learning_rate": 0.00016901680976855096, + "loss": 0.8428, + "step": 12525 + }, + { + "epoch": 1.51, + "grad_norm": 0.24861744046211243, + "learning_rate": 0.00016891473156224976, + "loss": 0.9324, + "step": 12530 + }, + { + "epoch": 1.51, + "grad_norm": 0.2355334758758545, + "learning_rate": 0.00016881264445404786, + "loss": 0.8497, + "step": 12535 + }, + { + "epoch": 1.51, + "grad_norm": 0.24266460537910461, + "learning_rate": 0.00016871054849199068, + "loss": 0.8034, + "step": 12540 + }, + { + "epoch": 1.51, + "grad_norm": 0.24508504569530487, + "learning_rate": 0.00016860844372412802, + "loss": 0.8216, + "step": 12545 + }, + { + "epoch": 1.51, + "grad_norm": 0.23630234599113464, + "learning_rate": 0.00016850633019851378, + "loss": 0.8422, + "step": 12550 + }, + { + "epoch": 1.51, + "grad_norm": 0.2652592658996582, + "learning_rate": 0.00016840420796320602, + "loss": 0.9161, + "step": 12555 + }, + { + "epoch": 1.51, + "grad_norm": 0.21832112967967987, + "learning_rate": 0.00016830207706626675, + "loss": 0.7847, + "step": 12560 + }, + { + "epoch": 1.51, + "grad_norm": 0.2525995671749115, + "learning_rate": 0.00016819993755576225, + "loss": 0.8049, + "step": 12565 + }, + { + "epoch": 1.51, + "grad_norm": 0.2583009898662567, + "learning_rate": 0.00016809778947976273, + "loss": 0.8428, + "step": 12570 + }, + { + "epoch": 1.52, + "grad_norm": 0.2520254850387573, + "learning_rate": 0.00016799563288634247, + "loss": 0.8095, + "step": 12575 + }, + { + "epoch": 1.52, + "grad_norm": 0.2578343152999878, + "learning_rate": 0.00016789346782357975, + "loss": 0.7919, + "step": 12580 + }, + { + "epoch": 1.52, + "grad_norm": 0.26357656717300415, + "learning_rate": 0.00016779129433955686, + "loss": 0.8591, + "step": 12585 + }, + { + "epoch": 1.52, + "grad_norm": 0.256157249212265, + "learning_rate": 0.00016768911248236001, + "loss": 0.8812, + "step": 12590 + }, + { + "epoch": 1.52, + "grad_norm": 0.25076690316200256, + "learning_rate": 0.00016758692230007939, + "loss": 0.8109, + "step": 12595 + }, + { + "epoch": 1.52, + "grad_norm": 0.26250481605529785, + "learning_rate": 0.00016748472384080912, + "loss": 0.944, + "step": 12600 + }, + { + "epoch": 1.52, + "grad_norm": 0.25288984179496765, + "learning_rate": 0.0001673825171526471, + "loss": 0.9143, + "step": 12605 + }, + { + "epoch": 1.52, + "grad_norm": 0.26439496874809265, + "learning_rate": 0.0001672803022836953, + "loss": 0.9372, + "step": 12610 + }, + { + "epoch": 1.52, + "grad_norm": 0.23811744153499603, + "learning_rate": 0.00016717807928205936, + "loss": 0.816, + "step": 12615 + }, + { + "epoch": 1.52, + "grad_norm": 0.2801905870437622, + "learning_rate": 0.00016707584819584885, + "loss": 0.7269, + "step": 12620 + }, + { + "epoch": 1.52, + "grad_norm": 0.2476351410150528, + "learning_rate": 0.00016697360907317712, + "loss": 0.7855, + "step": 12625 + }, + { + "epoch": 1.52, + "grad_norm": 0.2882719933986664, + "learning_rate": 0.0001668713619621613, + "loss": 0.9298, + "step": 12630 + }, + { + "epoch": 1.52, + "grad_norm": 0.27787694334983826, + "learning_rate": 0.00016676910691092224, + "loss": 0.8385, + "step": 12635 + }, + { + "epoch": 1.52, + "grad_norm": 0.28809016942977905, + "learning_rate": 0.00016666684396758459, + "loss": 0.7968, + "step": 12640 + }, + { + "epoch": 1.52, + "grad_norm": 0.24893060326576233, + "learning_rate": 0.00016656457318027667, + "loss": 0.8045, + "step": 12645 + }, + { + "epoch": 1.52, + "grad_norm": 0.22097425162792206, + "learning_rate": 0.0001664622945971305, + "loss": 0.8332, + "step": 12650 + }, + { + "epoch": 1.52, + "grad_norm": 0.241326704621315, + "learning_rate": 0.00016636000826628184, + "loss": 0.8432, + "step": 12655 + }, + { + "epoch": 1.53, + "grad_norm": 0.2555682063102722, + "learning_rate": 0.00016625771423586991, + "loss": 0.998, + "step": 12660 + }, + { + "epoch": 1.53, + "grad_norm": 0.2659325897693634, + "learning_rate": 0.00016615541255403786, + "loss": 0.796, + "step": 12665 + }, + { + "epoch": 1.53, + "grad_norm": 0.24846947193145752, + "learning_rate": 0.0001660531032689321, + "loss": 0.8544, + "step": 12670 + }, + { + "epoch": 1.53, + "grad_norm": 0.2357734739780426, + "learning_rate": 0.0001659507864287029, + "loss": 0.8947, + "step": 12675 + }, + { + "epoch": 1.53, + "grad_norm": 0.22256316244602203, + "learning_rate": 0.00016584846208150383, + "loss": 0.7654, + "step": 12680 + }, + { + "epoch": 1.53, + "grad_norm": 0.2255926877260208, + "learning_rate": 0.00016574613027549217, + "loss": 0.9368, + "step": 12685 + }, + { + "epoch": 1.53, + "grad_norm": 0.2199205905199051, + "learning_rate": 0.00016564379105882873, + "loss": 0.8522, + "step": 12690 + }, + { + "epoch": 1.53, + "grad_norm": 0.28501272201538086, + "learning_rate": 0.0001655414444796777, + "loss": 0.9038, + "step": 12695 + }, + { + "epoch": 1.53, + "grad_norm": 0.24840682744979858, + "learning_rate": 0.0001654390905862068, + "loss": 0.8622, + "step": 12700 + }, + { + "epoch": 1.53, + "grad_norm": 0.25936436653137207, + "learning_rate": 0.00016533672942658717, + "loss": 0.7946, + "step": 12705 + }, + { + "epoch": 1.53, + "grad_norm": 0.2659285068511963, + "learning_rate": 0.0001652343610489933, + "loss": 0.8187, + "step": 12710 + }, + { + "epoch": 1.53, + "grad_norm": 0.24160555005073547, + "learning_rate": 0.00016513198550160326, + "loss": 0.7751, + "step": 12715 + }, + { + "epoch": 1.53, + "grad_norm": 0.26907095313072205, + "learning_rate": 0.00016502960283259823, + "loss": 0.807, + "step": 12720 + }, + { + "epoch": 1.53, + "grad_norm": 0.2532917559146881, + "learning_rate": 0.00016492721309016307, + "loss": 0.8671, + "step": 12725 + }, + { + "epoch": 1.53, + "grad_norm": 0.22019241750240326, + "learning_rate": 0.00016482481632248568, + "loss": 0.8272, + "step": 12730 + }, + { + "epoch": 1.53, + "grad_norm": 0.22375644743442535, + "learning_rate": 0.00016472241257775743, + "loss": 0.7704, + "step": 12735 + }, + { + "epoch": 1.54, + "grad_norm": 0.2382332682609558, + "learning_rate": 0.00016462000190417292, + "loss": 0.9716, + "step": 12740 + }, + { + "epoch": 1.54, + "grad_norm": 0.279604971408844, + "learning_rate": 0.00016451758434992997, + "loss": 0.8371, + "step": 12745 + }, + { + "epoch": 1.54, + "grad_norm": 0.2584107220172882, + "learning_rate": 0.00016441515996322973, + "loss": 0.8307, + "step": 12750 + }, + { + "epoch": 1.54, + "grad_norm": 0.24332301318645477, + "learning_rate": 0.00016431272879227648, + "loss": 0.8213, + "step": 12755 + }, + { + "epoch": 1.54, + "grad_norm": 0.2555186450481415, + "learning_rate": 0.00016421029088527775, + "loss": 0.8662, + "step": 12760 + }, + { + "epoch": 1.54, + "grad_norm": 0.21609874069690704, + "learning_rate": 0.00016410784629044422, + "loss": 0.9062, + "step": 12765 + }, + { + "epoch": 1.54, + "grad_norm": 0.23835386335849762, + "learning_rate": 0.00016400539505598974, + "loss": 0.9274, + "step": 12770 + }, + { + "epoch": 1.54, + "grad_norm": 0.24308866262435913, + "learning_rate": 0.00016390293723013124, + "loss": 0.9382, + "step": 12775 + }, + { + "epoch": 1.54, + "grad_norm": 0.2690812647342682, + "learning_rate": 0.00016380047286108874, + "loss": 0.8704, + "step": 12780 + }, + { + "epoch": 1.54, + "grad_norm": 0.24690578877925873, + "learning_rate": 0.00016369800199708546, + "loss": 0.9505, + "step": 12785 + }, + { + "epoch": 1.54, + "grad_norm": 0.27407118678092957, + "learning_rate": 0.00016359552468634748, + "loss": 0.806, + "step": 12790 + }, + { + "epoch": 1.54, + "grad_norm": 0.23908410966396332, + "learning_rate": 0.00016349304097710416, + "loss": 0.8412, + "step": 12795 + }, + { + "epoch": 1.54, + "grad_norm": 0.21888162195682526, + "learning_rate": 0.00016339055091758764, + "loss": 0.91, + "step": 12800 + }, + { + "epoch": 1.54, + "grad_norm": 0.2692570984363556, + "learning_rate": 0.00016328805455603315, + "loss": 0.8517, + "step": 12805 + }, + { + "epoch": 1.54, + "grad_norm": 0.21654078364372253, + "learning_rate": 0.00016318555194067892, + "loss": 0.9092, + "step": 12810 + }, + { + "epoch": 1.54, + "grad_norm": 0.22260205447673798, + "learning_rate": 0.00016308304311976604, + "loss": 0.7538, + "step": 12815 + }, + { + "epoch": 1.54, + "grad_norm": 0.2802688479423523, + "learning_rate": 0.00016298052814153866, + "loss": 0.9275, + "step": 12820 + }, + { + "epoch": 1.55, + "grad_norm": 0.24824702739715576, + "learning_rate": 0.00016287800705424362, + "loss": 0.8563, + "step": 12825 + }, + { + "epoch": 1.55, + "grad_norm": 0.2404843270778656, + "learning_rate": 0.00016277547990613083, + "loss": 0.9009, + "step": 12830 + }, + { + "epoch": 1.55, + "grad_norm": 0.24702101945877075, + "learning_rate": 0.00016267294674545286, + "loss": 0.9061, + "step": 12835 + }, + { + "epoch": 1.55, + "grad_norm": 0.24450421333312988, + "learning_rate": 0.0001625704076204654, + "loss": 0.9004, + "step": 12840 + }, + { + "epoch": 1.55, + "grad_norm": 0.2563953399658203, + "learning_rate": 0.00016246786257942658, + "loss": 0.9063, + "step": 12845 + }, + { + "epoch": 1.55, + "grad_norm": 0.2544556260108948, + "learning_rate": 0.00016236531167059762, + "loss": 0.8174, + "step": 12850 + }, + { + "epoch": 1.55, + "grad_norm": 0.23205184936523438, + "learning_rate": 0.00016226275494224233, + "loss": 0.8916, + "step": 12855 + }, + { + "epoch": 1.55, + "grad_norm": 0.2657812237739563, + "learning_rate": 0.00016216019244262735, + "loss": 0.8149, + "step": 12860 + }, + { + "epoch": 1.55, + "grad_norm": 0.21930214762687683, + "learning_rate": 0.00016205762422002198, + "loss": 0.8223, + "step": 12865 + }, + { + "epoch": 1.55, + "grad_norm": 0.2352408766746521, + "learning_rate": 0.00016195505032269821, + "loss": 0.8929, + "step": 12870 + }, + { + "epoch": 1.55, + "grad_norm": 0.24339045584201813, + "learning_rate": 0.00016185247079893075, + "loss": 0.919, + "step": 12875 + }, + { + "epoch": 1.55, + "grad_norm": 0.27827414870262146, + "learning_rate": 0.00016174988569699696, + "loss": 0.8859, + "step": 12880 + }, + { + "epoch": 1.55, + "grad_norm": 0.2651593089103699, + "learning_rate": 0.0001616472950651767, + "loss": 0.7699, + "step": 12885 + }, + { + "epoch": 1.55, + "grad_norm": 0.24380792677402496, + "learning_rate": 0.00016154469895175266, + "loss": 0.9605, + "step": 12890 + }, + { + "epoch": 1.55, + "grad_norm": 0.2639029920101166, + "learning_rate": 0.00016144209740500982, + "loss": 0.8723, + "step": 12895 + }, + { + "epoch": 1.55, + "grad_norm": 0.2522783577442169, + "learning_rate": 0.000161339490473236, + "loss": 0.8958, + "step": 12900 + }, + { + "epoch": 1.55, + "grad_norm": 0.2770918309688568, + "learning_rate": 0.00016123687820472139, + "loss": 0.797, + "step": 12905 + }, + { + "epoch": 1.56, + "grad_norm": 0.22492796182632446, + "learning_rate": 0.00016113426064775875, + "loss": 0.8496, + "step": 12910 + }, + { + "epoch": 1.56, + "grad_norm": 0.2644830644130707, + "learning_rate": 0.00016103163785064327, + "loss": 0.8458, + "step": 12915 + }, + { + "epoch": 1.56, + "grad_norm": 0.2638452649116516, + "learning_rate": 0.0001609290098616727, + "loss": 0.8408, + "step": 12920 + }, + { + "epoch": 1.56, + "grad_norm": 0.23867619037628174, + "learning_rate": 0.0001608263767291471, + "loss": 0.8522, + "step": 12925 + }, + { + "epoch": 1.56, + "grad_norm": 0.2165214717388153, + "learning_rate": 0.00016072373850136912, + "loss": 0.9328, + "step": 12930 + }, + { + "epoch": 1.56, + "grad_norm": 0.2513158619403839, + "learning_rate": 0.00016062109522664366, + "loss": 0.8523, + "step": 12935 + }, + { + "epoch": 1.56, + "grad_norm": 0.22370438277721405, + "learning_rate": 0.00016051844695327806, + "loss": 0.8542, + "step": 12940 + }, + { + "epoch": 1.56, + "grad_norm": 0.2550356090068817, + "learning_rate": 0.000160415793729582, + "loss": 0.9122, + "step": 12945 + }, + { + "epoch": 1.56, + "grad_norm": 0.23933614790439606, + "learning_rate": 0.00016031313560386758, + "loss": 0.8236, + "step": 12950 + }, + { + "epoch": 1.56, + "grad_norm": 0.2359176129102707, + "learning_rate": 0.000160210472624449, + "loss": 0.8815, + "step": 12955 + }, + { + "epoch": 1.56, + "grad_norm": 0.2673303186893463, + "learning_rate": 0.00016010780483964295, + "loss": 0.8111, + "step": 12960 + }, + { + "epoch": 1.56, + "grad_norm": 0.2548558712005615, + "learning_rate": 0.00016000513229776826, + "loss": 0.919, + "step": 12965 + }, + { + "epoch": 1.56, + "grad_norm": 0.27557799220085144, + "learning_rate": 0.00015990245504714608, + "loss": 0.7909, + "step": 12970 + }, + { + "epoch": 1.56, + "grad_norm": 0.285546213388443, + "learning_rate": 0.00015979977313609965, + "loss": 0.8933, + "step": 12975 + }, + { + "epoch": 1.56, + "grad_norm": 0.2725813090801239, + "learning_rate": 0.00015969708661295456, + "loss": 0.8644, + "step": 12980 + }, + { + "epoch": 1.56, + "grad_norm": 0.23822170495986938, + "learning_rate": 0.0001595943955260385, + "loss": 0.8707, + "step": 12985 + }, + { + "epoch": 1.57, + "grad_norm": 0.2242768406867981, + "learning_rate": 0.0001594916999236813, + "loss": 0.7392, + "step": 12990 + }, + { + "epoch": 1.57, + "grad_norm": 0.21174441277980804, + "learning_rate": 0.00015938899985421486, + "loss": 0.794, + "step": 12995 + }, + { + "epoch": 1.57, + "grad_norm": 0.24703176319599152, + "learning_rate": 0.00015928629536597332, + "loss": 0.8355, + "step": 13000 + }, + { + "epoch": 1.57, + "grad_norm": 0.22385214269161224, + "learning_rate": 0.00015918358650729276, + "loss": 0.763, + "step": 13005 + }, + { + "epoch": 1.57, + "grad_norm": 0.24205328524112701, + "learning_rate": 0.00015908087332651142, + "loss": 0.86, + "step": 13010 + }, + { + "epoch": 1.57, + "grad_norm": 0.256166011095047, + "learning_rate": 0.00015897815587196954, + "loss": 0.8247, + "step": 13015 + }, + { + "epoch": 1.57, + "grad_norm": 0.2543538212776184, + "learning_rate": 0.00015887543419200936, + "loss": 0.9216, + "step": 13020 + }, + { + "epoch": 1.57, + "grad_norm": 0.2570268511772156, + "learning_rate": 0.0001587727083349751, + "loss": 0.8575, + "step": 13025 + }, + { + "epoch": 1.57, + "grad_norm": 0.29525551199913025, + "learning_rate": 0.000158669978349213, + "loss": 0.9212, + "step": 13030 + }, + { + "epoch": 1.57, + "grad_norm": 0.23510275781154633, + "learning_rate": 0.0001585672442830711, + "loss": 0.8684, + "step": 13035 + }, + { + "epoch": 1.57, + "grad_norm": 0.23046576976776123, + "learning_rate": 0.00015846450618489958, + "loss": 0.7974, + "step": 13040 + }, + { + "epoch": 1.57, + "grad_norm": 0.2453136444091797, + "learning_rate": 0.0001583617641030503, + "loss": 0.832, + "step": 13045 + }, + { + "epoch": 1.57, + "grad_norm": 0.27528902888298035, + "learning_rate": 0.0001582590180858772, + "loss": 0.7578, + "step": 13050 + }, + { + "epoch": 1.57, + "grad_norm": 0.2460784763097763, + "learning_rate": 0.0001581562681817359, + "loss": 0.8231, + "step": 13055 + }, + { + "epoch": 1.57, + "grad_norm": 0.29714435338974, + "learning_rate": 0.00015805351443898388, + "loss": 0.7481, + "step": 13060 + }, + { + "epoch": 1.57, + "grad_norm": 0.22743813693523407, + "learning_rate": 0.0001579507569059806, + "loss": 0.7635, + "step": 13065 + }, + { + "epoch": 1.57, + "grad_norm": 0.24308447539806366, + "learning_rate": 0.00015784799563108706, + "loss": 0.8822, + "step": 13070 + }, + { + "epoch": 1.58, + "grad_norm": 0.2288735806941986, + "learning_rate": 0.00015774523066266612, + "loss": 0.8427, + "step": 13075 + }, + { + "epoch": 1.58, + "grad_norm": 0.23703986406326294, + "learning_rate": 0.00015764246204908245, + "loss": 0.9294, + "step": 13080 + }, + { + "epoch": 1.58, + "grad_norm": 0.2489616721868515, + "learning_rate": 0.0001575396898387023, + "loss": 0.9377, + "step": 13085 + }, + { + "epoch": 1.58, + "grad_norm": 0.25076568126678467, + "learning_rate": 0.00015743691407989378, + "loss": 0.9284, + "step": 13090 + }, + { + "epoch": 1.58, + "grad_norm": 0.247811421751976, + "learning_rate": 0.00015733413482102652, + "loss": 0.867, + "step": 13095 + }, + { + "epoch": 1.58, + "grad_norm": 0.2610926330089569, + "learning_rate": 0.00015723135211047186, + "loss": 0.8203, + "step": 13100 + }, + { + "epoch": 1.58, + "grad_norm": 0.2661215662956238, + "learning_rate": 0.00015712856599660267, + "loss": 0.9256, + "step": 13105 + }, + { + "epoch": 1.58, + "grad_norm": 0.22630742192268372, + "learning_rate": 0.00015702577652779368, + "loss": 0.8556, + "step": 13110 + }, + { + "epoch": 1.58, + "grad_norm": 0.3097597360610962, + "learning_rate": 0.00015692298375242087, + "loss": 0.8452, + "step": 13115 + }, + { + "epoch": 1.58, + "grad_norm": 0.2603763937950134, + "learning_rate": 0.00015682018771886203, + "loss": 0.8142, + "step": 13120 + }, + { + "epoch": 1.58, + "grad_norm": 0.2540077865123749, + "learning_rate": 0.00015671738847549633, + "loss": 0.9639, + "step": 13125 + }, + { + "epoch": 1.58, + "grad_norm": 0.24496349692344666, + "learning_rate": 0.0001566145860707046, + "loss": 1.0137, + "step": 13130 + }, + { + "epoch": 1.58, + "grad_norm": 0.2169584482908249, + "learning_rate": 0.00015651178055286897, + "loss": 0.9417, + "step": 13135 + }, + { + "epoch": 1.58, + "grad_norm": 0.23268476128578186, + "learning_rate": 0.0001564089719703732, + "loss": 0.91, + "step": 13140 + }, + { + "epoch": 1.58, + "grad_norm": 0.21907760202884674, + "learning_rate": 0.0001563061603716023, + "loss": 0.8338, + "step": 13145 + }, + { + "epoch": 1.58, + "grad_norm": 0.24328762292861938, + "learning_rate": 0.00015620334580494297, + "loss": 0.872, + "step": 13150 + }, + { + "epoch": 1.59, + "grad_norm": 0.26593923568725586, + "learning_rate": 0.00015610052831878304, + "loss": 0.9121, + "step": 13155 + }, + { + "epoch": 1.59, + "grad_norm": 0.2493075728416443, + "learning_rate": 0.00015599770796151196, + "loss": 0.8829, + "step": 13160 + }, + { + "epoch": 1.59, + "grad_norm": 0.2412213832139969, + "learning_rate": 0.00015589488478152027, + "loss": 0.8247, + "step": 13165 + }, + { + "epoch": 1.59, + "grad_norm": 0.2626934349536896, + "learning_rate": 0.00015579205882720014, + "loss": 0.8603, + "step": 13170 + }, + { + "epoch": 1.59, + "grad_norm": 0.2839190363883972, + "learning_rate": 0.0001556892301469447, + "loss": 0.8257, + "step": 13175 + }, + { + "epoch": 1.59, + "grad_norm": 0.28261008858680725, + "learning_rate": 0.0001555863987891486, + "loss": 0.7699, + "step": 13180 + }, + { + "epoch": 1.59, + "grad_norm": 0.2586536109447479, + "learning_rate": 0.00015548356480220773, + "loss": 0.9528, + "step": 13185 + }, + { + "epoch": 1.59, + "grad_norm": 0.25561654567718506, + "learning_rate": 0.0001553807282345192, + "loss": 0.809, + "step": 13190 + }, + { + "epoch": 1.59, + "grad_norm": 0.24702797830104828, + "learning_rate": 0.00015527788913448124, + "loss": 0.9084, + "step": 13195 + }, + { + "epoch": 1.59, + "grad_norm": 0.22921974956989288, + "learning_rate": 0.0001551750475504934, + "loss": 0.8321, + "step": 13200 + }, + { + "epoch": 1.59, + "grad_norm": 0.2360827922821045, + "learning_rate": 0.0001550722035309563, + "loss": 0.8252, + "step": 13205 + }, + { + "epoch": 1.59, + "grad_norm": 0.24088476598262787, + "learning_rate": 0.00015496935712427183, + "loss": 0.7994, + "step": 13210 + }, + { + "epoch": 1.59, + "grad_norm": 0.22839505970478058, + "learning_rate": 0.00015486650837884277, + "loss": 0.891, + "step": 13215 + }, + { + "epoch": 1.59, + "grad_norm": 0.257959246635437, + "learning_rate": 0.00015476365734307335, + "loss": 0.853, + "step": 13220 + }, + { + "epoch": 1.59, + "grad_norm": 0.23323047161102295, + "learning_rate": 0.00015466080406536853, + "loss": 0.8453, + "step": 13225 + }, + { + "epoch": 1.59, + "grad_norm": 0.24002352356910706, + "learning_rate": 0.00015455794859413458, + "loss": 0.7921, + "step": 13230 + }, + { + "epoch": 1.59, + "grad_norm": 0.270093709230423, + "learning_rate": 0.0001544550909777786, + "loss": 0.8082, + "step": 13235 + }, + { + "epoch": 1.6, + "grad_norm": 0.25525951385498047, + "learning_rate": 0.0001543522312647089, + "loss": 0.9131, + "step": 13240 + }, + { + "epoch": 1.6, + "grad_norm": 0.26229140162467957, + "learning_rate": 0.00015424936950333463, + "loss": 0.9213, + "step": 13245 + }, + { + "epoch": 1.6, + "grad_norm": 0.25442442297935486, + "learning_rate": 0.00015414650574206595, + "loss": 0.8611, + "step": 13250 + }, + { + "epoch": 1.6, + "grad_norm": 0.2250799685716629, + "learning_rate": 0.00015404364002931397, + "loss": 0.8195, + "step": 13255 + }, + { + "epoch": 1.6, + "grad_norm": 0.2337421178817749, + "learning_rate": 0.00015394077241349073, + "loss": 0.8752, + "step": 13260 + }, + { + "epoch": 1.6, + "grad_norm": 0.2822083830833435, + "learning_rate": 0.00015383790294300908, + "loss": 0.8054, + "step": 13265 + }, + { + "epoch": 1.6, + "grad_norm": 0.25542697310447693, + "learning_rate": 0.00015373503166628288, + "loss": 0.8523, + "step": 13270 + }, + { + "epoch": 1.6, + "grad_norm": 0.23330925405025482, + "learning_rate": 0.00015363215863172671, + "loss": 0.9308, + "step": 13275 + }, + { + "epoch": 1.6, + "grad_norm": 0.3322324752807617, + "learning_rate": 0.00015352928388775612, + "loss": 0.8854, + "step": 13280 + }, + { + "epoch": 1.6, + "grad_norm": 0.2551574110984802, + "learning_rate": 0.00015342640748278725, + "loss": 0.8225, + "step": 13285 + }, + { + "epoch": 1.6, + "grad_norm": 0.2288314402103424, + "learning_rate": 0.00015332352946523733, + "loss": 0.8141, + "step": 13290 + }, + { + "epoch": 1.6, + "grad_norm": 0.28480198979377747, + "learning_rate": 0.000153220649883524, + "loss": 0.7063, + "step": 13295 + }, + { + "epoch": 1.6, + "grad_norm": 0.26448380947113037, + "learning_rate": 0.0001531177687860659, + "loss": 0.8755, + "step": 13300 + }, + { + "epoch": 1.6, + "grad_norm": 0.29811567068099976, + "learning_rate": 0.00015301488622128224, + "loss": 0.867, + "step": 13305 + }, + { + "epoch": 1.6, + "grad_norm": 0.2261502593755722, + "learning_rate": 0.00015291200223759306, + "loss": 0.881, + "step": 13310 + }, + { + "epoch": 1.6, + "grad_norm": 0.26991018652915955, + "learning_rate": 0.0001528091168834189, + "loss": 0.835, + "step": 13315 + }, + { + "epoch": 1.6, + "grad_norm": 0.2542611062526703, + "learning_rate": 0.00015270623020718102, + "loss": 0.8976, + "step": 13320 + }, + { + "epoch": 1.61, + "grad_norm": 0.2485206127166748, + "learning_rate": 0.00015260334225730137, + "loss": 0.8939, + "step": 13325 + }, + { + "epoch": 1.61, + "grad_norm": 0.26317083835601807, + "learning_rate": 0.00015250045308220236, + "loss": 0.8537, + "step": 13330 + }, + { + "epoch": 1.61, + "grad_norm": 0.23447948694229126, + "learning_rate": 0.00015239756273030715, + "loss": 0.8983, + "step": 13335 + }, + { + "epoch": 1.61, + "grad_norm": 0.25760382413864136, + "learning_rate": 0.00015229467125003925, + "loss": 0.7641, + "step": 13340 + }, + { + "epoch": 1.61, + "grad_norm": 0.24973775446414948, + "learning_rate": 0.00015219177868982286, + "loss": 0.8753, + "step": 13345 + }, + { + "epoch": 1.61, + "grad_norm": 0.26316988468170166, + "learning_rate": 0.00015208888509808267, + "loss": 0.9377, + "step": 13350 + }, + { + "epoch": 1.61, + "grad_norm": 0.23706527054309845, + "learning_rate": 0.00015198599052324377, + "loss": 0.8637, + "step": 13355 + }, + { + "epoch": 1.61, + "grad_norm": 0.2416650950908661, + "learning_rate": 0.00015188309501373175, + "loss": 0.8773, + "step": 13360 + }, + { + "epoch": 1.61, + "grad_norm": 0.24447450041770935, + "learning_rate": 0.0001517801986179727, + "loss": 0.8913, + "step": 13365 + }, + { + "epoch": 1.61, + "grad_norm": 0.24984611570835114, + "learning_rate": 0.00015167730138439305, + "loss": 0.8512, + "step": 13370 + }, + { + "epoch": 1.61, + "grad_norm": 0.2462444007396698, + "learning_rate": 0.00015157440336141967, + "loss": 0.934, + "step": 13375 + }, + { + "epoch": 1.61, + "grad_norm": 0.23950566351413727, + "learning_rate": 0.0001514715045974798, + "loss": 0.8259, + "step": 13380 + }, + { + "epoch": 1.61, + "grad_norm": 0.23663043975830078, + "learning_rate": 0.000151368605141001, + "loss": 0.7744, + "step": 13385 + }, + { + "epoch": 1.61, + "grad_norm": 0.2686599791049957, + "learning_rate": 0.00015126570504041115, + "loss": 0.8617, + "step": 13390 + }, + { + "epoch": 1.61, + "grad_norm": 0.22998297214508057, + "learning_rate": 0.0001511628043441385, + "loss": 0.8539, + "step": 13395 + }, + { + "epoch": 1.61, + "grad_norm": 0.2632577121257782, + "learning_rate": 0.00015105990310061146, + "loss": 0.8452, + "step": 13400 + }, + { + "epoch": 1.62, + "grad_norm": 0.2295655459165573, + "learning_rate": 0.00015095700135825887, + "loss": 0.9384, + "step": 13405 + }, + { + "epoch": 1.62, + "grad_norm": 0.24616779386997223, + "learning_rate": 0.00015085409916550961, + "loss": 0.8459, + "step": 13410 + }, + { + "epoch": 1.62, + "grad_norm": 0.25910162925720215, + "learning_rate": 0.00015075119657079298, + "loss": 0.8525, + "step": 13415 + }, + { + "epoch": 1.62, + "grad_norm": 0.22823968529701233, + "learning_rate": 0.00015064829362253828, + "loss": 0.8893, + "step": 13420 + }, + { + "epoch": 1.62, + "grad_norm": 0.2842884063720703, + "learning_rate": 0.0001505453903691751, + "loss": 0.806, + "step": 13425 + }, + { + "epoch": 1.62, + "grad_norm": 0.24456340074539185, + "learning_rate": 0.00015044248685913304, + "loss": 0.871, + "step": 13430 + }, + { + "epoch": 1.62, + "grad_norm": 0.23443590104579926, + "learning_rate": 0.00015033958314084202, + "loss": 0.8712, + "step": 13435 + }, + { + "epoch": 1.62, + "grad_norm": 0.28807491064071655, + "learning_rate": 0.00015023667926273183, + "loss": 0.8278, + "step": 13440 + }, + { + "epoch": 1.62, + "grad_norm": 0.28618142008781433, + "learning_rate": 0.00015013377527323257, + "loss": 0.9076, + "step": 13445 + }, + { + "epoch": 1.62, + "grad_norm": 0.2491266131401062, + "learning_rate": 0.0001500308712207742, + "loss": 0.8047, + "step": 13450 + }, + { + "epoch": 1.62, + "grad_norm": 0.25134310126304626, + "learning_rate": 0.00014992796715378686, + "loss": 0.8135, + "step": 13455 + }, + { + "epoch": 1.62, + "grad_norm": 0.2570821940898895, + "learning_rate": 0.00014982506312070053, + "loss": 0.8727, + "step": 13460 + }, + { + "epoch": 1.62, + "grad_norm": 0.23932205140590668, + "learning_rate": 0.0001497221591699453, + "loss": 0.8599, + "step": 13465 + }, + { + "epoch": 1.62, + "grad_norm": 0.274517685174942, + "learning_rate": 0.00014961925534995118, + "loss": 0.8436, + "step": 13470 + }, + { + "epoch": 1.62, + "grad_norm": 0.24874992668628693, + "learning_rate": 0.0001495163517091482, + "loss": 0.8423, + "step": 13475 + }, + { + "epoch": 1.62, + "grad_norm": 0.23968589305877686, + "learning_rate": 0.00014941344829596612, + "loss": 0.9005, + "step": 13480 + }, + { + "epoch": 1.62, + "grad_norm": 0.25740721821784973, + "learning_rate": 0.00014931054515883473, + "loss": 0.8737, + "step": 13485 + }, + { + "epoch": 1.63, + "grad_norm": 0.22568397223949432, + "learning_rate": 0.00014920764234618373, + "loss": 0.9345, + "step": 13490 + }, + { + "epoch": 1.63, + "grad_norm": 0.2677795886993408, + "learning_rate": 0.00014910473990644254, + "loss": 0.8573, + "step": 13495 + }, + { + "epoch": 1.63, + "grad_norm": 0.2688808739185333, + "learning_rate": 0.00014900183788804048, + "loss": 0.8759, + "step": 13500 + }, + { + "epoch": 1.63, + "grad_norm": 0.25429514050483704, + "learning_rate": 0.0001488989363394066, + "loss": 0.7835, + "step": 13505 + }, + { + "epoch": 1.63, + "grad_norm": 0.3000656068325043, + "learning_rate": 0.00014879603530896992, + "loss": 0.8517, + "step": 13510 + }, + { + "epoch": 1.63, + "grad_norm": 0.2686077356338501, + "learning_rate": 0.00014869313484515897, + "loss": 0.9386, + "step": 13515 + }, + { + "epoch": 1.63, + "grad_norm": 0.2108505368232727, + "learning_rate": 0.0001485902349964022, + "loss": 0.8535, + "step": 13520 + }, + { + "epoch": 1.63, + "grad_norm": 0.2740882635116577, + "learning_rate": 0.0001484873358111276, + "loss": 0.9123, + "step": 13525 + }, + { + "epoch": 1.63, + "grad_norm": 0.234098881483078, + "learning_rate": 0.00014838443733776306, + "loss": 0.8044, + "step": 13530 + }, + { + "epoch": 1.63, + "grad_norm": 0.2779216468334198, + "learning_rate": 0.00014828153962473593, + "loss": 0.8323, + "step": 13535 + }, + { + "epoch": 1.63, + "grad_norm": 0.24527031183242798, + "learning_rate": 0.00014817864272047334, + "loss": 0.8261, + "step": 13540 + }, + { + "epoch": 1.63, + "grad_norm": 0.2469957321882248, + "learning_rate": 0.00014807574667340188, + "loss": 0.836, + "step": 13545 + }, + { + "epoch": 1.63, + "grad_norm": 0.24512287974357605, + "learning_rate": 0.00014797285153194805, + "loss": 0.799, + "step": 13550 + }, + { + "epoch": 1.63, + "grad_norm": 0.26496610045433044, + "learning_rate": 0.00014786995734453756, + "loss": 0.8458, + "step": 13555 + }, + { + "epoch": 1.63, + "grad_norm": 0.2510302662849426, + "learning_rate": 0.0001477670641595959, + "loss": 0.8904, + "step": 13560 + }, + { + "epoch": 1.63, + "grad_norm": 0.22821520268917084, + "learning_rate": 0.00014766417202554798, + "loss": 0.8772, + "step": 13565 + }, + { + "epoch": 1.64, + "grad_norm": 0.27513188123703003, + "learning_rate": 0.0001475612809908183, + "loss": 0.81, + "step": 13570 + }, + { + "epoch": 1.64, + "grad_norm": 0.24558843672275543, + "learning_rate": 0.00014745839110383077, + "loss": 0.7985, + "step": 13575 + }, + { + "epoch": 1.64, + "grad_norm": 0.26508501172065735, + "learning_rate": 0.0001473555024130088, + "loss": 0.8178, + "step": 13580 + }, + { + "epoch": 1.64, + "grad_norm": 0.23892903327941895, + "learning_rate": 0.00014725261496677513, + "loss": 0.8101, + "step": 13585 + }, + { + "epoch": 1.64, + "grad_norm": 0.24258942902088165, + "learning_rate": 0.00014714972881355216, + "loss": 0.875, + "step": 13590 + }, + { + "epoch": 1.64, + "grad_norm": 0.26198121905326843, + "learning_rate": 0.0001470468440017615, + "loss": 0.8671, + "step": 13595 + }, + { + "epoch": 1.64, + "grad_norm": 0.22335876524448395, + "learning_rate": 0.0001469439605798241, + "loss": 0.7895, + "step": 13600 + }, + { + "epoch": 1.64, + "grad_norm": 0.21281731128692627, + "learning_rate": 0.0001468410785961603, + "loss": 0.8186, + "step": 13605 + }, + { + "epoch": 1.64, + "grad_norm": 0.26256877183914185, + "learning_rate": 0.00014673819809918985, + "loss": 0.8658, + "step": 13610 + }, + { + "epoch": 1.64, + "grad_norm": 0.2902090549468994, + "learning_rate": 0.0001466353191373317, + "loss": 0.8785, + "step": 13615 + }, + { + "epoch": 1.64, + "grad_norm": 0.2701566815376282, + "learning_rate": 0.0001465324417590041, + "loss": 0.9397, + "step": 13620 + }, + { + "epoch": 1.64, + "grad_norm": 0.2777699828147888, + "learning_rate": 0.00014642956601262452, + "loss": 0.8809, + "step": 13625 + }, + { + "epoch": 1.64, + "grad_norm": 0.24040934443473816, + "learning_rate": 0.0001463266919466098, + "loss": 0.9082, + "step": 13630 + }, + { + "epoch": 1.64, + "grad_norm": 0.2828356623649597, + "learning_rate": 0.0001462238196093758, + "loss": 0.8458, + "step": 13635 + }, + { + "epoch": 1.64, + "grad_norm": 0.24517838656902313, + "learning_rate": 0.00014612094904933772, + "loss": 0.9284, + "step": 13640 + }, + { + "epoch": 1.64, + "grad_norm": 0.23123225569725037, + "learning_rate": 0.00014601808031490982, + "loss": 0.7244, + "step": 13645 + }, + { + "epoch": 1.64, + "grad_norm": 0.23790529370307922, + "learning_rate": 0.00014591521345450558, + "loss": 0.7898, + "step": 13650 + }, + { + "epoch": 1.65, + "grad_norm": 0.2609568238258362, + "learning_rate": 0.00014581234851653753, + "loss": 0.8663, + "step": 13655 + }, + { + "epoch": 1.65, + "grad_norm": 0.36788272857666016, + "learning_rate": 0.00014570948554941735, + "loss": 0.8454, + "step": 13660 + }, + { + "epoch": 1.65, + "grad_norm": 0.23802486062049866, + "learning_rate": 0.0001456066246015557, + "loss": 0.9379, + "step": 13665 + }, + { + "epoch": 1.65, + "grad_norm": 0.26445284485816956, + "learning_rate": 0.00014550376572136246, + "loss": 0.9054, + "step": 13670 + }, + { + "epoch": 1.65, + "grad_norm": 0.23677241802215576, + "learning_rate": 0.0001454009089572464, + "loss": 0.7262, + "step": 13675 + }, + { + "epoch": 1.65, + "grad_norm": 0.2473301738500595, + "learning_rate": 0.0001452980543576153, + "loss": 0.7921, + "step": 13680 + }, + { + "epoch": 1.65, + "grad_norm": 0.25561195611953735, + "learning_rate": 0.0001451952019708759, + "loss": 0.8822, + "step": 13685 + }, + { + "epoch": 1.65, + "grad_norm": 0.27183815836906433, + "learning_rate": 0.0001450923518454341, + "loss": 0.9218, + "step": 13690 + }, + { + "epoch": 1.65, + "grad_norm": 0.22998422384262085, + "learning_rate": 0.0001449895040296945, + "loss": 0.909, + "step": 13695 + }, + { + "epoch": 1.65, + "grad_norm": 0.2734430730342865, + "learning_rate": 0.00014488665857206065, + "loss": 0.8962, + "step": 13700 + }, + { + "epoch": 1.65, + "grad_norm": 0.23932689428329468, + "learning_rate": 0.0001447838155209351, + "loss": 0.8714, + "step": 13705 + }, + { + "epoch": 1.65, + "grad_norm": 0.25987210869789124, + "learning_rate": 0.0001446809749247192, + "loss": 0.7435, + "step": 13710 + }, + { + "epoch": 1.65, + "grad_norm": 0.2543168365955353, + "learning_rate": 0.00014457813683181316, + "loss": 0.8738, + "step": 13715 + }, + { + "epoch": 1.65, + "grad_norm": 0.3010767698287964, + "learning_rate": 0.00014447530129061597, + "loss": 0.7546, + "step": 13720 + }, + { + "epoch": 1.65, + "grad_norm": 0.24551789462566376, + "learning_rate": 0.00014437246834952537, + "loss": 0.8564, + "step": 13725 + }, + { + "epoch": 1.65, + "grad_norm": 0.22315555810928345, + "learning_rate": 0.00014426963805693816, + "loss": 0.8453, + "step": 13730 + }, + { + "epoch": 1.65, + "grad_norm": 0.2511034905910492, + "learning_rate": 0.00014416681046124953, + "loss": 0.7888, + "step": 13735 + }, + { + "epoch": 1.66, + "grad_norm": 0.28927719593048096, + "learning_rate": 0.00014406398561085364, + "loss": 0.8495, + "step": 13740 + }, + { + "epoch": 1.66, + "grad_norm": 0.23084275424480438, + "learning_rate": 0.00014396116355414322, + "loss": 0.8212, + "step": 13745 + }, + { + "epoch": 1.66, + "grad_norm": 0.29675766825675964, + "learning_rate": 0.0001438583443395098, + "loss": 0.97, + "step": 13750 + }, + { + "epoch": 1.66, + "grad_norm": 0.27987009286880493, + "learning_rate": 0.00014375552801534352, + "loss": 0.8982, + "step": 13755 + }, + { + "epoch": 1.66, + "grad_norm": 0.21575410664081573, + "learning_rate": 0.00014365271463003307, + "loss": 0.9205, + "step": 13760 + }, + { + "epoch": 1.66, + "grad_norm": 0.27890732884407043, + "learning_rate": 0.0001435499042319659, + "loss": 0.8773, + "step": 13765 + }, + { + "epoch": 1.66, + "grad_norm": 0.23023445904254913, + "learning_rate": 0.00014344709686952802, + "loss": 0.9438, + "step": 13770 + }, + { + "epoch": 1.66, + "grad_norm": 0.2396959662437439, + "learning_rate": 0.000143344292591104, + "loss": 0.8498, + "step": 13775 + }, + { + "epoch": 1.66, + "grad_norm": 0.2347070574760437, + "learning_rate": 0.0001432414914450769, + "loss": 0.8571, + "step": 13780 + }, + { + "epoch": 1.66, + "grad_norm": 0.2968361973762512, + "learning_rate": 0.00014313869347982831, + "loss": 0.7974, + "step": 13785 + }, + { + "epoch": 1.66, + "grad_norm": 0.26392439007759094, + "learning_rate": 0.0001430358987437385, + "loss": 0.8104, + "step": 13790 + }, + { + "epoch": 1.66, + "grad_norm": 0.25702106952667236, + "learning_rate": 0.000142933107285186, + "loss": 0.8631, + "step": 13795 + }, + { + "epoch": 1.66, + "grad_norm": 0.25434088706970215, + "learning_rate": 0.0001428303191525479, + "loss": 0.8464, + "step": 13800 + }, + { + "epoch": 1.66, + "grad_norm": 0.23187103867530823, + "learning_rate": 0.00014272753439419962, + "loss": 0.8209, + "step": 13805 + }, + { + "epoch": 1.66, + "grad_norm": 0.2695891261100769, + "learning_rate": 0.00014262475305851523, + "loss": 0.8192, + "step": 13810 + }, + { + "epoch": 1.66, + "grad_norm": 0.24711039662361145, + "learning_rate": 0.000142521975193867, + "loss": 0.9271, + "step": 13815 + }, + { + "epoch": 1.67, + "grad_norm": 0.25541651248931885, + "learning_rate": 0.00014241920084862554, + "loss": 0.8063, + "step": 13820 + }, + { + "epoch": 1.67, + "grad_norm": 0.22565728425979614, + "learning_rate": 0.00014231643007115994, + "loss": 0.8552, + "step": 13825 + }, + { + "epoch": 1.67, + "grad_norm": 0.24936147034168243, + "learning_rate": 0.0001422136629098375, + "loss": 0.872, + "step": 13830 + }, + { + "epoch": 1.67, + "grad_norm": 0.29638171195983887, + "learning_rate": 0.0001421108994130239, + "loss": 0.9048, + "step": 13835 + }, + { + "epoch": 1.67, + "grad_norm": 0.23596297204494476, + "learning_rate": 0.00014200813962908293, + "loss": 0.7751, + "step": 13840 + }, + { + "epoch": 1.67, + "grad_norm": 0.23561915755271912, + "learning_rate": 0.00014190538360637695, + "loss": 0.8732, + "step": 13845 + }, + { + "epoch": 1.67, + "grad_norm": 0.24926547706127167, + "learning_rate": 0.00014180263139326624, + "loss": 0.8895, + "step": 13850 + }, + { + "epoch": 1.67, + "grad_norm": 0.30811989307403564, + "learning_rate": 0.00014169988303810942, + "loss": 0.8066, + "step": 13855 + }, + { + "epoch": 1.67, + "grad_norm": 0.243475079536438, + "learning_rate": 0.00014159713858926323, + "loss": 0.8425, + "step": 13860 + }, + { + "epoch": 1.67, + "grad_norm": 0.23428189754486084, + "learning_rate": 0.00014149439809508273, + "loss": 0.8981, + "step": 13865 + }, + { + "epoch": 1.67, + "grad_norm": 0.26283276081085205, + "learning_rate": 0.00014139166160392094, + "loss": 0.8915, + "step": 13870 + }, + { + "epoch": 1.67, + "grad_norm": 0.2469039112329483, + "learning_rate": 0.00014128892916412907, + "loss": 0.7645, + "step": 13875 + }, + { + "epoch": 1.67, + "grad_norm": 0.2410871535539627, + "learning_rate": 0.00014118620082405637, + "loss": 0.811, + "step": 13880 + }, + { + "epoch": 1.67, + "grad_norm": 0.23918049037456512, + "learning_rate": 0.00014108347663205033, + "loss": 0.7641, + "step": 13885 + }, + { + "epoch": 1.67, + "grad_norm": 0.2121327817440033, + "learning_rate": 0.00014098075663645628, + "loss": 0.9609, + "step": 13890 + }, + { + "epoch": 1.67, + "grad_norm": 0.2228885442018509, + "learning_rate": 0.0001408780408856177, + "loss": 0.9062, + "step": 13895 + }, + { + "epoch": 1.67, + "grad_norm": 0.23560738563537598, + "learning_rate": 0.0001407753294278759, + "loss": 0.7888, + "step": 13900 + }, + { + "epoch": 1.68, + "grad_norm": 0.25033679604530334, + "learning_rate": 0.0001406726223115705, + "loss": 0.8781, + "step": 13905 + }, + { + "epoch": 1.68, + "grad_norm": 0.26739415526390076, + "learning_rate": 0.00014056991958503882, + "loss": 0.8365, + "step": 13910 + }, + { + "epoch": 1.68, + "grad_norm": 0.24519668519496918, + "learning_rate": 0.0001404672212966161, + "loss": 0.9437, + "step": 13915 + }, + { + "epoch": 1.68, + "grad_norm": 0.232150599360466, + "learning_rate": 0.0001403645274946356, + "loss": 0.7994, + "step": 13920 + }, + { + "epoch": 1.68, + "grad_norm": 0.25735652446746826, + "learning_rate": 0.00014026183822742847, + "loss": 0.8427, + "step": 13925 + }, + { + "epoch": 1.68, + "grad_norm": 0.2597762644290924, + "learning_rate": 0.00014015915354332367, + "loss": 0.8662, + "step": 13930 + }, + { + "epoch": 1.68, + "grad_norm": 0.2447414994239807, + "learning_rate": 0.000140056473490648, + "loss": 0.8525, + "step": 13935 + }, + { + "epoch": 1.68, + "grad_norm": 0.24776887893676758, + "learning_rate": 0.0001399537981177261, + "loss": 0.7445, + "step": 13940 + }, + { + "epoch": 1.68, + "grad_norm": 0.2430184930562973, + "learning_rate": 0.00013985112747288048, + "loss": 0.8798, + "step": 13945 + }, + { + "epoch": 1.68, + "grad_norm": 0.26686951518058777, + "learning_rate": 0.00013974846160443128, + "loss": 0.8167, + "step": 13950 + }, + { + "epoch": 1.68, + "grad_norm": 0.2600416839122772, + "learning_rate": 0.0001396458005606965, + "loss": 0.7542, + "step": 13955 + }, + { + "epoch": 1.68, + "grad_norm": 0.262658953666687, + "learning_rate": 0.0001395431443899918, + "loss": 0.8568, + "step": 13960 + }, + { + "epoch": 1.68, + "grad_norm": 0.24480225145816803, + "learning_rate": 0.00013944049314063063, + "loss": 0.8143, + "step": 13965 + }, + { + "epoch": 1.68, + "grad_norm": 0.25501397252082825, + "learning_rate": 0.0001393378468609241, + "loss": 0.7804, + "step": 13970 + }, + { + "epoch": 1.68, + "grad_norm": 0.24708352982997894, + "learning_rate": 0.00013923520559918086, + "loss": 0.8356, + "step": 13975 + }, + { + "epoch": 1.68, + "grad_norm": 0.2388540506362915, + "learning_rate": 0.00013913256940370733, + "loss": 0.876, + "step": 13980 + }, + { + "epoch": 1.69, + "grad_norm": 0.2332463413476944, + "learning_rate": 0.00013902993832280757, + "loss": 0.7916, + "step": 13985 + }, + { + "epoch": 1.69, + "grad_norm": 0.2795470058917999, + "learning_rate": 0.00013892731240478317, + "loss": 0.8881, + "step": 13990 + }, + { + "epoch": 1.69, + "grad_norm": 0.2751898169517517, + "learning_rate": 0.00013882469169793324, + "loss": 0.7779, + "step": 13995 + }, + { + "epoch": 1.69, + "grad_norm": 0.27980801463127136, + "learning_rate": 0.00013872207625055449, + "loss": 0.8326, + "step": 14000 + }, + { + "epoch": 1.69, + "grad_norm": 0.2469402402639389, + "learning_rate": 0.00013861946611094125, + "loss": 0.85, + "step": 14005 + }, + { + "epoch": 1.69, + "grad_norm": 0.23618659377098083, + "learning_rate": 0.00013851686132738516, + "loss": 0.7895, + "step": 14010 + }, + { + "epoch": 1.69, + "grad_norm": 0.25577905774116516, + "learning_rate": 0.00013841426194817548, + "loss": 0.8099, + "step": 14015 + }, + { + "epoch": 1.69, + "grad_norm": 0.23130281269550323, + "learning_rate": 0.0001383116680215988, + "loss": 0.9164, + "step": 14020 + }, + { + "epoch": 1.69, + "grad_norm": 0.2609565258026123, + "learning_rate": 0.00013820907959593938, + "loss": 0.8081, + "step": 14025 + }, + { + "epoch": 1.69, + "grad_norm": 0.24102197587490082, + "learning_rate": 0.00013810649671947868, + "loss": 0.7962, + "step": 14030 + }, + { + "epoch": 1.69, + "grad_norm": 0.23725546896457672, + "learning_rate": 0.0001380039194404956, + "loss": 0.813, + "step": 14035 + }, + { + "epoch": 1.69, + "grad_norm": 0.24073052406311035, + "learning_rate": 0.00013790134780726634, + "loss": 0.7779, + "step": 14040 + }, + { + "epoch": 1.69, + "grad_norm": 0.2828931510448456, + "learning_rate": 0.00013779878186806463, + "loss": 0.8943, + "step": 14045 + }, + { + "epoch": 1.69, + "grad_norm": 0.28520870208740234, + "learning_rate": 0.00013769622167116138, + "loss": 0.8855, + "step": 14050 + }, + { + "epoch": 1.69, + "grad_norm": 0.2628406882286072, + "learning_rate": 0.0001375936672648248, + "loss": 0.796, + "step": 14055 + }, + { + "epoch": 1.69, + "grad_norm": 0.23413865268230438, + "learning_rate": 0.00013749111869732034, + "loss": 0.9203, + "step": 14060 + }, + { + "epoch": 1.69, + "grad_norm": 0.24607300758361816, + "learning_rate": 0.0001373885760169109, + "loss": 0.8389, + "step": 14065 + }, + { + "epoch": 1.7, + "grad_norm": 0.23336441814899445, + "learning_rate": 0.00013728603927185644, + "loss": 0.8227, + "step": 14070 + }, + { + "epoch": 1.7, + "grad_norm": 0.284675270318985, + "learning_rate": 0.00013718350851041407, + "loss": 0.8254, + "step": 14075 + }, + { + "epoch": 1.7, + "grad_norm": 0.2564062178134918, + "learning_rate": 0.00013708098378083813, + "loss": 0.7736, + "step": 14080 + }, + { + "epoch": 1.7, + "grad_norm": 0.2292274683713913, + "learning_rate": 0.00013697846513138035, + "loss": 0.8294, + "step": 14085 + }, + { + "epoch": 1.7, + "grad_norm": 0.2765873074531555, + "learning_rate": 0.0001368759526102893, + "loss": 0.8284, + "step": 14090 + }, + { + "epoch": 1.7, + "grad_norm": 0.2501067519187927, + "learning_rate": 0.0001367734462658108, + "loss": 0.9865, + "step": 14095 + }, + { + "epoch": 1.7, + "grad_norm": 0.26736894249916077, + "learning_rate": 0.00013667094614618766, + "loss": 0.7624, + "step": 14100 + }, + { + "epoch": 1.7, + "grad_norm": 0.24033088982105255, + "learning_rate": 0.00013656845229965996, + "loss": 0.8987, + "step": 14105 + }, + { + "epoch": 1.7, + "grad_norm": 0.2716580927371979, + "learning_rate": 0.00013646596477446467, + "loss": 0.9425, + "step": 14110 + }, + { + "epoch": 1.7, + "grad_norm": 0.2522694766521454, + "learning_rate": 0.00013636348361883578, + "loss": 0.9069, + "step": 14115 + }, + { + "epoch": 1.7, + "grad_norm": 0.25670236349105835, + "learning_rate": 0.00013626100888100432, + "loss": 0.9238, + "step": 14120 + }, + { + "epoch": 1.7, + "grad_norm": 0.26054003834724426, + "learning_rate": 0.00013615854060919838, + "loss": 0.8625, + "step": 14125 + }, + { + "epoch": 1.7, + "grad_norm": 0.23610499501228333, + "learning_rate": 0.0001360560788516429, + "loss": 0.7802, + "step": 14130 + }, + { + "epoch": 1.7, + "grad_norm": 0.2758851647377014, + "learning_rate": 0.0001359536236565598, + "loss": 0.8364, + "step": 14135 + }, + { + "epoch": 1.7, + "grad_norm": 0.26497259736061096, + "learning_rate": 0.0001358511750721678, + "loss": 0.8287, + "step": 14140 + }, + { + "epoch": 1.7, + "grad_norm": 0.24857574701309204, + "learning_rate": 0.0001357487331466827, + "loss": 0.862, + "step": 14145 + }, + { + "epoch": 1.7, + "grad_norm": 0.28877443075180054, + "learning_rate": 0.0001356462979283171, + "loss": 0.8711, + "step": 14150 + }, + { + "epoch": 1.71, + "grad_norm": 0.2738126218318939, + "learning_rate": 0.00013554386946528033, + "loss": 0.9011, + "step": 14155 + }, + { + "epoch": 1.71, + "grad_norm": 0.26807695627212524, + "learning_rate": 0.0001354414478057786, + "loss": 0.7915, + "step": 14160 + }, + { + "epoch": 1.71, + "grad_norm": 0.269781231880188, + "learning_rate": 0.0001353390329980151, + "loss": 0.8746, + "step": 14165 + }, + { + "epoch": 1.71, + "grad_norm": 0.2851022183895111, + "learning_rate": 0.00013523662509018952, + "loss": 0.8851, + "step": 14170 + }, + { + "epoch": 1.71, + "grad_norm": 0.26294824481010437, + "learning_rate": 0.00013513422413049847, + "loss": 0.7998, + "step": 14175 + }, + { + "epoch": 1.71, + "grad_norm": 0.23697425425052643, + "learning_rate": 0.00013503183016713518, + "loss": 0.8183, + "step": 14180 + }, + { + "epoch": 1.71, + "grad_norm": 0.22184935212135315, + "learning_rate": 0.0001349294432482897, + "loss": 0.7641, + "step": 14185 + }, + { + "epoch": 1.71, + "grad_norm": 0.23600788414478302, + "learning_rate": 0.00013482706342214873, + "loss": 0.8828, + "step": 14190 + }, + { + "epoch": 1.71, + "grad_norm": 0.26765936613082886, + "learning_rate": 0.0001347246907368956, + "loss": 0.8785, + "step": 14195 + }, + { + "epoch": 1.71, + "grad_norm": 0.2510226368904114, + "learning_rate": 0.00013462232524071022, + "loss": 0.9082, + "step": 14200 + }, + { + "epoch": 1.71, + "grad_norm": 0.2888829708099365, + "learning_rate": 0.0001345199669817693, + "loss": 0.8412, + "step": 14205 + }, + { + "epoch": 1.71, + "grad_norm": 0.22700832784175873, + "learning_rate": 0.00013441761600824602, + "loss": 0.8552, + "step": 14210 + }, + { + "epoch": 1.71, + "grad_norm": 0.2521451711654663, + "learning_rate": 0.0001343152723683101, + "loss": 0.7209, + "step": 14215 + }, + { + "epoch": 1.71, + "grad_norm": 0.24360665678977966, + "learning_rate": 0.00013421293611012784, + "loss": 0.8414, + "step": 14220 + }, + { + "epoch": 1.71, + "grad_norm": 0.2193845957517624, + "learning_rate": 0.00013411060728186217, + "loss": 0.7894, + "step": 14225 + }, + { + "epoch": 1.71, + "grad_norm": 0.2685592472553253, + "learning_rate": 0.00013400828593167238, + "loss": 0.8794, + "step": 14230 + }, + { + "epoch": 1.72, + "grad_norm": 0.23406338691711426, + "learning_rate": 0.0001339059721077143, + "loss": 0.8955, + "step": 14235 + }, + { + "epoch": 1.72, + "grad_norm": 0.2340015023946762, + "learning_rate": 0.00013380366585814016, + "loss": 0.7859, + "step": 14240 + }, + { + "epoch": 1.72, + "grad_norm": 0.2922716438770294, + "learning_rate": 0.00013370136723109876, + "loss": 0.7737, + "step": 14245 + }, + { + "epoch": 1.72, + "grad_norm": 0.26283466815948486, + "learning_rate": 0.0001335990762747352, + "loss": 0.8702, + "step": 14250 + }, + { + "epoch": 1.72, + "grad_norm": 0.2611430287361145, + "learning_rate": 0.00013349679303719105, + "loss": 0.7799, + "step": 14255 + }, + { + "epoch": 1.72, + "grad_norm": 0.2265879213809967, + "learning_rate": 0.00013339451756660408, + "loss": 0.8453, + "step": 14260 + }, + { + "epoch": 1.72, + "grad_norm": 0.2740434408187866, + "learning_rate": 0.00013329224991110865, + "loss": 0.854, + "step": 14265 + }, + { + "epoch": 1.72, + "grad_norm": 0.23583091795444489, + "learning_rate": 0.00013318999011883526, + "loss": 0.8633, + "step": 14270 + }, + { + "epoch": 1.72, + "grad_norm": 0.21470926702022552, + "learning_rate": 0.00013308773823791074, + "loss": 0.8227, + "step": 14275 + }, + { + "epoch": 1.72, + "grad_norm": 0.2593861520290375, + "learning_rate": 0.0001329854943164582, + "loss": 0.7781, + "step": 14280 + }, + { + "epoch": 1.72, + "grad_norm": 0.27231550216674805, + "learning_rate": 0.00013288325840259715, + "loss": 0.8061, + "step": 14285 + }, + { + "epoch": 1.72, + "grad_norm": 0.2605496644973755, + "learning_rate": 0.0001327810305444431, + "loss": 0.9153, + "step": 14290 + }, + { + "epoch": 1.72, + "grad_norm": 0.280644953250885, + "learning_rate": 0.0001326788107901079, + "loss": 0.8612, + "step": 14295 + }, + { + "epoch": 1.72, + "grad_norm": 0.2497917264699936, + "learning_rate": 0.0001325765991876995, + "loss": 0.7707, + "step": 14300 + }, + { + "epoch": 1.72, + "grad_norm": 0.26758888363838196, + "learning_rate": 0.0001324743957853222, + "loss": 0.8533, + "step": 14305 + }, + { + "epoch": 1.72, + "grad_norm": 0.2774585485458374, + "learning_rate": 0.00013237220063107625, + "loss": 0.9308, + "step": 14310 + }, + { + "epoch": 1.72, + "grad_norm": 0.21268567442893982, + "learning_rate": 0.00013227001377305806, + "loss": 0.8137, + "step": 14315 + }, + { + "epoch": 1.73, + "grad_norm": 0.2505970001220703, + "learning_rate": 0.0001321678352593602, + "loss": 0.7499, + "step": 14320 + }, + { + "epoch": 1.73, + "grad_norm": 0.2653083801269531, + "learning_rate": 0.00013206566513807125, + "loss": 0.8927, + "step": 14325 + }, + { + "epoch": 1.73, + "grad_norm": 0.2813301086425781, + "learning_rate": 0.0001319635034572759, + "loss": 0.8058, + "step": 14330 + }, + { + "epoch": 1.73, + "grad_norm": 0.23401866853237152, + "learning_rate": 0.0001318613502650547, + "loss": 0.8581, + "step": 14335 + }, + { + "epoch": 1.73, + "grad_norm": 0.24355721473693848, + "learning_rate": 0.0001317592056094845, + "loss": 0.9558, + "step": 14340 + }, + { + "epoch": 1.73, + "grad_norm": 0.23587583005428314, + "learning_rate": 0.0001316570695386379, + "loss": 0.8461, + "step": 14345 + }, + { + "epoch": 1.73, + "grad_norm": 0.2816319763660431, + "learning_rate": 0.00013155494210058353, + "loss": 0.8245, + "step": 14350 + }, + { + "epoch": 1.73, + "grad_norm": 0.2452281266450882, + "learning_rate": 0.0001314528233433859, + "loss": 0.8102, + "step": 14355 + }, + { + "epoch": 1.73, + "grad_norm": 0.2361225038766861, + "learning_rate": 0.0001313507133151056, + "loss": 0.7548, + "step": 14360 + }, + { + "epoch": 1.73, + "grad_norm": 0.2706858217716217, + "learning_rate": 0.0001312486120637989, + "loss": 0.8989, + "step": 14365 + }, + { + "epoch": 1.73, + "grad_norm": 0.25683704018592834, + "learning_rate": 0.0001311465196375181, + "loss": 0.87, + "step": 14370 + }, + { + "epoch": 1.73, + "grad_norm": 0.2547888457775116, + "learning_rate": 0.0001310444360843112, + "loss": 0.8649, + "step": 14375 + }, + { + "epoch": 1.73, + "grad_norm": 0.28074026107788086, + "learning_rate": 0.00013094236145222223, + "loss": 0.8258, + "step": 14380 + }, + { + "epoch": 1.73, + "grad_norm": 0.2704260051250458, + "learning_rate": 0.00013084029578929086, + "loss": 0.8283, + "step": 14385 + }, + { + "epoch": 1.73, + "grad_norm": 0.22389455139636993, + "learning_rate": 0.00013073823914355257, + "loss": 0.7883, + "step": 14390 + }, + { + "epoch": 1.73, + "grad_norm": 0.29739248752593994, + "learning_rate": 0.00013063619156303854, + "loss": 0.9081, + "step": 14395 + }, + { + "epoch": 1.74, + "grad_norm": 0.25847524404525757, + "learning_rate": 0.00013053415309577588, + "loss": 0.8948, + "step": 14400 + }, + { + "epoch": 1.74, + "grad_norm": 0.2561867833137512, + "learning_rate": 0.0001304321237897872, + "loss": 0.9301, + "step": 14405 + }, + { + "epoch": 1.74, + "grad_norm": 0.23929399251937866, + "learning_rate": 0.00013033010369309088, + "loss": 0.853, + "step": 14410 + }, + { + "epoch": 1.74, + "grad_norm": 0.2782881557941437, + "learning_rate": 0.0001302280928537009, + "loss": 0.8784, + "step": 14415 + }, + { + "epoch": 1.74, + "grad_norm": 0.2864670753479004, + "learning_rate": 0.00013012609131962712, + "loss": 0.888, + "step": 14420 + }, + { + "epoch": 1.74, + "grad_norm": 0.25334519147872925, + "learning_rate": 0.00013002409913887475, + "loss": 0.8426, + "step": 14425 + }, + { + "epoch": 1.74, + "grad_norm": 0.2505394518375397, + "learning_rate": 0.00012992211635944474, + "loss": 0.7904, + "step": 14430 + }, + { + "epoch": 1.74, + "grad_norm": 0.2567066252231598, + "learning_rate": 0.00012982014302933347, + "loss": 0.838, + "step": 14435 + }, + { + "epoch": 1.74, + "grad_norm": 0.2711067199707031, + "learning_rate": 0.00012971817919653307, + "loss": 0.8792, + "step": 14440 + }, + { + "epoch": 1.74, + "grad_norm": 0.25377967953681946, + "learning_rate": 0.00012961622490903108, + "loss": 0.7812, + "step": 14445 + }, + { + "epoch": 1.74, + "grad_norm": 0.2506578862667084, + "learning_rate": 0.00012951428021481056, + "loss": 0.9277, + "step": 14450 + }, + { + "epoch": 1.74, + "grad_norm": 0.22411175072193146, + "learning_rate": 0.00012941234516185003, + "loss": 0.9082, + "step": 14455 + }, + { + "epoch": 1.74, + "grad_norm": 0.252632200717926, + "learning_rate": 0.00012931041979812364, + "loss": 0.7627, + "step": 14460 + }, + { + "epoch": 1.74, + "grad_norm": 0.2694997489452362, + "learning_rate": 0.00012920850417160078, + "loss": 0.9178, + "step": 14465 + }, + { + "epoch": 1.74, + "grad_norm": 0.2513391077518463, + "learning_rate": 0.0001291065983302463, + "loss": 0.8679, + "step": 14470 + }, + { + "epoch": 1.74, + "grad_norm": 0.2643163800239563, + "learning_rate": 0.00012900470232202045, + "loss": 0.8087, + "step": 14475 + }, + { + "epoch": 1.74, + "grad_norm": 0.2633654773235321, + "learning_rate": 0.00012890281619487898, + "loss": 0.8016, + "step": 14480 + }, + { + "epoch": 1.75, + "grad_norm": 0.24543441832065582, + "learning_rate": 0.00012880093999677282, + "loss": 0.8115, + "step": 14485 + }, + { + "epoch": 1.75, + "grad_norm": 0.2534711956977844, + "learning_rate": 0.00012869907377564827, + "loss": 0.7686, + "step": 14490 + }, + { + "epoch": 1.75, + "grad_norm": 0.24713997542858124, + "learning_rate": 0.00012859721757944696, + "loss": 0.8995, + "step": 14495 + }, + { + "epoch": 1.75, + "grad_norm": 0.2603427469730377, + "learning_rate": 0.00012849537145610587, + "loss": 0.9065, + "step": 14500 + }, + { + "epoch": 1.75, + "grad_norm": 0.26300033926963806, + "learning_rate": 0.00012839353545355712, + "loss": 0.8299, + "step": 14505 + }, + { + "epoch": 1.75, + "grad_norm": 0.2567319869995117, + "learning_rate": 0.0001282917096197281, + "loss": 0.8677, + "step": 14510 + }, + { + "epoch": 1.75, + "grad_norm": 0.2563308775424957, + "learning_rate": 0.0001281898940025414, + "loss": 0.7826, + "step": 14515 + }, + { + "epoch": 1.75, + "grad_norm": 0.2692480683326721, + "learning_rate": 0.0001280880886499149, + "loss": 0.8732, + "step": 14520 + }, + { + "epoch": 1.75, + "grad_norm": 0.26520323753356934, + "learning_rate": 0.0001279862936097616, + "loss": 0.8239, + "step": 14525 + }, + { + "epoch": 1.75, + "grad_norm": 0.2513013780117035, + "learning_rate": 0.00012788450892998952, + "loss": 0.7887, + "step": 14530 + }, + { + "epoch": 1.75, + "grad_norm": 0.2527408003807068, + "learning_rate": 0.000127782734658502, + "loss": 0.9332, + "step": 14535 + }, + { + "epoch": 1.75, + "grad_norm": 0.2598322629928589, + "learning_rate": 0.00012768097084319736, + "loss": 0.8357, + "step": 14540 + }, + { + "epoch": 1.75, + "grad_norm": 0.257907509803772, + "learning_rate": 0.00012757921753196906, + "loss": 0.7879, + "step": 14545 + }, + { + "epoch": 1.75, + "grad_norm": 0.2565610110759735, + "learning_rate": 0.00012747747477270552, + "loss": 0.9305, + "step": 14550 + }, + { + "epoch": 1.75, + "grad_norm": 0.25154614448547363, + "learning_rate": 0.00012737574261329027, + "loss": 0.734, + "step": 14555 + }, + { + "epoch": 1.75, + "grad_norm": 0.24832646548748016, + "learning_rate": 0.00012727402110160194, + "loss": 0.853, + "step": 14560 + }, + { + "epoch": 1.75, + "grad_norm": 0.2888548672199249, + "learning_rate": 0.00012717231028551397, + "loss": 0.8503, + "step": 14565 + }, + { + "epoch": 1.76, + "grad_norm": 0.2529507279396057, + "learning_rate": 0.00012707061021289485, + "loss": 0.8343, + "step": 14570 + }, + { + "epoch": 1.76, + "grad_norm": 0.27451494336128235, + "learning_rate": 0.00012696892093160803, + "loss": 0.7783, + "step": 14575 + }, + { + "epoch": 1.76, + "grad_norm": 0.2568528652191162, + "learning_rate": 0.00012686724248951189, + "loss": 0.9118, + "step": 14580 + }, + { + "epoch": 1.76, + "grad_norm": 0.2533772587776184, + "learning_rate": 0.00012676557493445962, + "loss": 0.7751, + "step": 14585 + }, + { + "epoch": 1.76, + "grad_norm": 0.272686630487442, + "learning_rate": 0.0001266639183142994, + "loss": 0.8448, + "step": 14590 + }, + { + "epoch": 1.76, + "grad_norm": 0.25127992033958435, + "learning_rate": 0.0001265622726768741, + "loss": 0.8768, + "step": 14595 + }, + { + "epoch": 1.76, + "grad_norm": 0.2625642418861389, + "learning_rate": 0.00012646063807002168, + "loss": 0.9275, + "step": 14600 + }, + { + "epoch": 1.76, + "grad_norm": 0.26558226346969604, + "learning_rate": 0.00012635901454157472, + "loss": 0.7573, + "step": 14605 + }, + { + "epoch": 1.76, + "grad_norm": 0.2596949338912964, + "learning_rate": 0.00012625740213936064, + "loss": 0.8072, + "step": 14610 + }, + { + "epoch": 1.76, + "grad_norm": 0.25480490922927856, + "learning_rate": 0.0001261558009112015, + "loss": 0.9022, + "step": 14615 + }, + { + "epoch": 1.76, + "grad_norm": 0.2669465243816376, + "learning_rate": 0.00012605421090491434, + "loss": 0.978, + "step": 14620 + }, + { + "epoch": 1.76, + "grad_norm": 0.22800852358341217, + "learning_rate": 0.00012595263216831076, + "loss": 0.6918, + "step": 14625 + }, + { + "epoch": 1.76, + "grad_norm": 0.2479008287191391, + "learning_rate": 0.00012585106474919704, + "loss": 0.8508, + "step": 14630 + }, + { + "epoch": 1.76, + "grad_norm": 0.24926097691059113, + "learning_rate": 0.00012574950869537419, + "loss": 0.7538, + "step": 14635 + }, + { + "epoch": 1.76, + "grad_norm": 0.23529042303562164, + "learning_rate": 0.0001256479640546379, + "loss": 0.8676, + "step": 14640 + }, + { + "epoch": 1.76, + "grad_norm": 0.24101610481739044, + "learning_rate": 0.00012554643087477844, + "loss": 0.9266, + "step": 14645 + }, + { + "epoch": 1.77, + "grad_norm": 0.28825363516807556, + "learning_rate": 0.00012544490920358072, + "loss": 0.9265, + "step": 14650 + }, + { + "epoch": 1.77, + "grad_norm": 0.2545327842235565, + "learning_rate": 0.00012534339908882412, + "loss": 0.8619, + "step": 14655 + }, + { + "epoch": 1.77, + "grad_norm": 0.3079511523246765, + "learning_rate": 0.00012524190057828277, + "loss": 0.853, + "step": 14660 + }, + { + "epoch": 1.77, + "grad_norm": 0.2799226939678192, + "learning_rate": 0.00012514041371972518, + "loss": 0.9301, + "step": 14665 + }, + { + "epoch": 1.77, + "grad_norm": 0.25301575660705566, + "learning_rate": 0.00012503893856091448, + "loss": 0.8581, + "step": 14670 + }, + { + "epoch": 1.77, + "grad_norm": 0.26482102274894714, + "learning_rate": 0.00012493747514960815, + "loss": 0.9563, + "step": 14675 + }, + { + "epoch": 1.77, + "grad_norm": 0.23120246827602386, + "learning_rate": 0.0001248360235335584, + "loss": 0.8965, + "step": 14680 + }, + { + "epoch": 1.77, + "grad_norm": 0.2709689736366272, + "learning_rate": 0.00012473458376051163, + "loss": 0.7622, + "step": 14685 + }, + { + "epoch": 1.77, + "grad_norm": 0.254367470741272, + "learning_rate": 0.00012463315587820878, + "loss": 0.9004, + "step": 14690 + }, + { + "epoch": 1.77, + "grad_norm": 0.25970759987831116, + "learning_rate": 0.0001245317399343851, + "loss": 0.7974, + "step": 14695 + }, + { + "epoch": 1.77, + "grad_norm": 0.23521548509597778, + "learning_rate": 0.00012443033597677047, + "loss": 0.7991, + "step": 14700 + }, + { + "epoch": 1.77, + "grad_norm": 0.2456759810447693, + "learning_rate": 0.00012432894405308887, + "loss": 0.9068, + "step": 14705 + }, + { + "epoch": 1.77, + "grad_norm": 0.3220599889755249, + "learning_rate": 0.00012422756421105868, + "loss": 0.8967, + "step": 14710 + }, + { + "epoch": 1.77, + "grad_norm": 0.25704309344291687, + "learning_rate": 0.00012412619649839263, + "loss": 0.9185, + "step": 14715 + }, + { + "epoch": 1.77, + "grad_norm": 0.2799650728702545, + "learning_rate": 0.0001240248409627978, + "loss": 0.8382, + "step": 14720 + }, + { + "epoch": 1.77, + "grad_norm": 0.22531536221504211, + "learning_rate": 0.00012392349765197541, + "loss": 0.7933, + "step": 14725 + }, + { + "epoch": 1.77, + "grad_norm": 0.2780819535255432, + "learning_rate": 0.00012382216661362098, + "loss": 0.8583, + "step": 14730 + }, + { + "epoch": 1.78, + "grad_norm": 0.2472156137228012, + "learning_rate": 0.00012372084789542424, + "loss": 0.7335, + "step": 14735 + }, + { + "epoch": 1.78, + "grad_norm": 0.22671552002429962, + "learning_rate": 0.00012361954154506926, + "loss": 0.8889, + "step": 14740 + }, + { + "epoch": 1.78, + "grad_norm": 0.23616662621498108, + "learning_rate": 0.00012351824761023405, + "loss": 0.9454, + "step": 14745 + }, + { + "epoch": 1.78, + "grad_norm": 0.24710243940353394, + "learning_rate": 0.00012341696613859098, + "loss": 0.7842, + "step": 14750 + }, + { + "epoch": 1.78, + "grad_norm": 0.27012568712234497, + "learning_rate": 0.0001233156971778064, + "loss": 0.7432, + "step": 14755 + }, + { + "epoch": 1.78, + "grad_norm": 0.30002421140670776, + "learning_rate": 0.00012321444077554095, + "loss": 0.9517, + "step": 14760 + }, + { + "epoch": 1.78, + "grad_norm": 0.2851850688457489, + "learning_rate": 0.00012311319697944913, + "loss": 0.8708, + "step": 14765 + }, + { + "epoch": 1.78, + "grad_norm": 0.2681834101676941, + "learning_rate": 0.00012301196583717973, + "loss": 0.9541, + "step": 14770 + }, + { + "epoch": 1.78, + "grad_norm": 0.2770148515701294, + "learning_rate": 0.0001229107473963754, + "loss": 0.7985, + "step": 14775 + }, + { + "epoch": 1.78, + "grad_norm": 0.23922573029994965, + "learning_rate": 0.000122809541704673, + "loss": 0.8304, + "step": 14780 + }, + { + "epoch": 1.78, + "grad_norm": 0.38129690289497375, + "learning_rate": 0.00012270834880970323, + "loss": 0.8315, + "step": 14785 + }, + { + "epoch": 1.78, + "grad_norm": 0.2657111585140228, + "learning_rate": 0.00012260716875909085, + "loss": 0.792, + "step": 14790 + }, + { + "epoch": 1.78, + "grad_norm": 0.2710582911968231, + "learning_rate": 0.0001225060016004545, + "loss": 0.7308, + "step": 14795 + }, + { + "epoch": 1.78, + "grad_norm": 0.2484636902809143, + "learning_rate": 0.0001224048473814069, + "loss": 0.852, + "step": 14800 + }, + { + "epoch": 1.78, + "grad_norm": 0.24950478971004486, + "learning_rate": 0.0001223037061495545, + "loss": 0.8909, + "step": 14805 + }, + { + "epoch": 1.78, + "grad_norm": 0.2482597976922989, + "learning_rate": 0.00012220257795249778, + "loss": 0.8531, + "step": 14810 + }, + { + "epoch": 1.79, + "grad_norm": 0.2781403362751007, + "learning_rate": 0.00012210146283783092, + "loss": 0.855, + "step": 14815 + }, + { + "epoch": 1.79, + "grad_norm": 0.26472207903862, + "learning_rate": 0.00012200036085314218, + "loss": 0.8358, + "step": 14820 + }, + { + "epoch": 1.79, + "grad_norm": 0.25163668394088745, + "learning_rate": 0.00012189927204601348, + "loss": 0.8767, + "step": 14825 + }, + { + "epoch": 1.79, + "grad_norm": 0.23350991308689117, + "learning_rate": 0.00012179819646402052, + "loss": 0.8239, + "step": 14830 + }, + { + "epoch": 1.79, + "grad_norm": 0.275762677192688, + "learning_rate": 0.00012169713415473288, + "loss": 0.7539, + "step": 14835 + }, + { + "epoch": 1.79, + "grad_norm": 0.2697180509567261, + "learning_rate": 0.00012159608516571383, + "loss": 0.8765, + "step": 14840 + }, + { + "epoch": 1.79, + "grad_norm": 0.25511398911476135, + "learning_rate": 0.00012149504954452036, + "loss": 0.9266, + "step": 14845 + }, + { + "epoch": 1.79, + "grad_norm": 0.26254042983055115, + "learning_rate": 0.0001213940273387031, + "loss": 0.8991, + "step": 14850 + }, + { + "epoch": 1.79, + "grad_norm": 0.27790650725364685, + "learning_rate": 0.00012129301859580665, + "loss": 0.8492, + "step": 14855 + }, + { + "epoch": 1.79, + "grad_norm": 0.2523384690284729, + "learning_rate": 0.00012119202336336897, + "loss": 0.8993, + "step": 14860 + }, + { + "epoch": 1.79, + "grad_norm": 0.23431278765201569, + "learning_rate": 0.00012109104168892177, + "loss": 0.9201, + "step": 14865 + }, + { + "epoch": 1.79, + "grad_norm": 0.2700154185295105, + "learning_rate": 0.00012099007361999037, + "loss": 0.8558, + "step": 14870 + }, + { + "epoch": 1.79, + "grad_norm": 0.2271653562784195, + "learning_rate": 0.00012088911920409374, + "loss": 0.8236, + "step": 14875 + }, + { + "epoch": 1.79, + "grad_norm": 0.24218647181987762, + "learning_rate": 0.00012078817848874434, + "loss": 0.7495, + "step": 14880 + }, + { + "epoch": 1.79, + "grad_norm": 0.25467637181282043, + "learning_rate": 0.00012068725152144827, + "loss": 0.8275, + "step": 14885 + }, + { + "epoch": 1.79, + "grad_norm": 0.277286559343338, + "learning_rate": 0.00012058633834970502, + "loss": 0.8756, + "step": 14890 + }, + { + "epoch": 1.79, + "grad_norm": 0.26092687249183655, + "learning_rate": 0.00012048543902100779, + "loss": 0.8228, + "step": 14895 + }, + { + "epoch": 1.8, + "grad_norm": 0.24140042066574097, + "learning_rate": 0.00012038455358284309, + "loss": 0.8216, + "step": 14900 + }, + { + "epoch": 1.8, + "grad_norm": 0.2441730499267578, + "learning_rate": 0.00012028368208269097, + "loss": 0.873, + "step": 14905 + }, + { + "epoch": 1.8, + "grad_norm": 0.242695614695549, + "learning_rate": 0.00012018282456802487, + "loss": 0.734, + "step": 14910 + }, + { + "epoch": 1.8, + "grad_norm": 0.27291175723075867, + "learning_rate": 0.00012008198108631176, + "loss": 1.0009, + "step": 14915 + }, + { + "epoch": 1.8, + "grad_norm": 0.25119420886039734, + "learning_rate": 0.00011998115168501192, + "loss": 1.0065, + "step": 14920 + }, + { + "epoch": 1.8, + "grad_norm": 0.2412206530570984, + "learning_rate": 0.00011988033641157898, + "loss": 0.8785, + "step": 14925 + }, + { + "epoch": 1.8, + "grad_norm": 0.24067923426628113, + "learning_rate": 0.00011977953531345996, + "loss": 0.9148, + "step": 14930 + }, + { + "epoch": 1.8, + "grad_norm": 0.2587399184703827, + "learning_rate": 0.00011967874843809522, + "loss": 0.8226, + "step": 14935 + }, + { + "epoch": 1.8, + "grad_norm": 0.26349955797195435, + "learning_rate": 0.00011957797583291841, + "loss": 0.7778, + "step": 14940 + }, + { + "epoch": 1.8, + "grad_norm": 0.2319829910993576, + "learning_rate": 0.00011947721754535645, + "loss": 0.7828, + "step": 14945 + }, + { + "epoch": 1.8, + "grad_norm": 0.27520623803138733, + "learning_rate": 0.00011937647362282948, + "loss": 0.8186, + "step": 14950 + }, + { + "epoch": 1.8, + "grad_norm": 0.23423904180526733, + "learning_rate": 0.00011927574411275107, + "loss": 0.773, + "step": 14955 + }, + { + "epoch": 1.8, + "grad_norm": 0.2540885806083679, + "learning_rate": 0.0001191750290625278, + "loss": 0.7941, + "step": 14960 + }, + { + "epoch": 1.8, + "grad_norm": 0.26657721400260925, + "learning_rate": 0.00011907432851955952, + "loss": 0.848, + "step": 14965 + }, + { + "epoch": 1.8, + "grad_norm": 0.22718775272369385, + "learning_rate": 0.00011897364253123921, + "loss": 0.7955, + "step": 14970 + }, + { + "epoch": 1.8, + "grad_norm": 0.252336323261261, + "learning_rate": 0.00011887297114495312, + "loss": 0.894, + "step": 14975 + }, + { + "epoch": 1.8, + "grad_norm": 0.2825542390346527, + "learning_rate": 0.0001187723144080805, + "loss": 0.9111, + "step": 14980 + }, + { + "epoch": 1.81, + "grad_norm": 0.2752092182636261, + "learning_rate": 0.00011867167236799376, + "loss": 0.839, + "step": 14985 + }, + { + "epoch": 1.81, + "grad_norm": 0.2676568925380707, + "learning_rate": 0.00011857104507205831, + "loss": 0.9122, + "step": 14990 + }, + { + "epoch": 1.81, + "grad_norm": 0.24729391932487488, + "learning_rate": 0.00011847043256763285, + "loss": 0.7432, + "step": 14995 + }, + { + "epoch": 1.81, + "grad_norm": 0.24606819450855255, + "learning_rate": 0.00011836983490206889, + "loss": 0.7616, + "step": 15000 + }, + { + "epoch": 1.81, + "grad_norm": 0.27244672179222107, + "learning_rate": 0.00011826925212271102, + "loss": 0.8588, + "step": 15005 + }, + { + "epoch": 1.81, + "grad_norm": 0.26637476682662964, + "learning_rate": 0.00011816868427689683, + "loss": 0.9134, + "step": 15010 + }, + { + "epoch": 1.81, + "grad_norm": 0.27990347146987915, + "learning_rate": 0.00011806813141195691, + "loss": 0.8513, + "step": 15015 + }, + { + "epoch": 1.81, + "grad_norm": 0.33465734124183655, + "learning_rate": 0.0001179675935752148, + "loss": 0.8542, + "step": 15020 + }, + { + "epoch": 1.81, + "grad_norm": 0.22314803302288055, + "learning_rate": 0.0001178670708139869, + "loss": 0.8547, + "step": 15025 + }, + { + "epoch": 1.81, + "grad_norm": 0.2743472754955292, + "learning_rate": 0.00011776656317558251, + "loss": 0.8579, + "step": 15030 + }, + { + "epoch": 1.81, + "grad_norm": 0.23580728471279144, + "learning_rate": 0.000117666070707304, + "loss": 0.9977, + "step": 15035 + }, + { + "epoch": 1.81, + "grad_norm": 0.2850353419780731, + "learning_rate": 0.0001175655934564464, + "loss": 0.7897, + "step": 15040 + }, + { + "epoch": 1.81, + "grad_norm": 0.2732098698616028, + "learning_rate": 0.00011746513147029762, + "loss": 0.6902, + "step": 15045 + }, + { + "epoch": 1.81, + "grad_norm": 0.26247236132621765, + "learning_rate": 0.00011736468479613841, + "loss": 0.74, + "step": 15050 + }, + { + "epoch": 1.81, + "grad_norm": 0.23917360603809357, + "learning_rate": 0.00011726425348124232, + "loss": 0.8132, + "step": 15055 + }, + { + "epoch": 1.81, + "grad_norm": 0.2416449338197708, + "learning_rate": 0.00011716383757287568, + "loss": 0.94, + "step": 15060 + }, + { + "epoch": 1.82, + "grad_norm": 0.26987558603286743, + "learning_rate": 0.00011706343711829753, + "loss": 0.8469, + "step": 15065 + }, + { + "epoch": 1.82, + "grad_norm": 0.2562698721885681, + "learning_rate": 0.0001169630521647596, + "loss": 0.7888, + "step": 15070 + }, + { + "epoch": 1.82, + "grad_norm": 0.2376549392938614, + "learning_rate": 0.0001168626827595065, + "loss": 0.8179, + "step": 15075 + }, + { + "epoch": 1.82, + "grad_norm": 0.23913566768169403, + "learning_rate": 0.0001167623289497754, + "loss": 0.9149, + "step": 15080 + }, + { + "epoch": 1.82, + "grad_norm": 0.2533036172389984, + "learning_rate": 0.00011666199078279604, + "loss": 0.8483, + "step": 15085 + }, + { + "epoch": 1.82, + "grad_norm": 0.23935401439666748, + "learning_rate": 0.00011656166830579087, + "loss": 0.8704, + "step": 15090 + }, + { + "epoch": 1.82, + "grad_norm": 0.243647962808609, + "learning_rate": 0.00011646136156597513, + "loss": 0.8779, + "step": 15095 + }, + { + "epoch": 1.82, + "grad_norm": 0.27903953194618225, + "learning_rate": 0.0001163610706105564, + "loss": 0.8851, + "step": 15100 + }, + { + "epoch": 1.82, + "grad_norm": 0.25441214442253113, + "learning_rate": 0.00011626079548673496, + "loss": 0.8484, + "step": 15105 + }, + { + "epoch": 1.82, + "grad_norm": 0.23863151669502258, + "learning_rate": 0.00011616053624170359, + "loss": 0.8271, + "step": 15110 + }, + { + "epoch": 1.82, + "grad_norm": 0.271961510181427, + "learning_rate": 0.00011606029292264766, + "loss": 0.855, + "step": 15115 + }, + { + "epoch": 1.82, + "grad_norm": 0.25115951895713806, + "learning_rate": 0.00011596006557674497, + "loss": 0.8599, + "step": 15120 + }, + { + "epoch": 1.82, + "grad_norm": 0.21527209877967834, + "learning_rate": 0.00011585985425116589, + "loss": 0.8878, + "step": 15125 + }, + { + "epoch": 1.82, + "grad_norm": 0.2371513992547989, + "learning_rate": 0.00011575965899307306, + "loss": 0.8347, + "step": 15130 + }, + { + "epoch": 1.82, + "grad_norm": 0.26192623376846313, + "learning_rate": 0.00011565947984962187, + "loss": 0.814, + "step": 15135 + }, + { + "epoch": 1.82, + "grad_norm": 0.2839601933956146, + "learning_rate": 0.00011555931686795987, + "loss": 0.8186, + "step": 15140 + }, + { + "epoch": 1.82, + "grad_norm": 0.2582460045814514, + "learning_rate": 0.0001154591700952271, + "loss": 0.7673, + "step": 15145 + }, + { + "epoch": 1.83, + "grad_norm": 0.2260724902153015, + "learning_rate": 0.0001153590395785559, + "loss": 0.8031, + "step": 15150 + }, + { + "epoch": 1.83, + "grad_norm": 0.2650599777698517, + "learning_rate": 0.00011525892536507111, + "loss": 0.8469, + "step": 15155 + }, + { + "epoch": 1.83, + "grad_norm": 0.280758798122406, + "learning_rate": 0.00011515882750188976, + "loss": 0.8937, + "step": 15160 + }, + { + "epoch": 1.83, + "grad_norm": 0.23013721406459808, + "learning_rate": 0.00011505874603612122, + "loss": 0.8758, + "step": 15165 + }, + { + "epoch": 1.83, + "grad_norm": 0.23622775077819824, + "learning_rate": 0.0001149586810148671, + "loss": 0.9219, + "step": 15170 + }, + { + "epoch": 1.83, + "grad_norm": 0.23313015699386597, + "learning_rate": 0.00011485863248522144, + "loss": 0.8072, + "step": 15175 + }, + { + "epoch": 1.83, + "grad_norm": 0.27392974495887756, + "learning_rate": 0.00011475860049427036, + "loss": 0.7727, + "step": 15180 + }, + { + "epoch": 1.83, + "grad_norm": 0.24279291927814484, + "learning_rate": 0.00011465858508909219, + "loss": 0.8076, + "step": 15185 + }, + { + "epoch": 1.83, + "grad_norm": 0.22247040271759033, + "learning_rate": 0.00011455858631675752, + "loss": 0.8597, + "step": 15190 + }, + { + "epoch": 1.83, + "grad_norm": 0.23884879052639008, + "learning_rate": 0.00011445860422432913, + "loss": 0.9097, + "step": 15195 + }, + { + "epoch": 1.83, + "grad_norm": 0.22139222919940948, + "learning_rate": 0.00011435863885886188, + "loss": 0.8262, + "step": 15200 + }, + { + "epoch": 1.83, + "grad_norm": 0.20830343663692474, + "learning_rate": 0.00011425869026740278, + "loss": 0.8878, + "step": 15205 + }, + { + "epoch": 1.83, + "grad_norm": 0.23654043674468994, + "learning_rate": 0.00011415875849699094, + "loss": 0.8093, + "step": 15210 + }, + { + "epoch": 1.83, + "grad_norm": 0.2731797993183136, + "learning_rate": 0.00011405884359465766, + "loss": 0.8693, + "step": 15215 + }, + { + "epoch": 1.83, + "grad_norm": 0.2721179723739624, + "learning_rate": 0.00011395894560742612, + "loss": 0.7557, + "step": 15220 + }, + { + "epoch": 1.83, + "grad_norm": 0.2314218133687973, + "learning_rate": 0.0001138590645823117, + "loss": 0.911, + "step": 15225 + }, + { + "epoch": 1.84, + "grad_norm": 0.2549746036529541, + "learning_rate": 0.00011375920056632164, + "loss": 0.8295, + "step": 15230 + }, + { + "epoch": 1.84, + "grad_norm": 0.2613559365272522, + "learning_rate": 0.00011365935360645536, + "loss": 0.8176, + "step": 15235 + }, + { + "epoch": 1.84, + "grad_norm": 0.24376095831394196, + "learning_rate": 0.0001135595237497041, + "loss": 0.8712, + "step": 15240 + }, + { + "epoch": 1.84, + "grad_norm": 0.2682994604110718, + "learning_rate": 0.00011345971104305111, + "loss": 0.8861, + "step": 15245 + }, + { + "epoch": 1.84, + "grad_norm": 0.2671447992324829, + "learning_rate": 0.0001133599155334715, + "loss": 0.9091, + "step": 15250 + }, + { + "epoch": 1.84, + "grad_norm": 0.2641143500804901, + "learning_rate": 0.00011326013726793249, + "loss": 0.8653, + "step": 15255 + }, + { + "epoch": 1.84, + "grad_norm": 0.2844027280807495, + "learning_rate": 0.00011316037629339299, + "loss": 0.8379, + "step": 15260 + }, + { + "epoch": 1.84, + "grad_norm": 0.25467634201049805, + "learning_rate": 0.00011306063265680384, + "loss": 0.7981, + "step": 15265 + }, + { + "epoch": 1.84, + "grad_norm": 0.27292025089263916, + "learning_rate": 0.00011296090640510758, + "loss": 0.9155, + "step": 15270 + }, + { + "epoch": 1.84, + "grad_norm": 0.27698391675949097, + "learning_rate": 0.00011286119758523885, + "loss": 0.8058, + "step": 15275 + }, + { + "epoch": 1.84, + "grad_norm": 0.2688562870025635, + "learning_rate": 0.00011276150624412388, + "loss": 0.8207, + "step": 15280 + }, + { + "epoch": 1.84, + "grad_norm": 0.2935139238834381, + "learning_rate": 0.00011266183242868073, + "loss": 0.8824, + "step": 15285 + }, + { + "epoch": 1.84, + "grad_norm": 0.26238155364990234, + "learning_rate": 0.00011256217618581916, + "loss": 0.8231, + "step": 15290 + }, + { + "epoch": 1.84, + "grad_norm": 0.23228612542152405, + "learning_rate": 0.00011246253756244079, + "loss": 0.8126, + "step": 15295 + }, + { + "epoch": 1.84, + "grad_norm": 0.2749589681625366, + "learning_rate": 0.00011236291660543881, + "loss": 0.8331, + "step": 15300 + }, + { + "epoch": 1.84, + "grad_norm": 0.26084092259407043, + "learning_rate": 0.0001122633133616982, + "loss": 0.9848, + "step": 15305 + }, + { + "epoch": 1.84, + "grad_norm": 0.2661038041114807, + "learning_rate": 0.0001121637278780954, + "loss": 0.873, + "step": 15310 + }, + { + "epoch": 1.85, + "grad_norm": 0.27569863200187683, + "learning_rate": 0.00011206416020149887, + "loss": 0.8332, + "step": 15315 + }, + { + "epoch": 1.85, + "grad_norm": 0.23671133816242218, + "learning_rate": 0.00011196461037876834, + "loss": 0.8793, + "step": 15320 + }, + { + "epoch": 1.85, + "grad_norm": 0.26371756196022034, + "learning_rate": 0.00011186507845675527, + "loss": 0.8082, + "step": 15325 + }, + { + "epoch": 1.85, + "grad_norm": 0.2549302875995636, + "learning_rate": 0.00011176556448230271, + "loss": 0.7892, + "step": 15330 + }, + { + "epoch": 1.85, + "grad_norm": 0.2468237727880478, + "learning_rate": 0.00011166606850224524, + "loss": 0.7797, + "step": 15335 + }, + { + "epoch": 1.85, + "grad_norm": 0.2780306041240692, + "learning_rate": 0.00011156659056340898, + "loss": 0.7513, + "step": 15340 + }, + { + "epoch": 1.85, + "grad_norm": 0.264596551656723, + "learning_rate": 0.00011146713071261145, + "loss": 0.8805, + "step": 15345 + }, + { + "epoch": 1.85, + "grad_norm": 0.2658274173736572, + "learning_rate": 0.00011136768899666191, + "loss": 0.8718, + "step": 15350 + }, + { + "epoch": 1.85, + "grad_norm": 0.24554196000099182, + "learning_rate": 0.00011126826546236087, + "loss": 0.943, + "step": 15355 + }, + { + "epoch": 1.85, + "grad_norm": 0.24591578543186188, + "learning_rate": 0.00011116886015650035, + "loss": 0.7976, + "step": 15360 + }, + { + "epoch": 1.85, + "grad_norm": 0.2368362545967102, + "learning_rate": 0.00011106947312586373, + "loss": 0.867, + "step": 15365 + }, + { + "epoch": 1.85, + "grad_norm": 0.2802934944629669, + "learning_rate": 0.00011097010441722595, + "loss": 0.8927, + "step": 15370 + }, + { + "epoch": 1.85, + "grad_norm": 0.25117719173431396, + "learning_rate": 0.00011087075407735316, + "loss": 0.8307, + "step": 15375 + }, + { + "epoch": 1.85, + "grad_norm": 0.23962007462978363, + "learning_rate": 0.00011077142215300297, + "loss": 0.8692, + "step": 15380 + }, + { + "epoch": 1.85, + "grad_norm": 0.22337029874324799, + "learning_rate": 0.00011067210869092417, + "loss": 0.7974, + "step": 15385 + }, + { + "epoch": 1.85, + "grad_norm": 0.25178852677345276, + "learning_rate": 0.00011057281373785712, + "loss": 0.9266, + "step": 15390 + }, + { + "epoch": 1.85, + "grad_norm": 0.25446346402168274, + "learning_rate": 0.00011047353734053327, + "loss": 0.8743, + "step": 15395 + }, + { + "epoch": 1.86, + "grad_norm": 0.26338401436805725, + "learning_rate": 0.0001103742795456754, + "loss": 0.8066, + "step": 15400 + }, + { + "epoch": 1.86, + "grad_norm": 0.2573286294937134, + "learning_rate": 0.00011027504039999744, + "loss": 0.8641, + "step": 15405 + }, + { + "epoch": 1.86, + "grad_norm": 0.2608354091644287, + "learning_rate": 0.00011017581995020475, + "loss": 0.8588, + "step": 15410 + }, + { + "epoch": 1.86, + "grad_norm": 0.23720592260360718, + "learning_rate": 0.0001100766182429937, + "loss": 0.8744, + "step": 15415 + }, + { + "epoch": 1.86, + "grad_norm": 0.26355910301208496, + "learning_rate": 0.00010997743532505192, + "loss": 0.8457, + "step": 15420 + }, + { + "epoch": 1.86, + "grad_norm": 0.3250594139099121, + "learning_rate": 0.00010987827124305812, + "loss": 0.7379, + "step": 15425 + }, + { + "epoch": 1.86, + "grad_norm": 0.2539902329444885, + "learning_rate": 0.00010977912604368232, + "loss": 0.7225, + "step": 15430 + }, + { + "epoch": 1.86, + "grad_norm": 0.23028779029846191, + "learning_rate": 0.00010967999977358551, + "loss": 0.8119, + "step": 15435 + }, + { + "epoch": 1.86, + "grad_norm": 0.23592324554920197, + "learning_rate": 0.00010958089247941981, + "loss": 0.8199, + "step": 15440 + }, + { + "epoch": 1.86, + "grad_norm": 0.2703402638435364, + "learning_rate": 0.00010948180420782835, + "loss": 0.8367, + "step": 15445 + }, + { + "epoch": 1.86, + "grad_norm": 0.2614709734916687, + "learning_rate": 0.00010938273500544543, + "loss": 0.7616, + "step": 15450 + }, + { + "epoch": 1.86, + "grad_norm": 0.2527773976325989, + "learning_rate": 0.00010928368491889626, + "loss": 0.9269, + "step": 15455 + }, + { + "epoch": 1.86, + "grad_norm": 0.2695715129375458, + "learning_rate": 0.00010918465399479712, + "loss": 0.8234, + "step": 15460 + }, + { + "epoch": 1.86, + "grad_norm": 0.2826976478099823, + "learning_rate": 0.00010908564227975518, + "loss": 0.8455, + "step": 15465 + }, + { + "epoch": 1.86, + "grad_norm": 0.2363000065088272, + "learning_rate": 0.00010898664982036877, + "loss": 0.8312, + "step": 15470 + }, + { + "epoch": 1.86, + "grad_norm": 0.29471728205680847, + "learning_rate": 0.000108887676663227, + "loss": 0.8394, + "step": 15475 + }, + { + "epoch": 1.87, + "grad_norm": 0.24590782821178436, + "learning_rate": 0.00010878872285490984, + "loss": 0.8675, + "step": 15480 + }, + { + "epoch": 1.87, + "grad_norm": 0.26483914256095886, + "learning_rate": 0.00010868978844198827, + "loss": 0.7959, + "step": 15485 + }, + { + "epoch": 1.87, + "grad_norm": 0.25184696912765503, + "learning_rate": 0.00010859087347102416, + "loss": 0.8902, + "step": 15490 + }, + { + "epoch": 1.87, + "grad_norm": 0.23830364644527435, + "learning_rate": 0.00010849197798857015, + "loss": 0.9064, + "step": 15495 + }, + { + "epoch": 1.87, + "grad_norm": 0.24322542548179626, + "learning_rate": 0.00010839310204116975, + "loss": 0.854, + "step": 15500 + }, + { + "epoch": 1.87, + "grad_norm": 0.24347706139087677, + "learning_rate": 0.0001082942456753572, + "loss": 0.8495, + "step": 15505 + }, + { + "epoch": 1.87, + "grad_norm": 0.24465703964233398, + "learning_rate": 0.0001081954089376577, + "loss": 0.8317, + "step": 15510 + }, + { + "epoch": 1.87, + "grad_norm": 0.24925029277801514, + "learning_rate": 0.00010809659187458702, + "loss": 0.8258, + "step": 15515 + }, + { + "epoch": 1.87, + "grad_norm": 0.27092206478118896, + "learning_rate": 0.00010799779453265178, + "loss": 0.8398, + "step": 15520 + }, + { + "epoch": 1.87, + "grad_norm": 0.2629016041755676, + "learning_rate": 0.00010789901695834921, + "loss": 0.7819, + "step": 15525 + }, + { + "epoch": 1.87, + "grad_norm": 0.2438107430934906, + "learning_rate": 0.00010780025919816748, + "loss": 0.8834, + "step": 15530 + }, + { + "epoch": 1.87, + "grad_norm": 0.2341863512992859, + "learning_rate": 0.00010770152129858515, + "loss": 0.8463, + "step": 15535 + }, + { + "epoch": 1.87, + "grad_norm": 0.2474951595067978, + "learning_rate": 0.00010760280330607161, + "loss": 0.8869, + "step": 15540 + }, + { + "epoch": 1.87, + "grad_norm": 0.2520921528339386, + "learning_rate": 0.00010750410526708675, + "loss": 0.8906, + "step": 15545 + }, + { + "epoch": 1.87, + "grad_norm": 0.2502596974372864, + "learning_rate": 0.00010740542722808123, + "loss": 0.774, + "step": 15550 + }, + { + "epoch": 1.87, + "grad_norm": 0.2887754440307617, + "learning_rate": 0.0001073067692354962, + "loss": 0.7609, + "step": 15555 + }, + { + "epoch": 1.87, + "grad_norm": 0.26220768690109253, + "learning_rate": 0.00010720813133576336, + "loss": 0.855, + "step": 15560 + }, + { + "epoch": 1.88, + "grad_norm": 0.264213889837265, + "learning_rate": 0.00010710951357530489, + "loss": 0.7649, + "step": 15565 + }, + { + "epoch": 1.88, + "grad_norm": 0.26668137311935425, + "learning_rate": 0.00010701091600053379, + "loss": 0.8864, + "step": 15570 + }, + { + "epoch": 1.88, + "grad_norm": 0.2364049255847931, + "learning_rate": 0.00010691233865785321, + "loss": 0.8681, + "step": 15575 + }, + { + "epoch": 1.88, + "grad_norm": 0.2663012444972992, + "learning_rate": 0.00010681378159365696, + "loss": 0.9484, + "step": 15580 + }, + { + "epoch": 1.88, + "grad_norm": 0.25167354941368103, + "learning_rate": 0.00010671524485432926, + "loss": 0.8411, + "step": 15585 + }, + { + "epoch": 1.88, + "grad_norm": 0.2526310384273529, + "learning_rate": 0.00010661672848624477, + "loss": 0.8295, + "step": 15590 + }, + { + "epoch": 1.88, + "grad_norm": 0.25531667470932007, + "learning_rate": 0.0001065182325357686, + "loss": 0.8014, + "step": 15595 + }, + { + "epoch": 1.88, + "grad_norm": 0.24380475282669067, + "learning_rate": 0.00010641975704925615, + "loss": 0.9029, + "step": 15600 + }, + { + "epoch": 1.88, + "grad_norm": 0.25942888855934143, + "learning_rate": 0.00010632130207305324, + "loss": 0.8727, + "step": 15605 + }, + { + "epoch": 1.88, + "grad_norm": 0.26547759771347046, + "learning_rate": 0.00010622286765349618, + "loss": 0.7635, + "step": 15610 + }, + { + "epoch": 1.88, + "grad_norm": 0.27288028597831726, + "learning_rate": 0.00010612445383691137, + "loss": 0.7731, + "step": 15615 + }, + { + "epoch": 1.88, + "grad_norm": 0.2501557171344757, + "learning_rate": 0.00010602606066961564, + "loss": 0.8859, + "step": 15620 + }, + { + "epoch": 1.88, + "grad_norm": 0.25001439452171326, + "learning_rate": 0.00010592768819791608, + "loss": 0.8516, + "step": 15625 + }, + { + "epoch": 1.88, + "grad_norm": 0.2799443006515503, + "learning_rate": 0.00010582933646811008, + "loss": 0.8835, + "step": 15630 + }, + { + "epoch": 1.88, + "grad_norm": 0.2613951861858368, + "learning_rate": 0.00010573100552648517, + "loss": 0.9239, + "step": 15635 + }, + { + "epoch": 1.88, + "grad_norm": 0.27016937732696533, + "learning_rate": 0.00010563269541931922, + "loss": 0.8181, + "step": 15640 + }, + { + "epoch": 1.89, + "grad_norm": 0.22996383905410767, + "learning_rate": 0.00010553440619288014, + "loss": 0.8646, + "step": 15645 + }, + { + "epoch": 1.89, + "grad_norm": 0.29044246673583984, + "learning_rate": 0.00010543613789342621, + "loss": 0.8249, + "step": 15650 + }, + { + "epoch": 1.89, + "grad_norm": 0.2618498206138611, + "learning_rate": 0.00010533789056720571, + "loss": 0.7699, + "step": 15655 + }, + { + "epoch": 1.89, + "grad_norm": 0.25093433260917664, + "learning_rate": 0.00010523966426045709, + "loss": 0.7474, + "step": 15660 + }, + { + "epoch": 1.89, + "grad_norm": 0.2828831076622009, + "learning_rate": 0.00010514145901940887, + "loss": 0.7825, + "step": 15665 + }, + { + "epoch": 1.89, + "grad_norm": 0.2531161308288574, + "learning_rate": 0.0001050432748902798, + "loss": 0.8004, + "step": 15670 + }, + { + "epoch": 1.89, + "grad_norm": 0.23352687060832977, + "learning_rate": 0.0001049451119192785, + "loss": 0.7407, + "step": 15675 + }, + { + "epoch": 1.89, + "grad_norm": 0.22876708209514618, + "learning_rate": 0.00010484697015260379, + "loss": 0.7868, + "step": 15680 + }, + { + "epoch": 1.89, + "grad_norm": 0.27175286412239075, + "learning_rate": 0.00010474884963644434, + "loss": 0.7846, + "step": 15685 + }, + { + "epoch": 1.89, + "grad_norm": 0.2837650775909424, + "learning_rate": 0.00010465075041697908, + "loss": 0.7969, + "step": 15690 + }, + { + "epoch": 1.89, + "grad_norm": 0.2851541340351105, + "learning_rate": 0.00010455267254037663, + "loss": 0.7862, + "step": 15695 + }, + { + "epoch": 1.89, + "grad_norm": 0.2439732849597931, + "learning_rate": 0.00010445461605279579, + "loss": 0.8859, + "step": 15700 + }, + { + "epoch": 1.89, + "grad_norm": 0.24269287288188934, + "learning_rate": 0.00010435658100038505, + "loss": 0.7997, + "step": 15705 + }, + { + "epoch": 1.89, + "grad_norm": 0.24573330581188202, + "learning_rate": 0.00010425856742928313, + "loss": 1.0228, + "step": 15710 + }, + { + "epoch": 1.89, + "grad_norm": 0.25582510232925415, + "learning_rate": 0.00010416057538561842, + "loss": 0.8363, + "step": 15715 + }, + { + "epoch": 1.89, + "grad_norm": 0.21897505223751068, + "learning_rate": 0.00010406260491550918, + "loss": 0.8839, + "step": 15720 + }, + { + "epoch": 1.89, + "grad_norm": 0.23649723827838898, + "learning_rate": 0.0001039646560650636, + "loss": 0.8441, + "step": 15725 + }, + { + "epoch": 1.9, + "grad_norm": 0.2502078413963318, + "learning_rate": 0.00010386672888037969, + "loss": 0.8262, + "step": 15730 + }, + { + "epoch": 1.9, + "grad_norm": 0.2571605145931244, + "learning_rate": 0.00010376882340754519, + "loss": 0.8712, + "step": 15735 + }, + { + "epoch": 1.9, + "grad_norm": 0.26057517528533936, + "learning_rate": 0.0001036709396926377, + "loss": 0.8395, + "step": 15740 + }, + { + "epoch": 1.9, + "grad_norm": 0.2550731301307678, + "learning_rate": 0.00010357307778172445, + "loss": 0.8821, + "step": 15745 + }, + { + "epoch": 1.9, + "grad_norm": 0.2894580066204071, + "learning_rate": 0.00010347523772086268, + "loss": 0.8224, + "step": 15750 + }, + { + "epoch": 1.9, + "grad_norm": 0.24666161835193634, + "learning_rate": 0.00010337741955609907, + "loss": 0.7983, + "step": 15755 + }, + { + "epoch": 1.9, + "grad_norm": 0.26421642303466797, + "learning_rate": 0.00010327962333347008, + "loss": 0.7837, + "step": 15760 + }, + { + "epoch": 1.9, + "grad_norm": 0.27278557419776917, + "learning_rate": 0.00010318184909900188, + "loss": 0.757, + "step": 15765 + }, + { + "epoch": 1.9, + "grad_norm": 0.25606653094291687, + "learning_rate": 0.00010308409689871029, + "loss": 0.9554, + "step": 15770 + }, + { + "epoch": 1.9, + "grad_norm": 0.2995527982711792, + "learning_rate": 0.00010298636677860074, + "loss": 0.8754, + "step": 15775 + }, + { + "epoch": 1.9, + "grad_norm": 0.2641645669937134, + "learning_rate": 0.00010288865878466825, + "loss": 0.841, + "step": 15780 + }, + { + "epoch": 1.9, + "grad_norm": 0.2527436912059784, + "learning_rate": 0.00010279097296289741, + "loss": 0.7688, + "step": 15785 + }, + { + "epoch": 1.9, + "grad_norm": 0.25887179374694824, + "learning_rate": 0.0001026933093592625, + "loss": 0.7526, + "step": 15790 + }, + { + "epoch": 1.9, + "grad_norm": 0.2504650354385376, + "learning_rate": 0.00010259566801972721, + "loss": 0.8904, + "step": 15795 + }, + { + "epoch": 1.9, + "grad_norm": 0.235322505235672, + "learning_rate": 0.00010249804899024482, + "loss": 0.7883, + "step": 15800 + }, + { + "epoch": 1.9, + "grad_norm": 0.250555157661438, + "learning_rate": 0.00010240045231675802, + "loss": 0.8137, + "step": 15805 + }, + { + "epoch": 1.9, + "grad_norm": 0.2679235637187958, + "learning_rate": 0.00010230287804519914, + "loss": 0.8408, + "step": 15810 + }, + { + "epoch": 1.91, + "grad_norm": 0.2516213357448578, + "learning_rate": 0.00010220532622148982, + "loss": 0.8198, + "step": 15815 + }, + { + "epoch": 1.91, + "grad_norm": 0.26022079586982727, + "learning_rate": 0.00010210779689154118, + "loss": 0.8253, + "step": 15820 + }, + { + "epoch": 1.91, + "grad_norm": 0.3123502731323242, + "learning_rate": 0.0001020102901012537, + "loss": 0.8452, + "step": 15825 + }, + { + "epoch": 1.91, + "grad_norm": 0.24394842982292175, + "learning_rate": 0.00010191280589651746, + "loss": 0.8889, + "step": 15830 + }, + { + "epoch": 1.91, + "grad_norm": 0.24881669878959656, + "learning_rate": 0.00010181534432321171, + "loss": 0.8135, + "step": 15835 + }, + { + "epoch": 1.91, + "grad_norm": 0.2647436261177063, + "learning_rate": 0.00010171790542720504, + "loss": 0.8659, + "step": 15840 + }, + { + "epoch": 1.91, + "grad_norm": 0.2636089324951172, + "learning_rate": 0.00010162048925435549, + "loss": 0.891, + "step": 15845 + }, + { + "epoch": 1.91, + "grad_norm": 0.2648860216140747, + "learning_rate": 0.00010152309585051035, + "loss": 0.7962, + "step": 15850 + }, + { + "epoch": 1.91, + "grad_norm": 0.2279883474111557, + "learning_rate": 0.00010142572526150616, + "loss": 0.9005, + "step": 15855 + }, + { + "epoch": 1.91, + "grad_norm": 0.297305703163147, + "learning_rate": 0.0001013283775331687, + "loss": 0.7451, + "step": 15860 + }, + { + "epoch": 1.91, + "grad_norm": 0.2450573891401291, + "learning_rate": 0.00010123105271131319, + "loss": 0.8414, + "step": 15865 + }, + { + "epoch": 1.91, + "grad_norm": 0.2383514642715454, + "learning_rate": 0.00010113375084174382, + "loss": 0.8569, + "step": 15870 + }, + { + "epoch": 1.91, + "grad_norm": 0.23251613974571228, + "learning_rate": 0.00010103647197025414, + "loss": 0.8139, + "step": 15875 + }, + { + "epoch": 1.91, + "grad_norm": 0.2642490863800049, + "learning_rate": 0.0001009392161426267, + "loss": 0.8877, + "step": 15880 + }, + { + "epoch": 1.91, + "grad_norm": 0.25998982787132263, + "learning_rate": 0.00010084198340463345, + "loss": 0.8697, + "step": 15885 + }, + { + "epoch": 1.91, + "grad_norm": 0.2662920355796814, + "learning_rate": 0.00010074477380203529, + "loss": 0.7452, + "step": 15890 + }, + { + "epoch": 1.92, + "grad_norm": 0.24700447916984558, + "learning_rate": 0.00010064758738058231, + "loss": 0.856, + "step": 15895 + }, + { + "epoch": 1.92, + "grad_norm": 0.26831844449043274, + "learning_rate": 0.0001005504241860136, + "loss": 0.7415, + "step": 15900 + }, + { + "epoch": 1.92, + "grad_norm": 0.26766031980514526, + "learning_rate": 0.00010045328426405749, + "loss": 0.7633, + "step": 15905 + }, + { + "epoch": 1.92, + "grad_norm": 0.24661500751972198, + "learning_rate": 0.00010035616766043119, + "loss": 0.8742, + "step": 15910 + }, + { + "epoch": 1.92, + "grad_norm": 0.2439432591199875, + "learning_rate": 0.00010025907442084102, + "loss": 0.7972, + "step": 15915 + }, + { + "epoch": 1.92, + "grad_norm": 0.236423522233963, + "learning_rate": 0.0001001620045909822, + "loss": 0.7564, + "step": 15920 + }, + { + "epoch": 1.92, + "grad_norm": 0.26059049367904663, + "learning_rate": 0.00010006495821653914, + "loss": 0.824, + "step": 15925 + }, + { + "epoch": 1.92, + "grad_norm": 0.2574262022972107, + "learning_rate": 9.996793534318505e-05, + "loss": 0.8156, + "step": 15930 + }, + { + "epoch": 1.92, + "grad_norm": 0.2577226758003235, + "learning_rate": 9.987093601658209e-05, + "loss": 0.8548, + "step": 15935 + }, + { + "epoch": 1.92, + "grad_norm": 0.2414378523826599, + "learning_rate": 9.977396028238136e-05, + "loss": 0.8627, + "step": 15940 + }, + { + "epoch": 1.92, + "grad_norm": 0.2435644119977951, + "learning_rate": 9.969639671222088e-05, + "loss": 0.8817, + "step": 15945 + }, + { + "epoch": 1.92, + "grad_norm": 0.25886785984039307, + "learning_rate": 9.959946355934948e-05, + "loss": 0.7624, + "step": 15950 + }, + { + "epoch": 1.92, + "grad_norm": 0.22503872215747833, + "learning_rate": 9.950255412664435e-05, + "loss": 0.9167, + "step": 15955 + }, + { + "epoch": 1.92, + "grad_norm": 0.24574321508407593, + "learning_rate": 9.940566845971425e-05, + "loss": 0.7814, + "step": 15960 + }, + { + "epoch": 1.92, + "grad_norm": 0.23402521014213562, + "learning_rate": 9.93088066041567e-05, + "loss": 0.8389, + "step": 15965 + }, + { + "epoch": 1.92, + "grad_norm": 0.27619630098342896, + "learning_rate": 9.921196860555813e-05, + "loss": 0.8364, + "step": 15970 + }, + { + "epoch": 1.92, + "grad_norm": 0.2658560872077942, + "learning_rate": 9.91151545094938e-05, + "loss": 0.8762, + "step": 15975 + }, + { + "epoch": 1.93, + "grad_norm": 0.2881810963153839, + "learning_rate": 9.90183643615276e-05, + "loss": 0.8234, + "step": 15980 + }, + { + "epoch": 1.93, + "grad_norm": 0.2377784252166748, + "learning_rate": 9.892159820721216e-05, + "loss": 0.8424, + "step": 15985 + }, + { + "epoch": 1.93, + "grad_norm": 0.22696682810783386, + "learning_rate": 9.882485609208885e-05, + "loss": 0.7891, + "step": 15990 + }, + { + "epoch": 1.93, + "grad_norm": 0.2700740098953247, + "learning_rate": 9.872813806168778e-05, + "loss": 0.8931, + "step": 15995 + }, + { + "epoch": 1.93, + "grad_norm": 0.23319804668426514, + "learning_rate": 9.86314441615276e-05, + "loss": 0.7404, + "step": 16000 + }, + { + "epoch": 1.93, + "grad_norm": 0.23562254011631012, + "learning_rate": 9.853477443711572e-05, + "loss": 0.9749, + "step": 16005 + }, + { + "epoch": 1.93, + "grad_norm": 0.23507817089557648, + "learning_rate": 9.843812893394801e-05, + "loss": 0.8976, + "step": 16010 + }, + { + "epoch": 1.93, + "grad_norm": 0.25539952516555786, + "learning_rate": 9.834150769750921e-05, + "loss": 0.8241, + "step": 16015 + }, + { + "epoch": 1.93, + "grad_norm": 0.24266016483306885, + "learning_rate": 9.824491077327242e-05, + "loss": 0.8013, + "step": 16020 + }, + { + "epoch": 1.93, + "grad_norm": 0.29498210549354553, + "learning_rate": 9.814833820669934e-05, + "loss": 0.8838, + "step": 16025 + }, + { + "epoch": 1.93, + "grad_norm": 0.26613032817840576, + "learning_rate": 9.805179004324022e-05, + "loss": 0.8375, + "step": 16030 + }, + { + "epoch": 1.93, + "grad_norm": 0.2595424950122833, + "learning_rate": 9.795526632833388e-05, + "loss": 0.8382, + "step": 16035 + }, + { + "epoch": 1.93, + "grad_norm": 0.24384133517742157, + "learning_rate": 9.785876710740755e-05, + "loss": 0.9724, + "step": 16040 + }, + { + "epoch": 1.93, + "grad_norm": 0.2365197241306305, + "learning_rate": 9.776229242587701e-05, + "loss": 0.8506, + "step": 16045 + }, + { + "epoch": 1.93, + "grad_norm": 0.2736034095287323, + "learning_rate": 9.766584232914633e-05, + "loss": 0.818, + "step": 16050 + }, + { + "epoch": 1.93, + "grad_norm": 0.2698809504508972, + "learning_rate": 9.756941686260826e-05, + "loss": 0.7629, + "step": 16055 + }, + { + "epoch": 1.94, + "grad_norm": 0.24631649255752563, + "learning_rate": 9.747301607164378e-05, + "loss": 0.8116, + "step": 16060 + }, + { + "epoch": 1.94, + "grad_norm": 0.23516754806041718, + "learning_rate": 9.737664000162233e-05, + "loss": 0.8521, + "step": 16065 + }, + { + "epoch": 1.94, + "grad_norm": 0.22055235505104065, + "learning_rate": 9.728028869790162e-05, + "loss": 0.8474, + "step": 16070 + }, + { + "epoch": 1.94, + "grad_norm": 0.27016615867614746, + "learning_rate": 9.718396220582785e-05, + "loss": 0.8257, + "step": 16075 + }, + { + "epoch": 1.94, + "grad_norm": 0.26734715700149536, + "learning_rate": 9.708766057073543e-05, + "loss": 0.7563, + "step": 16080 + }, + { + "epoch": 1.94, + "grad_norm": 0.2916874587535858, + "learning_rate": 9.69913838379471e-05, + "loss": 0.7673, + "step": 16085 + }, + { + "epoch": 1.94, + "grad_norm": 0.23642754554748535, + "learning_rate": 9.689513205277387e-05, + "loss": 0.8493, + "step": 16090 + }, + { + "epoch": 1.94, + "grad_norm": 0.2527269721031189, + "learning_rate": 9.679890526051507e-05, + "loss": 0.8293, + "step": 16095 + }, + { + "epoch": 1.94, + "grad_norm": 0.2781146168708801, + "learning_rate": 9.670270350645823e-05, + "loss": 0.728, + "step": 16100 + }, + { + "epoch": 1.94, + "grad_norm": 0.24582897126674652, + "learning_rate": 9.660652683587907e-05, + "loss": 0.8191, + "step": 16105 + }, + { + "epoch": 1.94, + "grad_norm": 0.28193217515945435, + "learning_rate": 9.65103752940415e-05, + "loss": 0.8848, + "step": 16110 + }, + { + "epoch": 1.94, + "grad_norm": 0.2869209349155426, + "learning_rate": 9.641424892619766e-05, + "loss": 0.7661, + "step": 16115 + }, + { + "epoch": 1.94, + "grad_norm": 0.25964149832725525, + "learning_rate": 9.631814777758782e-05, + "loss": 0.8795, + "step": 16120 + }, + { + "epoch": 1.94, + "grad_norm": 0.2651737332344055, + "learning_rate": 9.622207189344035e-05, + "loss": 0.8498, + "step": 16125 + }, + { + "epoch": 1.94, + "grad_norm": 0.24850745499134064, + "learning_rate": 9.612602131897169e-05, + "loss": 0.8648, + "step": 16130 + }, + { + "epoch": 1.94, + "grad_norm": 0.2507181167602539, + "learning_rate": 9.602999609938658e-05, + "loss": 0.7739, + "step": 16135 + }, + { + "epoch": 1.94, + "grad_norm": 0.24325953423976898, + "learning_rate": 9.593399627987757e-05, + "loss": 0.8611, + "step": 16140 + }, + { + "epoch": 1.95, + "grad_norm": 0.2649693489074707, + "learning_rate": 9.58380219056254e-05, + "loss": 0.7544, + "step": 16145 + }, + { + "epoch": 1.95, + "grad_norm": 0.25593236088752747, + "learning_rate": 9.574207302179874e-05, + "loss": 0.8856, + "step": 16150 + }, + { + "epoch": 1.95, + "grad_norm": 0.22325646877288818, + "learning_rate": 9.56461496735544e-05, + "loss": 0.7696, + "step": 16155 + }, + { + "epoch": 1.95, + "grad_norm": 0.2551780343055725, + "learning_rate": 9.555025190603709e-05, + "loss": 0.8319, + "step": 16160 + }, + { + "epoch": 1.95, + "grad_norm": 0.2650226354598999, + "learning_rate": 9.54543797643794e-05, + "loss": 0.873, + "step": 16165 + }, + { + "epoch": 1.95, + "grad_norm": 0.25608155131340027, + "learning_rate": 9.5358533293702e-05, + "loss": 0.8617, + "step": 16170 + }, + { + "epoch": 1.95, + "grad_norm": 0.2500768303871155, + "learning_rate": 9.526271253911346e-05, + "loss": 0.8167, + "step": 16175 + }, + { + "epoch": 1.95, + "grad_norm": 0.24089059233665466, + "learning_rate": 9.516691754571015e-05, + "loss": 0.8327, + "step": 16180 + }, + { + "epoch": 1.95, + "grad_norm": 0.24258366227149963, + "learning_rate": 9.50711483585764e-05, + "loss": 0.8146, + "step": 16185 + }, + { + "epoch": 1.95, + "grad_norm": 0.2549149990081787, + "learning_rate": 9.49754050227843e-05, + "loss": 0.8334, + "step": 16190 + }, + { + "epoch": 1.95, + "grad_norm": 0.23933325707912445, + "learning_rate": 9.487968758339395e-05, + "loss": 0.8018, + "step": 16195 + }, + { + "epoch": 1.95, + "grad_norm": 0.24095794558525085, + "learning_rate": 9.478399608545314e-05, + "loss": 0.8354, + "step": 16200 + }, + { + "epoch": 1.95, + "grad_norm": 0.2553461492061615, + "learning_rate": 9.468833057399741e-05, + "loss": 0.8267, + "step": 16205 + }, + { + "epoch": 1.95, + "grad_norm": 0.22408875823020935, + "learning_rate": 9.459269109405017e-05, + "loss": 0.9216, + "step": 16210 + }, + { + "epoch": 1.95, + "grad_norm": 0.2684914469718933, + "learning_rate": 9.44970776906225e-05, + "loss": 0.7983, + "step": 16215 + }, + { + "epoch": 1.95, + "grad_norm": 0.26553064584732056, + "learning_rate": 9.440149040871329e-05, + "loss": 0.8494, + "step": 16220 + }, + { + "epoch": 1.95, + "grad_norm": 0.2991175055503845, + "learning_rate": 9.430592929330907e-05, + "loss": 0.9366, + "step": 16225 + }, + { + "epoch": 1.96, + "grad_norm": 0.2854938209056854, + "learning_rate": 9.421039438938399e-05, + "loss": 0.7927, + "step": 16230 + }, + { + "epoch": 1.96, + "grad_norm": 0.23396198451519012, + "learning_rate": 9.41148857419001e-05, + "loss": 0.8203, + "step": 16235 + }, + { + "epoch": 1.96, + "grad_norm": 0.2335088849067688, + "learning_rate": 9.401940339580687e-05, + "loss": 0.8674, + "step": 16240 + }, + { + "epoch": 1.96, + "grad_norm": 0.27896398305892944, + "learning_rate": 9.392394739604141e-05, + "loss": 0.8955, + "step": 16245 + }, + { + "epoch": 1.96, + "grad_norm": 0.27797919511795044, + "learning_rate": 9.382851778752858e-05, + "loss": 0.8088, + "step": 16250 + }, + { + "epoch": 1.96, + "grad_norm": 0.22393245995044708, + "learning_rate": 9.373311461518066e-05, + "loss": 0.7955, + "step": 16255 + }, + { + "epoch": 1.96, + "grad_norm": 0.2545322775840759, + "learning_rate": 9.363773792389759e-05, + "loss": 0.8558, + "step": 16260 + }, + { + "epoch": 1.96, + "grad_norm": 0.23659881949424744, + "learning_rate": 9.354238775856672e-05, + "loss": 0.9073, + "step": 16265 + }, + { + "epoch": 1.96, + "grad_norm": 0.2683257460594177, + "learning_rate": 9.344706416406312e-05, + "loss": 0.7528, + "step": 16270 + }, + { + "epoch": 1.96, + "grad_norm": 0.2514216899871826, + "learning_rate": 9.335176718524919e-05, + "loss": 0.8392, + "step": 16275 + }, + { + "epoch": 1.96, + "grad_norm": 0.22830678522586823, + "learning_rate": 9.325649686697485e-05, + "loss": 0.7843, + "step": 16280 + }, + { + "epoch": 1.96, + "grad_norm": 0.2406618446111679, + "learning_rate": 9.316125325407746e-05, + "loss": 0.8878, + "step": 16285 + }, + { + "epoch": 1.96, + "grad_norm": 0.24721916019916534, + "learning_rate": 9.306603639138187e-05, + "loss": 0.8405, + "step": 16290 + }, + { + "epoch": 1.96, + "grad_norm": 0.2820783853530884, + "learning_rate": 9.297084632370026e-05, + "loss": 0.8073, + "step": 16295 + }, + { + "epoch": 1.96, + "grad_norm": 0.24439501762390137, + "learning_rate": 9.287568309583227e-05, + "loss": 0.9226, + "step": 16300 + }, + { + "epoch": 1.96, + "grad_norm": 0.25293993949890137, + "learning_rate": 9.278054675256479e-05, + "loss": 0.7773, + "step": 16305 + }, + { + "epoch": 1.97, + "grad_norm": 0.26933521032333374, + "learning_rate": 9.268543733867225e-05, + "loss": 0.8361, + "step": 16310 + }, + { + "epoch": 1.97, + "grad_norm": 0.27007099986076355, + "learning_rate": 9.259035489891628e-05, + "loss": 0.7993, + "step": 16315 + }, + { + "epoch": 1.97, + "grad_norm": 0.25989168882369995, + "learning_rate": 9.24952994780458e-05, + "loss": 0.8354, + "step": 16320 + }, + { + "epoch": 1.97, + "grad_norm": 0.2629483640193939, + "learning_rate": 9.240027112079702e-05, + "loss": 0.7799, + "step": 16325 + }, + { + "epoch": 1.97, + "grad_norm": 0.24853236973285675, + "learning_rate": 9.230526987189351e-05, + "loss": 0.7944, + "step": 16330 + }, + { + "epoch": 1.97, + "grad_norm": 0.2612740099430084, + "learning_rate": 9.2210295776046e-05, + "loss": 0.7517, + "step": 16335 + }, + { + "epoch": 1.97, + "grad_norm": 0.2724323570728302, + "learning_rate": 9.211534887795241e-05, + "loss": 0.8058, + "step": 16340 + }, + { + "epoch": 1.97, + "grad_norm": 0.2580575942993164, + "learning_rate": 9.202042922229788e-05, + "loss": 0.8327, + "step": 16345 + }, + { + "epoch": 1.97, + "grad_norm": 0.29225781559944153, + "learning_rate": 9.192553685375488e-05, + "loss": 0.847, + "step": 16350 + }, + { + "epoch": 1.97, + "grad_norm": 0.26023924350738525, + "learning_rate": 9.18306718169828e-05, + "loss": 0.8871, + "step": 16355 + }, + { + "epoch": 1.97, + "grad_norm": 0.26399603486061096, + "learning_rate": 9.173583415662835e-05, + "loss": 0.8788, + "step": 16360 + }, + { + "epoch": 1.97, + "grad_norm": 0.2926609516143799, + "learning_rate": 9.164102391732514e-05, + "loss": 0.7472, + "step": 16365 + }, + { + "epoch": 1.97, + "grad_norm": 0.2964513897895813, + "learning_rate": 9.156519549902523e-05, + "loss": 0.88, + "step": 16370 + }, + { + "epoch": 1.97, + "grad_norm": 0.28954005241394043, + "learning_rate": 9.147043473005033e-05, + "loss": 0.7696, + "step": 16375 + }, + { + "epoch": 1.97, + "grad_norm": 0.2357548326253891, + "learning_rate": 9.137570150703257e-05, + "loss": 0.7794, + "step": 16380 + }, + { + "epoch": 1.97, + "grad_norm": 0.22681371867656708, + "learning_rate": 9.128099587455652e-05, + "loss": 0.8403, + "step": 16385 + }, + { + "epoch": 1.97, + "grad_norm": 0.26432034373283386, + "learning_rate": 9.118631787719381e-05, + "loss": 0.8527, + "step": 16390 + }, + { + "epoch": 1.98, + "grad_norm": 0.2536108195781708, + "learning_rate": 9.109166755950302e-05, + "loss": 0.8427, + "step": 16395 + }, + { + "epoch": 1.98, + "grad_norm": 0.22776862978935242, + "learning_rate": 9.101596726464934e-05, + "loss": 0.8992, + "step": 16400 + }, + { + "epoch": 1.98, + "grad_norm": 0.24553075432777405, + "learning_rate": 9.092136688261414e-05, + "loss": 0.8519, + "step": 16405 + }, + { + "epoch": 1.98, + "grad_norm": 0.25476738810539246, + "learning_rate": 9.082679430494567e-05, + "loss": 0.7777, + "step": 16410 + }, + { + "epoch": 1.98, + "grad_norm": 0.2561832368373871, + "learning_rate": 9.07322495761529e-05, + "loss": 0.8118, + "step": 16415 + }, + { + "epoch": 1.98, + "grad_norm": 0.26670464873313904, + "learning_rate": 9.06377327407317e-05, + "loss": 0.873, + "step": 16420 + }, + { + "epoch": 1.98, + "grad_norm": 0.261623352766037, + "learning_rate": 9.05432438431648e-05, + "loss": 0.7815, + "step": 16425 + }, + { + "epoch": 1.98, + "grad_norm": 0.283966600894928, + "learning_rate": 9.044878292792187e-05, + "loss": 0.8202, + "step": 16430 + }, + { + "epoch": 1.98, + "grad_norm": 0.23486657440662384, + "learning_rate": 9.035435003945933e-05, + "loss": 0.802, + "step": 16435 + }, + { + "epoch": 1.98, + "grad_norm": 0.233073890209198, + "learning_rate": 9.025994522222043e-05, + "loss": 0.8367, + "step": 16440 + }, + { + "epoch": 1.98, + "grad_norm": 0.24187245965003967, + "learning_rate": 9.016556852063515e-05, + "loss": 0.894, + "step": 16445 + }, + { + "epoch": 1.98, + "grad_norm": 0.2719327211380005, + "learning_rate": 9.007121997912044e-05, + "loss": 0.9086, + "step": 16450 + }, + { + "epoch": 1.98, + "grad_norm": 0.2707170844078064, + "learning_rate": 8.997689964207978e-05, + "loss": 0.7866, + "step": 16455 + }, + { + "epoch": 1.98, + "grad_norm": 0.2778245508670807, + "learning_rate": 8.988260755390346e-05, + "loss": 0.8574, + "step": 16460 + }, + { + "epoch": 1.98, + "grad_norm": 0.24011406302452087, + "learning_rate": 8.978834375896841e-05, + "loss": 0.8875, + "step": 16465 + }, + { + "epoch": 1.98, + "grad_norm": 0.26258188486099243, + "learning_rate": 8.96941083016384e-05, + "loss": 0.8013, + "step": 16470 + }, + { + "epoch": 1.99, + "grad_norm": 0.25577783584594727, + "learning_rate": 8.95999012262637e-05, + "loss": 0.7737, + "step": 16475 + }, + { + "epoch": 1.99, + "grad_norm": 0.2674656808376312, + "learning_rate": 8.950572257718132e-05, + "loss": 0.8443, + "step": 16480 + }, + { + "epoch": 1.99, + "grad_norm": 0.2525770962238312, + "learning_rate": 8.941157239871479e-05, + "loss": 0.8448, + "step": 16485 + }, + { + "epoch": 1.99, + "grad_norm": 0.23468473553657532, + "learning_rate": 8.931745073517443e-05, + "loss": 0.883, + "step": 16490 + }, + { + "epoch": 1.99, + "grad_norm": 0.23663926124572754, + "learning_rate": 8.922335763085696e-05, + "loss": 0.8718, + "step": 16495 + }, + { + "epoch": 1.99, + "grad_norm": 0.22883319854736328, + "learning_rate": 8.912929313004572e-05, + "loss": 0.9618, + "step": 16500 + }, + { + "epoch": 1.99, + "grad_norm": 0.24675635993480682, + "learning_rate": 8.903525727701054e-05, + "loss": 0.8928, + "step": 16505 + }, + { + "epoch": 1.99, + "grad_norm": 0.2842852771282196, + "learning_rate": 8.89412501160079e-05, + "loss": 0.8255, + "step": 16510 + }, + { + "epoch": 1.99, + "grad_norm": 0.26586049795150757, + "learning_rate": 8.884727169128066e-05, + "loss": 0.9482, + "step": 16515 + }, + { + "epoch": 1.99, + "grad_norm": 0.3201524615287781, + "learning_rate": 8.875332204705818e-05, + "loss": 0.8118, + "step": 16520 + }, + { + "epoch": 1.99, + "grad_norm": 0.2589406967163086, + "learning_rate": 8.865940122755623e-05, + "loss": 0.8802, + "step": 16525 + }, + { + "epoch": 1.99, + "grad_norm": 0.28572580218315125, + "learning_rate": 8.85655092769772e-05, + "loss": 0.9211, + "step": 16530 + }, + { + "epoch": 1.99, + "grad_norm": 0.24606913328170776, + "learning_rate": 8.847164623950965e-05, + "loss": 0.8589, + "step": 16535 + }, + { + "epoch": 1.99, + "grad_norm": 0.2393767237663269, + "learning_rate": 8.837781215932862e-05, + "loss": 0.8135, + "step": 16540 + }, + { + "epoch": 1.99, + "grad_norm": 0.26532062888145447, + "learning_rate": 8.828400708059567e-05, + "loss": 0.7389, + "step": 16545 + }, + { + "epoch": 1.99, + "grad_norm": 0.25288575887680054, + "learning_rate": 8.81902310474585e-05, + "loss": 0.8357, + "step": 16550 + }, + { + "epoch": 1.99, + "grad_norm": 0.23215311765670776, + "learning_rate": 8.809648410405123e-05, + "loss": 0.7262, + "step": 16555 + }, + { + "epoch": 2.0, + "grad_norm": 0.2512628734111786, + "learning_rate": 8.800276629449426e-05, + "loss": 0.7932, + "step": 16560 + }, + { + "epoch": 2.0, + "grad_norm": 0.23022831976413727, + "learning_rate": 8.790907766289437e-05, + "loss": 0.7226, + "step": 16565 + }, + { + "epoch": 2.0, + "grad_norm": 0.24034039676189423, + "learning_rate": 8.781541825334453e-05, + "loss": 0.8625, + "step": 16570 + }, + { + "epoch": 2.0, + "grad_norm": 0.23891344666481018, + "learning_rate": 8.772178810992392e-05, + "loss": 0.8287, + "step": 16575 + }, + { + "epoch": 2.0, + "grad_norm": 0.23461419343948364, + "learning_rate": 8.762818727669797e-05, + "loss": 0.7968, + "step": 16580 + }, + { + "epoch": 2.0, + "grad_norm": 0.24658669531345367, + "learning_rate": 8.753461579771846e-05, + "loss": 0.787, + "step": 16585 + }, + { + "epoch": 2.0, + "grad_norm": 0.23091553151607513, + "learning_rate": 8.744107371702315e-05, + "loss": 0.8718, + "step": 16590 + }, + { + "epoch": 2.0, + "grad_norm": 0.26409897208213806, + "learning_rate": 8.734756107863608e-05, + "loss": 0.8378, + "step": 16595 + }, + { + "epoch": 2.0, + "grad_norm": 0.27900955080986023, + "learning_rate": 8.725407792656731e-05, + "loss": 0.7441, + "step": 16600 + }, + { + "epoch": 2.0, + "grad_norm": 0.2287808060646057, + "learning_rate": 8.716062430481328e-05, + "loss": 0.8506, + "step": 16605 + }, + { + "epoch": 2.0, + "grad_norm": 0.26879292726516724, + "learning_rate": 8.706720025735627e-05, + "loss": 0.8325, + "step": 16610 + }, + { + "epoch": 2.0, + "grad_norm": 0.28211572766304016, + "learning_rate": 8.697380582816476e-05, + "loss": 0.8599, + "step": 16615 + }, + { + "epoch": 2.0, + "grad_norm": 0.2926715016365051, + "learning_rate": 8.688044106119325e-05, + "loss": 0.7717, + "step": 16620 + }, + { + "epoch": 2.0, + "grad_norm": 0.27610814571380615, + "learning_rate": 8.678710600038233e-05, + "loss": 0.8317, + "step": 16625 + }, + { + "epoch": 2.0, + "grad_norm": 0.2478606104850769, + "learning_rate": 8.669380068965856e-05, + "loss": 0.8358, + "step": 16630 + }, + { + "epoch": 2.0, + "grad_norm": 0.2299978882074356, + "learning_rate": 8.660052517293448e-05, + "loss": 0.8603, + "step": 16635 + }, + { + "epoch": 2.0, + "grad_norm": 0.27341413497924805, + "learning_rate": 8.650727949410867e-05, + "loss": 0.9561, + "step": 16640 + }, + { + "epoch": 2.01, + "grad_norm": 0.25213295221328735, + "learning_rate": 8.641406369706572e-05, + "loss": 0.8045, + "step": 16645 + }, + { + "epoch": 2.01, + "grad_norm": 0.2605089843273163, + "learning_rate": 8.6320877825676e-05, + "loss": 0.8112, + "step": 16650 + }, + { + "epoch": 2.01, + "grad_norm": 0.27594542503356934, + "learning_rate": 8.622772192379588e-05, + "loss": 0.8145, + "step": 16655 + }, + { + "epoch": 2.01, + "grad_norm": 0.2801823318004608, + "learning_rate": 8.61345960352676e-05, + "loss": 0.6998, + "step": 16660 + }, + { + "epoch": 2.01, + "grad_norm": 0.2606346011161804, + "learning_rate": 8.604150020391937e-05, + "loss": 0.7655, + "step": 16665 + }, + { + "epoch": 2.01, + "grad_norm": 0.23958322405815125, + "learning_rate": 8.594843447356517e-05, + "loss": 0.8022, + "step": 16670 + }, + { + "epoch": 2.01, + "grad_norm": 0.26263347268104553, + "learning_rate": 8.585539888800475e-05, + "loss": 0.7376, + "step": 16675 + }, + { + "epoch": 2.01, + "grad_norm": 0.23959647119045258, + "learning_rate": 8.576239349102375e-05, + "loss": 0.763, + "step": 16680 + }, + { + "epoch": 2.01, + "grad_norm": 0.25928083062171936, + "learning_rate": 8.56694183263937e-05, + "loss": 0.7902, + "step": 16685 + }, + { + "epoch": 2.01, + "grad_norm": 0.24009615182876587, + "learning_rate": 8.557647343787175e-05, + "loss": 0.8473, + "step": 16690 + }, + { + "epoch": 2.01, + "grad_norm": 0.2364853471517563, + "learning_rate": 8.548355886920084e-05, + "loss": 0.7173, + "step": 16695 + }, + { + "epoch": 2.01, + "grad_norm": 0.2518242299556732, + "learning_rate": 8.539067466410962e-05, + "loss": 0.8219, + "step": 16700 + }, + { + "epoch": 2.01, + "grad_norm": 0.28275394439697266, + "learning_rate": 8.529782086631254e-05, + "loss": 0.9054, + "step": 16705 + }, + { + "epoch": 2.01, + "grad_norm": 0.2858952283859253, + "learning_rate": 8.520499751950965e-05, + "loss": 0.8016, + "step": 16710 + }, + { + "epoch": 2.01, + "grad_norm": 0.2402840256690979, + "learning_rate": 8.51122046673867e-05, + "loss": 0.8295, + "step": 16715 + }, + { + "epoch": 2.01, + "grad_norm": 0.2762582004070282, + "learning_rate": 8.501944235361502e-05, + "loss": 0.733, + "step": 16720 + }, + { + "epoch": 2.02, + "grad_norm": 0.2743135094642639, + "learning_rate": 8.492671062185177e-05, + "loss": 0.7954, + "step": 16725 + }, + { + "epoch": 2.02, + "grad_norm": 0.24607881903648376, + "learning_rate": 8.483400951573954e-05, + "loss": 0.7754, + "step": 16730 + }, + { + "epoch": 2.02, + "grad_norm": 0.27087563276290894, + "learning_rate": 8.474133907890651e-05, + "loss": 0.7463, + "step": 16735 + }, + { + "epoch": 2.02, + "grad_norm": 0.24239374697208405, + "learning_rate": 8.464869935496641e-05, + "loss": 0.7516, + "step": 16740 + }, + { + "epoch": 2.02, + "grad_norm": 0.27605438232421875, + "learning_rate": 8.455609038751871e-05, + "loss": 0.7053, + "step": 16745 + }, + { + "epoch": 2.02, + "grad_norm": 0.29548242688179016, + "learning_rate": 8.446351222014822e-05, + "loss": 0.7657, + "step": 16750 + }, + { + "epoch": 2.02, + "grad_norm": 0.26143786311149597, + "learning_rate": 8.437096489642526e-05, + "loss": 0.7796, + "step": 16755 + }, + { + "epoch": 2.02, + "grad_norm": 0.26324915885925293, + "learning_rate": 8.42784484599057e-05, + "loss": 0.7717, + "step": 16760 + }, + { + "epoch": 2.02, + "grad_norm": 0.2451925128698349, + "learning_rate": 8.418596295413083e-05, + "loss": 0.8181, + "step": 16765 + }, + { + "epoch": 2.02, + "grad_norm": 0.26199206709861755, + "learning_rate": 8.409350842262741e-05, + "loss": 0.8205, + "step": 16770 + }, + { + "epoch": 2.02, + "grad_norm": 0.26254212856292725, + "learning_rate": 8.400108490890763e-05, + "loss": 0.7734, + "step": 16775 + }, + { + "epoch": 2.02, + "grad_norm": 0.24498294293880463, + "learning_rate": 8.390869245646897e-05, + "loss": 0.7043, + "step": 16780 + }, + { + "epoch": 2.02, + "grad_norm": 0.30816173553466797, + "learning_rate": 8.381633110879454e-05, + "loss": 0.8023, + "step": 16785 + }, + { + "epoch": 2.02, + "grad_norm": 0.24860282242298126, + "learning_rate": 8.372400090935256e-05, + "loss": 0.7682, + "step": 16790 + }, + { + "epoch": 2.02, + "grad_norm": 0.28883734345436096, + "learning_rate": 8.363170190159673e-05, + "loss": 0.8831, + "step": 16795 + }, + { + "epoch": 2.02, + "grad_norm": 0.28336241841316223, + "learning_rate": 8.353943412896596e-05, + "loss": 0.7443, + "step": 16800 + }, + { + "epoch": 2.02, + "grad_norm": 0.27060964703559875, + "learning_rate": 8.34471976348846e-05, + "loss": 0.8307, + "step": 16805 + }, + { + "epoch": 2.03, + "grad_norm": 0.27045443654060364, + "learning_rate": 8.33549924627622e-05, + "loss": 0.7417, + "step": 16810 + }, + { + "epoch": 2.03, + "grad_norm": 0.24936868250370026, + "learning_rate": 8.326281865599356e-05, + "loss": 0.801, + "step": 16815 + }, + { + "epoch": 2.03, + "grad_norm": 0.28478366136550903, + "learning_rate": 8.317067625795867e-05, + "loss": 0.7997, + "step": 16820 + }, + { + "epoch": 2.03, + "grad_norm": 0.28683024644851685, + "learning_rate": 8.307856531202295e-05, + "loss": 0.8765, + "step": 16825 + }, + { + "epoch": 2.03, + "grad_norm": 0.2502270042896271, + "learning_rate": 8.298648586153676e-05, + "loss": 0.8878, + "step": 16830 + }, + { + "epoch": 2.03, + "grad_norm": 0.25373736023902893, + "learning_rate": 8.289443794983578e-05, + "loss": 0.7722, + "step": 16835 + }, + { + "epoch": 2.03, + "grad_norm": 0.2576674818992615, + "learning_rate": 8.280242162024079e-05, + "loss": 0.8224, + "step": 16840 + }, + { + "epoch": 2.03, + "grad_norm": 0.26497557759284973, + "learning_rate": 8.271043691605778e-05, + "loss": 0.8216, + "step": 16845 + }, + { + "epoch": 2.03, + "grad_norm": 0.2641935646533966, + "learning_rate": 8.261848388057775e-05, + "loss": 0.755, + "step": 16850 + }, + { + "epoch": 2.03, + "grad_norm": 0.28792330622673035, + "learning_rate": 8.252656255707689e-05, + "loss": 0.7801, + "step": 16855 + }, + { + "epoch": 2.03, + "grad_norm": 0.2565497159957886, + "learning_rate": 8.243467298881636e-05, + "loss": 0.7938, + "step": 16860 + }, + { + "epoch": 2.03, + "grad_norm": 0.2724232077598572, + "learning_rate": 8.234281521904253e-05, + "loss": 0.8549, + "step": 16865 + }, + { + "epoch": 2.03, + "grad_norm": 0.24661044776439667, + "learning_rate": 8.225098929098673e-05, + "loss": 0.812, + "step": 16870 + }, + { + "epoch": 2.03, + "grad_norm": 0.23888921737670898, + "learning_rate": 8.215919524786521e-05, + "loss": 0.7724, + "step": 16875 + }, + { + "epoch": 2.03, + "grad_norm": 0.24567218124866486, + "learning_rate": 8.206743313287925e-05, + "loss": 0.7095, + "step": 16880 + }, + { + "epoch": 2.03, + "grad_norm": 0.2549126148223877, + "learning_rate": 8.197570298921533e-05, + "loss": 0.7106, + "step": 16885 + }, + { + "epoch": 2.04, + "grad_norm": 0.2611244022846222, + "learning_rate": 8.18840048600446e-05, + "loss": 0.8283, + "step": 16890 + }, + { + "epoch": 2.04, + "grad_norm": 0.24455487728118896, + "learning_rate": 8.179233878852323e-05, + "loss": 0.7387, + "step": 16895 + }, + { + "epoch": 2.04, + "grad_norm": 0.3053877651691437, + "learning_rate": 8.170070481779224e-05, + "loss": 0.8595, + "step": 16900 + }, + { + "epoch": 2.04, + "grad_norm": 0.2966110408306122, + "learning_rate": 8.160910299097782e-05, + "loss": 0.7955, + "step": 16905 + }, + { + "epoch": 2.04, + "grad_norm": 0.29260462522506714, + "learning_rate": 8.15175333511907e-05, + "loss": 0.8606, + "step": 16910 + }, + { + "epoch": 2.04, + "grad_norm": 0.27227169275283813, + "learning_rate": 8.14259959415267e-05, + "loss": 0.6951, + "step": 16915 + }, + { + "epoch": 2.04, + "grad_norm": 0.2581891417503357, + "learning_rate": 8.133449080506615e-05, + "loss": 0.8414, + "step": 16920 + }, + { + "epoch": 2.04, + "grad_norm": 0.2824326157569885, + "learning_rate": 8.124301798487458e-05, + "loss": 0.7542, + "step": 16925 + }, + { + "epoch": 2.04, + "grad_norm": 0.28472477197647095, + "learning_rate": 8.115157752400211e-05, + "loss": 0.7875, + "step": 16930 + }, + { + "epoch": 2.04, + "grad_norm": 0.3563947081565857, + "learning_rate": 8.106016946548365e-05, + "loss": 0.8582, + "step": 16935 + }, + { + "epoch": 2.04, + "grad_norm": 0.27918270230293274, + "learning_rate": 8.096879385233879e-05, + "loss": 0.8431, + "step": 16940 + }, + { + "epoch": 2.04, + "grad_norm": 0.31938403844833374, + "learning_rate": 8.087745072757208e-05, + "loss": 0.8991, + "step": 16945 + }, + { + "epoch": 2.04, + "grad_norm": 0.33040347695350647, + "learning_rate": 8.078614013417253e-05, + "loss": 0.7591, + "step": 16950 + }, + { + "epoch": 2.04, + "grad_norm": 0.28828245401382446, + "learning_rate": 8.069486211511394e-05, + "loss": 0.7754, + "step": 16955 + }, + { + "epoch": 2.04, + "grad_norm": 0.24763058125972748, + "learning_rate": 8.060361671335474e-05, + "loss": 0.7698, + "step": 16960 + }, + { + "epoch": 2.04, + "grad_norm": 0.23437534272670746, + "learning_rate": 8.051240397183818e-05, + "loss": 0.7253, + "step": 16965 + }, + { + "epoch": 2.04, + "grad_norm": 0.2842622995376587, + "learning_rate": 8.042122393349189e-05, + "loss": 0.773, + "step": 16970 + }, + { + "epoch": 2.05, + "grad_norm": 0.26870718598365784, + "learning_rate": 8.033007664122827e-05, + "loss": 0.7793, + "step": 16975 + }, + { + "epoch": 2.05, + "grad_norm": 0.26021209359169006, + "learning_rate": 8.023896213794425e-05, + "loss": 0.8217, + "step": 16980 + }, + { + "epoch": 2.05, + "grad_norm": 0.26792365312576294, + "learning_rate": 8.014788046652135e-05, + "loss": 0.8153, + "step": 16985 + }, + { + "epoch": 2.05, + "grad_norm": 0.2724539637565613, + "learning_rate": 8.00568316698256e-05, + "loss": 0.7681, + "step": 16990 + }, + { + "epoch": 2.05, + "grad_norm": 0.27169302105903625, + "learning_rate": 7.996581579070762e-05, + "loss": 0.7707, + "step": 16995 + }, + { + "epoch": 2.05, + "grad_norm": 0.25969168543815613, + "learning_rate": 7.987483287200243e-05, + "loss": 0.8198, + "step": 17000 + }, + { + "epoch": 2.05, + "grad_norm": 0.279259592294693, + "learning_rate": 7.978388295652974e-05, + "loss": 0.7626, + "step": 17005 + }, + { + "epoch": 2.05, + "grad_norm": 0.27438825368881226, + "learning_rate": 7.969296608709351e-05, + "loss": 0.7247, + "step": 17010 + }, + { + "epoch": 2.05, + "grad_norm": 0.30393609404563904, + "learning_rate": 7.96020823064823e-05, + "loss": 0.7484, + "step": 17015 + }, + { + "epoch": 2.05, + "grad_norm": 0.2659038007259369, + "learning_rate": 7.951123165746892e-05, + "loss": 0.9497, + "step": 17020 + }, + { + "epoch": 2.05, + "grad_norm": 0.2514190673828125, + "learning_rate": 7.942041418281086e-05, + "loss": 0.729, + "step": 17025 + }, + { + "epoch": 2.05, + "grad_norm": 0.2725908160209656, + "learning_rate": 7.932962992524974e-05, + "loss": 0.8552, + "step": 17030 + }, + { + "epoch": 2.05, + "grad_norm": 0.28015610575675964, + "learning_rate": 7.923887892751165e-05, + "loss": 0.7491, + "step": 17035 + }, + { + "epoch": 2.05, + "grad_norm": 0.2758082151412964, + "learning_rate": 7.914816123230703e-05, + "loss": 0.7509, + "step": 17040 + }, + { + "epoch": 2.05, + "grad_norm": 0.26611337065696716, + "learning_rate": 7.905747688233069e-05, + "loss": 0.7658, + "step": 17045 + }, + { + "epoch": 2.05, + "grad_norm": 0.24941769242286682, + "learning_rate": 7.896682592026164e-05, + "loss": 0.7546, + "step": 17050 + }, + { + "epoch": 2.05, + "grad_norm": 0.28314536809921265, + "learning_rate": 7.887620838876326e-05, + "loss": 0.7747, + "step": 17055 + }, + { + "epoch": 2.06, + "grad_norm": 0.29477396607398987, + "learning_rate": 7.878562433048316e-05, + "loss": 0.8356, + "step": 17060 + }, + { + "epoch": 2.06, + "grad_norm": 0.278089314699173, + "learning_rate": 7.869507378805321e-05, + "loss": 0.8421, + "step": 17065 + }, + { + "epoch": 2.06, + "grad_norm": 0.27571311593055725, + "learning_rate": 7.86045568040895e-05, + "loss": 0.8294, + "step": 17070 + }, + { + "epoch": 2.06, + "grad_norm": 0.27711910009384155, + "learning_rate": 7.851407342119226e-05, + "loss": 0.8208, + "step": 17075 + }, + { + "epoch": 2.06, + "grad_norm": 0.27659597992897034, + "learning_rate": 7.842362368194611e-05, + "loss": 0.7766, + "step": 17080 + }, + { + "epoch": 2.06, + "grad_norm": 0.28369686007499695, + "learning_rate": 7.833320762891964e-05, + "loss": 0.8258, + "step": 17085 + }, + { + "epoch": 2.06, + "grad_norm": 0.2623741626739502, + "learning_rate": 7.824282530466562e-05, + "loss": 0.836, + "step": 17090 + }, + { + "epoch": 2.06, + "grad_norm": 0.2581416368484497, + "learning_rate": 7.815247675172097e-05, + "loss": 0.7441, + "step": 17095 + }, + { + "epoch": 2.06, + "grad_norm": 0.2673351764678955, + "learning_rate": 7.806216201260677e-05, + "loss": 0.8585, + "step": 17100 + }, + { + "epoch": 2.06, + "grad_norm": 0.26765942573547363, + "learning_rate": 7.797188112982811e-05, + "loss": 0.7769, + "step": 17105 + }, + { + "epoch": 2.06, + "grad_norm": 0.28835397958755493, + "learning_rate": 7.788163414587417e-05, + "loss": 0.8654, + "step": 17110 + }, + { + "epoch": 2.06, + "grad_norm": 0.263345330953598, + "learning_rate": 7.779142110321812e-05, + "loss": 0.9097, + "step": 17115 + }, + { + "epoch": 2.06, + "grad_norm": 0.2905290126800537, + "learning_rate": 7.770124204431734e-05, + "loss": 0.8099, + "step": 17120 + }, + { + "epoch": 2.06, + "grad_norm": 0.2563280463218689, + "learning_rate": 7.761109701161308e-05, + "loss": 0.8323, + "step": 17125 + }, + { + "epoch": 2.06, + "grad_norm": 0.24418336153030396, + "learning_rate": 7.752098604753045e-05, + "loss": 0.7408, + "step": 17130 + }, + { + "epoch": 2.06, + "grad_norm": 0.267090767621994, + "learning_rate": 7.743090919447869e-05, + "loss": 0.6895, + "step": 17135 + }, + { + "epoch": 2.07, + "grad_norm": 0.2744917869567871, + "learning_rate": 7.734086649485109e-05, + "loss": 0.7986, + "step": 17140 + }, + { + "epoch": 2.07, + "grad_norm": 0.2881864011287689, + "learning_rate": 7.725085799102464e-05, + "loss": 0.7426, + "step": 17145 + }, + { + "epoch": 2.07, + "grad_norm": 0.32715755701065063, + "learning_rate": 7.716088372536035e-05, + "loss": 0.779, + "step": 17150 + }, + { + "epoch": 2.07, + "grad_norm": 0.2890661060810089, + "learning_rate": 7.707094374020302e-05, + "loss": 0.8839, + "step": 17155 + }, + { + "epoch": 2.07, + "grad_norm": 0.2906983196735382, + "learning_rate": 7.698103807788152e-05, + "loss": 0.9254, + "step": 17160 + }, + { + "epoch": 2.07, + "grad_norm": 0.27394556999206543, + "learning_rate": 7.689116678070839e-05, + "loss": 0.7887, + "step": 17165 + }, + { + "epoch": 2.07, + "grad_norm": 0.2804103195667267, + "learning_rate": 7.680132989098004e-05, + "loss": 0.7343, + "step": 17170 + }, + { + "epoch": 2.07, + "grad_norm": 0.2799692451953888, + "learning_rate": 7.671152745097664e-05, + "loss": 0.7547, + "step": 17175 + }, + { + "epoch": 2.07, + "grad_norm": 0.2532331943511963, + "learning_rate": 7.662175950296231e-05, + "loss": 0.8438, + "step": 17180 + }, + { + "epoch": 2.07, + "grad_norm": 0.2761310338973999, + "learning_rate": 7.653202608918479e-05, + "loss": 0.7216, + "step": 17185 + }, + { + "epoch": 2.07, + "grad_norm": 0.27380895614624023, + "learning_rate": 7.644232725187557e-05, + "loss": 0.7755, + "step": 17190 + }, + { + "epoch": 2.07, + "grad_norm": 0.3025756776332855, + "learning_rate": 7.635266303324993e-05, + "loss": 0.7884, + "step": 17195 + }, + { + "epoch": 2.07, + "grad_norm": 0.26098042726516724, + "learning_rate": 7.62630334755068e-05, + "loss": 0.7618, + "step": 17200 + }, + { + "epoch": 2.07, + "grad_norm": 0.2765859365463257, + "learning_rate": 7.617343862082887e-05, + "loss": 0.8837, + "step": 17205 + }, + { + "epoch": 2.07, + "grad_norm": 0.29430699348449707, + "learning_rate": 7.608387851138241e-05, + "loss": 0.8484, + "step": 17210 + }, + { + "epoch": 2.07, + "grad_norm": 0.2927020788192749, + "learning_rate": 7.599435318931737e-05, + "loss": 0.7411, + "step": 17215 + }, + { + "epoch": 2.07, + "grad_norm": 0.29646044969558716, + "learning_rate": 7.590486269676741e-05, + "loss": 0.7927, + "step": 17220 + }, + { + "epoch": 2.08, + "grad_norm": 0.2691041827201843, + "learning_rate": 7.58154070758497e-05, + "loss": 0.7709, + "step": 17225 + }, + { + "epoch": 2.08, + "grad_norm": 0.2560572922229767, + "learning_rate": 7.572598636866499e-05, + "loss": 0.8836, + "step": 17230 + }, + { + "epoch": 2.08, + "grad_norm": 0.27019980549812317, + "learning_rate": 7.563660061729763e-05, + "loss": 0.83, + "step": 17235 + }, + { + "epoch": 2.08, + "grad_norm": 0.2663953900337219, + "learning_rate": 7.554724986381558e-05, + "loss": 0.7005, + "step": 17240 + }, + { + "epoch": 2.08, + "grad_norm": 0.2575012743473053, + "learning_rate": 7.545793415027026e-05, + "loss": 0.8471, + "step": 17245 + }, + { + "epoch": 2.08, + "grad_norm": 0.3181961476802826, + "learning_rate": 7.53686535186966e-05, + "loss": 0.8156, + "step": 17250 + }, + { + "epoch": 2.08, + "grad_norm": 0.2710973024368286, + "learning_rate": 7.527940801111296e-05, + "loss": 0.7603, + "step": 17255 + }, + { + "epoch": 2.08, + "grad_norm": 0.23654502630233765, + "learning_rate": 7.519019766952135e-05, + "loss": 0.7897, + "step": 17260 + }, + { + "epoch": 2.08, + "grad_norm": 0.2502569258213043, + "learning_rate": 7.51010225359071e-05, + "loss": 0.7789, + "step": 17265 + }, + { + "epoch": 2.08, + "grad_norm": 0.2620236277580261, + "learning_rate": 7.501188265223893e-05, + "loss": 0.7543, + "step": 17270 + }, + { + "epoch": 2.08, + "grad_norm": 0.27987682819366455, + "learning_rate": 7.492277806046908e-05, + "loss": 0.8192, + "step": 17275 + }, + { + "epoch": 2.08, + "grad_norm": 0.23856423795223236, + "learning_rate": 7.483370880253311e-05, + "loss": 0.7938, + "step": 17280 + }, + { + "epoch": 2.08, + "grad_norm": 0.26246753334999084, + "learning_rate": 7.474467492034998e-05, + "loss": 0.7727, + "step": 17285 + }, + { + "epoch": 2.08, + "grad_norm": 0.2560681998729706, + "learning_rate": 7.465567645582199e-05, + "loss": 0.7481, + "step": 17290 + }, + { + "epoch": 2.08, + "grad_norm": 0.27870872616767883, + "learning_rate": 7.45667134508347e-05, + "loss": 0.7659, + "step": 17295 + }, + { + "epoch": 2.08, + "grad_norm": 0.2658903896808624, + "learning_rate": 7.447778594725717e-05, + "loss": 0.8235, + "step": 17300 + }, + { + "epoch": 2.09, + "grad_norm": 0.3086778223514557, + "learning_rate": 7.438889398694161e-05, + "loss": 0.8337, + "step": 17305 + }, + { + "epoch": 2.09, + "grad_norm": 0.2679433226585388, + "learning_rate": 7.430003761172349e-05, + "loss": 0.7323, + "step": 17310 + }, + { + "epoch": 2.09, + "grad_norm": 0.2665143311023712, + "learning_rate": 7.421121686342152e-05, + "loss": 0.8554, + "step": 17315 + }, + { + "epoch": 2.09, + "grad_norm": 0.24720510840415955, + "learning_rate": 7.412243178383784e-05, + "loss": 0.788, + "step": 17320 + }, + { + "epoch": 2.09, + "grad_norm": 0.28797394037246704, + "learning_rate": 7.403368241475757e-05, + "loss": 0.7654, + "step": 17325 + }, + { + "epoch": 2.09, + "grad_norm": 0.24396300315856934, + "learning_rate": 7.394496879794911e-05, + "loss": 0.8301, + "step": 17330 + }, + { + "epoch": 2.09, + "grad_norm": 0.26526421308517456, + "learning_rate": 7.385629097516407e-05, + "loss": 0.9021, + "step": 17335 + }, + { + "epoch": 2.09, + "grad_norm": 0.2767682671546936, + "learning_rate": 7.376764898813714e-05, + "loss": 0.7755, + "step": 17340 + }, + { + "epoch": 2.09, + "grad_norm": 0.3317005932331085, + "learning_rate": 7.367904287858618e-05, + "loss": 0.8482, + "step": 17345 + }, + { + "epoch": 2.09, + "grad_norm": 0.2770203948020935, + "learning_rate": 7.359047268821219e-05, + "loss": 0.8076, + "step": 17350 + }, + { + "epoch": 2.09, + "grad_norm": 0.2635518014431, + "learning_rate": 7.350193845869918e-05, + "loss": 0.8716, + "step": 17355 + }, + { + "epoch": 2.09, + "grad_norm": 0.2668205797672272, + "learning_rate": 7.341344023171441e-05, + "loss": 0.7338, + "step": 17360 + }, + { + "epoch": 2.09, + "grad_norm": 0.29583418369293213, + "learning_rate": 7.332497804890803e-05, + "loss": 0.7572, + "step": 17365 + }, + { + "epoch": 2.09, + "grad_norm": 0.26507455110549927, + "learning_rate": 7.323655195191328e-05, + "loss": 0.7844, + "step": 17370 + }, + { + "epoch": 2.09, + "grad_norm": 0.2781868278980255, + "learning_rate": 7.314816198234636e-05, + "loss": 0.7465, + "step": 17375 + }, + { + "epoch": 2.09, + "grad_norm": 0.2629784345626831, + "learning_rate": 7.305980818180663e-05, + "loss": 0.8417, + "step": 17380 + }, + { + "epoch": 2.09, + "grad_norm": 0.2834410071372986, + "learning_rate": 7.297149059187628e-05, + "loss": 0.7415, + "step": 17385 + }, + { + "epoch": 2.1, + "grad_norm": 0.3003019392490387, + "learning_rate": 7.28832092541205e-05, + "loss": 0.8308, + "step": 17390 + }, + { + "epoch": 2.1, + "grad_norm": 0.2973310351371765, + "learning_rate": 7.279496421008735e-05, + "loss": 0.7261, + "step": 17395 + }, + { + "epoch": 2.1, + "grad_norm": 0.2581186890602112, + "learning_rate": 7.2706755501308e-05, + "loss": 0.6732, + "step": 17400 + }, + { + "epoch": 2.1, + "grad_norm": 0.276574969291687, + "learning_rate": 7.261858316929634e-05, + "loss": 0.7984, + "step": 17405 + }, + { + "epoch": 2.1, + "grad_norm": 0.24236898124217987, + "learning_rate": 7.253044725554922e-05, + "loss": 0.7923, + "step": 17410 + }, + { + "epoch": 2.1, + "grad_norm": 0.26904451847076416, + "learning_rate": 7.244234780154627e-05, + "loss": 0.7706, + "step": 17415 + }, + { + "epoch": 2.1, + "grad_norm": 0.2632739841938019, + "learning_rate": 7.235428484875006e-05, + "loss": 0.748, + "step": 17420 + }, + { + "epoch": 2.1, + "grad_norm": 0.3082495629787445, + "learning_rate": 7.226625843860595e-05, + "loss": 0.8136, + "step": 17425 + }, + { + "epoch": 2.1, + "grad_norm": 0.25225359201431274, + "learning_rate": 7.217826861254208e-05, + "loss": 0.7853, + "step": 17430 + }, + { + "epoch": 2.1, + "grad_norm": 0.2693781554698944, + "learning_rate": 7.209031541196931e-05, + "loss": 0.8022, + "step": 17435 + }, + { + "epoch": 2.1, + "grad_norm": 0.2704066038131714, + "learning_rate": 7.200239887828147e-05, + "loss": 0.8619, + "step": 17440 + }, + { + "epoch": 2.1, + "grad_norm": 0.2754969000816345, + "learning_rate": 7.191451905285494e-05, + "loss": 0.7699, + "step": 17445 + }, + { + "epoch": 2.1, + "grad_norm": 0.2515884041786194, + "learning_rate": 7.182667597704889e-05, + "loss": 0.7937, + "step": 17450 + }, + { + "epoch": 2.1, + "grad_norm": 0.27935126423835754, + "learning_rate": 7.17388696922051e-05, + "loss": 0.7801, + "step": 17455 + }, + { + "epoch": 2.1, + "grad_norm": 0.24923402070999146, + "learning_rate": 7.165110023964828e-05, + "loss": 0.8244, + "step": 17460 + }, + { + "epoch": 2.1, + "grad_norm": 0.30999448895454407, + "learning_rate": 7.156336766068557e-05, + "loss": 0.7159, + "step": 17465 + }, + { + "epoch": 2.1, + "grad_norm": 0.2763430178165436, + "learning_rate": 7.147567199660684e-05, + "loss": 0.8173, + "step": 17470 + }, + { + "epoch": 2.11, + "grad_norm": 0.25914233922958374, + "learning_rate": 7.138801328868453e-05, + "loss": 0.7264, + "step": 17475 + }, + { + "epoch": 2.11, + "grad_norm": 0.28961434960365295, + "learning_rate": 7.130039157817384e-05, + "loss": 0.7886, + "step": 17480 + }, + { + "epoch": 2.11, + "grad_norm": 0.29315581917762756, + "learning_rate": 7.121280690631239e-05, + "loss": 0.7655, + "step": 17485 + }, + { + "epoch": 2.11, + "grad_norm": 0.24615170061588287, + "learning_rate": 7.112525931432047e-05, + "loss": 0.7012, + "step": 17490 + }, + { + "epoch": 2.11, + "grad_norm": 0.28100574016571045, + "learning_rate": 7.103774884340087e-05, + "loss": 0.8085, + "step": 17495 + }, + { + "epoch": 2.11, + "grad_norm": 0.25875842571258545, + "learning_rate": 7.095027553473891e-05, + "loss": 0.85, + "step": 17500 + }, + { + "epoch": 2.11, + "grad_norm": 0.28253424167633057, + "learning_rate": 7.086283942950246e-05, + "loss": 0.7207, + "step": 17505 + }, + { + "epoch": 2.11, + "grad_norm": 0.2582673728466034, + "learning_rate": 7.077544056884182e-05, + "loss": 0.8388, + "step": 17510 + }, + { + "epoch": 2.11, + "grad_norm": 0.2423962503671646, + "learning_rate": 7.068807899388976e-05, + "loss": 0.7351, + "step": 17515 + }, + { + "epoch": 2.11, + "grad_norm": 0.2518567144870758, + "learning_rate": 7.060075474576165e-05, + "loss": 0.755, + "step": 17520 + }, + { + "epoch": 2.11, + "grad_norm": 0.2877778708934784, + "learning_rate": 7.051346786555513e-05, + "loss": 0.7769, + "step": 17525 + }, + { + "epoch": 2.11, + "grad_norm": 0.2926938235759735, + "learning_rate": 7.042621839435029e-05, + "loss": 0.8798, + "step": 17530 + }, + { + "epoch": 2.11, + "grad_norm": 0.295379638671875, + "learning_rate": 7.033900637320958e-05, + "loss": 0.7104, + "step": 17535 + }, + { + "epoch": 2.11, + "grad_norm": 0.27940165996551514, + "learning_rate": 7.0251831843178e-05, + "loss": 0.732, + "step": 17540 + }, + { + "epoch": 2.11, + "grad_norm": 0.2717758119106293, + "learning_rate": 7.01646948452827e-05, + "loss": 0.8071, + "step": 17545 + }, + { + "epoch": 2.11, + "grad_norm": 0.26429474353790283, + "learning_rate": 7.007759542053324e-05, + "loss": 0.767, + "step": 17550 + }, + { + "epoch": 2.12, + "grad_norm": 0.3140299916267395, + "learning_rate": 6.999053360992155e-05, + "loss": 0.7958, + "step": 17555 + }, + { + "epoch": 2.12, + "grad_norm": 0.2404472976922989, + "learning_rate": 6.990350945442173e-05, + "loss": 0.8426, + "step": 17560 + }, + { + "epoch": 2.12, + "grad_norm": 0.2709415555000305, + "learning_rate": 6.981652299499032e-05, + "loss": 0.7802, + "step": 17565 + }, + { + "epoch": 2.12, + "grad_norm": 0.2728278338909149, + "learning_rate": 6.972957427256594e-05, + "loss": 0.7786, + "step": 17570 + }, + { + "epoch": 2.12, + "grad_norm": 0.27763307094573975, + "learning_rate": 6.964266332806966e-05, + "loss": 0.735, + "step": 17575 + }, + { + "epoch": 2.12, + "grad_norm": 0.27662762999534607, + "learning_rate": 6.955579020240459e-05, + "loss": 0.8434, + "step": 17580 + }, + { + "epoch": 2.12, + "grad_norm": 0.26812487840652466, + "learning_rate": 6.946895493645613e-05, + "loss": 0.7325, + "step": 17585 + }, + { + "epoch": 2.12, + "grad_norm": 0.27408894896507263, + "learning_rate": 6.938215757109176e-05, + "loss": 0.9187, + "step": 17590 + }, + { + "epoch": 2.12, + "grad_norm": 0.27132144570350647, + "learning_rate": 6.929539814716136e-05, + "loss": 0.7242, + "step": 17595 + }, + { + "epoch": 2.12, + "grad_norm": 0.3133727014064789, + "learning_rate": 6.920867670549668e-05, + "loss": 0.8118, + "step": 17600 + }, + { + "epoch": 2.12, + "grad_norm": 0.2706137001514435, + "learning_rate": 6.912199328691175e-05, + "loss": 0.7838, + "step": 17605 + }, + { + "epoch": 2.12, + "grad_norm": 0.26105305552482605, + "learning_rate": 6.90353479322026e-05, + "loss": 0.821, + "step": 17610 + }, + { + "epoch": 2.12, + "grad_norm": 0.26275086402893066, + "learning_rate": 6.894874068214751e-05, + "loss": 0.8778, + "step": 17615 + }, + { + "epoch": 2.12, + "grad_norm": 0.28266021609306335, + "learning_rate": 6.88621715775067e-05, + "loss": 0.91, + "step": 17620 + }, + { + "epoch": 2.12, + "grad_norm": 0.2602560520172119, + "learning_rate": 6.877564065902245e-05, + "loss": 0.8879, + "step": 17625 + }, + { + "epoch": 2.12, + "grad_norm": 0.2614297568798065, + "learning_rate": 6.868914796741907e-05, + "loss": 0.7482, + "step": 17630 + }, + { + "epoch": 2.12, + "grad_norm": 0.2839134633541107, + "learning_rate": 6.860269354340292e-05, + "loss": 0.7748, + "step": 17635 + }, + { + "epoch": 2.13, + "grad_norm": 0.2632676064968109, + "learning_rate": 6.85162774276623e-05, + "loss": 0.8696, + "step": 17640 + }, + { + "epoch": 2.13, + "grad_norm": 0.24448035657405853, + "learning_rate": 6.842989966086751e-05, + "loss": 0.7791, + "step": 17645 + }, + { + "epoch": 2.13, + "grad_norm": 0.289100706577301, + "learning_rate": 6.834356028367076e-05, + "loss": 0.7354, + "step": 17650 + }, + { + "epoch": 2.13, + "grad_norm": 0.27051296830177307, + "learning_rate": 6.82572593367063e-05, + "loss": 0.9232, + "step": 17655 + }, + { + "epoch": 2.13, + "grad_norm": 0.25651922821998596, + "learning_rate": 6.81709968605902e-05, + "loss": 0.8086, + "step": 17660 + }, + { + "epoch": 2.13, + "grad_norm": 0.24785958230495453, + "learning_rate": 6.808477289592045e-05, + "loss": 0.7559, + "step": 17665 + }, + { + "epoch": 2.13, + "grad_norm": 0.25814223289489746, + "learning_rate": 6.799858748327681e-05, + "loss": 0.8003, + "step": 17670 + }, + { + "epoch": 2.13, + "grad_norm": 0.2958534061908722, + "learning_rate": 6.791244066322115e-05, + "loss": 0.8522, + "step": 17675 + }, + { + "epoch": 2.13, + "grad_norm": 0.27761539816856384, + "learning_rate": 6.782633247629697e-05, + "loss": 0.8069, + "step": 17680 + }, + { + "epoch": 2.13, + "grad_norm": 0.33598434925079346, + "learning_rate": 6.774026296302963e-05, + "loss": 0.7955, + "step": 17685 + }, + { + "epoch": 2.13, + "grad_norm": 0.26065829396247864, + "learning_rate": 6.765423216392623e-05, + "loss": 0.751, + "step": 17690 + }, + { + "epoch": 2.13, + "grad_norm": 0.2681087553501129, + "learning_rate": 6.756824011947586e-05, + "loss": 0.7626, + "step": 17695 + }, + { + "epoch": 2.13, + "grad_norm": 0.27821341156959534, + "learning_rate": 6.748228687014915e-05, + "loss": 0.6555, + "step": 17700 + }, + { + "epoch": 2.13, + "grad_norm": 0.2935525178909302, + "learning_rate": 6.739637245639858e-05, + "loss": 0.8103, + "step": 17705 + }, + { + "epoch": 2.13, + "grad_norm": 0.29609110951423645, + "learning_rate": 6.73104969186583e-05, + "loss": 0.8574, + "step": 17710 + }, + { + "epoch": 2.13, + "grad_norm": 0.30124762654304504, + "learning_rate": 6.722466029734422e-05, + "loss": 0.865, + "step": 17715 + }, + { + "epoch": 2.14, + "grad_norm": 0.26538020372390747, + "learning_rate": 6.713886263285388e-05, + "loss": 0.7681, + "step": 17720 + }, + { + "epoch": 2.14, + "grad_norm": 0.26841890811920166, + "learning_rate": 6.705310396556651e-05, + "loss": 0.7141, + "step": 17725 + }, + { + "epoch": 2.14, + "grad_norm": 0.2693963050842285, + "learning_rate": 6.696738433584295e-05, + "loss": 0.7318, + "step": 17730 + }, + { + "epoch": 2.14, + "grad_norm": 0.3165275752544403, + "learning_rate": 6.688170378402581e-05, + "loss": 0.7613, + "step": 17735 + }, + { + "epoch": 2.14, + "grad_norm": 0.2781963050365448, + "learning_rate": 6.679606235043913e-05, + "loss": 0.7928, + "step": 17740 + }, + { + "epoch": 2.14, + "grad_norm": 0.2980717122554779, + "learning_rate": 6.671046007538862e-05, + "loss": 0.7451, + "step": 17745 + }, + { + "epoch": 2.14, + "grad_norm": 0.26822733879089355, + "learning_rate": 6.662489699916153e-05, + "loss": 0.8043, + "step": 17750 + }, + { + "epoch": 2.14, + "grad_norm": 0.2781360447406769, + "learning_rate": 6.653937316202675e-05, + "loss": 0.8289, + "step": 17755 + }, + { + "epoch": 2.14, + "grad_norm": 0.28398361802101135, + "learning_rate": 6.64538886042346e-05, + "loss": 0.7195, + "step": 17760 + }, + { + "epoch": 2.14, + "grad_norm": 0.28372859954833984, + "learning_rate": 6.636844336601695e-05, + "loss": 0.8084, + "step": 17765 + }, + { + "epoch": 2.14, + "grad_norm": 0.2855859696865082, + "learning_rate": 6.628303748758719e-05, + "loss": 0.7861, + "step": 17770 + }, + { + "epoch": 2.14, + "grad_norm": 0.2785280644893646, + "learning_rate": 6.619767100914013e-05, + "loss": 0.8347, + "step": 17775 + }, + { + "epoch": 2.14, + "grad_norm": 0.2754531800746918, + "learning_rate": 6.611234397085207e-05, + "loss": 0.8999, + "step": 17780 + }, + { + "epoch": 2.14, + "grad_norm": 0.2457299828529358, + "learning_rate": 6.602705641288078e-05, + "loss": 0.6812, + "step": 17785 + }, + { + "epoch": 2.14, + "grad_norm": 0.2855609357357025, + "learning_rate": 6.594180837536533e-05, + "loss": 0.7006, + "step": 17790 + }, + { + "epoch": 2.14, + "grad_norm": 0.23807214200496674, + "learning_rate": 6.585659989842641e-05, + "loss": 0.7315, + "step": 17795 + }, + { + "epoch": 2.14, + "grad_norm": 0.2731025815010071, + "learning_rate": 6.57714310221659e-05, + "loss": 0.8854, + "step": 17800 + }, + { + "epoch": 2.15, + "grad_norm": 0.2559722363948822, + "learning_rate": 6.568630178666706e-05, + "loss": 0.7751, + "step": 17805 + }, + { + "epoch": 2.15, + "grad_norm": 0.28620442748069763, + "learning_rate": 6.560121223199455e-05, + "loss": 0.85, + "step": 17810 + }, + { + "epoch": 2.15, + "grad_norm": 0.2663728892803192, + "learning_rate": 6.551616239819441e-05, + "loss": 0.9119, + "step": 17815 + }, + { + "epoch": 2.15, + "grad_norm": 0.2895793318748474, + "learning_rate": 6.543115232529386e-05, + "loss": 0.8386, + "step": 17820 + }, + { + "epoch": 2.15, + "grad_norm": 0.25901633501052856, + "learning_rate": 6.534618205330146e-05, + "loss": 0.8167, + "step": 17825 + }, + { + "epoch": 2.15, + "grad_norm": 0.2944163382053375, + "learning_rate": 6.526125162220702e-05, + "loss": 0.8255, + "step": 17830 + }, + { + "epoch": 2.15, + "grad_norm": 0.27901577949523926, + "learning_rate": 6.51763610719817e-05, + "loss": 0.7202, + "step": 17835 + }, + { + "epoch": 2.15, + "grad_norm": 0.25898268818855286, + "learning_rate": 6.509151044257776e-05, + "loss": 0.7927, + "step": 17840 + }, + { + "epoch": 2.15, + "grad_norm": 0.27605336904525757, + "learning_rate": 6.500669977392874e-05, + "loss": 0.9295, + "step": 17845 + }, + { + "epoch": 2.15, + "grad_norm": 0.3069280683994293, + "learning_rate": 6.492192910594933e-05, + "loss": 0.8324, + "step": 17850 + }, + { + "epoch": 2.15, + "grad_norm": 0.2427961677312851, + "learning_rate": 6.483719847853545e-05, + "loss": 0.8261, + "step": 17855 + }, + { + "epoch": 2.15, + "grad_norm": 0.31651127338409424, + "learning_rate": 6.475250793156412e-05, + "loss": 0.7048, + "step": 17860 + }, + { + "epoch": 2.15, + "grad_norm": 0.3037257492542267, + "learning_rate": 6.466785750489357e-05, + "loss": 0.7998, + "step": 17865 + }, + { + "epoch": 2.15, + "grad_norm": 0.29183289408683777, + "learning_rate": 6.458324723836299e-05, + "loss": 0.7498, + "step": 17870 + }, + { + "epoch": 2.15, + "grad_norm": 0.264967679977417, + "learning_rate": 6.449867717179293e-05, + "loss": 0.844, + "step": 17875 + }, + { + "epoch": 2.15, + "grad_norm": 0.319900780916214, + "learning_rate": 6.44141473449848e-05, + "loss": 0.7544, + "step": 17880 + }, + { + "epoch": 2.15, + "grad_norm": 0.2805964946746826, + "learning_rate": 6.432965779772115e-05, + "loss": 0.8429, + "step": 17885 + }, + { + "epoch": 2.16, + "grad_norm": 0.24578483402729034, + "learning_rate": 6.424520856976551e-05, + "loss": 0.7295, + "step": 17890 + }, + { + "epoch": 2.16, + "grad_norm": 0.27586978673934937, + "learning_rate": 6.416079970086259e-05, + "loss": 0.9162, + "step": 17895 + }, + { + "epoch": 2.16, + "grad_norm": 0.2891899347305298, + "learning_rate": 6.407643123073797e-05, + "loss": 0.7682, + "step": 17900 + }, + { + "epoch": 2.16, + "grad_norm": 0.26478612422943115, + "learning_rate": 6.399210319909824e-05, + "loss": 0.8028, + "step": 17905 + }, + { + "epoch": 2.16, + "grad_norm": 0.26194480061531067, + "learning_rate": 6.390781564563093e-05, + "loss": 0.8431, + "step": 17910 + }, + { + "epoch": 2.16, + "grad_norm": 0.2496415376663208, + "learning_rate": 6.382356861000466e-05, + "loss": 0.8244, + "step": 17915 + }, + { + "epoch": 2.16, + "grad_norm": 0.2828780710697174, + "learning_rate": 6.373936213186884e-05, + "loss": 0.8356, + "step": 17920 + }, + { + "epoch": 2.16, + "grad_norm": 0.3063983917236328, + "learning_rate": 6.365519625085388e-05, + "loss": 0.9442, + "step": 17925 + }, + { + "epoch": 2.16, + "grad_norm": 0.2884746491909027, + "learning_rate": 6.357107100657088e-05, + "loss": 0.7947, + "step": 17930 + }, + { + "epoch": 2.16, + "grad_norm": 0.2872295379638672, + "learning_rate": 6.348698643861213e-05, + "loss": 0.8708, + "step": 17935 + }, + { + "epoch": 2.16, + "grad_norm": 0.2698356509208679, + "learning_rate": 6.340294258655056e-05, + "loss": 0.7472, + "step": 17940 + }, + { + "epoch": 2.16, + "grad_norm": 0.2601388394832611, + "learning_rate": 6.331893948994003e-05, + "loss": 0.7562, + "step": 17945 + }, + { + "epoch": 2.16, + "grad_norm": 0.27124738693237305, + "learning_rate": 6.32349771883151e-05, + "loss": 0.8861, + "step": 17950 + }, + { + "epoch": 2.16, + "grad_norm": 0.2834950089454651, + "learning_rate": 6.315105572119134e-05, + "loss": 0.8054, + "step": 17955 + }, + { + "epoch": 2.16, + "grad_norm": 0.26736146211624146, + "learning_rate": 6.306717512806492e-05, + "loss": 0.8403, + "step": 17960 + }, + { + "epoch": 2.16, + "grad_norm": 0.2949734330177307, + "learning_rate": 6.298333544841284e-05, + "loss": 0.788, + "step": 17965 + }, + { + "epoch": 2.17, + "grad_norm": 0.2661183178424835, + "learning_rate": 6.289953672169276e-05, + "loss": 0.7755, + "step": 17970 + }, + { + "epoch": 2.17, + "grad_norm": 0.23854653537273407, + "learning_rate": 6.281577898734329e-05, + "loss": 0.7737, + "step": 17975 + }, + { + "epoch": 2.17, + "grad_norm": 0.27708199620246887, + "learning_rate": 6.273206228478351e-05, + "loss": 0.7334, + "step": 17980 + }, + { + "epoch": 2.17, + "grad_norm": 0.25264453887939453, + "learning_rate": 6.264838665341331e-05, + "loss": 0.796, + "step": 17985 + }, + { + "epoch": 2.17, + "grad_norm": 0.2900809645652771, + "learning_rate": 6.25647521326132e-05, + "loss": 0.78, + "step": 17990 + }, + { + "epoch": 2.17, + "grad_norm": 0.24564304947853088, + "learning_rate": 6.248115876174438e-05, + "loss": 0.7526, + "step": 17995 + }, + { + "epoch": 2.17, + "grad_norm": 0.29007548093795776, + "learning_rate": 6.239760658014865e-05, + "loss": 0.8331, + "step": 18000 + }, + { + "epoch": 2.17, + "grad_norm": 0.2651544213294983, + "learning_rate": 6.231409562714845e-05, + "loss": 0.7203, + "step": 18005 + }, + { + "epoch": 2.17, + "grad_norm": 0.28148752450942993, + "learning_rate": 6.223062594204676e-05, + "loss": 0.7299, + "step": 18010 + }, + { + "epoch": 2.17, + "grad_norm": 0.2735466957092285, + "learning_rate": 6.214719756412729e-05, + "loss": 0.6825, + "step": 18015 + }, + { + "epoch": 2.17, + "grad_norm": 0.2654740810394287, + "learning_rate": 6.206381053265412e-05, + "loss": 0.8268, + "step": 18020 + }, + { + "epoch": 2.17, + "grad_norm": 0.264364093542099, + "learning_rate": 6.198046488687201e-05, + "loss": 0.7943, + "step": 18025 + }, + { + "epoch": 2.17, + "grad_norm": 0.2690427601337433, + "learning_rate": 6.18971606660061e-05, + "loss": 0.7951, + "step": 18030 + }, + { + "epoch": 2.17, + "grad_norm": 0.3025277853012085, + "learning_rate": 6.181389790926224e-05, + "loss": 0.7487, + "step": 18035 + }, + { + "epoch": 2.17, + "grad_norm": 0.2653854489326477, + "learning_rate": 6.173067665582659e-05, + "loss": 0.8685, + "step": 18040 + }, + { + "epoch": 2.17, + "grad_norm": 0.2819320559501648, + "learning_rate": 6.164749694486579e-05, + "loss": 0.769, + "step": 18045 + }, + { + "epoch": 2.17, + "grad_norm": 0.28492966294288635, + "learning_rate": 6.156435881552708e-05, + "loss": 0.821, + "step": 18050 + }, + { + "epoch": 2.18, + "grad_norm": 0.24990375339984894, + "learning_rate": 6.148126230693796e-05, + "loss": 0.9141, + "step": 18055 + }, + { + "epoch": 2.18, + "grad_norm": 0.2689098119735718, + "learning_rate": 6.139820745820643e-05, + "loss": 0.7734, + "step": 18060 + }, + { + "epoch": 2.18, + "grad_norm": 0.2601291239261627, + "learning_rate": 6.131519430842083e-05, + "loss": 0.7885, + "step": 18065 + }, + { + "epoch": 2.18, + "grad_norm": 0.26659342646598816, + "learning_rate": 6.123222289664993e-05, + "loss": 0.8574, + "step": 18070 + }, + { + "epoch": 2.18, + "grad_norm": 0.260973185300827, + "learning_rate": 6.114929326194281e-05, + "loss": 0.7629, + "step": 18075 + }, + { + "epoch": 2.18, + "grad_norm": 0.2958158850669861, + "learning_rate": 6.106640544332894e-05, + "loss": 0.8406, + "step": 18080 + }, + { + "epoch": 2.18, + "grad_norm": 0.27698177099227905, + "learning_rate": 6.0983559479817986e-05, + "loss": 0.7944, + "step": 18085 + }, + { + "epoch": 2.18, + "grad_norm": 0.26226866245269775, + "learning_rate": 6.090075541040015e-05, + "loss": 0.761, + "step": 18090 + }, + { + "epoch": 2.18, + "grad_norm": 0.28929901123046875, + "learning_rate": 6.08179932740457e-05, + "loss": 0.7879, + "step": 18095 + }, + { + "epoch": 2.18, + "grad_norm": 0.2864818871021271, + "learning_rate": 6.073527310970527e-05, + "loss": 0.7526, + "step": 18100 + }, + { + "epoch": 2.18, + "grad_norm": 0.28640732169151306, + "learning_rate": 6.065259495630966e-05, + "loss": 0.7588, + "step": 18105 + }, + { + "epoch": 2.18, + "grad_norm": 0.25879958271980286, + "learning_rate": 6.0569958852770026e-05, + "loss": 0.8613, + "step": 18110 + }, + { + "epoch": 2.18, + "grad_norm": 0.26180025935173035, + "learning_rate": 6.048736483797765e-05, + "loss": 0.7988, + "step": 18115 + }, + { + "epoch": 2.18, + "grad_norm": 0.28200724720954895, + "learning_rate": 6.040481295080402e-05, + "loss": 0.7577, + "step": 18120 + }, + { + "epoch": 2.18, + "grad_norm": 0.27897992730140686, + "learning_rate": 6.0322303230100706e-05, + "loss": 0.7339, + "step": 18125 + }, + { + "epoch": 2.18, + "grad_norm": 0.2738398313522339, + "learning_rate": 6.0239835714699656e-05, + "loss": 0.7929, + "step": 18130 + }, + { + "epoch": 2.19, + "grad_norm": 0.2769078016281128, + "learning_rate": 6.015741044341282e-05, + "loss": 0.8469, + "step": 18135 + }, + { + "epoch": 2.19, + "grad_norm": 0.2504939138889313, + "learning_rate": 6.0075027455032154e-05, + "loss": 0.806, + "step": 18140 + }, + { + "epoch": 2.19, + "grad_norm": 0.27813395857810974, + "learning_rate": 5.999268678832982e-05, + "loss": 0.8322, + "step": 18145 + }, + { + "epoch": 2.19, + "grad_norm": 0.25341561436653137, + "learning_rate": 5.9910388482058196e-05, + "loss": 0.8823, + "step": 18150 + }, + { + "epoch": 2.19, + "grad_norm": 0.2485748827457428, + "learning_rate": 5.982813257494954e-05, + "loss": 0.7445, + "step": 18155 + }, + { + "epoch": 2.19, + "grad_norm": 0.2577187120914459, + "learning_rate": 5.9745919105716195e-05, + "loss": 0.7756, + "step": 18160 + }, + { + "epoch": 2.19, + "grad_norm": 0.28886187076568604, + "learning_rate": 5.966374811305051e-05, + "loss": 0.8127, + "step": 18165 + }, + { + "epoch": 2.19, + "grad_norm": 0.27744728326797485, + "learning_rate": 5.9581619635625014e-05, + "loss": 0.815, + "step": 18170 + }, + { + "epoch": 2.19, + "grad_norm": 0.2894884943962097, + "learning_rate": 5.9499533712092e-05, + "loss": 0.8005, + "step": 18175 + }, + { + "epoch": 2.19, + "grad_norm": 0.3021494448184967, + "learning_rate": 5.941749038108385e-05, + "loss": 0.8278, + "step": 18180 + }, + { + "epoch": 2.19, + "grad_norm": 0.2534330189228058, + "learning_rate": 5.9335489681212835e-05, + "loss": 0.8549, + "step": 18185 + }, + { + "epoch": 2.19, + "grad_norm": 0.3106016516685486, + "learning_rate": 5.92535316510713e-05, + "loss": 0.6661, + "step": 18190 + }, + { + "epoch": 2.19, + "grad_norm": 0.2976013422012329, + "learning_rate": 5.9171616329231364e-05, + "loss": 0.824, + "step": 18195 + }, + { + "epoch": 2.19, + "grad_norm": 0.2501593232154846, + "learning_rate": 5.90897437542451e-05, + "loss": 0.787, + "step": 18200 + }, + { + "epoch": 2.19, + "grad_norm": 0.28381872177124023, + "learning_rate": 5.900791396464445e-05, + "loss": 0.7354, + "step": 18205 + }, + { + "epoch": 2.19, + "grad_norm": 0.259772390127182, + "learning_rate": 5.892612699894127e-05, + "loss": 0.789, + "step": 18210 + }, + { + "epoch": 2.19, + "grad_norm": 0.24163563549518585, + "learning_rate": 5.884438289562717e-05, + "loss": 0.8713, + "step": 18215 + }, + { + "epoch": 2.2, + "grad_norm": 0.2834208011627197, + "learning_rate": 5.8762681693173675e-05, + "loss": 0.841, + "step": 18220 + }, + { + "epoch": 2.2, + "grad_norm": 0.2929078936576843, + "learning_rate": 5.868102343003201e-05, + "loss": 0.7945, + "step": 18225 + }, + { + "epoch": 2.2, + "grad_norm": 0.30455702543258667, + "learning_rate": 5.8599408144633405e-05, + "loss": 0.7069, + "step": 18230 + }, + { + "epoch": 2.2, + "grad_norm": 0.3452713191509247, + "learning_rate": 5.851783587538863e-05, + "loss": 0.8092, + "step": 18235 + }, + { + "epoch": 2.2, + "grad_norm": 0.26134809851646423, + "learning_rate": 5.843630666068832e-05, + "loss": 0.8087, + "step": 18240 + }, + { + "epoch": 2.2, + "grad_norm": 0.30902811884880066, + "learning_rate": 5.835482053890278e-05, + "loss": 0.8481, + "step": 18245 + }, + { + "epoch": 2.2, + "grad_norm": 0.31992846727371216, + "learning_rate": 5.827337754838218e-05, + "loss": 0.7905, + "step": 18250 + }, + { + "epoch": 2.2, + "grad_norm": 0.2618788182735443, + "learning_rate": 5.819197772745627e-05, + "loss": 0.7629, + "step": 18255 + }, + { + "epoch": 2.2, + "grad_norm": 0.28527355194091797, + "learning_rate": 5.811062111443447e-05, + "loss": 0.826, + "step": 18260 + }, + { + "epoch": 2.2, + "grad_norm": 0.31207770109176636, + "learning_rate": 5.8029307747605905e-05, + "loss": 0.7721, + "step": 18265 + }, + { + "epoch": 2.2, + "grad_norm": 0.250207781791687, + "learning_rate": 5.794803766523939e-05, + "loss": 0.7803, + "step": 18270 + }, + { + "epoch": 2.2, + "grad_norm": 0.29229018092155457, + "learning_rate": 5.786681090558332e-05, + "loss": 0.8023, + "step": 18275 + }, + { + "epoch": 2.2, + "grad_norm": 0.273379385471344, + "learning_rate": 5.778562750686568e-05, + "loss": 0.7844, + "step": 18280 + }, + { + "epoch": 2.2, + "grad_norm": 0.27281737327575684, + "learning_rate": 5.770448750729408e-05, + "loss": 0.8071, + "step": 18285 + }, + { + "epoch": 2.2, + "grad_norm": 0.2896445095539093, + "learning_rate": 5.762339094505569e-05, + "loss": 0.7641, + "step": 18290 + }, + { + "epoch": 2.2, + "grad_norm": 0.2542775571346283, + "learning_rate": 5.7542337858317257e-05, + "loss": 0.8674, + "step": 18295 + }, + { + "epoch": 2.2, + "grad_norm": 0.2890925705432892, + "learning_rate": 5.746132828522506e-05, + "loss": 0.7799, + "step": 18300 + }, + { + "epoch": 2.21, + "grad_norm": 0.303602397441864, + "learning_rate": 5.738036226390483e-05, + "loss": 0.7872, + "step": 18305 + }, + { + "epoch": 2.21, + "grad_norm": 0.31776705384254456, + "learning_rate": 5.729943983246198e-05, + "loss": 0.8462, + "step": 18310 + }, + { + "epoch": 2.21, + "grad_norm": 0.2585162818431854, + "learning_rate": 5.721856102898121e-05, + "loss": 0.8747, + "step": 18315 + }, + { + "epoch": 2.21, + "grad_norm": 0.2587442696094513, + "learning_rate": 5.713772589152682e-05, + "loss": 0.8068, + "step": 18320 + }, + { + "epoch": 2.21, + "grad_norm": 0.26109957695007324, + "learning_rate": 5.705693445814243e-05, + "loss": 0.7185, + "step": 18325 + }, + { + "epoch": 2.21, + "grad_norm": 0.27592533826828003, + "learning_rate": 5.697618676685127e-05, + "loss": 0.8076, + "step": 18330 + }, + { + "epoch": 2.21, + "grad_norm": 0.26613497734069824, + "learning_rate": 5.689548285565585e-05, + "loss": 0.803, + "step": 18335 + }, + { + "epoch": 2.21, + "grad_norm": 0.2756407558917999, + "learning_rate": 5.681482276253811e-05, + "loss": 0.836, + "step": 18340 + }, + { + "epoch": 2.21, + "grad_norm": 0.2865229845046997, + "learning_rate": 5.6734206525459355e-05, + "loss": 0.7962, + "step": 18345 + }, + { + "epoch": 2.21, + "grad_norm": 0.2565017342567444, + "learning_rate": 5.6653634182360267e-05, + "loss": 0.8133, + "step": 18350 + }, + { + "epoch": 2.21, + "grad_norm": 0.2665974497795105, + "learning_rate": 5.6573105771160875e-05, + "loss": 0.7833, + "step": 18355 + }, + { + "epoch": 2.21, + "grad_norm": 0.3112146556377411, + "learning_rate": 5.6492621329760524e-05, + "loss": 0.8103, + "step": 18360 + }, + { + "epoch": 2.21, + "grad_norm": 0.299140602350235, + "learning_rate": 5.641218089603779e-05, + "loss": 0.7744, + "step": 18365 + }, + { + "epoch": 2.21, + "grad_norm": 0.25373366475105286, + "learning_rate": 5.6331784507850744e-05, + "loss": 0.8986, + "step": 18370 + }, + { + "epoch": 2.21, + "grad_norm": 0.27519750595092773, + "learning_rate": 5.6251432203036544e-05, + "loss": 0.791, + "step": 18375 + }, + { + "epoch": 2.21, + "grad_norm": 0.253791481256485, + "learning_rate": 5.617112401941163e-05, + "loss": 0.7265, + "step": 18380 + }, + { + "epoch": 2.22, + "grad_norm": 0.26284411549568176, + "learning_rate": 5.609085999477166e-05, + "loss": 0.8349, + "step": 18385 + }, + { + "epoch": 2.22, + "grad_norm": 0.26994234323501587, + "learning_rate": 5.601064016689165e-05, + "loss": 0.7947, + "step": 18390 + }, + { + "epoch": 2.22, + "grad_norm": 0.3026762902736664, + "learning_rate": 5.593046457352568e-05, + "loss": 0.8011, + "step": 18395 + }, + { + "epoch": 2.22, + "grad_norm": 0.2768292725086212, + "learning_rate": 5.585033325240704e-05, + "loss": 0.7135, + "step": 18400 + }, + { + "epoch": 2.22, + "grad_norm": 0.28852131962776184, + "learning_rate": 5.5770246241248125e-05, + "loss": 0.8516, + "step": 18405 + }, + { + "epoch": 2.22, + "grad_norm": 0.28244367241859436, + "learning_rate": 5.5690203577740654e-05, + "loss": 0.7431, + "step": 18410 + }, + { + "epoch": 2.22, + "grad_norm": 0.2686423361301422, + "learning_rate": 5.561020529955531e-05, + "loss": 0.7382, + "step": 18415 + }, + { + "epoch": 2.22, + "grad_norm": 0.2730972170829773, + "learning_rate": 5.5530251444341936e-05, + "loss": 0.8299, + "step": 18420 + }, + { + "epoch": 2.22, + "grad_norm": 0.26289600133895874, + "learning_rate": 5.5450342049729475e-05, + "loss": 0.7141, + "step": 18425 + }, + { + "epoch": 2.22, + "grad_norm": 0.2697264552116394, + "learning_rate": 5.5370477153325945e-05, + "loss": 0.7229, + "step": 18430 + }, + { + "epoch": 2.22, + "grad_norm": 0.2662758231163025, + "learning_rate": 5.529065679271841e-05, + "loss": 0.8089, + "step": 18435 + }, + { + "epoch": 2.22, + "grad_norm": 0.2873300313949585, + "learning_rate": 5.5210881005473e-05, + "loss": 0.8195, + "step": 18440 + }, + { + "epoch": 2.22, + "grad_norm": 0.29123494029045105, + "learning_rate": 5.513114982913479e-05, + "loss": 0.8099, + "step": 18445 + }, + { + "epoch": 2.22, + "grad_norm": 0.2564477324485779, + "learning_rate": 5.5051463301228e-05, + "loss": 0.675, + "step": 18450 + }, + { + "epoch": 2.22, + "grad_norm": 0.29058778285980225, + "learning_rate": 5.497182145925575e-05, + "loss": 0.7853, + "step": 18455 + }, + { + "epoch": 2.22, + "grad_norm": 0.32175374031066895, + "learning_rate": 5.489222434070009e-05, + "loss": 0.8176, + "step": 18460 + }, + { + "epoch": 2.22, + "grad_norm": 0.291685551404953, + "learning_rate": 5.4812671983022046e-05, + "loss": 0.7884, + "step": 18465 + }, + { + "epoch": 2.23, + "grad_norm": 0.25728997588157654, + "learning_rate": 5.473316442366167e-05, + "loss": 0.824, + "step": 18470 + }, + { + "epoch": 2.23, + "grad_norm": 0.2623702883720398, + "learning_rate": 5.465370170003785e-05, + "loss": 0.8171, + "step": 18475 + }, + { + "epoch": 2.23, + "grad_norm": 0.24209091067314148, + "learning_rate": 5.4574283849548354e-05, + "loss": 0.8101, + "step": 18480 + }, + { + "epoch": 2.23, + "grad_norm": 0.2494385689496994, + "learning_rate": 5.449491090956982e-05, + "loss": 0.7616, + "step": 18485 + }, + { + "epoch": 2.23, + "grad_norm": 0.307625949382782, + "learning_rate": 5.44155829174579e-05, + "loss": 0.8873, + "step": 18490 + }, + { + "epoch": 2.23, + "grad_norm": 0.27069059014320374, + "learning_rate": 5.433629991054691e-05, + "loss": 0.8159, + "step": 18495 + }, + { + "epoch": 2.23, + "grad_norm": 0.2697925567626953, + "learning_rate": 5.425706192615007e-05, + "loss": 0.8165, + "step": 18500 + }, + { + "epoch": 2.23, + "grad_norm": 0.2592358887195587, + "learning_rate": 5.417786900155942e-05, + "loss": 0.7355, + "step": 18505 + }, + { + "epoch": 2.23, + "grad_norm": 0.28814736008644104, + "learning_rate": 5.409872117404577e-05, + "loss": 0.8358, + "step": 18510 + }, + { + "epoch": 2.23, + "grad_norm": 0.2974863350391388, + "learning_rate": 5.401961848085871e-05, + "loss": 0.7225, + "step": 18515 + }, + { + "epoch": 2.23, + "grad_norm": 0.28759467601776123, + "learning_rate": 5.394056095922662e-05, + "loss": 0.74, + "step": 18520 + }, + { + "epoch": 2.23, + "grad_norm": 0.3033956289291382, + "learning_rate": 5.3861548646356514e-05, + "loss": 0.7645, + "step": 18525 + }, + { + "epoch": 2.23, + "grad_norm": 0.2899126410484314, + "learning_rate": 5.3782581579434325e-05, + "loss": 0.7309, + "step": 18530 + }, + { + "epoch": 2.23, + "grad_norm": 0.2775956690311432, + "learning_rate": 5.370365979562453e-05, + "loss": 0.8238, + "step": 18535 + }, + { + "epoch": 2.23, + "grad_norm": 0.2776409983634949, + "learning_rate": 5.362478333207034e-05, + "loss": 0.84, + "step": 18540 + }, + { + "epoch": 2.23, + "grad_norm": 0.27701374888420105, + "learning_rate": 5.354595222589358e-05, + "loss": 0.7884, + "step": 18545 + }, + { + "epoch": 2.24, + "grad_norm": 0.3034186065196991, + "learning_rate": 5.34671665141949e-05, + "loss": 0.7565, + "step": 18550 + }, + { + "epoch": 2.24, + "grad_norm": 0.2622527480125427, + "learning_rate": 5.3388426234053414e-05, + "loss": 0.7612, + "step": 18555 + }, + { + "epoch": 2.24, + "grad_norm": 0.28101763129234314, + "learning_rate": 5.330973142252691e-05, + "loss": 0.7192, + "step": 18560 + }, + { + "epoch": 2.24, + "grad_norm": 0.2687520384788513, + "learning_rate": 5.323108211665178e-05, + "loss": 0.8082, + "step": 18565 + }, + { + "epoch": 2.24, + "grad_norm": 0.31704583764076233, + "learning_rate": 5.3152478353443006e-05, + "loss": 0.6957, + "step": 18570 + }, + { + "epoch": 2.24, + "grad_norm": 0.26655253767967224, + "learning_rate": 5.307392016989413e-05, + "loss": 0.8078, + "step": 18575 + }, + { + "epoch": 2.24, + "grad_norm": 0.27396515011787415, + "learning_rate": 5.299540760297717e-05, + "loss": 0.8209, + "step": 18580 + }, + { + "epoch": 2.24, + "grad_norm": 0.24590495228767395, + "learning_rate": 5.2916940689642887e-05, + "loss": 0.8348, + "step": 18585 + }, + { + "epoch": 2.24, + "grad_norm": 0.23811030387878418, + "learning_rate": 5.283851946682033e-05, + "loss": 0.771, + "step": 18590 + }, + { + "epoch": 2.24, + "grad_norm": 0.2999459505081177, + "learning_rate": 5.2760143971417155e-05, + "loss": 0.7364, + "step": 18595 + }, + { + "epoch": 2.24, + "grad_norm": 0.257988840341568, + "learning_rate": 5.2681814240319415e-05, + "loss": 0.7833, + "step": 18600 + }, + { + "epoch": 2.24, + "grad_norm": 0.27727702260017395, + "learning_rate": 5.260353031039177e-05, + "loss": 0.8248, + "step": 18605 + }, + { + "epoch": 2.24, + "grad_norm": 0.24502165615558624, + "learning_rate": 5.252529221847719e-05, + "loss": 0.7083, + "step": 18610 + }, + { + "epoch": 2.24, + "grad_norm": 0.2822337746620178, + "learning_rate": 5.244710000139714e-05, + "loss": 0.8521, + "step": 18615 + }, + { + "epoch": 2.24, + "grad_norm": 0.2731589674949646, + "learning_rate": 5.2368953695951405e-05, + "loss": 0.8219, + "step": 18620 + }, + { + "epoch": 2.24, + "grad_norm": 0.2679608464241028, + "learning_rate": 5.229085333891834e-05, + "loss": 0.73, + "step": 18625 + }, + { + "epoch": 2.24, + "grad_norm": 0.2607372999191284, + "learning_rate": 5.221279896705452e-05, + "loss": 0.7845, + "step": 18630 + }, + { + "epoch": 2.25, + "grad_norm": 0.2670726478099823, + "learning_rate": 5.213479061709492e-05, + "loss": 0.8545, + "step": 18635 + }, + { + "epoch": 2.25, + "grad_norm": 0.29974955320358276, + "learning_rate": 5.2056828325752855e-05, + "loss": 0.8346, + "step": 18640 + }, + { + "epoch": 2.25, + "grad_norm": 0.2804435193538666, + "learning_rate": 5.1978912129719956e-05, + "loss": 0.7781, + "step": 18645 + }, + { + "epoch": 2.25, + "grad_norm": 0.27671873569488525, + "learning_rate": 5.1901042065666214e-05, + "loss": 0.8077, + "step": 18650 + }, + { + "epoch": 2.25, + "grad_norm": 0.28457868099212646, + "learning_rate": 5.182321817023983e-05, + "loss": 0.8014, + "step": 18655 + }, + { + "epoch": 2.25, + "grad_norm": 0.2779267728328705, + "learning_rate": 5.174544048006726e-05, + "loss": 0.8311, + "step": 18660 + }, + { + "epoch": 2.25, + "grad_norm": 0.26867663860321045, + "learning_rate": 5.1667709031753406e-05, + "loss": 0.7116, + "step": 18665 + }, + { + "epoch": 2.25, + "grad_norm": 0.27532848715782166, + "learning_rate": 5.159002386188118e-05, + "loss": 0.9608, + "step": 18670 + }, + { + "epoch": 2.25, + "grad_norm": 0.2576589286327362, + "learning_rate": 5.151238500701184e-05, + "loss": 0.7806, + "step": 18675 + }, + { + "epoch": 2.25, + "grad_norm": 0.29367679357528687, + "learning_rate": 5.1434792503684716e-05, + "loss": 0.7674, + "step": 18680 + }, + { + "epoch": 2.25, + "grad_norm": 0.2842334806919098, + "learning_rate": 5.135724638841755e-05, + "loss": 0.7849, + "step": 18685 + }, + { + "epoch": 2.25, + "grad_norm": 0.305401474237442, + "learning_rate": 5.1279746697706055e-05, + "loss": 0.7973, + "step": 18690 + }, + { + "epoch": 2.25, + "grad_norm": 0.2906571328639984, + "learning_rate": 5.120229346802416e-05, + "loss": 0.808, + "step": 18695 + }, + { + "epoch": 2.25, + "grad_norm": 0.2675495445728302, + "learning_rate": 5.112488673582389e-05, + "loss": 0.7582, + "step": 18700 + }, + { + "epoch": 2.25, + "grad_norm": 0.2715294063091278, + "learning_rate": 5.1047526537535504e-05, + "loss": 0.779, + "step": 18705 + }, + { + "epoch": 2.25, + "grad_norm": 0.29356223344802856, + "learning_rate": 5.0970212909567236e-05, + "loss": 0.8705, + "step": 18710 + }, + { + "epoch": 2.25, + "grad_norm": 0.2562316656112671, + "learning_rate": 5.0892945888305446e-05, + "loss": 0.7458, + "step": 18715 + }, + { + "epoch": 2.26, + "grad_norm": 0.31263595819473267, + "learning_rate": 5.081572551011458e-05, + "loss": 0.8115, + "step": 18720 + }, + { + "epoch": 2.26, + "grad_norm": 0.31803205609321594, + "learning_rate": 5.07385518113371e-05, + "loss": 0.7623, + "step": 18725 + }, + { + "epoch": 2.26, + "grad_norm": 0.25509113073349, + "learning_rate": 5.06614248282935e-05, + "loss": 0.8839, + "step": 18730 + }, + { + "epoch": 2.26, + "grad_norm": 0.2617010474205017, + "learning_rate": 5.058434459728229e-05, + "loss": 0.7858, + "step": 18735 + }, + { + "epoch": 2.26, + "grad_norm": 0.29557377099990845, + "learning_rate": 5.0507311154579976e-05, + "loss": 0.8331, + "step": 18740 + }, + { + "epoch": 2.26, + "grad_norm": 0.28714311122894287, + "learning_rate": 5.043032453644113e-05, + "loss": 0.7831, + "step": 18745 + }, + { + "epoch": 2.26, + "grad_norm": 0.27089929580688477, + "learning_rate": 5.035338477909817e-05, + "loss": 0.8846, + "step": 18750 + }, + { + "epoch": 2.26, + "grad_norm": 0.2682959735393524, + "learning_rate": 5.027649191876147e-05, + "loss": 0.7482, + "step": 18755 + }, + { + "epoch": 2.26, + "grad_norm": 0.26213210821151733, + "learning_rate": 5.019964599161935e-05, + "loss": 0.7949, + "step": 18760 + }, + { + "epoch": 2.26, + "grad_norm": 0.30847010016441345, + "learning_rate": 5.0122847033838156e-05, + "loss": 0.7751, + "step": 18765 + }, + { + "epoch": 2.26, + "grad_norm": 0.25752073526382446, + "learning_rate": 5.004609508156196e-05, + "loss": 0.8719, + "step": 18770 + }, + { + "epoch": 2.26, + "grad_norm": 0.2857056260108948, + "learning_rate": 4.996939017091278e-05, + "loss": 0.7445, + "step": 18775 + }, + { + "epoch": 2.26, + "grad_norm": 0.2874182164669037, + "learning_rate": 4.989273233799051e-05, + "loss": 0.7717, + "step": 18780 + }, + { + "epoch": 2.26, + "grad_norm": 0.34927722811698914, + "learning_rate": 4.981612161887285e-05, + "loss": 0.7717, + "step": 18785 + }, + { + "epoch": 2.26, + "grad_norm": 0.2875741422176361, + "learning_rate": 4.973955804961536e-05, + "loss": 0.8041, + "step": 18790 + }, + { + "epoch": 2.26, + "grad_norm": 0.24138030409812927, + "learning_rate": 4.966304166625139e-05, + "loss": 0.8356, + "step": 18795 + }, + { + "epoch": 2.27, + "grad_norm": 0.28111588954925537, + "learning_rate": 4.958657250479208e-05, + "loss": 0.7551, + "step": 18800 + }, + { + "epoch": 2.27, + "grad_norm": 0.2540626525878906, + "learning_rate": 4.9510150601226394e-05, + "loss": 0.777, + "step": 18805 + }, + { + "epoch": 2.27, + "grad_norm": 0.26399436593055725, + "learning_rate": 4.9433775991521006e-05, + "loss": 0.8436, + "step": 18810 + }, + { + "epoch": 2.27, + "grad_norm": 0.27439209818840027, + "learning_rate": 4.9357448711620345e-05, + "loss": 0.7735, + "step": 18815 + }, + { + "epoch": 2.27, + "grad_norm": 0.2720988988876343, + "learning_rate": 4.9281168797446504e-05, + "loss": 0.8395, + "step": 18820 + }, + { + "epoch": 2.27, + "grad_norm": 0.30142033100128174, + "learning_rate": 4.920493628489946e-05, + "loss": 0.865, + "step": 18825 + }, + { + "epoch": 2.27, + "grad_norm": 0.27852845191955566, + "learning_rate": 4.9128751209856684e-05, + "loss": 0.8549, + "step": 18830 + }, + { + "epoch": 2.27, + "grad_norm": 0.2861536741256714, + "learning_rate": 4.9052613608173425e-05, + "loss": 0.77, + "step": 18835 + }, + { + "epoch": 2.27, + "grad_norm": 0.26632633805274963, + "learning_rate": 4.8976523515682505e-05, + "loss": 0.8792, + "step": 18840 + }, + { + "epoch": 2.27, + "grad_norm": 0.25601017475128174, + "learning_rate": 4.890048096819456e-05, + "loss": 0.6669, + "step": 18845 + }, + { + "epoch": 2.27, + "grad_norm": 0.29297733306884766, + "learning_rate": 4.882448600149767e-05, + "loss": 0.8335, + "step": 18850 + }, + { + "epoch": 2.27, + "grad_norm": 0.27074167132377625, + "learning_rate": 4.874853865135761e-05, + "loss": 0.7719, + "step": 18855 + }, + { + "epoch": 2.27, + "grad_norm": 0.2736327648162842, + "learning_rate": 4.8672638953517724e-05, + "loss": 0.7569, + "step": 18860 + }, + { + "epoch": 2.27, + "grad_norm": 0.2467767298221588, + "learning_rate": 4.859678694369892e-05, + "loss": 0.8277, + "step": 18865 + }, + { + "epoch": 2.27, + "grad_norm": 0.26508867740631104, + "learning_rate": 4.852098265759969e-05, + "loss": 0.7552, + "step": 18870 + }, + { + "epoch": 2.27, + "grad_norm": 0.295194149017334, + "learning_rate": 4.844522613089601e-05, + "loss": 0.7934, + "step": 18875 + }, + { + "epoch": 2.27, + "grad_norm": 0.2907750606536865, + "learning_rate": 4.836951739924141e-05, + "loss": 0.8823, + "step": 18880 + }, + { + "epoch": 2.28, + "grad_norm": 0.252069890499115, + "learning_rate": 4.829385649826702e-05, + "loss": 0.8101, + "step": 18885 + }, + { + "epoch": 2.28, + "grad_norm": 0.263295978307724, + "learning_rate": 4.82182434635813e-05, + "loss": 0.8344, + "step": 18890 + }, + { + "epoch": 2.28, + "grad_norm": 0.3141554892063141, + "learning_rate": 4.814267833077029e-05, + "loss": 0.8552, + "step": 18895 + }, + { + "epoch": 2.28, + "grad_norm": 0.2977995276451111, + "learning_rate": 4.806716113539737e-05, + "loss": 0.7844, + "step": 18900 + }, + { + "epoch": 2.28, + "grad_norm": 0.28713858127593994, + "learning_rate": 4.799169191300357e-05, + "loss": 0.7359, + "step": 18905 + }, + { + "epoch": 2.28, + "grad_norm": 0.265524297952652, + "learning_rate": 4.791627069910713e-05, + "loss": 0.7948, + "step": 18910 + }, + { + "epoch": 2.28, + "grad_norm": 0.26660722494125366, + "learning_rate": 4.784089752920381e-05, + "loss": 0.7537, + "step": 18915 + }, + { + "epoch": 2.28, + "grad_norm": 0.3201920688152313, + "learning_rate": 4.776557243876665e-05, + "loss": 0.6509, + "step": 18920 + }, + { + "epoch": 2.28, + "grad_norm": 0.25317713618278503, + "learning_rate": 4.7690295463246255e-05, + "loss": 0.8217, + "step": 18925 + }, + { + "epoch": 2.28, + "grad_norm": 0.2925649881362915, + "learning_rate": 4.761506663807047e-05, + "loss": 0.8093, + "step": 18930 + }, + { + "epoch": 2.28, + "grad_norm": 0.26825374364852905, + "learning_rate": 4.7539885998644365e-05, + "loss": 0.7968, + "step": 18935 + }, + { + "epoch": 2.28, + "grad_norm": 0.2798418402671814, + "learning_rate": 4.746475358035046e-05, + "loss": 0.8392, + "step": 18940 + }, + { + "epoch": 2.28, + "grad_norm": 0.25741979479789734, + "learning_rate": 4.738966941854866e-05, + "loss": 0.7922, + "step": 18945 + }, + { + "epoch": 2.28, + "grad_norm": 0.27443379163742065, + "learning_rate": 4.731463354857602e-05, + "loss": 0.8369, + "step": 18950 + }, + { + "epoch": 2.28, + "grad_norm": 0.2702893316745758, + "learning_rate": 4.7239646005746895e-05, + "loss": 0.7611, + "step": 18955 + }, + { + "epoch": 2.28, + "grad_norm": 0.38152143359184265, + "learning_rate": 4.716470682535289e-05, + "loss": 0.8706, + "step": 18960 + }, + { + "epoch": 2.29, + "grad_norm": 0.26766619086265564, + "learning_rate": 4.708981604266296e-05, + "loss": 0.8218, + "step": 18965 + }, + { + "epoch": 2.29, + "grad_norm": 0.25141772627830505, + "learning_rate": 4.701497369292313e-05, + "loss": 0.8075, + "step": 18970 + }, + { + "epoch": 2.29, + "grad_norm": 0.264624685049057, + "learning_rate": 4.694017981135671e-05, + "loss": 0.7692, + "step": 18975 + }, + { + "epoch": 2.29, + "grad_norm": 0.2943498194217682, + "learning_rate": 4.6865434433164125e-05, + "loss": 0.7622, + "step": 18980 + }, + { + "epoch": 2.29, + "grad_norm": 0.26615047454833984, + "learning_rate": 4.679073759352315e-05, + "loss": 0.7964, + "step": 18985 + }, + { + "epoch": 2.29, + "grad_norm": 0.24245823919773102, + "learning_rate": 4.671608932758853e-05, + "loss": 0.7825, + "step": 18990 + }, + { + "epoch": 2.29, + "grad_norm": 0.2933671772480011, + "learning_rate": 4.664148967049221e-05, + "loss": 0.8122, + "step": 18995 + }, + { + "epoch": 2.29, + "grad_norm": 0.2749022841453552, + "learning_rate": 4.656693865734331e-05, + "loss": 0.7729, + "step": 19000 + }, + { + "epoch": 2.29, + "grad_norm": 0.268289178609848, + "learning_rate": 4.649243632322796e-05, + "loss": 0.793, + "step": 19005 + }, + { + "epoch": 2.29, + "grad_norm": 0.28177735209465027, + "learning_rate": 4.641798270320948e-05, + "loss": 0.8523, + "step": 19010 + }, + { + "epoch": 2.29, + "grad_norm": 0.262329638004303, + "learning_rate": 4.6343577832328176e-05, + "loss": 0.8208, + "step": 19015 + }, + { + "epoch": 2.29, + "grad_norm": 0.31374141573905945, + "learning_rate": 4.626922174560142e-05, + "loss": 0.8154, + "step": 19020 + }, + { + "epoch": 2.29, + "grad_norm": 0.28644031286239624, + "learning_rate": 4.6194914478023754e-05, + "loss": 0.8359, + "step": 19025 + }, + { + "epoch": 2.29, + "grad_norm": 0.2689576745033264, + "learning_rate": 4.6120656064566604e-05, + "loss": 0.9421, + "step": 19030 + }, + { + "epoch": 2.29, + "grad_norm": 0.25812458992004395, + "learning_rate": 4.604644654017843e-05, + "loss": 0.8141, + "step": 19035 + }, + { + "epoch": 2.29, + "grad_norm": 0.2505223751068115, + "learning_rate": 4.597228593978464e-05, + "loss": 0.8084, + "step": 19040 + }, + { + "epoch": 2.29, + "grad_norm": 0.2971956431865692, + "learning_rate": 4.589817429828781e-05, + "loss": 0.7819, + "step": 19045 + }, + { + "epoch": 2.3, + "grad_norm": 0.274215430021286, + "learning_rate": 4.5824111650567264e-05, + "loss": 0.7431, + "step": 19050 + }, + { + "epoch": 2.3, + "grad_norm": 0.2821873724460602, + "learning_rate": 4.575009803147929e-05, + "loss": 0.8117, + "step": 19055 + }, + { + "epoch": 2.3, + "grad_norm": 0.2926546037197113, + "learning_rate": 4.567613347585727e-05, + "loss": 0.8768, + "step": 19060 + }, + { + "epoch": 2.3, + "grad_norm": 0.3123989701271057, + "learning_rate": 4.560221801851133e-05, + "loss": 0.7407, + "step": 19065 + }, + { + "epoch": 2.3, + "grad_norm": 0.3110482394695282, + "learning_rate": 4.552835169422854e-05, + "loss": 0.7504, + "step": 19070 + }, + { + "epoch": 2.3, + "grad_norm": 0.2745916545391083, + "learning_rate": 4.5454534537772825e-05, + "loss": 0.6989, + "step": 19075 + }, + { + "epoch": 2.3, + "grad_norm": 0.2758617401123047, + "learning_rate": 4.5380766583885016e-05, + "loss": 0.7739, + "step": 19080 + }, + { + "epoch": 2.3, + "grad_norm": 0.2954859733581543, + "learning_rate": 4.5307047867282734e-05, + "loss": 0.838, + "step": 19085 + }, + { + "epoch": 2.3, + "grad_norm": 0.285125732421875, + "learning_rate": 4.523337842266047e-05, + "loss": 0.7301, + "step": 19090 + }, + { + "epoch": 2.3, + "grad_norm": 0.2881282567977905, + "learning_rate": 4.515975828468949e-05, + "loss": 0.7589, + "step": 19095 + }, + { + "epoch": 2.3, + "grad_norm": 0.30516520142555237, + "learning_rate": 4.508618748801793e-05, + "loss": 0.8142, + "step": 19100 + }, + { + "epoch": 2.3, + "grad_norm": 0.315054714679718, + "learning_rate": 4.50126660672706e-05, + "loss": 0.7316, + "step": 19105 + }, + { + "epoch": 2.3, + "grad_norm": 0.2825736999511719, + "learning_rate": 4.493919405704917e-05, + "loss": 0.777, + "step": 19110 + }, + { + "epoch": 2.3, + "grad_norm": 0.2859044671058655, + "learning_rate": 4.486577149193191e-05, + "loss": 0.7306, + "step": 19115 + }, + { + "epoch": 2.3, + "grad_norm": 0.29017677903175354, + "learning_rate": 4.479239840647405e-05, + "loss": 0.8264, + "step": 19120 + }, + { + "epoch": 2.3, + "grad_norm": 0.28172457218170166, + "learning_rate": 4.471907483520732e-05, + "loss": 0.7622, + "step": 19125 + }, + { + "epoch": 2.3, + "grad_norm": 0.28697964549064636, + "learning_rate": 4.464580081264026e-05, + "loss": 0.8655, + "step": 19130 + }, + { + "epoch": 2.31, + "grad_norm": 0.290617972612381, + "learning_rate": 4.4572576373257986e-05, + "loss": 0.7366, + "step": 19135 + }, + { + "epoch": 2.31, + "grad_norm": 0.2692755460739136, + "learning_rate": 4.4499401551522504e-05, + "loss": 0.7704, + "step": 19140 + }, + { + "epoch": 2.31, + "grad_norm": 0.3180653750896454, + "learning_rate": 4.442627638187216e-05, + "loss": 0.7775, + "step": 19145 + }, + { + "epoch": 2.31, + "grad_norm": 0.3238975703716278, + "learning_rate": 4.435320089872217e-05, + "loss": 0.7988, + "step": 19150 + }, + { + "epoch": 2.31, + "grad_norm": 0.2899223864078522, + "learning_rate": 4.428017513646418e-05, + "loss": 0.7712, + "step": 19155 + }, + { + "epoch": 2.31, + "grad_norm": 0.2541576027870178, + "learning_rate": 4.4207199129466685e-05, + "loss": 0.8302, + "step": 19160 + }, + { + "epoch": 2.31, + "grad_norm": 0.3092675805091858, + "learning_rate": 4.4134272912074546e-05, + "loss": 0.7243, + "step": 19165 + }, + { + "epoch": 2.31, + "grad_norm": 0.2940780818462372, + "learning_rate": 4.406139651860928e-05, + "loss": 0.7353, + "step": 19170 + }, + { + "epoch": 2.31, + "grad_norm": 0.27896803617477417, + "learning_rate": 4.398856998336885e-05, + "loss": 0.7755, + "step": 19175 + }, + { + "epoch": 2.31, + "grad_norm": 0.275955468416214, + "learning_rate": 4.391579334062798e-05, + "loss": 0.6845, + "step": 19180 + }, + { + "epoch": 2.31, + "grad_norm": 0.27416709065437317, + "learning_rate": 4.3843066624637705e-05, + "loss": 0.8806, + "step": 19185 + }, + { + "epoch": 2.31, + "grad_norm": 0.2871681749820709, + "learning_rate": 4.3770389869625654e-05, + "loss": 0.8377, + "step": 19190 + }, + { + "epoch": 2.31, + "grad_norm": 0.2698437571525574, + "learning_rate": 4.3697763109795845e-05, + "loss": 0.7839, + "step": 19195 + }, + { + "epoch": 2.31, + "grad_norm": 0.2793125510215759, + "learning_rate": 4.362518637932896e-05, + "loss": 0.7334, + "step": 19200 + }, + { + "epoch": 2.31, + "grad_norm": 0.26299965381622314, + "learning_rate": 4.3552659712381965e-05, + "loss": 0.7914, + "step": 19205 + }, + { + "epoch": 2.31, + "grad_norm": 0.29598289728164673, + "learning_rate": 4.34801831430883e-05, + "loss": 0.7296, + "step": 19210 + }, + { + "epoch": 2.32, + "grad_norm": 0.30345529317855835, + "learning_rate": 4.340775670555787e-05, + "loss": 0.7574, + "step": 19215 + }, + { + "epoch": 2.32, + "grad_norm": 0.27294403314590454, + "learning_rate": 4.333538043387695e-05, + "loss": 0.8409, + "step": 19220 + }, + { + "epoch": 2.32, + "grad_norm": 0.2598150968551636, + "learning_rate": 4.326305436210823e-05, + "loss": 0.7326, + "step": 19225 + }, + { + "epoch": 2.32, + "grad_norm": 0.24395301938056946, + "learning_rate": 4.3190778524290755e-05, + "loss": 0.8482, + "step": 19230 + }, + { + "epoch": 2.32, + "grad_norm": 0.26434871554374695, + "learning_rate": 4.311855295443987e-05, + "loss": 0.7736, + "step": 19235 + }, + { + "epoch": 2.32, + "grad_norm": 0.3252696394920349, + "learning_rate": 4.304637768654744e-05, + "loss": 0.7654, + "step": 19240 + }, + { + "epoch": 2.32, + "grad_norm": 0.31384336948394775, + "learning_rate": 4.29742527545815e-05, + "loss": 0.7946, + "step": 19245 + }, + { + "epoch": 2.32, + "grad_norm": 0.2744503617286682, + "learning_rate": 4.290217819248641e-05, + "loss": 0.8248, + "step": 19250 + }, + { + "epoch": 2.32, + "grad_norm": 0.29785433411598206, + "learning_rate": 4.283015403418284e-05, + "loss": 0.777, + "step": 19255 + }, + { + "epoch": 2.32, + "grad_norm": 0.26613983511924744, + "learning_rate": 4.275818031356783e-05, + "loss": 0.7121, + "step": 19260 + }, + { + "epoch": 2.32, + "grad_norm": 0.2956700623035431, + "learning_rate": 4.268625706451454e-05, + "loss": 0.8807, + "step": 19265 + }, + { + "epoch": 2.32, + "grad_norm": 0.2700934112071991, + "learning_rate": 4.261438432087247e-05, + "loss": 0.703, + "step": 19270 + }, + { + "epoch": 2.32, + "grad_norm": 0.2920432984828949, + "learning_rate": 4.2542562116467245e-05, + "loss": 0.8218, + "step": 19275 + }, + { + "epoch": 2.32, + "grad_norm": 0.2755813002586365, + "learning_rate": 4.247079048510089e-05, + "loss": 0.7852, + "step": 19280 + }, + { + "epoch": 2.32, + "grad_norm": 0.3014310598373413, + "learning_rate": 4.239906946055148e-05, + "loss": 0.7117, + "step": 19285 + }, + { + "epoch": 2.32, + "grad_norm": 0.32181209325790405, + "learning_rate": 4.2327399076573284e-05, + "loss": 0.8007, + "step": 19290 + }, + { + "epoch": 2.32, + "grad_norm": 0.3137061297893524, + "learning_rate": 4.225577936689677e-05, + "loss": 0.7185, + "step": 19295 + }, + { + "epoch": 2.33, + "grad_norm": 0.24845640361309052, + "learning_rate": 4.2184210365228575e-05, + "loss": 0.733, + "step": 19300 + }, + { + "epoch": 2.33, + "grad_norm": 0.28396517038345337, + "learning_rate": 4.2112692105251446e-05, + "loss": 0.8673, + "step": 19305 + }, + { + "epoch": 2.33, + "grad_norm": 0.2980553209781647, + "learning_rate": 4.204122462062422e-05, + "loss": 0.7282, + "step": 19310 + }, + { + "epoch": 2.33, + "grad_norm": 0.31656941771507263, + "learning_rate": 4.196980794498185e-05, + "loss": 0.7511, + "step": 19315 + }, + { + "epoch": 2.33, + "grad_norm": 0.26247191429138184, + "learning_rate": 4.189844211193548e-05, + "loss": 0.6768, + "step": 19320 + }, + { + "epoch": 2.33, + "grad_norm": 0.26794061064720154, + "learning_rate": 4.182712715507217e-05, + "loss": 0.774, + "step": 19325 + }, + { + "epoch": 2.33, + "grad_norm": 0.289065420627594, + "learning_rate": 4.175586310795515e-05, + "loss": 0.7939, + "step": 19330 + }, + { + "epoch": 2.33, + "grad_norm": 0.27553194761276245, + "learning_rate": 4.168465000412359e-05, + "loss": 0.6774, + "step": 19335 + }, + { + "epoch": 2.33, + "grad_norm": 0.29777318239212036, + "learning_rate": 4.161348787709282e-05, + "loss": 0.6694, + "step": 19340 + }, + { + "epoch": 2.33, + "grad_norm": 0.289673388004303, + "learning_rate": 4.1542376760354045e-05, + "loss": 0.7335, + "step": 19345 + }, + { + "epoch": 2.33, + "grad_norm": 0.27125170826911926, + "learning_rate": 4.1471316687374556e-05, + "loss": 0.8134, + "step": 19350 + }, + { + "epoch": 2.33, + "grad_norm": 0.2760598063468933, + "learning_rate": 4.140030769159755e-05, + "loss": 0.8138, + "step": 19355 + }, + { + "epoch": 2.33, + "grad_norm": 0.28096047043800354, + "learning_rate": 4.132934980644223e-05, + "loss": 0.7309, + "step": 19360 + }, + { + "epoch": 2.33, + "grad_norm": 0.28885167837142944, + "learning_rate": 4.125844306530373e-05, + "loss": 0.752, + "step": 19365 + }, + { + "epoch": 2.33, + "grad_norm": 0.2942561209201813, + "learning_rate": 4.118758750155311e-05, + "loss": 0.7993, + "step": 19370 + }, + { + "epoch": 2.33, + "grad_norm": 0.24654746055603027, + "learning_rate": 4.111678314853732e-05, + "loss": 0.7266, + "step": 19375 + }, + { + "epoch": 2.34, + "grad_norm": 0.2839394807815552, + "learning_rate": 4.104603003957934e-05, + "loss": 0.7632, + "step": 19380 + }, + { + "epoch": 2.34, + "grad_norm": 0.27274009585380554, + "learning_rate": 4.097532820797786e-05, + "loss": 0.8847, + "step": 19385 + }, + { + "epoch": 2.34, + "grad_norm": 0.2987731397151947, + "learning_rate": 4.0904677687007524e-05, + "loss": 0.7926, + "step": 19390 + }, + { + "epoch": 2.34, + "grad_norm": 0.2836725413799286, + "learning_rate": 4.0834078509918756e-05, + "loss": 0.8595, + "step": 19395 + }, + { + "epoch": 2.34, + "grad_norm": 0.3034559190273285, + "learning_rate": 4.076353070993799e-05, + "loss": 0.8338, + "step": 19400 + }, + { + "epoch": 2.34, + "grad_norm": 0.3112902045249939, + "learning_rate": 4.0693034320267295e-05, + "loss": 0.7235, + "step": 19405 + }, + { + "epoch": 2.34, + "grad_norm": 0.3092862367630005, + "learning_rate": 4.062258937408464e-05, + "loss": 0.7693, + "step": 19410 + }, + { + "epoch": 2.34, + "grad_norm": 0.2575216591358185, + "learning_rate": 4.055219590454366e-05, + "loss": 0.769, + "step": 19415 + }, + { + "epoch": 2.34, + "grad_norm": 0.28645190596580505, + "learning_rate": 4.0481853944774e-05, + "loss": 0.7985, + "step": 19420 + }, + { + "epoch": 2.34, + "grad_norm": 0.29721811413764954, + "learning_rate": 4.0411563527880856e-05, + "loss": 0.878, + "step": 19425 + }, + { + "epoch": 2.34, + "grad_norm": 0.27767476439476013, + "learning_rate": 4.0341324686945237e-05, + "loss": 0.8084, + "step": 19430 + }, + { + "epoch": 2.34, + "grad_norm": 0.3142256736755371, + "learning_rate": 4.027113745502388e-05, + "loss": 0.7558, + "step": 19435 + }, + { + "epoch": 2.34, + "grad_norm": 0.28294190764427185, + "learning_rate": 4.0201001865149206e-05, + "loss": 0.8359, + "step": 19440 + }, + { + "epoch": 2.34, + "grad_norm": 0.31937623023986816, + "learning_rate": 4.013091795032937e-05, + "loss": 0.769, + "step": 19445 + }, + { + "epoch": 2.34, + "grad_norm": 0.26449480652809143, + "learning_rate": 4.006088574354819e-05, + "loss": 0.7822, + "step": 19450 + }, + { + "epoch": 2.34, + "grad_norm": 0.3096768856048584, + "learning_rate": 3.999090527776509e-05, + "loss": 0.7575, + "step": 19455 + }, + { + "epoch": 2.34, + "grad_norm": 0.2941647171974182, + "learning_rate": 3.9920976585915296e-05, + "loss": 0.8384, + "step": 19460 + }, + { + "epoch": 2.35, + "grad_norm": 0.3138192594051361, + "learning_rate": 3.985109970090956e-05, + "loss": 0.7938, + "step": 19465 + }, + { + "epoch": 2.35, + "grad_norm": 0.24960915744304657, + "learning_rate": 3.9781274655634205e-05, + "loss": 0.7544, + "step": 19470 + }, + { + "epoch": 2.35, + "grad_norm": 0.2531121075153351, + "learning_rate": 3.971150148295123e-05, + "loss": 0.9053, + "step": 19475 + }, + { + "epoch": 2.35, + "grad_norm": 0.25435906648635864, + "learning_rate": 3.964178021569825e-05, + "loss": 0.7899, + "step": 19480 + }, + { + "epoch": 2.35, + "grad_norm": 0.27135327458381653, + "learning_rate": 3.957211088668838e-05, + "loss": 0.7274, + "step": 19485 + }, + { + "epoch": 2.35, + "grad_norm": 0.2866491675376892, + "learning_rate": 3.950249352871034e-05, + "loss": 0.7441, + "step": 19490 + }, + { + "epoch": 2.35, + "grad_norm": 0.2988530397415161, + "learning_rate": 3.94329281745283e-05, + "loss": 0.7978, + "step": 19495 + }, + { + "epoch": 2.35, + "grad_norm": 0.27263930439949036, + "learning_rate": 3.9363414856882126e-05, + "loss": 0.7804, + "step": 19500 + }, + { + "epoch": 2.35, + "grad_norm": 0.2896611988544464, + "learning_rate": 3.929395360848704e-05, + "loss": 0.7589, + "step": 19505 + }, + { + "epoch": 2.35, + "grad_norm": 0.28115859627723694, + "learning_rate": 3.9224544462033844e-05, + "loss": 0.7887, + "step": 19510 + }, + { + "epoch": 2.35, + "grad_norm": 0.31314992904663086, + "learning_rate": 3.915518745018873e-05, + "loss": 0.7634, + "step": 19515 + }, + { + "epoch": 2.35, + "grad_norm": 0.28734245896339417, + "learning_rate": 3.9085882605593485e-05, + "loss": 0.8887, + "step": 19520 + }, + { + "epoch": 2.35, + "grad_norm": 0.26335301995277405, + "learning_rate": 3.901662996086519e-05, + "loss": 0.8303, + "step": 19525 + }, + { + "epoch": 2.35, + "grad_norm": 0.2777862846851349, + "learning_rate": 3.8947429548596506e-05, + "loss": 0.7661, + "step": 19530 + }, + { + "epoch": 2.35, + "grad_norm": 0.2924749553203583, + "learning_rate": 3.8878281401355366e-05, + "loss": 0.7203, + "step": 19535 + }, + { + "epoch": 2.35, + "grad_norm": 0.265706866979599, + "learning_rate": 3.880918555168528e-05, + "loss": 0.8163, + "step": 19540 + }, + { + "epoch": 2.35, + "grad_norm": 0.29131948947906494, + "learning_rate": 3.8740142032105e-05, + "loss": 0.8113, + "step": 19545 + }, + { + "epoch": 2.36, + "grad_norm": 0.26033177971839905, + "learning_rate": 3.8671150875108715e-05, + "loss": 0.8122, + "step": 19550 + }, + { + "epoch": 2.36, + "grad_norm": 0.2742202579975128, + "learning_rate": 3.8602212113165886e-05, + "loss": 0.8273, + "step": 19555 + }, + { + "epoch": 2.36, + "grad_norm": 0.2769484221935272, + "learning_rate": 3.85333257787215e-05, + "loss": 0.7594, + "step": 19560 + }, + { + "epoch": 2.36, + "grad_norm": 0.31669124960899353, + "learning_rate": 3.846449190419569e-05, + "loss": 0.8816, + "step": 19565 + }, + { + "epoch": 2.36, + "grad_norm": 0.28259631991386414, + "learning_rate": 3.839571052198398e-05, + "loss": 0.7435, + "step": 19570 + }, + { + "epoch": 2.36, + "grad_norm": 0.2604540288448334, + "learning_rate": 3.832698166445715e-05, + "loss": 0.7484, + "step": 19575 + }, + { + "epoch": 2.36, + "grad_norm": 0.3139670491218567, + "learning_rate": 3.82583053639613e-05, + "loss": 0.7701, + "step": 19580 + }, + { + "epoch": 2.36, + "grad_norm": 0.30153560638427734, + "learning_rate": 3.818968165281777e-05, + "loss": 0.8798, + "step": 19585 + }, + { + "epoch": 2.36, + "grad_norm": 0.27177488803863525, + "learning_rate": 3.8121110563323115e-05, + "loss": 0.7226, + "step": 19590 + }, + { + "epoch": 2.36, + "grad_norm": 0.27674952149391174, + "learning_rate": 3.8052592127749265e-05, + "loss": 0.8741, + "step": 19595 + }, + { + "epoch": 2.36, + "grad_norm": 0.2683180570602417, + "learning_rate": 3.798412637834321e-05, + "loss": 0.767, + "step": 19600 + }, + { + "epoch": 2.36, + "grad_norm": 0.25967836380004883, + "learning_rate": 3.7915713347327225e-05, + "loss": 0.7556, + "step": 19605 + }, + { + "epoch": 2.36, + "grad_norm": 0.2514658570289612, + "learning_rate": 3.784735306689869e-05, + "loss": 0.7164, + "step": 19610 + }, + { + "epoch": 2.36, + "grad_norm": 0.2914251983165741, + "learning_rate": 3.777904556923031e-05, + "loss": 0.9218, + "step": 19615 + }, + { + "epoch": 2.36, + "grad_norm": 0.24623388051986694, + "learning_rate": 3.7710790886469837e-05, + "loss": 0.8447, + "step": 19620 + }, + { + "epoch": 2.36, + "grad_norm": 0.295613169670105, + "learning_rate": 3.764258905074018e-05, + "loss": 0.735, + "step": 19625 + }, + { + "epoch": 2.37, + "grad_norm": 0.3049282133579254, + "learning_rate": 3.7574440094139364e-05, + "loss": 0.7163, + "step": 19630 + }, + { + "epoch": 2.37, + "grad_norm": 0.3349364101886749, + "learning_rate": 3.75063440487406e-05, + "loss": 0.8382, + "step": 19635 + }, + { + "epoch": 2.37, + "grad_norm": 0.2821791172027588, + "learning_rate": 3.743830094659212e-05, + "loss": 0.7424, + "step": 19640 + }, + { + "epoch": 2.37, + "grad_norm": 0.2976232171058655, + "learning_rate": 3.737031081971729e-05, + "loss": 0.7901, + "step": 19645 + }, + { + "epoch": 2.37, + "grad_norm": 0.2489924281835556, + "learning_rate": 3.73023737001145e-05, + "loss": 0.7586, + "step": 19650 + }, + { + "epoch": 2.37, + "grad_norm": 0.2726670503616333, + "learning_rate": 3.723448961975722e-05, + "loss": 0.8153, + "step": 19655 + }, + { + "epoch": 2.37, + "grad_norm": 0.29014068841934204, + "learning_rate": 3.7166658610593955e-05, + "loss": 0.8317, + "step": 19660 + }, + { + "epoch": 2.37, + "grad_norm": 0.28912895917892456, + "learning_rate": 3.709888070454822e-05, + "loss": 0.8345, + "step": 19665 + }, + { + "epoch": 2.37, + "grad_norm": 0.2740095257759094, + "learning_rate": 3.70311559335185e-05, + "loss": 0.7405, + "step": 19670 + }, + { + "epoch": 2.37, + "grad_norm": 0.3410658836364746, + "learning_rate": 3.696348432937845e-05, + "loss": 0.7129, + "step": 19675 + }, + { + "epoch": 2.37, + "grad_norm": 0.2865118682384491, + "learning_rate": 3.689586592397647e-05, + "loss": 0.7907, + "step": 19680 + }, + { + "epoch": 2.37, + "grad_norm": 0.26647239923477173, + "learning_rate": 3.6828300749136085e-05, + "loss": 0.8186, + "step": 19685 + }, + { + "epoch": 2.37, + "grad_norm": 0.30425769090652466, + "learning_rate": 3.6760788836655624e-05, + "loss": 0.7776, + "step": 19690 + }, + { + "epoch": 2.37, + "grad_norm": 0.28291577100753784, + "learning_rate": 3.669333021830854e-05, + "loss": 0.7774, + "step": 19695 + }, + { + "epoch": 2.37, + "grad_norm": 0.294132262468338, + "learning_rate": 3.662592492584306e-05, + "loss": 0.7293, + "step": 19700 + }, + { + "epoch": 2.37, + "grad_norm": 0.28489431738853455, + "learning_rate": 3.655857299098233e-05, + "loss": 0.7946, + "step": 19705 + }, + { + "epoch": 2.37, + "grad_norm": 0.2683461308479309, + "learning_rate": 3.64912744454244e-05, + "loss": 0.7626, + "step": 19710 + }, + { + "epoch": 2.38, + "grad_norm": 0.24661622941493988, + "learning_rate": 3.6424029320842265e-05, + "loss": 0.7492, + "step": 19715 + }, + { + "epoch": 2.38, + "grad_norm": 0.266984760761261, + "learning_rate": 3.635683764888367e-05, + "loss": 0.7598, + "step": 19720 + }, + { + "epoch": 2.38, + "grad_norm": 0.2817406952381134, + "learning_rate": 3.628969946117129e-05, + "loss": 0.852, + "step": 19725 + }, + { + "epoch": 2.38, + "grad_norm": 0.2606806755065918, + "learning_rate": 3.6222614789302454e-05, + "loss": 0.863, + "step": 19730 + }, + { + "epoch": 2.38, + "grad_norm": 0.2557438015937805, + "learning_rate": 3.615558366484958e-05, + "loss": 0.7349, + "step": 19735 + }, + { + "epoch": 2.38, + "grad_norm": 0.27822768688201904, + "learning_rate": 3.608860611935969e-05, + "loss": 0.8482, + "step": 19740 + }, + { + "epoch": 2.38, + "grad_norm": 0.2577543556690216, + "learning_rate": 3.602168218435464e-05, + "loss": 0.8088, + "step": 19745 + }, + { + "epoch": 2.38, + "grad_norm": 0.3152078092098236, + "learning_rate": 3.595481189133101e-05, + "loss": 0.7895, + "step": 19750 + }, + { + "epoch": 2.38, + "grad_norm": 0.2776154577732086, + "learning_rate": 3.588799527176026e-05, + "loss": 0.8115, + "step": 19755 + }, + { + "epoch": 2.38, + "grad_norm": 0.2907949984073639, + "learning_rate": 3.5821232357088493e-05, + "loss": 0.8447, + "step": 19760 + }, + { + "epoch": 2.38, + "grad_norm": 0.27817782759666443, + "learning_rate": 3.575452317873653e-05, + "loss": 0.7258, + "step": 19765 + }, + { + "epoch": 2.38, + "grad_norm": 0.2644236087799072, + "learning_rate": 3.56878677680999e-05, + "loss": 0.7573, + "step": 19770 + }, + { + "epoch": 2.38, + "grad_norm": 0.28008079528808594, + "learning_rate": 3.5621266156548925e-05, + "loss": 0.8521, + "step": 19775 + }, + { + "epoch": 2.38, + "grad_norm": 0.2851678133010864, + "learning_rate": 3.55547183754285e-05, + "loss": 0.8946, + "step": 19780 + }, + { + "epoch": 2.38, + "grad_norm": 0.29239290952682495, + "learning_rate": 3.5488224456058215e-05, + "loss": 0.7888, + "step": 19785 + }, + { + "epoch": 2.38, + "grad_norm": 0.2597346305847168, + "learning_rate": 3.5421784429732316e-05, + "loss": 0.7592, + "step": 19790 + }, + { + "epoch": 2.39, + "grad_norm": 0.2777891159057617, + "learning_rate": 3.535539832771971e-05, + "loss": 0.7478, + "step": 19795 + }, + { + "epoch": 2.39, + "grad_norm": 0.2665190100669861, + "learning_rate": 3.528906618126385e-05, + "loss": 0.8196, + "step": 19800 + }, + { + "epoch": 2.39, + "grad_norm": 0.2569584548473358, + "learning_rate": 3.52227880215829e-05, + "loss": 0.8045, + "step": 19805 + }, + { + "epoch": 2.39, + "grad_norm": 0.2824585437774658, + "learning_rate": 3.5156563879869494e-05, + "loss": 0.7438, + "step": 19810 + }, + { + "epoch": 2.39, + "grad_norm": 0.2764226496219635, + "learning_rate": 3.509039378729099e-05, + "loss": 0.7557, + "step": 19815 + }, + { + "epoch": 2.39, + "grad_norm": 0.2752508521080017, + "learning_rate": 3.5024277774989203e-05, + "loss": 0.9046, + "step": 19820 + }, + { + "epoch": 2.39, + "grad_norm": 0.28766295313835144, + "learning_rate": 3.4958215874080526e-05, + "loss": 0.7422, + "step": 19825 + }, + { + "epoch": 2.39, + "grad_norm": 0.25225573778152466, + "learning_rate": 3.4892208115655837e-05, + "loss": 0.8565, + "step": 19830 + }, + { + "epoch": 2.39, + "grad_norm": 0.3149223327636719, + "learning_rate": 3.482625453078065e-05, + "loss": 0.7516, + "step": 19835 + }, + { + "epoch": 2.39, + "grad_norm": 0.30531787872314453, + "learning_rate": 3.4760355150494895e-05, + "loss": 0.7884, + "step": 19840 + }, + { + "epoch": 2.39, + "grad_norm": 0.2835243046283722, + "learning_rate": 3.4694510005812996e-05, + "loss": 0.7924, + "step": 19845 + }, + { + "epoch": 2.39, + "grad_norm": 0.2607024013996124, + "learning_rate": 3.462871912772382e-05, + "loss": 0.7911, + "step": 19850 + }, + { + "epoch": 2.39, + "grad_norm": 0.29384374618530273, + "learning_rate": 3.4562982547190824e-05, + "loss": 0.8039, + "step": 19855 + }, + { + "epoch": 2.39, + "grad_norm": 0.28320446610450745, + "learning_rate": 3.449730029515179e-05, + "loss": 0.7786, + "step": 19860 + }, + { + "epoch": 2.39, + "grad_norm": 0.281463623046875, + "learning_rate": 3.4431672402518955e-05, + "loss": 0.7672, + "step": 19865 + }, + { + "epoch": 2.39, + "grad_norm": 0.2654038369655609, + "learning_rate": 3.436609890017901e-05, + "loss": 0.8256, + "step": 19870 + }, + { + "epoch": 2.39, + "grad_norm": 0.2744678556919098, + "learning_rate": 3.430057981899298e-05, + "loss": 0.7221, + "step": 19875 + }, + { + "epoch": 2.4, + "grad_norm": 0.28920984268188477, + "learning_rate": 3.4235115189796375e-05, + "loss": 0.7532, + "step": 19880 + }, + { + "epoch": 2.4, + "grad_norm": 0.3083685040473938, + "learning_rate": 3.416970504339897e-05, + "loss": 0.7365, + "step": 19885 + }, + { + "epoch": 2.4, + "grad_norm": 0.25423839688301086, + "learning_rate": 3.410434941058495e-05, + "loss": 0.8097, + "step": 19890 + }, + { + "epoch": 2.4, + "grad_norm": 0.25034499168395996, + "learning_rate": 3.4039048322112917e-05, + "loss": 0.7832, + "step": 19895 + }, + { + "epoch": 2.4, + "grad_norm": 0.2709154188632965, + "learning_rate": 3.3973801808715676e-05, + "loss": 0.7133, + "step": 19900 + }, + { + "epoch": 2.4, + "grad_norm": 0.2830277383327484, + "learning_rate": 3.390860990110042e-05, + "loss": 0.8309, + "step": 19905 + }, + { + "epoch": 2.4, + "grad_norm": 0.26153650879859924, + "learning_rate": 3.384347262994858e-05, + "loss": 0.7983, + "step": 19910 + }, + { + "epoch": 2.4, + "grad_norm": 0.23845182359218597, + "learning_rate": 3.377839002591599e-05, + "loss": 0.84, + "step": 19915 + }, + { + "epoch": 2.4, + "grad_norm": 0.3157477080821991, + "learning_rate": 3.371336211963268e-05, + "loss": 0.7391, + "step": 19920 + }, + { + "epoch": 2.4, + "grad_norm": 0.25639525055885315, + "learning_rate": 3.364838894170289e-05, + "loss": 0.7765, + "step": 19925 + }, + { + "epoch": 2.4, + "grad_norm": 0.27470090985298157, + "learning_rate": 3.358347052270515e-05, + "loss": 0.7211, + "step": 19930 + }, + { + "epoch": 2.4, + "grad_norm": 0.3165142238140106, + "learning_rate": 3.351860689319234e-05, + "loss": 0.7079, + "step": 19935 + }, + { + "epoch": 2.4, + "grad_norm": 0.2669844329357147, + "learning_rate": 3.345379808369132e-05, + "loss": 0.7023, + "step": 19940 + }, + { + "epoch": 2.4, + "grad_norm": 0.29056012630462646, + "learning_rate": 3.338904412470328e-05, + "loss": 0.8274, + "step": 19945 + }, + { + "epoch": 2.4, + "grad_norm": 0.304496169090271, + "learning_rate": 3.332434504670358e-05, + "loss": 0.821, + "step": 19950 + }, + { + "epoch": 2.4, + "grad_norm": 0.29382607340812683, + "learning_rate": 3.3259700880141815e-05, + "loss": 0.7403, + "step": 19955 + }, + { + "epoch": 2.4, + "grad_norm": 0.27669087052345276, + "learning_rate": 3.319511165544165e-05, + "loss": 0.6459, + "step": 19960 + }, + { + "epoch": 2.41, + "grad_norm": 0.2903481125831604, + "learning_rate": 3.313057740300089e-05, + "loss": 0.8414, + "step": 19965 + }, + { + "epoch": 2.41, + "grad_norm": 0.2781134247779846, + "learning_rate": 3.30660981531915e-05, + "loss": 0.7587, + "step": 19970 + }, + { + "epoch": 2.41, + "grad_norm": 0.24643659591674805, + "learning_rate": 3.3001673936359604e-05, + "loss": 0.7535, + "step": 19975 + }, + { + "epoch": 2.41, + "grad_norm": 0.306083083152771, + "learning_rate": 3.293730478282534e-05, + "loss": 0.7106, + "step": 19980 + }, + { + "epoch": 2.41, + "grad_norm": 0.26486045122146606, + "learning_rate": 3.2872990722882984e-05, + "loss": 0.8252, + "step": 19985 + }, + { + "epoch": 2.41, + "grad_norm": 0.2603921890258789, + "learning_rate": 3.280873178680082e-05, + "loss": 0.7914, + "step": 19990 + }, + { + "epoch": 2.41, + "grad_norm": 0.31112536787986755, + "learning_rate": 3.274452800482133e-05, + "loss": 0.7106, + "step": 19995 + }, + { + "epoch": 2.41, + "grad_norm": 0.26250556111335754, + "learning_rate": 3.2680379407160886e-05, + "loss": 0.7859, + "step": 20000 + }, + { + "epoch": 2.41, + "grad_norm": 0.28084659576416016, + "learning_rate": 3.261628602400995e-05, + "loss": 0.7564, + "step": 20005 + }, + { + "epoch": 2.41, + "grad_norm": 0.2960032820701599, + "learning_rate": 3.2552247885533005e-05, + "loss": 0.8016, + "step": 20010 + }, + { + "epoch": 2.41, + "grad_norm": 0.3024326264858246, + "learning_rate": 3.248826502186854e-05, + "loss": 0.8589, + "step": 20015 + }, + { + "epoch": 2.41, + "grad_norm": 0.28151461482048035, + "learning_rate": 3.242433746312899e-05, + "loss": 0.8336, + "step": 20020 + }, + { + "epoch": 2.41, + "grad_norm": 0.2718256711959839, + "learning_rate": 3.23604652394008e-05, + "loss": 0.6869, + "step": 20025 + }, + { + "epoch": 2.41, + "grad_norm": 0.31024929881095886, + "learning_rate": 3.229664838074431e-05, + "loss": 0.9043, + "step": 20030 + }, + { + "epoch": 2.41, + "grad_norm": 0.26419252157211304, + "learning_rate": 3.223288691719394e-05, + "loss": 0.7494, + "step": 20035 + }, + { + "epoch": 2.41, + "grad_norm": 0.27617117762565613, + "learning_rate": 3.216918087875792e-05, + "loss": 0.7913, + "step": 20040 + }, + { + "epoch": 2.42, + "grad_norm": 0.2638927102088928, + "learning_rate": 3.21055302954184e-05, + "loss": 0.7151, + "step": 20045 + }, + { + "epoch": 2.42, + "grad_norm": 0.2976774573326111, + "learning_rate": 3.204193519713146e-05, + "loss": 0.7635, + "step": 20050 + }, + { + "epoch": 2.42, + "grad_norm": 0.256143182516098, + "learning_rate": 3.197839561382711e-05, + "loss": 0.8376, + "step": 20055 + }, + { + "epoch": 2.42, + "grad_norm": 0.2871249318122864, + "learning_rate": 3.1914911575409165e-05, + "loss": 0.707, + "step": 20060 + }, + { + "epoch": 2.42, + "grad_norm": 0.269939661026001, + "learning_rate": 3.1851483111755276e-05, + "loss": 0.8975, + "step": 20065 + }, + { + "epoch": 2.42, + "grad_norm": 0.2935774326324463, + "learning_rate": 3.178811025271705e-05, + "loss": 0.787, + "step": 20070 + }, + { + "epoch": 2.42, + "grad_norm": 0.2772122621536255, + "learning_rate": 3.1724793028119846e-05, + "loss": 0.6818, + "step": 20075 + }, + { + "epoch": 2.42, + "grad_norm": 0.3376406133174896, + "learning_rate": 3.1661531467762866e-05, + "loss": 0.6406, + "step": 20080 + }, + { + "epoch": 2.42, + "grad_norm": 0.24912644922733307, + "learning_rate": 3.159832560141904e-05, + "loss": 0.8038, + "step": 20085 + }, + { + "epoch": 2.42, + "grad_norm": 0.26904556155204773, + "learning_rate": 3.15351754588352e-05, + "loss": 0.8043, + "step": 20090 + }, + { + "epoch": 2.42, + "grad_norm": 0.27094265818595886, + "learning_rate": 3.147208106973189e-05, + "loss": 0.8588, + "step": 20095 + }, + { + "epoch": 2.42, + "grad_norm": 0.26303452253341675, + "learning_rate": 3.1409042463803406e-05, + "loss": 0.7485, + "step": 20100 + }, + { + "epoch": 2.42, + "grad_norm": 0.29558247327804565, + "learning_rate": 3.1346059670717783e-05, + "loss": 0.8514, + "step": 20105 + }, + { + "epoch": 2.42, + "grad_norm": 0.2617008090019226, + "learning_rate": 3.12831327201169e-05, + "loss": 0.7401, + "step": 20110 + }, + { + "epoch": 2.42, + "grad_norm": 0.29769590497016907, + "learning_rate": 3.1220261641616226e-05, + "loss": 0.6958, + "step": 20115 + }, + { + "epoch": 2.42, + "grad_norm": 0.2647980749607086, + "learning_rate": 3.1157446464804966e-05, + "loss": 0.824, + "step": 20120 + }, + { + "epoch": 2.42, + "grad_norm": 0.2724788784980774, + "learning_rate": 3.1094687219246025e-05, + "loss": 0.8486, + "step": 20125 + }, + { + "epoch": 2.43, + "grad_norm": 0.2676187753677368, + "learning_rate": 3.103198393447601e-05, + "loss": 0.8293, + "step": 20130 + }, + { + "epoch": 2.43, + "grad_norm": 0.254800945520401, + "learning_rate": 3.096933664000518e-05, + "loss": 0.8403, + "step": 20135 + }, + { + "epoch": 2.43, + "grad_norm": 0.2803385853767395, + "learning_rate": 3.090674536531741e-05, + "loss": 0.7535, + "step": 20140 + }, + { + "epoch": 2.43, + "grad_norm": 0.3000488877296448, + "learning_rate": 3.084421013987023e-05, + "loss": 0.7952, + "step": 20145 + }, + { + "epoch": 2.43, + "grad_norm": 0.24971972405910492, + "learning_rate": 3.07817309930948e-05, + "loss": 0.7537, + "step": 20150 + }, + { + "epoch": 2.43, + "grad_norm": 0.29271334409713745, + "learning_rate": 3.0719307954395886e-05, + "loss": 0.8457, + "step": 20155 + }, + { + "epoch": 2.43, + "grad_norm": 0.27046826481819153, + "learning_rate": 3.0656941053151846e-05, + "loss": 0.7313, + "step": 20160 + }, + { + "epoch": 2.43, + "grad_norm": 0.3202589154243469, + "learning_rate": 3.059463031871456e-05, + "loss": 0.7813, + "step": 20165 + }, + { + "epoch": 2.43, + "grad_norm": 0.2895718514919281, + "learning_rate": 3.05323757804096e-05, + "loss": 0.768, + "step": 20170 + }, + { + "epoch": 2.43, + "grad_norm": 0.28640422224998474, + "learning_rate": 3.0470177467536006e-05, + "loss": 0.7901, + "step": 20175 + }, + { + "epoch": 2.43, + "grad_norm": 0.2655580937862396, + "learning_rate": 3.040803540936637e-05, + "loss": 0.8789, + "step": 20180 + }, + { + "epoch": 2.43, + "grad_norm": 0.26338696479797363, + "learning_rate": 3.034594963514674e-05, + "loss": 0.8176, + "step": 20185 + }, + { + "epoch": 2.43, + "grad_norm": 0.27758362889289856, + "learning_rate": 3.028392017409685e-05, + "loss": 0.7612, + "step": 20190 + }, + { + "epoch": 2.43, + "grad_norm": 0.31428468227386475, + "learning_rate": 3.0221947055409766e-05, + "loss": 0.7512, + "step": 20195 + }, + { + "epoch": 2.43, + "grad_norm": 0.3056136965751648, + "learning_rate": 3.0160030308252113e-05, + "loss": 0.8479, + "step": 20200 + }, + { + "epoch": 2.43, + "grad_norm": 0.2786805331707001, + "learning_rate": 3.0098169961763913e-05, + "loss": 0.8396, + "step": 20205 + }, + { + "epoch": 2.44, + "grad_norm": 0.2735007405281067, + "learning_rate": 3.003636604505879e-05, + "loss": 0.7215, + "step": 20210 + }, + { + "epoch": 2.44, + "grad_norm": 0.2933249771595001, + "learning_rate": 2.9974618587223665e-05, + "loss": 0.8251, + "step": 20215 + }, + { + "epoch": 2.44, + "grad_norm": 0.24182629585266113, + "learning_rate": 2.991292761731897e-05, + "loss": 0.7686, + "step": 20220 + }, + { + "epoch": 2.44, + "grad_norm": 0.31849405169487, + "learning_rate": 2.9851293164378488e-05, + "loss": 0.8349, + "step": 20225 + }, + { + "epoch": 2.44, + "grad_norm": 0.2779251039028168, + "learning_rate": 2.9789715257409475e-05, + "loss": 0.7816, + "step": 20230 + }, + { + "epoch": 2.44, + "grad_norm": 0.2377801239490509, + "learning_rate": 2.9728193925392502e-05, + "loss": 0.8013, + "step": 20235 + }, + { + "epoch": 2.44, + "grad_norm": 0.29114991426467896, + "learning_rate": 2.966672919728159e-05, + "loss": 0.7782, + "step": 20240 + }, + { + "epoch": 2.44, + "grad_norm": 0.2626573443412781, + "learning_rate": 2.960532110200402e-05, + "loss": 0.7895, + "step": 20245 + }, + { + "epoch": 2.44, + "grad_norm": 0.26689302921295166, + "learning_rate": 2.9543969668460556e-05, + "loss": 0.7706, + "step": 20250 + }, + { + "epoch": 2.44, + "grad_norm": 0.29479920864105225, + "learning_rate": 2.948267492552518e-05, + "loss": 0.8125, + "step": 20255 + }, + { + "epoch": 2.44, + "grad_norm": 0.2591875195503235, + "learning_rate": 2.9421436902045255e-05, + "loss": 0.8254, + "step": 20260 + }, + { + "epoch": 2.44, + "grad_norm": 0.23983192443847656, + "learning_rate": 2.9360255626841374e-05, + "loss": 0.731, + "step": 20265 + }, + { + "epoch": 2.44, + "grad_norm": 0.27949729561805725, + "learning_rate": 2.9299131128707537e-05, + "loss": 0.7235, + "step": 20270 + }, + { + "epoch": 2.44, + "grad_norm": 0.29695627093315125, + "learning_rate": 2.923806343641097e-05, + "loss": 0.8012, + "step": 20275 + }, + { + "epoch": 2.44, + "grad_norm": 0.26276320219039917, + "learning_rate": 2.917705257869211e-05, + "loss": 0.7519, + "step": 20280 + }, + { + "epoch": 2.44, + "grad_norm": 0.2703682482242584, + "learning_rate": 2.911609858426468e-05, + "loss": 0.8297, + "step": 20285 + }, + { + "epoch": 2.44, + "grad_norm": 0.28486260771751404, + "learning_rate": 2.9055201481815727e-05, + "loss": 0.8341, + "step": 20290 + }, + { + "epoch": 2.45, + "grad_norm": 0.266369104385376, + "learning_rate": 2.8994361300005407e-05, + "loss": 0.6976, + "step": 20295 + }, + { + "epoch": 2.45, + "grad_norm": 0.2816215753555298, + "learning_rate": 2.8933578067467123e-05, + "loss": 0.846, + "step": 20300 + }, + { + "epoch": 2.45, + "grad_norm": 0.273666650056839, + "learning_rate": 2.8872851812807502e-05, + "loss": 0.7309, + "step": 20305 + }, + { + "epoch": 2.45, + "grad_norm": 0.30615535378456116, + "learning_rate": 2.8812182564606334e-05, + "loss": 0.7782, + "step": 20310 + }, + { + "epoch": 2.45, + "grad_norm": 0.2994655966758728, + "learning_rate": 2.8751570351416564e-05, + "loss": 0.6678, + "step": 20315 + }, + { + "epoch": 2.45, + "grad_norm": 0.2747027277946472, + "learning_rate": 2.8691015201764323e-05, + "loss": 0.7504, + "step": 20320 + }, + { + "epoch": 2.45, + "grad_norm": 0.28770124912261963, + "learning_rate": 2.863051714414883e-05, + "loss": 0.7577, + "step": 20325 + }, + { + "epoch": 2.45, + "grad_norm": 0.27923446893692017, + "learning_rate": 2.8570076207042574e-05, + "loss": 0.7865, + "step": 20330 + }, + { + "epoch": 2.45, + "grad_norm": 0.26224157214164734, + "learning_rate": 2.8509692418891024e-05, + "loss": 0.7961, + "step": 20335 + }, + { + "epoch": 2.45, + "grad_norm": 0.2644881308078766, + "learning_rate": 2.8449365808112773e-05, + "loss": 0.7063, + "step": 20340 + }, + { + "epoch": 2.45, + "grad_norm": 0.2863686680793762, + "learning_rate": 2.8389096403099515e-05, + "loss": 0.8939, + "step": 20345 + }, + { + "epoch": 2.45, + "grad_norm": 0.27196264266967773, + "learning_rate": 2.832888423221611e-05, + "loss": 0.803, + "step": 20350 + }, + { + "epoch": 2.45, + "grad_norm": 0.2805080711841583, + "learning_rate": 2.826872932380034e-05, + "loss": 0.8347, + "step": 20355 + }, + { + "epoch": 2.45, + "grad_norm": 0.27300190925598145, + "learning_rate": 2.820863170616312e-05, + "loss": 0.7662, + "step": 20360 + }, + { + "epoch": 2.45, + "grad_norm": 0.2929331660270691, + "learning_rate": 2.81485914075884e-05, + "loss": 0.8332, + "step": 20365 + }, + { + "epoch": 2.45, + "grad_norm": 0.3110920190811157, + "learning_rate": 2.808860845633312e-05, + "loss": 0.7865, + "step": 20370 + }, + { + "epoch": 2.45, + "grad_norm": 0.2958769202232361, + "learning_rate": 2.8028682880627255e-05, + "loss": 0.7955, + "step": 20375 + }, + { + "epoch": 2.46, + "grad_norm": 0.2582022547721863, + "learning_rate": 2.7968814708673758e-05, + "loss": 0.8322, + "step": 20380 + }, + { + "epoch": 2.46, + "grad_norm": 0.27582964301109314, + "learning_rate": 2.7909003968648557e-05, + "loss": 0.8729, + "step": 20385 + }, + { + "epoch": 2.46, + "grad_norm": 0.29320433735847473, + "learning_rate": 2.7849250688700637e-05, + "loss": 0.7828, + "step": 20390 + }, + { + "epoch": 2.46, + "grad_norm": 0.2722160220146179, + "learning_rate": 2.7789554896951865e-05, + "loss": 0.7481, + "step": 20395 + }, + { + "epoch": 2.46, + "grad_norm": 0.3223639130592346, + "learning_rate": 2.7729916621497022e-05, + "loss": 0.9109, + "step": 20400 + }, + { + "epoch": 2.46, + "grad_norm": 0.2686358690261841, + "learning_rate": 2.7670335890403837e-05, + "loss": 0.7711, + "step": 20405 + }, + { + "epoch": 2.46, + "grad_norm": 0.2783448398113251, + "learning_rate": 2.7610812731713084e-05, + "loss": 0.8384, + "step": 20410 + }, + { + "epoch": 2.46, + "grad_norm": 0.3094173073768616, + "learning_rate": 2.7551347173438267e-05, + "loss": 0.7424, + "step": 20415 + }, + { + "epoch": 2.46, + "grad_norm": 0.29645413160324097, + "learning_rate": 2.7491939243565862e-05, + "loss": 0.813, + "step": 20420 + }, + { + "epoch": 2.46, + "grad_norm": 0.31017208099365234, + "learning_rate": 2.7432588970055182e-05, + "loss": 0.8065, + "step": 20425 + }, + { + "epoch": 2.46, + "grad_norm": 0.29746440052986145, + "learning_rate": 2.7373296380838505e-05, + "loss": 0.7532, + "step": 20430 + }, + { + "epoch": 2.46, + "grad_norm": 0.2688598930835724, + "learning_rate": 2.7314061503820865e-05, + "loss": 0.7811, + "step": 20435 + }, + { + "epoch": 2.46, + "grad_norm": 0.30386409163475037, + "learning_rate": 2.7254884366880146e-05, + "loss": 0.7961, + "step": 20440 + }, + { + "epoch": 2.46, + "grad_norm": 0.28410372138023376, + "learning_rate": 2.7195764997867085e-05, + "loss": 0.9115, + "step": 20445 + }, + { + "epoch": 2.46, + "grad_norm": 0.2933676242828369, + "learning_rate": 2.7136703424605228e-05, + "loss": 0.8635, + "step": 20450 + }, + { + "epoch": 2.46, + "grad_norm": 0.3018134832382202, + "learning_rate": 2.7077699674890903e-05, + "loss": 0.7459, + "step": 20455 + }, + { + "epoch": 2.47, + "grad_norm": 0.2924741208553314, + "learning_rate": 2.7018753776493213e-05, + "loss": 0.8325, + "step": 20460 + }, + { + "epoch": 2.47, + "grad_norm": 0.2926197946071625, + "learning_rate": 2.6959865757154054e-05, + "loss": 0.722, + "step": 20465 + }, + { + "epoch": 2.47, + "grad_norm": 0.285312294960022, + "learning_rate": 2.6901035644588154e-05, + "loss": 0.8077, + "step": 20470 + }, + { + "epoch": 2.47, + "grad_norm": 0.2824037969112396, + "learning_rate": 2.684226346648285e-05, + "loss": 0.6966, + "step": 20475 + }, + { + "epoch": 2.47, + "grad_norm": 0.29435649514198303, + "learning_rate": 2.6783549250498315e-05, + "loss": 0.6975, + "step": 20480 + }, + { + "epoch": 2.47, + "grad_norm": 0.26435035467147827, + "learning_rate": 2.6724893024267348e-05, + "loss": 0.8694, + "step": 20485 + }, + { + "epoch": 2.47, + "grad_norm": 0.29340675473213196, + "learning_rate": 2.6666294815395612e-05, + "loss": 0.7117, + "step": 20490 + }, + { + "epoch": 2.47, + "grad_norm": 0.30014726519584656, + "learning_rate": 2.6607754651461317e-05, + "loss": 0.7437, + "step": 20495 + }, + { + "epoch": 2.47, + "grad_norm": 0.31490999460220337, + "learning_rate": 2.6549272560015406e-05, + "loss": 0.7764, + "step": 20500 + }, + { + "epoch": 2.47, + "grad_norm": 0.26503077149391174, + "learning_rate": 2.6490848568581462e-05, + "loss": 0.7501, + "step": 20505 + }, + { + "epoch": 2.47, + "grad_norm": 0.29323646426200867, + "learning_rate": 2.6432482704655845e-05, + "loss": 0.8158, + "step": 20510 + }, + { + "epoch": 2.47, + "grad_norm": 0.2801787555217743, + "learning_rate": 2.637417499570741e-05, + "loss": 0.7489, + "step": 20515 + }, + { + "epoch": 2.47, + "grad_norm": 0.2741035223007202, + "learning_rate": 2.631592546917774e-05, + "loss": 0.8378, + "step": 20520 + }, + { + "epoch": 2.47, + "grad_norm": 0.300493448972702, + "learning_rate": 2.6257734152480893e-05, + "loss": 0.7904, + "step": 20525 + }, + { + "epoch": 2.47, + "grad_norm": 0.2564448118209839, + "learning_rate": 2.619960107300374e-05, + "loss": 0.7423, + "step": 20530 + }, + { + "epoch": 2.47, + "grad_norm": 0.26825183629989624, + "learning_rate": 2.6141526258105615e-05, + "loss": 0.7046, + "step": 20535 + }, + { + "epoch": 2.47, + "grad_norm": 0.2867426872253418, + "learning_rate": 2.608350973511844e-05, + "loss": 0.8206, + "step": 20540 + }, + { + "epoch": 2.48, + "grad_norm": 0.2587963938713074, + "learning_rate": 2.6025551531346688e-05, + "loss": 0.7434, + "step": 20545 + }, + { + "epoch": 2.48, + "grad_norm": 0.3124673664569855, + "learning_rate": 2.5967651674067493e-05, + "loss": 0.7841, + "step": 20550 + }, + { + "epoch": 2.48, + "grad_norm": 0.28737810254096985, + "learning_rate": 2.59098101905304e-05, + "loss": 0.7924, + "step": 20555 + }, + { + "epoch": 2.48, + "grad_norm": 0.2854047417640686, + "learning_rate": 2.585202710795754e-05, + "loss": 0.7533, + "step": 20560 + }, + { + "epoch": 2.48, + "grad_norm": 0.24524690210819244, + "learning_rate": 2.5794302453543524e-05, + "loss": 0.8201, + "step": 20565 + }, + { + "epoch": 2.48, + "grad_norm": 0.30324557423591614, + "learning_rate": 2.573663625445554e-05, + "loss": 0.8067, + "step": 20570 + }, + { + "epoch": 2.48, + "grad_norm": 0.29187995195388794, + "learning_rate": 2.567902853783319e-05, + "loss": 0.8313, + "step": 20575 + }, + { + "epoch": 2.48, + "grad_norm": 0.26502034068107605, + "learning_rate": 2.562147933078859e-05, + "loss": 0.8097, + "step": 20580 + }, + { + "epoch": 2.48, + "grad_norm": 0.32918286323547363, + "learning_rate": 2.556398866040631e-05, + "loss": 0.8703, + "step": 20585 + }, + { + "epoch": 2.48, + "grad_norm": 0.3019461929798126, + "learning_rate": 2.5506556553743334e-05, + "loss": 0.8317, + "step": 20590 + }, + { + "epoch": 2.48, + "grad_norm": 0.28017765283584595, + "learning_rate": 2.5449183037829163e-05, + "loss": 0.8315, + "step": 20595 + }, + { + "epoch": 2.48, + "grad_norm": 0.31728842854499817, + "learning_rate": 2.5391868139665612e-05, + "loss": 0.8539, + "step": 20600 + }, + { + "epoch": 2.48, + "grad_norm": 0.26865053176879883, + "learning_rate": 2.533461188622707e-05, + "loss": 0.7489, + "step": 20605 + }, + { + "epoch": 2.48, + "grad_norm": 0.24447381496429443, + "learning_rate": 2.527741430446018e-05, + "loss": 0.7674, + "step": 20610 + }, + { + "epoch": 2.48, + "grad_norm": 0.3278535306453705, + "learning_rate": 2.5220275421284015e-05, + "loss": 0.7342, + "step": 20615 + }, + { + "epoch": 2.48, + "grad_norm": 0.30096837878227234, + "learning_rate": 2.5163195263590026e-05, + "loss": 0.7506, + "step": 20620 + }, + { + "epoch": 2.49, + "grad_norm": 0.28471359610557556, + "learning_rate": 2.5106173858242073e-05, + "loss": 0.7999, + "step": 20625 + }, + { + "epoch": 2.49, + "grad_norm": 0.2582080364227295, + "learning_rate": 2.5049211232076295e-05, + "loss": 0.8253, + "step": 20630 + }, + { + "epoch": 2.49, + "grad_norm": 0.2530338168144226, + "learning_rate": 2.499230741190121e-05, + "loss": 0.8581, + "step": 20635 + }, + { + "epoch": 2.49, + "grad_norm": 0.25447461009025574, + "learning_rate": 2.4935462424497576e-05, + "loss": 0.7793, + "step": 20640 + }, + { + "epoch": 2.49, + "grad_norm": 0.24925333261489868, + "learning_rate": 2.487867629661865e-05, + "loss": 0.7587, + "step": 20645 + }, + { + "epoch": 2.49, + "grad_norm": 0.2856590151786804, + "learning_rate": 2.4821949054989797e-05, + "loss": 0.7551, + "step": 20650 + }, + { + "epoch": 2.49, + "grad_norm": 0.2937513589859009, + "learning_rate": 2.476528072630875e-05, + "loss": 0.7448, + "step": 20655 + }, + { + "epoch": 2.49, + "grad_norm": 0.2708790600299835, + "learning_rate": 2.470867133724552e-05, + "loss": 0.7374, + "step": 20660 + }, + { + "epoch": 2.49, + "grad_norm": 0.2769574820995331, + "learning_rate": 2.4652120914442352e-05, + "loss": 0.8323, + "step": 20665 + }, + { + "epoch": 2.49, + "grad_norm": 0.2852955162525177, + "learning_rate": 2.459562948451375e-05, + "loss": 0.7135, + "step": 20670 + }, + { + "epoch": 2.49, + "grad_norm": 0.2564711272716522, + "learning_rate": 2.4539197074046467e-05, + "loss": 0.7752, + "step": 20675 + }, + { + "epoch": 2.49, + "grad_norm": 0.2757887840270996, + "learning_rate": 2.448282370959944e-05, + "loss": 0.7853, + "step": 20680 + }, + { + "epoch": 2.49, + "grad_norm": 0.2913254201412201, + "learning_rate": 2.44265094177039e-05, + "loss": 0.8027, + "step": 20685 + }, + { + "epoch": 2.49, + "grad_norm": 0.26618868112564087, + "learning_rate": 2.437025422486319e-05, + "loss": 0.7059, + "step": 20690 + }, + { + "epoch": 2.49, + "grad_norm": 0.24177007377147675, + "learning_rate": 2.4314058157552885e-05, + "loss": 0.7975, + "step": 20695 + }, + { + "epoch": 2.49, + "grad_norm": 0.26704952120780945, + "learning_rate": 2.4257921242220663e-05, + "loss": 0.751, + "step": 20700 + }, + { + "epoch": 2.49, + "grad_norm": 0.28091442584991455, + "learning_rate": 2.4201843505286507e-05, + "loss": 0.8622, + "step": 20705 + }, + { + "epoch": 2.5, + "grad_norm": 0.2535339891910553, + "learning_rate": 2.4145824973142435e-05, + "loss": 0.6663, + "step": 20710 + }, + { + "epoch": 2.5, + "grad_norm": 0.26714247465133667, + "learning_rate": 2.4089865672152613e-05, + "loss": 0.759, + "step": 20715 + }, + { + "epoch": 2.5, + "grad_norm": 0.2989547550678253, + "learning_rate": 2.4033965628653323e-05, + "loss": 0.8327, + "step": 20720 + }, + { + "epoch": 2.5, + "grad_norm": 0.270155131816864, + "learning_rate": 2.3978124868953037e-05, + "loss": 0.7935, + "step": 20725 + }, + { + "epoch": 2.5, + "grad_norm": 0.29163941740989685, + "learning_rate": 2.3922343419332247e-05, + "loss": 0.7799, + "step": 20730 + }, + { + "epoch": 2.5, + "grad_norm": 0.27277231216430664, + "learning_rate": 2.386662130604359e-05, + "loss": 0.7378, + "step": 20735 + }, + { + "epoch": 2.5, + "grad_norm": 0.308982253074646, + "learning_rate": 2.3810958555311647e-05, + "loss": 0.7067, + "step": 20740 + }, + { + "epoch": 2.5, + "grad_norm": 0.24645307660102844, + "learning_rate": 2.375535519333324e-05, + "loss": 0.6404, + "step": 20745 + }, + { + "epoch": 2.5, + "grad_norm": 0.30211979150772095, + "learning_rate": 2.3699811246277133e-05, + "loss": 0.7035, + "step": 20750 + }, + { + "epoch": 2.5, + "grad_norm": 0.27554258704185486, + "learning_rate": 2.3644326740284147e-05, + "loss": 0.8621, + "step": 20755 + }, + { + "epoch": 2.5, + "grad_norm": 0.24029062688350677, + "learning_rate": 2.358890170146711e-05, + "loss": 0.7073, + "step": 20760 + }, + { + "epoch": 2.5, + "grad_norm": 0.2912255525588989, + "learning_rate": 2.353353615591096e-05, + "loss": 0.7371, + "step": 20765 + }, + { + "epoch": 2.5, + "grad_norm": 0.29591822624206543, + "learning_rate": 2.3478230129672498e-05, + "loss": 0.6795, + "step": 20770 + }, + { + "epoch": 2.5, + "grad_norm": 0.274454802274704, + "learning_rate": 2.3422983648780606e-05, + "loss": 0.8204, + "step": 20775 + }, + { + "epoch": 2.5, + "grad_norm": 0.26619449257850647, + "learning_rate": 2.336779673923607e-05, + "loss": 0.8263, + "step": 20780 + }, + { + "epoch": 2.5, + "grad_norm": 0.2655404806137085, + "learning_rate": 2.3312669427011748e-05, + "loss": 0.8028, + "step": 20785 + }, + { + "epoch": 2.5, + "grad_norm": 0.2552018165588379, + "learning_rate": 2.3257601738052352e-05, + "loss": 0.8012, + "step": 20790 + }, + { + "epoch": 2.51, + "grad_norm": 0.2989104390144348, + "learning_rate": 2.3202593698274546e-05, + "loss": 0.8466, + "step": 20795 + }, + { + "epoch": 2.51, + "grad_norm": 0.25715914368629456, + "learning_rate": 2.3147645333566977e-05, + "loss": 0.7009, + "step": 20800 + }, + { + "epoch": 2.51, + "grad_norm": 0.29836973547935486, + "learning_rate": 2.309275666979014e-05, + "loss": 0.7897, + "step": 20805 + }, + { + "epoch": 2.51, + "grad_norm": 0.28271016478538513, + "learning_rate": 2.3037927732776472e-05, + "loss": 0.7738, + "step": 20810 + }, + { + "epoch": 2.51, + "grad_norm": 0.31232964992523193, + "learning_rate": 2.29831585483303e-05, + "loss": 0.7341, + "step": 20815 + }, + { + "epoch": 2.51, + "grad_norm": 0.29633277654647827, + "learning_rate": 2.292844914222777e-05, + "loss": 0.8116, + "step": 20820 + }, + { + "epoch": 2.51, + "grad_norm": 0.2850092649459839, + "learning_rate": 2.287379954021703e-05, + "loss": 0.7608, + "step": 20825 + }, + { + "epoch": 2.51, + "grad_norm": 0.2637297809123993, + "learning_rate": 2.2819209768017965e-05, + "loss": 0.7451, + "step": 20830 + }, + { + "epoch": 2.51, + "grad_norm": 0.239261195063591, + "learning_rate": 2.276467985132232e-05, + "loss": 0.8302, + "step": 20835 + }, + { + "epoch": 2.51, + "grad_norm": 0.3218871057033539, + "learning_rate": 2.271020981579365e-05, + "loss": 0.7533, + "step": 20840 + }, + { + "epoch": 2.51, + "grad_norm": 0.26522281765937805, + "learning_rate": 2.265579968706745e-05, + "loss": 0.6176, + "step": 20845 + }, + { + "epoch": 2.51, + "grad_norm": 0.3309444189071655, + "learning_rate": 2.2601449490750867e-05, + "loss": 0.68, + "step": 20850 + }, + { + "epoch": 2.51, + "grad_norm": 0.295600026845932, + "learning_rate": 2.254715925242294e-05, + "loss": 0.7778, + "step": 20855 + }, + { + "epoch": 2.51, + "grad_norm": 0.27975109219551086, + "learning_rate": 2.249292899763442e-05, + "loss": 0.8367, + "step": 20860 + }, + { + "epoch": 2.51, + "grad_norm": 0.301899790763855, + "learning_rate": 2.2438758751907933e-05, + "loss": 0.7383, + "step": 20865 + }, + { + "epoch": 2.51, + "grad_norm": 0.2855704724788666, + "learning_rate": 2.2384648540737736e-05, + "loss": 0.8029, + "step": 20870 + }, + { + "epoch": 2.52, + "grad_norm": 0.31557753682136536, + "learning_rate": 2.2330598389589915e-05, + "loss": 0.7673, + "step": 20875 + }, + { + "epoch": 2.52, + "grad_norm": 0.2771788239479065, + "learning_rate": 2.2276608323902266e-05, + "loss": 0.7175, + "step": 20880 + }, + { + "epoch": 2.52, + "grad_norm": 0.305113285779953, + "learning_rate": 2.22226783690843e-05, + "loss": 0.7742, + "step": 20885 + }, + { + "epoch": 2.52, + "grad_norm": 0.24871449172496796, + "learning_rate": 2.216880855051725e-05, + "loss": 0.678, + "step": 20890 + }, + { + "epoch": 2.52, + "grad_norm": 0.2921499013900757, + "learning_rate": 2.2114998893554042e-05, + "loss": 0.7413, + "step": 20895 + }, + { + "epoch": 2.52, + "grad_norm": 0.3036426603794098, + "learning_rate": 2.2061249423519244e-05, + "loss": 0.7398, + "step": 20900 + }, + { + "epoch": 2.52, + "grad_norm": 0.27135762572288513, + "learning_rate": 2.200756016570922e-05, + "loss": 0.7812, + "step": 20905 + }, + { + "epoch": 2.52, + "grad_norm": 0.2812870442867279, + "learning_rate": 2.1953931145391872e-05, + "loss": 0.7961, + "step": 20910 + }, + { + "epoch": 2.52, + "grad_norm": 0.3006354570388794, + "learning_rate": 2.1900362387806812e-05, + "loss": 0.7816, + "step": 20915 + }, + { + "epoch": 2.52, + "grad_norm": 0.27853453159332275, + "learning_rate": 2.1846853918165223e-05, + "loss": 0.7039, + "step": 20920 + }, + { + "epoch": 2.52, + "grad_norm": 0.29205799102783203, + "learning_rate": 2.1793405761650062e-05, + "loss": 0.8397, + "step": 20925 + }, + { + "epoch": 2.52, + "grad_norm": 0.27953484654426575, + "learning_rate": 2.1740017943415754e-05, + "loss": 0.8072, + "step": 20930 + }, + { + "epoch": 2.52, + "grad_norm": 0.3323310315608978, + "learning_rate": 2.1686690488588387e-05, + "loss": 0.7661, + "step": 20935 + }, + { + "epoch": 2.52, + "grad_norm": 0.33003172278404236, + "learning_rate": 2.16334234222656e-05, + "loss": 0.7778, + "step": 20940 + }, + { + "epoch": 2.52, + "grad_norm": 0.24858631193637848, + "learning_rate": 2.1580216769516738e-05, + "loss": 0.914, + "step": 20945 + }, + { + "epoch": 2.52, + "grad_norm": 0.2743765115737915, + "learning_rate": 2.152707055538251e-05, + "loss": 0.9042, + "step": 20950 + }, + { + "epoch": 2.52, + "grad_norm": 0.283892959356308, + "learning_rate": 2.1473984804875332e-05, + "loss": 0.8793, + "step": 20955 + }, + { + "epoch": 2.53, + "grad_norm": 0.28925827145576477, + "learning_rate": 2.1420959542979095e-05, + "loss": 0.7409, + "step": 20960 + }, + { + "epoch": 2.53, + "grad_norm": 0.24598172307014465, + "learning_rate": 2.1367994794649312e-05, + "loss": 0.7843, + "step": 20965 + }, + { + "epoch": 2.53, + "grad_norm": 0.2719762623310089, + "learning_rate": 2.1315090584812917e-05, + "loss": 0.9156, + "step": 20970 + }, + { + "epoch": 2.53, + "grad_norm": 0.29581862688064575, + "learning_rate": 2.126224693836837e-05, + "loss": 0.7598, + "step": 20975 + }, + { + "epoch": 2.53, + "grad_norm": 0.27417975664138794, + "learning_rate": 2.120946388018565e-05, + "loss": 0.8218, + "step": 20980 + }, + { + "epoch": 2.53, + "grad_norm": 0.27194756269454956, + "learning_rate": 2.115674143510625e-05, + "loss": 0.8978, + "step": 20985 + }, + { + "epoch": 2.53, + "grad_norm": 0.27011337876319885, + "learning_rate": 2.1104079627943087e-05, + "loss": 0.8051, + "step": 20990 + }, + { + "epoch": 2.53, + "grad_norm": 0.26955413818359375, + "learning_rate": 2.1051478483480555e-05, + "loss": 0.7435, + "step": 20995 + }, + { + "epoch": 2.53, + "grad_norm": 0.26142939925193787, + "learning_rate": 2.099893802647445e-05, + "loss": 0.7034, + "step": 21000 + }, + { + "epoch": 2.53, + "grad_norm": 0.27008065581321716, + "learning_rate": 2.0946458281652155e-05, + "loss": 0.7948, + "step": 21005 + }, + { + "epoch": 2.53, + "grad_norm": 0.2838139235973358, + "learning_rate": 2.089403927371232e-05, + "loss": 0.7694, + "step": 21010 + }, + { + "epoch": 2.53, + "grad_norm": 0.2932431399822235, + "learning_rate": 2.0841681027325075e-05, + "loss": 0.8528, + "step": 21015 + }, + { + "epoch": 2.53, + "grad_norm": 0.28293853998184204, + "learning_rate": 2.0789383567131955e-05, + "loss": 0.7757, + "step": 21020 + }, + { + "epoch": 2.53, + "grad_norm": 0.27947354316711426, + "learning_rate": 2.0737146917745878e-05, + "loss": 0.7972, + "step": 21025 + }, + { + "epoch": 2.53, + "grad_norm": 0.2797410190105438, + "learning_rate": 2.0684971103751158e-05, + "loss": 0.8079, + "step": 21030 + }, + { + "epoch": 2.53, + "grad_norm": 0.2733669877052307, + "learning_rate": 2.0632856149703454e-05, + "loss": 0.7684, + "step": 21035 + }, + { + "epoch": 2.54, + "grad_norm": 0.2936958074569702, + "learning_rate": 2.0580802080129744e-05, + "loss": 0.7623, + "step": 21040 + }, + { + "epoch": 2.54, + "grad_norm": 0.2874435782432556, + "learning_rate": 2.05288089195285e-05, + "loss": 0.752, + "step": 21045 + }, + { + "epoch": 2.54, + "grad_norm": 0.2876211404800415, + "learning_rate": 2.0476876692369398e-05, + "loss": 0.7191, + "step": 21050 + }, + { + "epoch": 2.54, + "grad_norm": 0.3623277544975281, + "learning_rate": 2.0425005423093437e-05, + "loss": 0.7588, + "step": 21055 + }, + { + "epoch": 2.54, + "grad_norm": 0.30501341819763184, + "learning_rate": 2.037319513611294e-05, + "loss": 0.723, + "step": 21060 + }, + { + "epoch": 2.54, + "grad_norm": 0.29336845874786377, + "learning_rate": 2.0321445855811635e-05, + "loss": 0.8807, + "step": 21065 + }, + { + "epoch": 2.54, + "grad_norm": 0.285491019487381, + "learning_rate": 2.026975760654439e-05, + "loss": 0.8104, + "step": 21070 + }, + { + "epoch": 2.54, + "grad_norm": 0.27054548263549805, + "learning_rate": 2.0218130412637397e-05, + "loss": 0.7187, + "step": 21075 + }, + { + "epoch": 2.54, + "grad_norm": 0.28753402829170227, + "learning_rate": 2.0166564298388182e-05, + "loss": 0.8717, + "step": 21080 + }, + { + "epoch": 2.54, + "grad_norm": 0.25017327070236206, + "learning_rate": 2.0115059288065437e-05, + "loss": 0.7553, + "step": 21085 + }, + { + "epoch": 2.54, + "grad_norm": 0.2623300850391388, + "learning_rate": 2.006361540590914e-05, + "loss": 0.8769, + "step": 21090 + }, + { + "epoch": 2.54, + "grad_norm": 0.2736659348011017, + "learning_rate": 2.0012232676130503e-05, + "loss": 0.8399, + "step": 21095 + }, + { + "epoch": 2.54, + "grad_norm": 0.3095528483390808, + "learning_rate": 1.9960911122911915e-05, + "loss": 0.6941, + "step": 21100 + }, + { + "epoch": 2.54, + "grad_norm": 0.2801484763622284, + "learning_rate": 1.990965077040701e-05, + "loss": 0.8403, + "step": 21105 + }, + { + "epoch": 2.54, + "grad_norm": 0.3449302017688751, + "learning_rate": 1.9858451642740637e-05, + "loss": 0.7571, + "step": 21110 + }, + { + "epoch": 2.54, + "grad_norm": 0.27275073528289795, + "learning_rate": 1.9807313764008738e-05, + "loss": 0.8058, + "step": 21115 + }, + { + "epoch": 2.54, + "grad_norm": 0.3031749129295349, + "learning_rate": 1.9756237158278593e-05, + "loss": 0.7587, + "step": 21120 + }, + { + "epoch": 2.55, + "grad_norm": 0.27661368250846863, + "learning_rate": 1.9705221849588493e-05, + "loss": 0.8447, + "step": 21125 + }, + { + "epoch": 2.55, + "grad_norm": 0.3269336223602295, + "learning_rate": 1.9654267861947946e-05, + "loss": 0.8247, + "step": 21130 + }, + { + "epoch": 2.55, + "grad_norm": 0.2593838572502136, + "learning_rate": 1.9603375219337573e-05, + "loss": 0.8053, + "step": 21135 + }, + { + "epoch": 2.55, + "grad_norm": 0.2844938039779663, + "learning_rate": 1.9552543945709183e-05, + "loss": 0.8455, + "step": 21140 + }, + { + "epoch": 2.55, + "grad_norm": 0.28493261337280273, + "learning_rate": 1.950177406498563e-05, + "loss": 0.8471, + "step": 21145 + }, + { + "epoch": 2.55, + "grad_norm": 0.2901061773300171, + "learning_rate": 1.9451065601060927e-05, + "loss": 0.6899, + "step": 21150 + }, + { + "epoch": 2.55, + "grad_norm": 0.27742087841033936, + "learning_rate": 1.9400418577800148e-05, + "loss": 0.7318, + "step": 21155 + }, + { + "epoch": 2.55, + "grad_norm": 0.2615971565246582, + "learning_rate": 1.9349833019039457e-05, + "loss": 0.8078, + "step": 21160 + }, + { + "epoch": 2.55, + "grad_norm": 0.3021378219127655, + "learning_rate": 1.929930894858611e-05, + "loss": 0.812, + "step": 21165 + }, + { + "epoch": 2.55, + "grad_norm": 0.29150012135505676, + "learning_rate": 1.924884639021841e-05, + "loss": 0.9014, + "step": 21170 + }, + { + "epoch": 2.55, + "grad_norm": 0.2782303988933563, + "learning_rate": 1.9198445367685668e-05, + "loss": 0.7307, + "step": 21175 + }, + { + "epoch": 2.55, + "grad_norm": 0.26657816767692566, + "learning_rate": 1.914810590470836e-05, + "loss": 0.8146, + "step": 21180 + }, + { + "epoch": 2.55, + "grad_norm": 0.27062734961509705, + "learning_rate": 1.909782802497786e-05, + "loss": 0.8376, + "step": 21185 + }, + { + "epoch": 2.55, + "grad_norm": 0.3097752630710602, + "learning_rate": 1.9047611752156628e-05, + "loss": 0.7168, + "step": 21190 + }, + { + "epoch": 2.55, + "grad_norm": 0.2954844534397125, + "learning_rate": 1.8997457109878067e-05, + "loss": 0.6942, + "step": 21195 + }, + { + "epoch": 2.55, + "grad_norm": 0.29877805709838867, + "learning_rate": 1.8947364121746677e-05, + "loss": 0.7444, + "step": 21200 + }, + { + "epoch": 2.55, + "grad_norm": 0.303915798664093, + "learning_rate": 1.8897332811337868e-05, + "loss": 0.7517, + "step": 21205 + }, + { + "epoch": 2.56, + "grad_norm": 0.2891033887863159, + "learning_rate": 1.8847363202198017e-05, + "loss": 0.7324, + "step": 21210 + }, + { + "epoch": 2.56, + "grad_norm": 0.27175503969192505, + "learning_rate": 1.8797455317844452e-05, + "loss": 0.7523, + "step": 21215 + }, + { + "epoch": 2.56, + "grad_norm": 0.32238414883613586, + "learning_rate": 1.8747609181765543e-05, + "loss": 0.7668, + "step": 21220 + }, + { + "epoch": 2.56, + "grad_norm": 0.28903838992118835, + "learning_rate": 1.8697824817420514e-05, + "loss": 0.796, + "step": 21225 + }, + { + "epoch": 2.56, + "grad_norm": 0.25564032793045044, + "learning_rate": 1.8648102248239537e-05, + "loss": 0.8034, + "step": 21230 + }, + { + "epoch": 2.56, + "grad_norm": 0.30182021856307983, + "learning_rate": 1.859844149762371e-05, + "loss": 0.7384, + "step": 21235 + }, + { + "epoch": 2.56, + "grad_norm": 0.33114859461784363, + "learning_rate": 1.8548842588945023e-05, + "loss": 0.8908, + "step": 21240 + }, + { + "epoch": 2.56, + "grad_norm": 0.2683655917644501, + "learning_rate": 1.8499305545546367e-05, + "loss": 0.7694, + "step": 21245 + }, + { + "epoch": 2.56, + "grad_norm": 0.30291497707366943, + "learning_rate": 1.8449830390741526e-05, + "loss": 0.8817, + "step": 21250 + }, + { + "epoch": 2.56, + "grad_norm": 0.2633529007434845, + "learning_rate": 1.840041714781511e-05, + "loss": 0.737, + "step": 21255 + }, + { + "epoch": 2.56, + "grad_norm": 0.27262598276138306, + "learning_rate": 1.8351065840022715e-05, + "loss": 0.7905, + "step": 21260 + }, + { + "epoch": 2.56, + "grad_norm": 0.28199708461761475, + "learning_rate": 1.8301776490590664e-05, + "loss": 0.7542, + "step": 21265 + }, + { + "epoch": 2.56, + "grad_norm": 0.3084707260131836, + "learning_rate": 1.8252549122716147e-05, + "loss": 0.6572, + "step": 21270 + }, + { + "epoch": 2.56, + "grad_norm": 0.27886369824409485, + "learning_rate": 1.8203383759567186e-05, + "loss": 0.7748, + "step": 21275 + }, + { + "epoch": 2.56, + "grad_norm": 0.2864547669887543, + "learning_rate": 1.8154280424282685e-05, + "loss": 0.8352, + "step": 21280 + }, + { + "epoch": 2.56, + "grad_norm": 0.26778092980384827, + "learning_rate": 1.8105239139972277e-05, + "loss": 0.7049, + "step": 21285 + }, + { + "epoch": 2.57, + "grad_norm": 0.28581252694129944, + "learning_rate": 1.8056259929716417e-05, + "loss": 0.7679, + "step": 21290 + }, + { + "epoch": 2.57, + "grad_norm": 0.30422863364219666, + "learning_rate": 1.800734281656633e-05, + "loss": 0.7548, + "step": 21295 + }, + { + "epoch": 2.57, + "grad_norm": 0.2808743715286255, + "learning_rate": 1.7958487823544082e-05, + "loss": 0.7986, + "step": 21300 + }, + { + "epoch": 2.57, + "grad_norm": 0.3032390773296356, + "learning_rate": 1.790969497364242e-05, + "loss": 0.8185, + "step": 21305 + }, + { + "epoch": 2.57, + "grad_norm": 0.2701072096824646, + "learning_rate": 1.7860964289824885e-05, + "loss": 0.7967, + "step": 21310 + }, + { + "epoch": 2.57, + "grad_norm": 0.28802967071533203, + "learning_rate": 1.7812295795025773e-05, + "loss": 0.7764, + "step": 21315 + }, + { + "epoch": 2.57, + "grad_norm": 0.2924157381057739, + "learning_rate": 1.776368951215007e-05, + "loss": 0.7171, + "step": 21320 + }, + { + "epoch": 2.57, + "grad_norm": 0.2745770812034607, + "learning_rate": 1.7715145464073498e-05, + "loss": 0.806, + "step": 21325 + }, + { + "epoch": 2.57, + "grad_norm": 0.2635575830936432, + "learning_rate": 1.7666663673642535e-05, + "loss": 0.7911, + "step": 21330 + }, + { + "epoch": 2.57, + "grad_norm": 0.2602815330028534, + "learning_rate": 1.761824416367427e-05, + "loss": 0.8123, + "step": 21335 + }, + { + "epoch": 2.57, + "grad_norm": 0.30612412095069885, + "learning_rate": 1.7569886956956585e-05, + "loss": 0.8126, + "step": 21340 + }, + { + "epoch": 2.57, + "grad_norm": 0.2764699161052704, + "learning_rate": 1.752159207624797e-05, + "loss": 0.8279, + "step": 21345 + }, + { + "epoch": 2.57, + "grad_norm": 0.2635042667388916, + "learning_rate": 1.7473359544277594e-05, + "loss": 0.6854, + "step": 21350 + }, + { + "epoch": 2.57, + "grad_norm": 0.25210273265838623, + "learning_rate": 1.7425189383745265e-05, + "loss": 0.8533, + "step": 21355 + }, + { + "epoch": 2.57, + "grad_norm": 0.26929739117622375, + "learning_rate": 1.7377081617321513e-05, + "loss": 0.7162, + "step": 21360 + }, + { + "epoch": 2.57, + "grad_norm": 0.27640727162361145, + "learning_rate": 1.7329036267647417e-05, + "loss": 0.7429, + "step": 21365 + }, + { + "epoch": 2.57, + "grad_norm": 0.2647760510444641, + "learning_rate": 1.7281053357334717e-05, + "loss": 0.8501, + "step": 21370 + }, + { + "epoch": 2.58, + "grad_norm": 0.28919151425361633, + "learning_rate": 1.7233132908965784e-05, + "loss": 0.7655, + "step": 21375 + }, + { + "epoch": 2.58, + "grad_norm": 0.295827180147171, + "learning_rate": 1.7185274945093547e-05, + "loss": 0.8074, + "step": 21380 + }, + { + "epoch": 2.58, + "grad_norm": 0.2644157409667969, + "learning_rate": 1.713747948824155e-05, + "loss": 0.8458, + "step": 21385 + }, + { + "epoch": 2.58, + "grad_norm": 0.2902376055717468, + "learning_rate": 1.7089746560903956e-05, + "loss": 0.8744, + "step": 21390 + }, + { + "epoch": 2.58, + "grad_norm": 0.25895097851753235, + "learning_rate": 1.7042076185545388e-05, + "loss": 0.7331, + "step": 21395 + }, + { + "epoch": 2.58, + "grad_norm": 0.2989761233329773, + "learning_rate": 1.6994468384601206e-05, + "loss": 0.7485, + "step": 21400 + }, + { + "epoch": 2.58, + "grad_norm": 0.26505047082901, + "learning_rate": 1.6946923180477183e-05, + "loss": 0.7844, + "step": 21405 + }, + { + "epoch": 2.58, + "grad_norm": 0.26280462741851807, + "learning_rate": 1.6899440595549674e-05, + "loss": 0.7854, + "step": 21410 + }, + { + "epoch": 2.58, + "grad_norm": 0.2874717712402344, + "learning_rate": 1.6852020652165515e-05, + "loss": 0.7819, + "step": 21415 + }, + { + "epoch": 2.58, + "grad_norm": 0.28164345026016235, + "learning_rate": 1.6804663372642175e-05, + "loss": 0.7315, + "step": 21420 + }, + { + "epoch": 2.58, + "grad_norm": 0.2647458016872406, + "learning_rate": 1.6757368779267522e-05, + "loss": 0.8445, + "step": 21425 + }, + { + "epoch": 2.58, + "grad_norm": 0.26932641863822937, + "learning_rate": 1.6710136894299987e-05, + "loss": 0.763, + "step": 21430 + }, + { + "epoch": 2.58, + "grad_norm": 0.2817295491695404, + "learning_rate": 1.6662967739968402e-05, + "loss": 0.8031, + "step": 21435 + }, + { + "epoch": 2.58, + "grad_norm": 0.303753137588501, + "learning_rate": 1.661586133847223e-05, + "loss": 0.6969, + "step": 21440 + }, + { + "epoch": 2.58, + "grad_norm": 0.27712610363960266, + "learning_rate": 1.656881771198125e-05, + "loss": 0.8331, + "step": 21445 + }, + { + "epoch": 2.58, + "grad_norm": 0.28993546962738037, + "learning_rate": 1.6521836882635775e-05, + "loss": 0.7937, + "step": 21450 + }, + { + "epoch": 2.59, + "grad_norm": 0.2543039619922638, + "learning_rate": 1.6474918872546545e-05, + "loss": 0.7681, + "step": 21455 + }, + { + "epoch": 2.59, + "grad_norm": 0.3048137426376343, + "learning_rate": 1.6428063703794714e-05, + "loss": 0.8169, + "step": 21460 + }, + { + "epoch": 2.59, + "grad_norm": 0.30479833483695984, + "learning_rate": 1.6381271398431894e-05, + "loss": 0.7523, + "step": 21465 + }, + { + "epoch": 2.59, + "grad_norm": 0.311480849981308, + "learning_rate": 1.6334541978480087e-05, + "loss": 0.7312, + "step": 21470 + }, + { + "epoch": 2.59, + "grad_norm": 0.2890738844871521, + "learning_rate": 1.6287875465931698e-05, + "loss": 0.746, + "step": 21475 + }, + { + "epoch": 2.59, + "grad_norm": 0.2721187472343445, + "learning_rate": 1.624127188274958e-05, + "loss": 0.827, + "step": 21480 + }, + { + "epoch": 2.59, + "grad_norm": 0.2833477556705475, + "learning_rate": 1.61947312508669e-05, + "loss": 0.7932, + "step": 21485 + }, + { + "epoch": 2.59, + "grad_norm": 0.28032922744750977, + "learning_rate": 1.614825359218724e-05, + "loss": 0.7861, + "step": 21490 + }, + { + "epoch": 2.59, + "grad_norm": 0.2863568067550659, + "learning_rate": 1.6101838928584483e-05, + "loss": 0.6808, + "step": 21495 + }, + { + "epoch": 2.59, + "grad_norm": 0.31328871846199036, + "learning_rate": 1.6055487281902973e-05, + "loss": 0.8056, + "step": 21500 + }, + { + "epoch": 2.59, + "grad_norm": 0.28002050518989563, + "learning_rate": 1.6009198673957313e-05, + "loss": 0.84, + "step": 21505 + }, + { + "epoch": 2.59, + "grad_norm": 0.28880271315574646, + "learning_rate": 1.5962973126532457e-05, + "loss": 0.8734, + "step": 21510 + }, + { + "epoch": 2.59, + "grad_norm": 0.2959395945072174, + "learning_rate": 1.5916810661383638e-05, + "loss": 0.7527, + "step": 21515 + }, + { + "epoch": 2.59, + "grad_norm": 0.28988808393478394, + "learning_rate": 1.5870711300236527e-05, + "loss": 0.6993, + "step": 21520 + }, + { + "epoch": 2.59, + "grad_norm": 0.3125680983066559, + "learning_rate": 1.5824675064786968e-05, + "loss": 0.7884, + "step": 21525 + }, + { + "epoch": 2.59, + "grad_norm": 0.31437787413597107, + "learning_rate": 1.577870197670118e-05, + "loss": 0.6855, + "step": 21530 + }, + { + "epoch": 2.59, + "grad_norm": 0.2919071316719055, + "learning_rate": 1.573279205761554e-05, + "loss": 0.8805, + "step": 21535 + }, + { + "epoch": 2.6, + "grad_norm": 0.288947194814682, + "learning_rate": 1.5686945329136865e-05, + "loss": 0.7189, + "step": 21540 + }, + { + "epoch": 2.6, + "grad_norm": 0.28742465376853943, + "learning_rate": 1.5641161812842105e-05, + "loss": 0.8336, + "step": 21545 + }, + { + "epoch": 2.6, + "grad_norm": 0.2684331238269806, + "learning_rate": 1.5595441530278517e-05, + "loss": 0.7574, + "step": 21550 + }, + { + "epoch": 2.6, + "grad_norm": 0.29591086506843567, + "learning_rate": 1.5549784502963554e-05, + "loss": 0.7995, + "step": 21555 + }, + { + "epoch": 2.6, + "grad_norm": 0.25457319617271423, + "learning_rate": 1.5504190752384987e-05, + "loss": 0.7078, + "step": 21560 + }, + { + "epoch": 2.6, + "grad_norm": 0.28611642122268677, + "learning_rate": 1.5458660300000725e-05, + "loss": 0.7801, + "step": 21565 + }, + { + "epoch": 2.6, + "grad_norm": 0.2762448489665985, + "learning_rate": 1.5413193167238908e-05, + "loss": 0.7721, + "step": 21570 + }, + { + "epoch": 2.6, + "grad_norm": 0.29619264602661133, + "learning_rate": 1.5367789375497836e-05, + "loss": 0.8263, + "step": 21575 + }, + { + "epoch": 2.6, + "grad_norm": 0.27710941433906555, + "learning_rate": 1.5322448946146114e-05, + "loss": 0.9136, + "step": 21580 + }, + { + "epoch": 2.6, + "grad_norm": 0.2905004322528839, + "learning_rate": 1.5277171900522428e-05, + "loss": 0.7735, + "step": 21585 + }, + { + "epoch": 2.6, + "grad_norm": 0.26848629117012024, + "learning_rate": 1.5231958259935646e-05, + "loss": 0.7816, + "step": 21590 + }, + { + "epoch": 2.6, + "grad_norm": 0.28664660453796387, + "learning_rate": 1.5186808045664812e-05, + "loss": 0.7792, + "step": 21595 + }, + { + "epoch": 2.6, + "grad_norm": 0.3380061984062195, + "learning_rate": 1.5141721278959112e-05, + "loss": 0.7895, + "step": 21600 + }, + { + "epoch": 2.6, + "grad_norm": 0.26987069845199585, + "learning_rate": 1.5096697981037909e-05, + "loss": 0.7762, + "step": 21605 + }, + { + "epoch": 2.6, + "grad_norm": 0.2945181131362915, + "learning_rate": 1.5051738173090582e-05, + "loss": 0.7818, + "step": 21610 + }, + { + "epoch": 2.6, + "grad_norm": 0.30163273215293884, + "learning_rate": 1.500684187627681e-05, + "loss": 0.7005, + "step": 21615 + }, + { + "epoch": 2.6, + "grad_norm": 0.28872594237327576, + "learning_rate": 1.496200911172622e-05, + "loss": 0.7485, + "step": 21620 + }, + { + "epoch": 2.61, + "grad_norm": 0.2600472569465637, + "learning_rate": 1.491723990053862e-05, + "loss": 0.7653, + "step": 21625 + }, + { + "epoch": 2.61, + "grad_norm": 0.2789260745048523, + "learning_rate": 1.487253426378387e-05, + "loss": 0.6609, + "step": 21630 + }, + { + "epoch": 2.61, + "grad_norm": 0.23983633518218994, + "learning_rate": 1.482789222250194e-05, + "loss": 0.7225, + "step": 21635 + }, + { + "epoch": 2.61, + "grad_norm": 0.27560335397720337, + "learning_rate": 1.4783313797702878e-05, + "loss": 0.8625, + "step": 21640 + }, + { + "epoch": 2.61, + "grad_norm": 0.2681236267089844, + "learning_rate": 1.4738799010366747e-05, + "loss": 0.7163, + "step": 21645 + }, + { + "epoch": 2.61, + "grad_norm": 0.2975192070007324, + "learning_rate": 1.4694347881443659e-05, + "loss": 0.7637, + "step": 21650 + }, + { + "epoch": 2.61, + "grad_norm": 0.3074178397655487, + "learning_rate": 1.4649960431853842e-05, + "loss": 0.7499, + "step": 21655 + }, + { + "epoch": 2.61, + "grad_norm": 0.28553348779678345, + "learning_rate": 1.4605636682487486e-05, + "loss": 0.8284, + "step": 21660 + }, + { + "epoch": 2.61, + "grad_norm": 0.29183608293533325, + "learning_rate": 1.456137665420481e-05, + "loss": 0.7162, + "step": 21665 + }, + { + "epoch": 2.61, + "grad_norm": 0.2632047235965729, + "learning_rate": 1.4517180367836062e-05, + "loss": 0.8519, + "step": 21670 + }, + { + "epoch": 2.61, + "grad_norm": 0.28933554887771606, + "learning_rate": 1.4473047844181474e-05, + "loss": 0.7592, + "step": 21675 + }, + { + "epoch": 2.61, + "grad_norm": 0.30451327562332153, + "learning_rate": 1.4428979104011295e-05, + "loss": 0.7692, + "step": 21680 + }, + { + "epoch": 2.61, + "grad_norm": 0.27428263425827026, + "learning_rate": 1.4384974168065705e-05, + "loss": 0.7196, + "step": 21685 + }, + { + "epoch": 2.61, + "grad_norm": 0.2668101191520691, + "learning_rate": 1.4341033057054885e-05, + "loss": 0.7442, + "step": 21690 + }, + { + "epoch": 2.61, + "grad_norm": 0.27039745450019836, + "learning_rate": 1.4297155791659044e-05, + "loss": 0.7293, + "step": 21695 + }, + { + "epoch": 2.61, + "grad_norm": 0.2836028039455414, + "learning_rate": 1.4253342392528227e-05, + "loss": 0.8122, + "step": 21700 + }, + { + "epoch": 2.62, + "grad_norm": 0.26885733008384705, + "learning_rate": 1.4209592880282494e-05, + "loss": 0.6921, + "step": 21705 + }, + { + "epoch": 2.62, + "grad_norm": 0.274857759475708, + "learning_rate": 1.4165907275511773e-05, + "loss": 0.7754, + "step": 21710 + }, + { + "epoch": 2.62, + "grad_norm": 0.29404956102371216, + "learning_rate": 1.4122285598776035e-05, + "loss": 0.7997, + "step": 21715 + }, + { + "epoch": 2.62, + "grad_norm": 0.29239895939826965, + "learning_rate": 1.4078727870605056e-05, + "loss": 0.6922, + "step": 21720 + }, + { + "epoch": 2.62, + "grad_norm": 0.2807917892932892, + "learning_rate": 1.4035234111498539e-05, + "loss": 0.8521, + "step": 21725 + }, + { + "epoch": 2.62, + "grad_norm": 0.26643988490104675, + "learning_rate": 1.3991804341926077e-05, + "loss": 0.8503, + "step": 21730 + }, + { + "epoch": 2.62, + "grad_norm": 0.2644132375717163, + "learning_rate": 1.394843858232722e-05, + "loss": 0.7678, + "step": 21735 + }, + { + "epoch": 2.62, + "grad_norm": 0.2644047141075134, + "learning_rate": 1.390513685311131e-05, + "loss": 0.8423, + "step": 21740 + }, + { + "epoch": 2.62, + "grad_norm": 0.2851656675338745, + "learning_rate": 1.3861899174657542e-05, + "loss": 0.7707, + "step": 21745 + }, + { + "epoch": 2.62, + "grad_norm": 0.2632659673690796, + "learning_rate": 1.381872556731501e-05, + "loss": 0.8024, + "step": 21750 + }, + { + "epoch": 2.62, + "grad_norm": 0.2766389846801758, + "learning_rate": 1.3775616051402689e-05, + "loss": 0.7274, + "step": 21755 + }, + { + "epoch": 2.62, + "grad_norm": 0.3009210228919983, + "learning_rate": 1.3732570647209334e-05, + "loss": 0.6989, + "step": 21760 + }, + { + "epoch": 2.62, + "grad_norm": 0.2986798584461212, + "learning_rate": 1.3689589374993526e-05, + "loss": 0.7043, + "step": 21765 + }, + { + "epoch": 2.62, + "grad_norm": 0.2332366406917572, + "learning_rate": 1.3646672254983649e-05, + "loss": 0.7085, + "step": 21770 + }, + { + "epoch": 2.62, + "grad_norm": 0.2506479322910309, + "learning_rate": 1.3603819307378011e-05, + "loss": 0.8323, + "step": 21775 + }, + { + "epoch": 2.62, + "grad_norm": 0.2667887806892395, + "learning_rate": 1.3561030552344566e-05, + "loss": 0.8099, + "step": 21780 + }, + { + "epoch": 2.62, + "grad_norm": 0.26609328389167786, + "learning_rate": 1.3518306010021152e-05, + "loss": 0.805, + "step": 21785 + }, + { + "epoch": 2.63, + "grad_norm": 0.2911878824234009, + "learning_rate": 1.3475645700515319e-05, + "loss": 0.8367, + "step": 21790 + }, + { + "epoch": 2.63, + "grad_norm": 0.2831841707229614, + "learning_rate": 1.3433049643904476e-05, + "loss": 0.8377, + "step": 21795 + }, + { + "epoch": 2.63, + "grad_norm": 0.2804235517978668, + "learning_rate": 1.3390517860235717e-05, + "loss": 0.8026, + "step": 21800 + }, + { + "epoch": 2.63, + "grad_norm": 0.28793367743492126, + "learning_rate": 1.3348050369525931e-05, + "loss": 0.7157, + "step": 21805 + }, + { + "epoch": 2.63, + "grad_norm": 0.28661012649536133, + "learning_rate": 1.3305647191761699e-05, + "loss": 0.729, + "step": 21810 + }, + { + "epoch": 2.63, + "grad_norm": 0.2898370027542114, + "learning_rate": 1.3263308346899371e-05, + "loss": 0.7045, + "step": 21815 + }, + { + "epoch": 2.63, + "grad_norm": 0.26269808411598206, + "learning_rate": 1.3221033854865027e-05, + "loss": 0.7042, + "step": 21820 + }, + { + "epoch": 2.63, + "grad_norm": 0.29441672563552856, + "learning_rate": 1.3178823735554434e-05, + "loss": 0.7476, + "step": 21825 + }, + { + "epoch": 2.63, + "grad_norm": 0.25631454586982727, + "learning_rate": 1.3136678008833069e-05, + "loss": 0.8577, + "step": 21830 + }, + { + "epoch": 2.63, + "grad_norm": 0.32068613171577454, + "learning_rate": 1.309459669453613e-05, + "loss": 0.8105, + "step": 21835 + }, + { + "epoch": 2.63, + "grad_norm": 0.23884128034114838, + "learning_rate": 1.3052579812468494e-05, + "loss": 0.7782, + "step": 21840 + }, + { + "epoch": 2.63, + "grad_norm": 0.3259485363960266, + "learning_rate": 1.3010627382404675e-05, + "loss": 0.8539, + "step": 21845 + }, + { + "epoch": 2.63, + "grad_norm": 0.30076032876968384, + "learning_rate": 1.2968739424088848e-05, + "loss": 0.818, + "step": 21850 + }, + { + "epoch": 2.63, + "grad_norm": 0.26317545771598816, + "learning_rate": 1.2926915957234957e-05, + "loss": 0.766, + "step": 21855 + }, + { + "epoch": 2.63, + "grad_norm": 0.23999077081680298, + "learning_rate": 1.2885157001526459e-05, + "loss": 0.7286, + "step": 21860 + }, + { + "epoch": 2.63, + "grad_norm": 0.23871669173240662, + "learning_rate": 1.2843462576616531e-05, + "loss": 0.7398, + "step": 21865 + }, + { + "epoch": 2.64, + "grad_norm": 0.25987398624420166, + "learning_rate": 1.2801832702127912e-05, + "loss": 0.7984, + "step": 21870 + }, + { + "epoch": 2.64, + "grad_norm": 0.25374579429626465, + "learning_rate": 1.2760267397653063e-05, + "loss": 0.7778, + "step": 21875 + }, + { + "epoch": 2.64, + "grad_norm": 0.3066677749156952, + "learning_rate": 1.2718766682753966e-05, + "loss": 0.8205, + "step": 21880 + }, + { + "epoch": 2.64, + "grad_norm": 0.2812236547470093, + "learning_rate": 1.2677330576962235e-05, + "loss": 0.8151, + "step": 21885 + }, + { + "epoch": 2.64, + "grad_norm": 0.2993893027305603, + "learning_rate": 1.263595909977907e-05, + "loss": 0.7551, + "step": 21890 + }, + { + "epoch": 2.64, + "grad_norm": 0.2752331495285034, + "learning_rate": 1.2594652270675293e-05, + "loss": 0.8551, + "step": 21895 + }, + { + "epoch": 2.64, + "grad_norm": 0.29263609647750854, + "learning_rate": 1.2553410109091221e-05, + "loss": 0.846, + "step": 21900 + }, + { + "epoch": 2.64, + "grad_norm": 0.28177276253700256, + "learning_rate": 1.251223263443683e-05, + "loss": 0.7788, + "step": 21905 + }, + { + "epoch": 2.64, + "grad_norm": 0.3226982355117798, + "learning_rate": 1.2479337242326714e-05, + "loss": 0.7536, + "step": 21910 + }, + { + "epoch": 2.64, + "grad_norm": 0.29417189955711365, + "learning_rate": 1.2438276252961555e-05, + "loss": 0.7601, + "step": 21915 + }, + { + "epoch": 2.64, + "grad_norm": 0.26856791973114014, + "learning_rate": 1.2397280004711845e-05, + "loss": 0.719, + "step": 21920 + }, + { + "epoch": 2.64, + "grad_norm": 0.2941911518573761, + "learning_rate": 1.2356348516871839e-05, + "loss": 0.8129, + "step": 21925 + }, + { + "epoch": 2.64, + "grad_norm": 0.3045237064361572, + "learning_rate": 1.2315481808705224e-05, + "loss": 0.8177, + "step": 21930 + }, + { + "epoch": 2.64, + "grad_norm": 0.26458317041397095, + "learning_rate": 1.2274679899445234e-05, + "loss": 0.8406, + "step": 21935 + }, + { + "epoch": 2.64, + "grad_norm": 0.3065710663795471, + "learning_rate": 1.2233942808294573e-05, + "loss": 0.7369, + "step": 21940 + }, + { + "epoch": 2.64, + "grad_norm": 0.28398415446281433, + "learning_rate": 1.2193270554425521e-05, + "loss": 0.8903, + "step": 21945 + }, + { + "epoch": 2.64, + "grad_norm": 0.29608169198036194, + "learning_rate": 1.215266315697977e-05, + "loss": 0.7931, + "step": 21950 + }, + { + "epoch": 2.65, + "grad_norm": 0.2909848093986511, + "learning_rate": 1.2112120635068495e-05, + "loss": 0.8049, + "step": 21955 + }, + { + "epoch": 2.65, + "grad_norm": 0.28937968611717224, + "learning_rate": 1.2071643007772353e-05, + "loss": 0.7373, + "step": 21960 + }, + { + "epoch": 2.65, + "grad_norm": 0.2677673399448395, + "learning_rate": 1.2031230294141486e-05, + "loss": 0.7057, + "step": 21965 + }, + { + "epoch": 2.65, + "grad_norm": 0.27190762758255005, + "learning_rate": 1.199088251319545e-05, + "loss": 0.7522, + "step": 21970 + }, + { + "epoch": 2.65, + "grad_norm": 0.2613365650177002, + "learning_rate": 1.1950599683923234e-05, + "loss": 0.8177, + "step": 21975 + }, + { + "epoch": 2.65, + "grad_norm": 0.27314963936805725, + "learning_rate": 1.1910381825283294e-05, + "loss": 0.7326, + "step": 21980 + }, + { + "epoch": 2.65, + "grad_norm": 0.23859882354736328, + "learning_rate": 1.1870228956203487e-05, + "loss": 0.7208, + "step": 21985 + }, + { + "epoch": 2.65, + "grad_norm": 0.2868862450122833, + "learning_rate": 1.183014109558108e-05, + "loss": 0.7957, + "step": 21990 + }, + { + "epoch": 2.65, + "grad_norm": 0.2559562027454376, + "learning_rate": 1.1790118262282715e-05, + "loss": 0.7123, + "step": 21995 + }, + { + "epoch": 2.65, + "grad_norm": 0.30129274725914, + "learning_rate": 1.1750160475144543e-05, + "loss": 0.7947, + "step": 22000 + }, + { + "epoch": 2.65, + "grad_norm": 0.26090654730796814, + "learning_rate": 1.171026775297197e-05, + "loss": 0.6892, + "step": 22005 + }, + { + "epoch": 2.65, + "grad_norm": 0.28930899500846863, + "learning_rate": 1.1670440114539858e-05, + "loss": 0.6845, + "step": 22010 + }, + { + "epoch": 2.65, + "grad_norm": 0.3468032777309418, + "learning_rate": 1.1630677578592401e-05, + "loss": 0.7648, + "step": 22015 + }, + { + "epoch": 2.65, + "grad_norm": 0.2959340810775757, + "learning_rate": 1.1590980163843194e-05, + "loss": 0.8748, + "step": 22020 + }, + { + "epoch": 2.65, + "grad_norm": 0.28836220502853394, + "learning_rate": 1.1551347888975126e-05, + "loss": 0.7935, + "step": 22025 + }, + { + "epoch": 2.65, + "grad_norm": 0.29400843381881714, + "learning_rate": 1.1511780772640494e-05, + "loss": 0.8172, + "step": 22030 + }, + { + "epoch": 2.65, + "grad_norm": 0.28731659054756165, + "learning_rate": 1.1472278833460886e-05, + "loss": 0.7621, + "step": 22035 + }, + { + "epoch": 2.66, + "grad_norm": 0.26251596212387085, + "learning_rate": 1.1432842090027227e-05, + "loss": 0.7288, + "step": 22040 + }, + { + "epoch": 2.66, + "grad_norm": 0.2831023037433624, + "learning_rate": 1.1393470560899742e-05, + "loss": 0.799, + "step": 22045 + }, + { + "epoch": 2.66, + "grad_norm": 0.28475290536880493, + "learning_rate": 1.135416426460799e-05, + "loss": 0.8506, + "step": 22050 + }, + { + "epoch": 2.66, + "grad_norm": 0.31778624653816223, + "learning_rate": 1.1314923219650807e-05, + "loss": 0.7712, + "step": 22055 + }, + { + "epoch": 2.66, + "grad_norm": 0.28575217723846436, + "learning_rate": 1.1275747444496353e-05, + "loss": 0.783, + "step": 22060 + }, + { + "epoch": 2.66, + "grad_norm": 0.3009990155696869, + "learning_rate": 1.1236636957582062e-05, + "loss": 0.7318, + "step": 22065 + }, + { + "epoch": 2.66, + "grad_norm": 0.27844470739364624, + "learning_rate": 1.1197591777314597e-05, + "loss": 0.8126, + "step": 22070 + }, + { + "epoch": 2.66, + "grad_norm": 0.2680925130844116, + "learning_rate": 1.1158611922069904e-05, + "loss": 0.7173, + "step": 22075 + }, + { + "epoch": 2.66, + "grad_norm": 0.2805362343788147, + "learning_rate": 1.1119697410193246e-05, + "loss": 0.8169, + "step": 22080 + }, + { + "epoch": 2.66, + "grad_norm": 0.2496698796749115, + "learning_rate": 1.1080848259999054e-05, + "loss": 0.7369, + "step": 22085 + }, + { + "epoch": 2.66, + "grad_norm": 0.2835550606250763, + "learning_rate": 1.1042064489771035e-05, + "loss": 0.7064, + "step": 22090 + }, + { + "epoch": 2.66, + "grad_norm": 0.29828065633773804, + "learning_rate": 1.100334611776209e-05, + "loss": 0.7352, + "step": 22095 + }, + { + "epoch": 2.66, + "grad_norm": 0.28538069128990173, + "learning_rate": 1.0964693162194427e-05, + "loss": 0.6496, + "step": 22100 + }, + { + "epoch": 2.66, + "grad_norm": 0.2641838490962982, + "learning_rate": 1.0926105641259392e-05, + "loss": 0.7463, + "step": 22105 + }, + { + "epoch": 2.66, + "grad_norm": 0.2766837775707245, + "learning_rate": 1.0887583573117526e-05, + "loss": 0.806, + "step": 22110 + }, + { + "epoch": 2.66, + "grad_norm": 0.28771060705184937, + "learning_rate": 1.0849126975898626e-05, + "loss": 0.9518, + "step": 22115 + }, + { + "epoch": 2.67, + "grad_norm": 0.2707420587539673, + "learning_rate": 1.0810735867701614e-05, + "loss": 0.7166, + "step": 22120 + }, + { + "epoch": 2.67, + "grad_norm": 0.2899439036846161, + "learning_rate": 1.077241026659464e-05, + "loss": 0.7041, + "step": 22125 + }, + { + "epoch": 2.67, + "grad_norm": 0.25429069995880127, + "learning_rate": 1.0734150190615005e-05, + "loss": 0.7543, + "step": 22130 + }, + { + "epoch": 2.67, + "grad_norm": 0.29701635241508484, + "learning_rate": 1.069595565776914e-05, + "loss": 0.7908, + "step": 22135 + }, + { + "epoch": 2.67, + "grad_norm": 0.27902600169181824, + "learning_rate": 1.06578266860327e-05, + "loss": 0.8859, + "step": 22140 + }, + { + "epoch": 2.67, + "grad_norm": 0.29597392678260803, + "learning_rate": 1.0619763293350447e-05, + "loss": 0.865, + "step": 22145 + }, + { + "epoch": 2.67, + "grad_norm": 0.24774394929409027, + "learning_rate": 1.0581765497636253e-05, + "loss": 0.767, + "step": 22150 + }, + { + "epoch": 2.67, + "grad_norm": 0.2774536907672882, + "learning_rate": 1.0543833316773127e-05, + "loss": 0.8292, + "step": 22155 + }, + { + "epoch": 2.67, + "grad_norm": 0.2544390857219696, + "learning_rate": 1.0505966768613273e-05, + "loss": 0.8584, + "step": 22160 + }, + { + "epoch": 2.67, + "grad_norm": 0.2902020215988159, + "learning_rate": 1.0468165870977901e-05, + "loss": 0.6859, + "step": 22165 + }, + { + "epoch": 2.67, + "grad_norm": 0.292767196893692, + "learning_rate": 1.0430430641657383e-05, + "loss": 0.8134, + "step": 22170 + }, + { + "epoch": 2.67, + "grad_norm": 0.2644735872745514, + "learning_rate": 1.0392761098411146e-05, + "loss": 0.6928, + "step": 22175 + }, + { + "epoch": 2.67, + "grad_norm": 0.26982855796813965, + "learning_rate": 1.0355157258967772e-05, + "loss": 0.7652, + "step": 22180 + }, + { + "epoch": 2.67, + "grad_norm": 0.2815554141998291, + "learning_rate": 1.0317619141024858e-05, + "loss": 0.8082, + "step": 22185 + }, + { + "epoch": 2.67, + "grad_norm": 0.25484582781791687, + "learning_rate": 1.028014676224907e-05, + "loss": 0.7836, + "step": 22190 + }, + { + "epoch": 2.67, + "grad_norm": 0.26840999722480774, + "learning_rate": 1.0242740140276185e-05, + "loss": 0.7332, + "step": 22195 + }, + { + "epoch": 2.67, + "grad_norm": 0.27440059185028076, + "learning_rate": 1.0205399292710969e-05, + "loss": 0.7333, + "step": 22200 + }, + { + "epoch": 2.68, + "grad_norm": 0.28358691930770874, + "learning_rate": 1.0168124237127301e-05, + "loss": 0.7612, + "step": 22205 + }, + { + "epoch": 2.68, + "grad_norm": 0.2975500226020813, + "learning_rate": 1.0130914991068028e-05, + "loss": 0.7863, + "step": 22210 + }, + { + "epoch": 2.68, + "grad_norm": 0.3028092682361603, + "learning_rate": 1.0093771572045045e-05, + "loss": 0.6993, + "step": 22215 + }, + { + "epoch": 2.68, + "grad_norm": 0.28230762481689453, + "learning_rate": 1.0056693997539317e-05, + "loss": 0.7698, + "step": 22220 + }, + { + "epoch": 2.68, + "grad_norm": 0.2743557393550873, + "learning_rate": 1.0019682285000785e-05, + "loss": 0.7347, + "step": 22225 + }, + { + "epoch": 2.68, + "grad_norm": 0.25398826599121094, + "learning_rate": 9.982736451848367e-06, + "loss": 0.7027, + "step": 22230 + }, + { + "epoch": 2.68, + "grad_norm": 0.2597667872905731, + "learning_rate": 9.94585651546997e-06, + "loss": 0.7767, + "step": 22235 + }, + { + "epoch": 2.68, + "grad_norm": 0.2996768355369568, + "learning_rate": 9.90904249322259e-06, + "loss": 0.8047, + "step": 22240 + }, + { + "epoch": 2.68, + "grad_norm": 0.3190852105617523, + "learning_rate": 9.872294402432074e-06, + "loss": 0.7218, + "step": 22245 + }, + { + "epoch": 2.68, + "grad_norm": 0.27263718843460083, + "learning_rate": 9.835612260393316e-06, + "loss": 0.8415, + "step": 22250 + }, + { + "epoch": 2.68, + "grad_norm": 0.29362040758132935, + "learning_rate": 9.798996084370143e-06, + "loss": 0.8255, + "step": 22255 + }, + { + "epoch": 2.68, + "grad_norm": 0.30126357078552246, + "learning_rate": 9.76244589159535e-06, + "loss": 0.726, + "step": 22260 + }, + { + "epoch": 2.68, + "grad_norm": 0.28834068775177, + "learning_rate": 9.725961699270662e-06, + "loss": 0.8232, + "step": 22265 + }, + { + "epoch": 2.68, + "grad_norm": 0.2588998079299927, + "learning_rate": 9.689543524566746e-06, + "loss": 0.8129, + "step": 22270 + }, + { + "epoch": 2.68, + "grad_norm": 0.24092987179756165, + "learning_rate": 9.653191384623204e-06, + "loss": 0.7258, + "step": 22275 + }, + { + "epoch": 2.68, + "grad_norm": 0.2920580506324768, + "learning_rate": 9.616905296548588e-06, + "loss": 0.863, + "step": 22280 + }, + { + "epoch": 2.69, + "grad_norm": 0.2646152079105377, + "learning_rate": 9.58068527742032e-06, + "loss": 0.7227, + "step": 22285 + }, + { + "epoch": 2.69, + "grad_norm": 0.26156240701675415, + "learning_rate": 9.544531344284745e-06, + "loss": 0.7387, + "step": 22290 + }, + { + "epoch": 2.69, + "grad_norm": 0.259880930185318, + "learning_rate": 9.50844351415707e-06, + "loss": 0.8297, + "step": 22295 + }, + { + "epoch": 2.69, + "grad_norm": 0.2808079421520233, + "learning_rate": 9.47242180402148e-06, + "loss": 0.8567, + "step": 22300 + }, + { + "epoch": 2.69, + "grad_norm": 0.2962208688259125, + "learning_rate": 9.436466230830958e-06, + "loss": 0.6969, + "step": 22305 + }, + { + "epoch": 2.69, + "grad_norm": 0.30381715297698975, + "learning_rate": 9.40057681150742e-06, + "loss": 0.6078, + "step": 22310 + }, + { + "epoch": 2.69, + "grad_norm": 0.29196277260780334, + "learning_rate": 9.364753562941556e-06, + "loss": 0.7343, + "step": 22315 + }, + { + "epoch": 2.69, + "grad_norm": 0.2667737603187561, + "learning_rate": 9.328996501993047e-06, + "loss": 0.8018, + "step": 22320 + }, + { + "epoch": 2.69, + "grad_norm": 0.27238935232162476, + "learning_rate": 9.29330564549033e-06, + "loss": 0.7812, + "step": 22325 + }, + { + "epoch": 2.69, + "grad_norm": 0.28509876132011414, + "learning_rate": 9.257681010230683e-06, + "loss": 0.8695, + "step": 22330 + }, + { + "epoch": 2.69, + "grad_norm": 0.2799687683582306, + "learning_rate": 9.222122612980281e-06, + "loss": 0.8006, + "step": 22335 + }, + { + "epoch": 2.69, + "grad_norm": 0.2903033494949341, + "learning_rate": 9.186630470474054e-06, + "loss": 0.8644, + "step": 22340 + }, + { + "epoch": 2.69, + "grad_norm": 0.2779753506183624, + "learning_rate": 9.15120459941579e-06, + "loss": 0.7324, + "step": 22345 + }, + { + "epoch": 2.69, + "grad_norm": 0.29276126623153687, + "learning_rate": 9.1158450164781e-06, + "loss": 0.7755, + "step": 22350 + }, + { + "epoch": 2.69, + "grad_norm": 0.2905157804489136, + "learning_rate": 9.080551738302328e-06, + "loss": 0.7065, + "step": 22355 + }, + { + "epoch": 2.69, + "grad_norm": 0.2716805636882782, + "learning_rate": 9.04532478149873e-06, + "loss": 0.8302, + "step": 22360 + }, + { + "epoch": 2.69, + "grad_norm": 0.2544827461242676, + "learning_rate": 9.010164162646249e-06, + "loss": 0.8856, + "step": 22365 + }, + { + "epoch": 2.7, + "grad_norm": 0.30418241024017334, + "learning_rate": 8.975069898292647e-06, + "loss": 0.8106, + "step": 22370 + }, + { + "epoch": 2.7, + "grad_norm": 0.2704138457775116, + "learning_rate": 8.940042004954412e-06, + "loss": 0.8389, + "step": 22375 + }, + { + "epoch": 2.7, + "grad_norm": 0.28807222843170166, + "learning_rate": 8.90508049911689e-06, + "loss": 0.7696, + "step": 22380 + }, + { + "epoch": 2.7, + "grad_norm": 0.28206029534339905, + "learning_rate": 8.870185397234086e-06, + "loss": 0.7857, + "step": 22385 + }, + { + "epoch": 2.7, + "grad_norm": 0.26948267221450806, + "learning_rate": 8.83535671572883e-06, + "loss": 0.8041, + "step": 22390 + }, + { + "epoch": 2.7, + "grad_norm": 0.2844690680503845, + "learning_rate": 8.800594470992611e-06, + "loss": 0.8143, + "step": 22395 + }, + { + "epoch": 2.7, + "grad_norm": 0.28157171607017517, + "learning_rate": 8.765898679385742e-06, + "loss": 0.7778, + "step": 22400 + }, + { + "epoch": 2.7, + "grad_norm": 0.28640446066856384, + "learning_rate": 8.731269357237192e-06, + "loss": 0.8495, + "step": 22405 + }, + { + "epoch": 2.7, + "grad_norm": 0.28528541326522827, + "learning_rate": 8.696706520844693e-06, + "loss": 0.8586, + "step": 22410 + }, + { + "epoch": 2.7, + "grad_norm": 0.2665455937385559, + "learning_rate": 8.6622101864746e-06, + "loss": 0.759, + "step": 22415 + }, + { + "epoch": 2.7, + "grad_norm": 0.2542749047279358, + "learning_rate": 8.627780370362108e-06, + "loss": 0.7185, + "step": 22420 + }, + { + "epoch": 2.7, + "grad_norm": 0.2605261504650116, + "learning_rate": 8.593417088710992e-06, + "loss": 0.8301, + "step": 22425 + }, + { + "epoch": 2.7, + "grad_norm": 0.27719512581825256, + "learning_rate": 8.55912035769376e-06, + "loss": 0.8109, + "step": 22430 + }, + { + "epoch": 2.7, + "grad_norm": 0.27308791875839233, + "learning_rate": 8.524890193451573e-06, + "loss": 0.722, + "step": 22435 + }, + { + "epoch": 2.7, + "grad_norm": 0.26642659306526184, + "learning_rate": 8.490726612094323e-06, + "loss": 0.7022, + "step": 22440 + }, + { + "epoch": 2.7, + "grad_norm": 0.2815400958061218, + "learning_rate": 8.456629629700518e-06, + "loss": 0.887, + "step": 22445 + }, + { + "epoch": 2.7, + "grad_norm": 0.27426496148109436, + "learning_rate": 8.422599262317303e-06, + "loss": 0.7021, + "step": 22450 + }, + { + "epoch": 2.71, + "grad_norm": 0.27988290786743164, + "learning_rate": 8.388635525960503e-06, + "loss": 0.7661, + "step": 22455 + }, + { + "epoch": 2.71, + "grad_norm": 0.31152406334877014, + "learning_rate": 8.35473843661461e-06, + "loss": 0.7652, + "step": 22460 + }, + { + "epoch": 2.71, + "grad_norm": 0.2869008481502533, + "learning_rate": 8.320908010232702e-06, + "loss": 0.7633, + "step": 22465 + }, + { + "epoch": 2.71, + "grad_norm": 0.28940802812576294, + "learning_rate": 8.287144262736506e-06, + "loss": 0.749, + "step": 22470 + }, + { + "epoch": 2.71, + "grad_norm": 0.27460524439811707, + "learning_rate": 8.253447210016363e-06, + "loss": 0.7667, + "step": 22475 + }, + { + "epoch": 2.71, + "grad_norm": 0.2848700284957886, + "learning_rate": 8.219816867931218e-06, + "loss": 0.7629, + "step": 22480 + }, + { + "epoch": 2.71, + "grad_norm": 0.2719096541404724, + "learning_rate": 8.18625325230861e-06, + "loss": 0.7007, + "step": 22485 + }, + { + "epoch": 2.71, + "grad_norm": 0.2773694396018982, + "learning_rate": 8.152756378944708e-06, + "loss": 0.7708, + "step": 22490 + }, + { + "epoch": 2.71, + "grad_norm": 0.2669026851654053, + "learning_rate": 8.119326263604281e-06, + "loss": 0.8003, + "step": 22495 + }, + { + "epoch": 2.71, + "grad_norm": 0.27180686593055725, + "learning_rate": 8.085962922020611e-06, + "loss": 0.8165, + "step": 22500 + }, + { + "epoch": 2.71, + "grad_norm": 0.2707700729370117, + "learning_rate": 8.052666369895622e-06, + "loss": 0.7817, + "step": 22505 + }, + { + "epoch": 2.71, + "grad_norm": 0.2820728123188019, + "learning_rate": 8.019436622899727e-06, + "loss": 0.7616, + "step": 22510 + }, + { + "epoch": 2.71, + "grad_norm": 0.27186763286590576, + "learning_rate": 7.986273696672019e-06, + "loss": 0.7928, + "step": 22515 + }, + { + "epoch": 2.71, + "grad_norm": 0.29205748438835144, + "learning_rate": 7.953177606820044e-06, + "loss": 0.8722, + "step": 22520 + }, + { + "epoch": 2.71, + "grad_norm": 0.2753778398036957, + "learning_rate": 7.92014836891991e-06, + "loss": 0.6978, + "step": 22525 + }, + { + "epoch": 2.71, + "grad_norm": 0.2816515564918518, + "learning_rate": 7.887185998516266e-06, + "loss": 0.7766, + "step": 22530 + }, + { + "epoch": 2.72, + "grad_norm": 0.2690742313861847, + "learning_rate": 7.85429051112232e-06, + "loss": 0.7389, + "step": 22535 + }, + { + "epoch": 2.72, + "grad_norm": 0.26442471146583557, + "learning_rate": 7.821461922219769e-06, + "loss": 0.8625, + "step": 22540 + }, + { + "epoch": 2.72, + "grad_norm": 0.2712923288345337, + "learning_rate": 7.788700247258855e-06, + "loss": 0.734, + "step": 22545 + }, + { + "epoch": 2.72, + "grad_norm": 0.2940865159034729, + "learning_rate": 7.756005501658297e-06, + "loss": 0.7485, + "step": 22550 + }, + { + "epoch": 2.72, + "grad_norm": 0.25925499200820923, + "learning_rate": 7.723377700805316e-06, + "loss": 0.7913, + "step": 22555 + }, + { + "epoch": 2.72, + "grad_norm": 0.23880963027477264, + "learning_rate": 7.690816860055648e-06, + "loss": 0.7763, + "step": 22560 + }, + { + "epoch": 2.72, + "grad_norm": 0.2328793704509735, + "learning_rate": 7.658322994733517e-06, + "loss": 0.7291, + "step": 22565 + }, + { + "epoch": 2.72, + "grad_norm": 0.23459170758724213, + "learning_rate": 7.625896120131575e-06, + "loss": 0.8022, + "step": 22570 + }, + { + "epoch": 2.72, + "grad_norm": 0.26686781644821167, + "learning_rate": 7.593536251511045e-06, + "loss": 0.751, + "step": 22575 + }, + { + "epoch": 2.72, + "grad_norm": 0.2697281837463379, + "learning_rate": 7.5612434041015305e-06, + "loss": 0.737, + "step": 22580 + }, + { + "epoch": 2.72, + "grad_norm": 0.30781927704811096, + "learning_rate": 7.529017593101105e-06, + "loss": 0.8416, + "step": 22585 + }, + { + "epoch": 2.72, + "grad_norm": 0.2810058891773224, + "learning_rate": 7.496858833676306e-06, + "loss": 0.7729, + "step": 22590 + }, + { + "epoch": 2.72, + "grad_norm": 0.24234548211097717, + "learning_rate": 7.464767140962124e-06, + "loss": 0.7394, + "step": 22595 + }, + { + "epoch": 2.72, + "grad_norm": 0.263785719871521, + "learning_rate": 7.432742530061997e-06, + "loss": 0.8007, + "step": 22600 + }, + { + "epoch": 2.72, + "grad_norm": 0.27168524265289307, + "learning_rate": 7.400785016047733e-06, + "loss": 0.8045, + "step": 22605 + }, + { + "epoch": 2.72, + "grad_norm": 0.2800976634025574, + "learning_rate": 7.3688946139596025e-06, + "loss": 0.8114, + "step": 22610 + }, + { + "epoch": 2.72, + "grad_norm": 0.29521191120147705, + "learning_rate": 7.3370713388063165e-06, + "loss": 0.7672, + "step": 22615 + }, + { + "epoch": 2.73, + "grad_norm": 0.27929726243019104, + "learning_rate": 7.30531520556496e-06, + "loss": 0.7258, + "step": 22620 + }, + { + "epoch": 2.73, + "grad_norm": 0.25764504075050354, + "learning_rate": 7.273626229180995e-06, + "loss": 0.8177, + "step": 22625 + }, + { + "epoch": 2.73, + "grad_norm": 0.26366397738456726, + "learning_rate": 7.242004424568309e-06, + "loss": 0.7124, + "step": 22630 + }, + { + "epoch": 2.73, + "grad_norm": 0.30640193819999695, + "learning_rate": 7.210449806609197e-06, + "loss": 0.8062, + "step": 22635 + }, + { + "epoch": 2.73, + "grad_norm": 0.2728465795516968, + "learning_rate": 7.178962390154314e-06, + "loss": 0.8824, + "step": 22640 + }, + { + "epoch": 2.73, + "grad_norm": 0.3235665559768677, + "learning_rate": 7.1475421900226705e-06, + "loss": 0.6664, + "step": 22645 + }, + { + "epoch": 2.73, + "grad_norm": 0.3118913471698761, + "learning_rate": 7.116189221001622e-06, + "loss": 0.7975, + "step": 22650 + }, + { + "epoch": 2.73, + "grad_norm": 0.2510865032672882, + "learning_rate": 7.084903497846983e-06, + "loss": 0.7969, + "step": 22655 + }, + { + "epoch": 2.73, + "grad_norm": 0.28625842928886414, + "learning_rate": 7.053685035282808e-06, + "loss": 0.7862, + "step": 22660 + }, + { + "epoch": 2.73, + "grad_norm": 0.30692657828330994, + "learning_rate": 7.02253384800156e-06, + "loss": 0.6927, + "step": 22665 + }, + { + "epoch": 2.73, + "grad_norm": 0.30167925357818604, + "learning_rate": 6.9914499506640135e-06, + "loss": 0.7441, + "step": 22670 + }, + { + "epoch": 2.73, + "grad_norm": 0.30318981409072876, + "learning_rate": 6.960433357899281e-06, + "loss": 0.8208, + "step": 22675 + }, + { + "epoch": 2.73, + "grad_norm": 0.268376886844635, + "learning_rate": 6.929484084304837e-06, + "loss": 0.732, + "step": 22680 + }, + { + "epoch": 2.73, + "grad_norm": 0.26787877082824707, + "learning_rate": 6.89860214444638e-06, + "loss": 0.7454, + "step": 22685 + }, + { + "epoch": 2.73, + "grad_norm": 0.29671844840049744, + "learning_rate": 6.8677875528580176e-06, + "loss": 0.767, + "step": 22690 + }, + { + "epoch": 2.73, + "grad_norm": 0.27219846844673157, + "learning_rate": 6.8370403240421146e-06, + "loss": 0.7894, + "step": 22695 + }, + { + "epoch": 2.74, + "grad_norm": 0.3147854804992676, + "learning_rate": 6.806360472469313e-06, + "loss": 0.8088, + "step": 22700 + }, + { + "epoch": 2.74, + "grad_norm": 0.27097561955451965, + "learning_rate": 6.775748012578597e-06, + "loss": 0.756, + "step": 22705 + }, + { + "epoch": 2.74, + "grad_norm": 0.25546738505363464, + "learning_rate": 6.745202958777174e-06, + "loss": 0.8471, + "step": 22710 + }, + { + "epoch": 2.74, + "grad_norm": 0.27763062715530396, + "learning_rate": 6.714725325440595e-06, + "loss": 0.7465, + "step": 22715 + }, + { + "epoch": 2.74, + "grad_norm": 0.265898197889328, + "learning_rate": 6.684315126912654e-06, + "loss": 0.7873, + "step": 22720 + }, + { + "epoch": 2.74, + "grad_norm": 0.292896032333374, + "learning_rate": 6.653972377505368e-06, + "loss": 0.7363, + "step": 22725 + }, + { + "epoch": 2.74, + "grad_norm": 0.2958504557609558, + "learning_rate": 6.623697091499031e-06, + "loss": 0.7821, + "step": 22730 + }, + { + "epoch": 2.74, + "grad_norm": 0.28079283237457275, + "learning_rate": 6.5934892831422616e-06, + "loss": 0.7395, + "step": 22735 + }, + { + "epoch": 2.74, + "grad_norm": 0.26290813088417053, + "learning_rate": 6.563348966651805e-06, + "loss": 0.7934, + "step": 22740 + }, + { + "epoch": 2.74, + "grad_norm": 0.3071194291114807, + "learning_rate": 6.533276156212697e-06, + "loss": 0.8421, + "step": 22745 + }, + { + "epoch": 2.74, + "grad_norm": 0.2782292366027832, + "learning_rate": 6.503270865978216e-06, + "loss": 0.749, + "step": 22750 + }, + { + "epoch": 2.74, + "grad_norm": 0.2784491181373596, + "learning_rate": 6.47333311006985e-06, + "loss": 0.7861, + "step": 22755 + }, + { + "epoch": 2.74, + "grad_norm": 0.2715974450111389, + "learning_rate": 6.443462902577296e-06, + "loss": 0.8529, + "step": 22760 + }, + { + "epoch": 2.74, + "grad_norm": 0.2865915894508362, + "learning_rate": 6.413660257558473e-06, + "loss": 0.8501, + "step": 22765 + }, + { + "epoch": 2.74, + "grad_norm": 0.30358344316482544, + "learning_rate": 6.383925189039479e-06, + "loss": 0.7397, + "step": 22770 + }, + { + "epoch": 2.74, + "grad_norm": 0.2950074076652527, + "learning_rate": 6.3542577110146375e-06, + "loss": 0.781, + "step": 22775 + }, + { + "epoch": 2.74, + "grad_norm": 0.2862589955329895, + "learning_rate": 6.324657837446445e-06, + "loss": 0.7687, + "step": 22780 + }, + { + "epoch": 2.75, + "grad_norm": 0.2698042392730713, + "learning_rate": 6.295125582265575e-06, + "loss": 0.8329, + "step": 22785 + }, + { + "epoch": 2.75, + "grad_norm": 0.2783750295639038, + "learning_rate": 6.265660959370894e-06, + "loss": 0.7668, + "step": 22790 + }, + { + "epoch": 2.75, + "grad_norm": 0.2662694752216339, + "learning_rate": 6.236263982629441e-06, + "loss": 0.7561, + "step": 22795 + }, + { + "epoch": 2.75, + "grad_norm": 0.35889652371406555, + "learning_rate": 6.206934665876417e-06, + "loss": 0.8031, + "step": 22800 + }, + { + "epoch": 2.75, + "grad_norm": 0.2936451733112335, + "learning_rate": 6.177673022915147e-06, + "loss": 0.7609, + "step": 22805 + }, + { + "epoch": 2.75, + "grad_norm": 0.299941748380661, + "learning_rate": 6.1484790675171315e-06, + "loss": 0.6878, + "step": 22810 + }, + { + "epoch": 2.75, + "grad_norm": 0.30326518416404724, + "learning_rate": 6.119352813422046e-06, + "loss": 0.8334, + "step": 22815 + }, + { + "epoch": 2.75, + "grad_norm": 0.2659267783164978, + "learning_rate": 6.09029427433766e-06, + "loss": 0.7561, + "step": 22820 + }, + { + "epoch": 2.75, + "grad_norm": 0.2844538986682892, + "learning_rate": 6.061303463939882e-06, + "loss": 0.8201, + "step": 22825 + }, + { + "epoch": 2.75, + "grad_norm": 0.274046391248703, + "learning_rate": 6.032380395872732e-06, + "loss": 0.7778, + "step": 22830 + }, + { + "epoch": 2.75, + "grad_norm": 0.25698041915893555, + "learning_rate": 6.003525083748406e-06, + "loss": 0.7376, + "step": 22835 + }, + { + "epoch": 2.75, + "grad_norm": 0.2826564908027649, + "learning_rate": 5.97473754114714e-06, + "loss": 0.8031, + "step": 22840 + }, + { + "epoch": 2.75, + "grad_norm": 0.2923043370246887, + "learning_rate": 5.946017781617329e-06, + "loss": 0.7715, + "step": 22845 + }, + { + "epoch": 2.75, + "grad_norm": 0.2728569507598877, + "learning_rate": 5.917365818675396e-06, + "loss": 0.8772, + "step": 22850 + }, + { + "epoch": 2.75, + "grad_norm": 0.2695605754852295, + "learning_rate": 5.888781665805986e-06, + "loss": 0.9072, + "step": 22855 + }, + { + "epoch": 2.75, + "grad_norm": 0.25165680050849915, + "learning_rate": 5.860265336461689e-06, + "loss": 0.6898, + "step": 22860 + }, + { + "epoch": 2.75, + "grad_norm": 0.2680535614490509, + "learning_rate": 5.8318168440632695e-06, + "loss": 0.8437, + "step": 22865 + }, + { + "epoch": 2.76, + "grad_norm": 0.28424060344696045, + "learning_rate": 5.8034362019995e-06, + "loss": 0.7131, + "step": 22870 + }, + { + "epoch": 2.76, + "grad_norm": 0.25301888585090637, + "learning_rate": 5.775123423627298e-06, + "loss": 0.7976, + "step": 22875 + }, + { + "epoch": 2.76, + "grad_norm": 0.27347350120544434, + "learning_rate": 5.746878522271553e-06, + "loss": 0.711, + "step": 22880 + }, + { + "epoch": 2.76, + "grad_norm": 0.26733747124671936, + "learning_rate": 5.718701511225299e-06, + "loss": 0.6827, + "step": 22885 + }, + { + "epoch": 2.76, + "grad_norm": 0.2814013361930847, + "learning_rate": 5.690592403749511e-06, + "loss": 0.7927, + "step": 22890 + }, + { + "epoch": 2.76, + "grad_norm": 0.264848530292511, + "learning_rate": 5.662551213073324e-06, + "loss": 0.9032, + "step": 22895 + }, + { + "epoch": 2.76, + "grad_norm": 0.28370270133018494, + "learning_rate": 5.634577952393848e-06, + "loss": 0.7843, + "step": 22900 + }, + { + "epoch": 2.76, + "grad_norm": 0.2731534242630005, + "learning_rate": 5.606672634876203e-06, + "loss": 0.8433, + "step": 22905 + }, + { + "epoch": 2.76, + "grad_norm": 0.24351301789283752, + "learning_rate": 5.578835273653581e-06, + "loss": 0.7116, + "step": 22910 + }, + { + "epoch": 2.76, + "grad_norm": 0.29447489976882935, + "learning_rate": 5.551065881827138e-06, + "loss": 0.7504, + "step": 22915 + }, + { + "epoch": 2.76, + "grad_norm": 0.2870446443557739, + "learning_rate": 5.523364472466118e-06, + "loss": 0.7319, + "step": 22920 + }, + { + "epoch": 2.76, + "grad_norm": 0.2608989179134369, + "learning_rate": 5.495731058607677e-06, + "loss": 0.8248, + "step": 22925 + }, + { + "epoch": 2.76, + "grad_norm": 0.2600563168525696, + "learning_rate": 5.468165653257028e-06, + "loss": 0.8477, + "step": 22930 + }, + { + "epoch": 2.76, + "grad_norm": 0.2777158319950104, + "learning_rate": 5.440668269387394e-06, + "loss": 0.7868, + "step": 22935 + }, + { + "epoch": 2.76, + "grad_norm": 0.2984352707862854, + "learning_rate": 5.4132389199399384e-06, + "loss": 0.7796, + "step": 22940 + }, + { + "epoch": 2.76, + "grad_norm": 0.3199765682220459, + "learning_rate": 5.385877617823819e-06, + "loss": 0.8571, + "step": 22945 + }, + { + "epoch": 2.77, + "grad_norm": 0.2830100953578949, + "learning_rate": 5.35858437591617e-06, + "loss": 0.775, + "step": 22950 + }, + { + "epoch": 2.77, + "grad_norm": 0.23226149380207062, + "learning_rate": 5.33135920706213e-06, + "loss": 0.706, + "step": 22955 + }, + { + "epoch": 2.77, + "grad_norm": 0.28286078572273254, + "learning_rate": 5.304202124074736e-06, + "loss": 0.7191, + "step": 22960 + }, + { + "epoch": 2.77, + "grad_norm": 0.3073217570781708, + "learning_rate": 5.277113139735012e-06, + "loss": 0.7106, + "step": 22965 + }, + { + "epoch": 2.77, + "grad_norm": 0.2731364667415619, + "learning_rate": 5.250092266791944e-06, + "loss": 0.7427, + "step": 22970 + }, + { + "epoch": 2.77, + "grad_norm": 0.2613232135772705, + "learning_rate": 5.22313951796246e-06, + "loss": 0.7188, + "step": 22975 + }, + { + "epoch": 2.77, + "grad_norm": 0.25765460729599, + "learning_rate": 5.196254905931413e-06, + "loss": 0.9263, + "step": 22980 + }, + { + "epoch": 2.77, + "grad_norm": 0.29852619767189026, + "learning_rate": 5.169438443351581e-06, + "loss": 0.6889, + "step": 22985 + }, + { + "epoch": 2.77, + "grad_norm": 0.2636108100414276, + "learning_rate": 5.142690142843703e-06, + "loss": 0.6934, + "step": 22990 + }, + { + "epoch": 2.77, + "grad_norm": 0.2629612684249878, + "learning_rate": 5.116010016996392e-06, + "loss": 0.8041, + "step": 22995 + }, + { + "epoch": 2.77, + "grad_norm": 0.2794385552406311, + "learning_rate": 5.089398078366219e-06, + "loss": 0.8116, + "step": 23000 + }, + { + "epoch": 2.77, + "grad_norm": 0.2664497494697571, + "learning_rate": 5.062854339477634e-06, + "loss": 0.8767, + "step": 23005 + }, + { + "epoch": 2.77, + "grad_norm": 0.30518075823783875, + "learning_rate": 5.036378812823028e-06, + "loss": 0.7819, + "step": 23010 + }, + { + "epoch": 2.77, + "grad_norm": 0.2723018229007721, + "learning_rate": 5.0099715108626485e-06, + "loss": 0.82, + "step": 23015 + }, + { + "epoch": 2.77, + "grad_norm": 0.27372339367866516, + "learning_rate": 4.983632446024638e-06, + "loss": 0.8301, + "step": 23020 + }, + { + "epoch": 2.77, + "grad_norm": 0.29022863507270813, + "learning_rate": 4.957361630705031e-06, + "loss": 0.7167, + "step": 23025 + }, + { + "epoch": 2.77, + "grad_norm": 0.2727348804473877, + "learning_rate": 4.931159077267771e-06, + "loss": 0.8002, + "step": 23030 + }, + { + "epoch": 2.78, + "grad_norm": 0.2919345200061798, + "learning_rate": 4.905024798044627e-06, + "loss": 0.7175, + "step": 23035 + }, + { + "epoch": 2.78, + "grad_norm": 0.25965866446495056, + "learning_rate": 4.878958805335276e-06, + "loss": 0.7755, + "step": 23040 + }, + { + "epoch": 2.78, + "grad_norm": 0.4287550151348114, + "learning_rate": 4.852961111407239e-06, + "loss": 0.787, + "step": 23045 + }, + { + "epoch": 2.78, + "grad_norm": 0.30621278285980225, + "learning_rate": 4.827031728495878e-06, + "loss": 0.8296, + "step": 23050 + }, + { + "epoch": 2.78, + "grad_norm": 0.29937830567359924, + "learning_rate": 4.801170668804433e-06, + "loss": 0.7147, + "step": 23055 + }, + { + "epoch": 2.78, + "grad_norm": 0.2686162292957306, + "learning_rate": 4.775377944503983e-06, + "loss": 0.8084, + "step": 23060 + }, + { + "epoch": 2.78, + "grad_norm": 0.2906171381473541, + "learning_rate": 4.749653567733402e-06, + "loss": 0.7937, + "step": 23065 + }, + { + "epoch": 2.78, + "grad_norm": 0.3064448833465576, + "learning_rate": 4.72399755059949e-06, + "loss": 0.758, + "step": 23070 + }, + { + "epoch": 2.78, + "grad_norm": 0.3103048801422119, + "learning_rate": 4.698409905176803e-06, + "loss": 0.7798, + "step": 23075 + }, + { + "epoch": 2.78, + "grad_norm": 0.3002449870109558, + "learning_rate": 4.672890643507727e-06, + "loss": 0.8077, + "step": 23080 + }, + { + "epoch": 2.78, + "grad_norm": 0.26185083389282227, + "learning_rate": 4.647439777602469e-06, + "loss": 0.6943, + "step": 23085 + }, + { + "epoch": 2.78, + "grad_norm": 0.2598731517791748, + "learning_rate": 4.6220573194390655e-06, + "loss": 0.7912, + "step": 23090 + }, + { + "epoch": 2.78, + "grad_norm": 0.283272385597229, + "learning_rate": 4.596743280963344e-06, + "loss": 0.7841, + "step": 23095 + }, + { + "epoch": 2.78, + "grad_norm": 0.29078394174575806, + "learning_rate": 4.571497674088925e-06, + "loss": 0.8184, + "step": 23100 + }, + { + "epoch": 2.78, + "grad_norm": 0.26592543721199036, + "learning_rate": 4.546320510697221e-06, + "loss": 0.7237, + "step": 23105 + }, + { + "epoch": 2.78, + "grad_norm": 0.2742617130279541, + "learning_rate": 4.52121180263747e-06, + "loss": 0.7937, + "step": 23110 + }, + { + "epoch": 2.79, + "grad_norm": 0.2708317041397095, + "learning_rate": 4.496171561726636e-06, + "loss": 0.8146, + "step": 23115 + }, + { + "epoch": 2.79, + "grad_norm": 0.2601563036441803, + "learning_rate": 4.471199799749508e-06, + "loss": 0.719, + "step": 23120 + }, + { + "epoch": 2.79, + "grad_norm": 0.2519415318965912, + "learning_rate": 4.446296528458604e-06, + "loss": 0.8397, + "step": 23125 + }, + { + "epoch": 2.79, + "grad_norm": 0.2795381247997284, + "learning_rate": 4.421461759574247e-06, + "loss": 0.708, + "step": 23130 + }, + { + "epoch": 2.79, + "grad_norm": 0.2789604067802429, + "learning_rate": 4.396695504784503e-06, + "loss": 0.7077, + "step": 23135 + }, + { + "epoch": 2.79, + "grad_norm": 0.30929040908813477, + "learning_rate": 4.371997775745184e-06, + "loss": 0.6999, + "step": 23140 + }, + { + "epoch": 2.79, + "grad_norm": 0.2771584689617157, + "learning_rate": 4.347368584079858e-06, + "loss": 0.8967, + "step": 23145 + }, + { + "epoch": 2.79, + "grad_norm": 0.290021687746048, + "learning_rate": 4.322807941379869e-06, + "loss": 0.7981, + "step": 23150 + }, + { + "epoch": 2.79, + "grad_norm": 0.2715446352958679, + "learning_rate": 4.298315859204254e-06, + "loss": 0.7353, + "step": 23155 + }, + { + "epoch": 2.79, + "grad_norm": 0.27552729845046997, + "learning_rate": 4.273892349079794e-06, + "loss": 0.8716, + "step": 23160 + }, + { + "epoch": 2.79, + "grad_norm": 0.2995644509792328, + "learning_rate": 4.249537422500992e-06, + "loss": 0.7564, + "step": 23165 + }, + { + "epoch": 2.79, + "grad_norm": 0.3228721022605896, + "learning_rate": 4.225251090930132e-06, + "loss": 0.7422, + "step": 23170 + }, + { + "epoch": 2.79, + "grad_norm": 0.2784271538257599, + "learning_rate": 4.201033365797119e-06, + "loss": 0.7439, + "step": 23175 + }, + { + "epoch": 2.79, + "grad_norm": 0.2869255840778351, + "learning_rate": 4.176884258499652e-06, + "loss": 0.8764, + "step": 23180 + }, + { + "epoch": 2.79, + "grad_norm": 0.2846572697162628, + "learning_rate": 4.152803780403058e-06, + "loss": 0.8168, + "step": 23185 + }, + { + "epoch": 2.79, + "grad_norm": 0.2854968011379242, + "learning_rate": 4.12879194284047e-06, + "loss": 0.84, + "step": 23190 + }, + { + "epoch": 2.79, + "grad_norm": 0.2596077620983124, + "learning_rate": 4.104848757112616e-06, + "loss": 0.7741, + "step": 23195 + }, + { + "epoch": 2.8, + "grad_norm": 0.2705884575843811, + "learning_rate": 4.080974234487966e-06, + "loss": 0.6468, + "step": 23200 + }, + { + "epoch": 2.8, + "grad_norm": 0.2820831835269928, + "learning_rate": 4.057168386202681e-06, + "loss": 0.7343, + "step": 23205 + }, + { + "epoch": 2.8, + "grad_norm": 0.2777920663356781, + "learning_rate": 4.033431223460548e-06, + "loss": 0.825, + "step": 23210 + }, + { + "epoch": 2.8, + "grad_norm": 0.25113433599472046, + "learning_rate": 4.0097627574330825e-06, + "loss": 0.8486, + "step": 23215 + }, + { + "epoch": 2.8, + "grad_norm": 0.26045823097229004, + "learning_rate": 3.9861629992594405e-06, + "loss": 0.8892, + "step": 23220 + }, + { + "epoch": 2.8, + "grad_norm": 0.2702331840991974, + "learning_rate": 3.962631960046453e-06, + "loss": 0.8179, + "step": 23225 + }, + { + "epoch": 2.8, + "grad_norm": 0.2872564494609833, + "learning_rate": 3.939169650868645e-06, + "loss": 0.7651, + "step": 23230 + }, + { + "epoch": 2.8, + "grad_norm": 0.28898364305496216, + "learning_rate": 3.915776082768118e-06, + "loss": 0.7968, + "step": 23235 + }, + { + "epoch": 2.8, + "grad_norm": 0.2918056845664978, + "learning_rate": 3.8924512667546645e-06, + "loss": 0.7513, + "step": 23240 + }, + { + "epoch": 2.8, + "grad_norm": 0.28438860177993774, + "learning_rate": 3.86919521380572e-06, + "loss": 0.8384, + "step": 23245 + }, + { + "epoch": 2.8, + "grad_norm": 0.29474344849586487, + "learning_rate": 3.8460079348663795e-06, + "loss": 0.8156, + "step": 23250 + }, + { + "epoch": 2.8, + "grad_norm": 0.28143933415412903, + "learning_rate": 3.82288944084933e-06, + "loss": 0.7344, + "step": 23255 + }, + { + "epoch": 2.8, + "grad_norm": 0.28403791785240173, + "learning_rate": 3.7998397426349024e-06, + "loss": 0.7604, + "step": 23260 + }, + { + "epoch": 2.8, + "grad_norm": 0.29978764057159424, + "learning_rate": 3.7768588510710353e-06, + "loss": 0.7076, + "step": 23265 + }, + { + "epoch": 2.8, + "grad_norm": 0.2770501375198364, + "learning_rate": 3.753946776973327e-06, + "loss": 0.8365, + "step": 23270 + }, + { + "epoch": 2.8, + "grad_norm": 0.27618712186813354, + "learning_rate": 3.7311035311249348e-06, + "loss": 0.7706, + "step": 23275 + }, + { + "epoch": 2.8, + "grad_norm": 0.2640831768512726, + "learning_rate": 3.7083291242766764e-06, + "loss": 0.7948, + "step": 23280 + }, + { + "epoch": 2.81, + "grad_norm": 0.2883791923522949, + "learning_rate": 3.685623567146895e-06, + "loss": 0.8043, + "step": 23285 + }, + { + "epoch": 2.81, + "grad_norm": 0.280841201543808, + "learning_rate": 3.6629868704216415e-06, + "loss": 0.9078, + "step": 23290 + }, + { + "epoch": 2.81, + "grad_norm": 0.25826311111450195, + "learning_rate": 3.640419044754461e-06, + "loss": 0.8755, + "step": 23295 + }, + { + "epoch": 2.81, + "grad_norm": 0.2830309271812439, + "learning_rate": 3.6179201007665413e-06, + "loss": 0.8145, + "step": 23300 + }, + { + "epoch": 2.81, + "grad_norm": 0.290726900100708, + "learning_rate": 3.5954900490465956e-06, + "loss": 0.8466, + "step": 23305 + }, + { + "epoch": 2.81, + "grad_norm": 0.28319695591926575, + "learning_rate": 3.5731289001509954e-06, + "loss": 0.7302, + "step": 23310 + }, + { + "epoch": 2.81, + "grad_norm": 0.27678796648979187, + "learning_rate": 3.5508366646036236e-06, + "loss": 0.8587, + "step": 23315 + }, + { + "epoch": 2.81, + "grad_norm": 0.26364395022392273, + "learning_rate": 3.528613352895937e-06, + "loss": 0.858, + "step": 23320 + }, + { + "epoch": 2.81, + "grad_norm": 0.2955693006515503, + "learning_rate": 3.506458975486953e-06, + "loss": 0.6728, + "step": 23325 + }, + { + "epoch": 2.81, + "grad_norm": 0.272214412689209, + "learning_rate": 3.484373542803298e-06, + "loss": 0.794, + "step": 23330 + }, + { + "epoch": 2.81, + "grad_norm": 0.2752835154533386, + "learning_rate": 3.4623570652390743e-06, + "loss": 0.6453, + "step": 23335 + }, + { + "epoch": 2.81, + "grad_norm": 0.28152868151664734, + "learning_rate": 3.440409553155993e-06, + "loss": 0.7392, + "step": 23340 + }, + { + "epoch": 2.81, + "grad_norm": 0.25148865580558777, + "learning_rate": 3.4185310168832748e-06, + "loss": 0.777, + "step": 23345 + }, + { + "epoch": 2.81, + "grad_norm": 0.2435861974954605, + "learning_rate": 3.3967214667176666e-06, + "loss": 0.7149, + "step": 23350 + }, + { + "epoch": 2.81, + "grad_norm": 0.33531370759010315, + "learning_rate": 3.3749809129234752e-06, + "loss": 0.7284, + "step": 23355 + }, + { + "epoch": 2.81, + "grad_norm": 0.30802983045578003, + "learning_rate": 3.353309365732548e-06, + "loss": 0.714, + "step": 23360 + }, + { + "epoch": 2.82, + "grad_norm": 0.2919432818889618, + "learning_rate": 3.3317068353441945e-06, + "loss": 0.8152, + "step": 23365 + }, + { + "epoch": 2.82, + "grad_norm": 0.2593708634376526, + "learning_rate": 3.310173331925331e-06, + "loss": 0.6821, + "step": 23370 + }, + { + "epoch": 2.82, + "grad_norm": 0.27010101079940796, + "learning_rate": 3.288708865610318e-06, + "loss": 0.753, + "step": 23375 + }, + { + "epoch": 2.82, + "grad_norm": 0.28530481457710266, + "learning_rate": 3.267313446501041e-06, + "loss": 0.7806, + "step": 23380 + }, + { + "epoch": 2.82, + "grad_norm": 0.27056190371513367, + "learning_rate": 3.245987084666879e-06, + "loss": 0.6671, + "step": 23385 + }, + { + "epoch": 2.82, + "grad_norm": 0.33442938327789307, + "learning_rate": 3.2247297901447534e-06, + "loss": 0.7138, + "step": 23390 + }, + { + "epoch": 2.82, + "grad_norm": 0.2769376039505005, + "learning_rate": 3.2035415729390613e-06, + "loss": 0.838, + "step": 23395 + }, + { + "epoch": 2.82, + "grad_norm": 0.29188603162765503, + "learning_rate": 3.1824224430216425e-06, + "loss": 0.7758, + "step": 23400 + }, + { + "epoch": 2.82, + "grad_norm": 0.27181747555732727, + "learning_rate": 3.161372410331897e-06, + "loss": 0.8393, + "step": 23405 + }, + { + "epoch": 2.82, + "grad_norm": 0.2749415338039398, + "learning_rate": 3.1403914847766497e-06, + "loss": 0.8015, + "step": 23410 + }, + { + "epoch": 2.82, + "grad_norm": 0.28852447867393494, + "learning_rate": 3.1194796762302353e-06, + "loss": 0.8061, + "step": 23415 + }, + { + "epoch": 2.82, + "grad_norm": 0.2778397798538208, + "learning_rate": 3.0986369945344312e-06, + "loss": 0.829, + "step": 23420 + }, + { + "epoch": 2.82, + "grad_norm": 0.3023031949996948, + "learning_rate": 3.0778634494984912e-06, + "loss": 0.7389, + "step": 23425 + }, + { + "epoch": 2.82, + "grad_norm": 0.3010731339454651, + "learning_rate": 3.0571590508991607e-06, + "loss": 0.8619, + "step": 23430 + }, + { + "epoch": 2.82, + "grad_norm": 0.28016921877861023, + "learning_rate": 3.0365238084805955e-06, + "loss": 0.7831, + "step": 23435 + }, + { + "epoch": 2.82, + "grad_norm": 0.2581021189689636, + "learning_rate": 3.015957731954427e-06, + "loss": 0.7122, + "step": 23440 + }, + { + "epoch": 2.82, + "grad_norm": 0.2601596713066101, + "learning_rate": 2.9954608309997296e-06, + "loss": 0.784, + "step": 23445 + }, + { + "epoch": 2.83, + "grad_norm": 0.2719314992427826, + "learning_rate": 2.9750331152630535e-06, + "loss": 0.7701, + "step": 23450 + }, + { + "epoch": 2.83, + "grad_norm": 0.25035297870635986, + "learning_rate": 2.9546745943583418e-06, + "loss": 0.785, + "step": 23455 + }, + { + "epoch": 2.83, + "grad_norm": 0.2835271656513214, + "learning_rate": 2.93438527786698e-06, + "loss": 0.7819, + "step": 23460 + }, + { + "epoch": 2.83, + "grad_norm": 0.2592954635620117, + "learning_rate": 2.914165175337796e-06, + "loss": 0.7281, + "step": 23465 + }, + { + "epoch": 2.83, + "grad_norm": 0.29438310861587524, + "learning_rate": 2.8940142962870784e-06, + "loss": 0.7564, + "step": 23470 + }, + { + "epoch": 2.83, + "grad_norm": 0.2590184807777405, + "learning_rate": 2.873932650198457e-06, + "loss": 0.862, + "step": 23475 + }, + { + "epoch": 2.83, + "grad_norm": 0.2566207945346832, + "learning_rate": 2.853920246523023e-06, + "loss": 0.7873, + "step": 23480 + }, + { + "epoch": 2.83, + "grad_norm": 0.2697911560535431, + "learning_rate": 2.8339770946793073e-06, + "loss": 0.8254, + "step": 23485 + }, + { + "epoch": 2.83, + "grad_norm": 0.2645513117313385, + "learning_rate": 2.814103204053203e-06, + "loss": 0.7198, + "step": 23490 + }, + { + "epoch": 2.83, + "grad_norm": 0.294327974319458, + "learning_rate": 2.794298583998028e-06, + "loss": 0.8079, + "step": 23495 + }, + { + "epoch": 2.83, + "grad_norm": 0.26449063420295715, + "learning_rate": 2.77456324383446e-06, + "loss": 0.6838, + "step": 23500 + }, + { + "epoch": 2.83, + "grad_norm": 0.27684205770492554, + "learning_rate": 2.7548971928506693e-06, + "loss": 0.7777, + "step": 23505 + }, + { + "epoch": 2.83, + "grad_norm": 0.29637137055397034, + "learning_rate": 2.7353004403021017e-06, + "loss": 0.8073, + "step": 23510 + }, + { + "epoch": 2.83, + "grad_norm": 0.297203004360199, + "learning_rate": 2.7157729954116634e-06, + "loss": 0.671, + "step": 23515 + }, + { + "epoch": 2.83, + "grad_norm": 0.27864888310432434, + "learning_rate": 2.6963148673696034e-06, + "loss": 0.7304, + "step": 23520 + }, + { + "epoch": 2.83, + "grad_norm": 0.30311015248298645, + "learning_rate": 2.676926065333562e-06, + "loss": 0.7151, + "step": 23525 + }, + { + "epoch": 2.84, + "grad_norm": 0.2594775855541229, + "learning_rate": 2.657606598428591e-06, + "loss": 0.8637, + "step": 23530 + }, + { + "epoch": 2.84, + "grad_norm": 0.2853897213935852, + "learning_rate": 2.6383564757470168e-06, + "loss": 0.7382, + "step": 23535 + }, + { + "epoch": 2.84, + "grad_norm": 0.27156689763069153, + "learning_rate": 2.6191757063486252e-06, + "loss": 0.7312, + "step": 23540 + }, + { + "epoch": 2.84, + "grad_norm": 0.26867052912712097, + "learning_rate": 2.6000642992605127e-06, + "loss": 0.7705, + "step": 23545 + }, + { + "epoch": 2.84, + "grad_norm": 0.27880826592445374, + "learning_rate": 2.581022263477134e-06, + "loss": 0.703, + "step": 23550 + }, + { + "epoch": 2.84, + "grad_norm": 0.26525866985321045, + "learning_rate": 2.5620496079603205e-06, + "loss": 0.7129, + "step": 23555 + }, + { + "epoch": 2.84, + "grad_norm": 0.36034145951271057, + "learning_rate": 2.5431463416392296e-06, + "loss": 0.7513, + "step": 23560 + }, + { + "epoch": 2.84, + "grad_norm": 0.28856807947158813, + "learning_rate": 2.5243124734103616e-06, + "loss": 0.7044, + "step": 23565 + }, + { + "epoch": 2.84, + "grad_norm": 0.27370092272758484, + "learning_rate": 2.5055480121375426e-06, + "loss": 0.7817, + "step": 23570 + }, + { + "epoch": 2.84, + "grad_norm": 0.28846797347068787, + "learning_rate": 2.486852966651992e-06, + "loss": 0.7372, + "step": 23575 + }, + { + "epoch": 2.84, + "grad_norm": 0.26241225004196167, + "learning_rate": 2.468227345752155e-06, + "loss": 0.7602, + "step": 23580 + }, + { + "epoch": 2.84, + "grad_norm": 0.30139365792274475, + "learning_rate": 2.4496711582039365e-06, + "loss": 0.7023, + "step": 23585 + }, + { + "epoch": 2.84, + "grad_norm": 0.29070574045181274, + "learning_rate": 2.4311844127404668e-06, + "loss": 0.7737, + "step": 23590 + }, + { + "epoch": 2.84, + "grad_norm": 0.3289770781993866, + "learning_rate": 2.41276711806222e-06, + "loss": 0.8992, + "step": 23595 + }, + { + "epoch": 2.84, + "grad_norm": 0.2710113525390625, + "learning_rate": 2.394419282836979e-06, + "loss": 0.7763, + "step": 23600 + }, + { + "epoch": 2.84, + "grad_norm": 0.31186643242836, + "learning_rate": 2.3761409156998532e-06, + "loss": 0.7533, + "step": 23605 + }, + { + "epoch": 2.84, + "grad_norm": 0.2570800185203552, + "learning_rate": 2.357932025253262e-06, + "loss": 0.7039, + "step": 23610 + }, + { + "epoch": 2.85, + "grad_norm": 0.2939087450504303, + "learning_rate": 2.3397926200668994e-06, + "loss": 0.8794, + "step": 23615 + }, + { + "epoch": 2.85, + "grad_norm": 0.26302385330200195, + "learning_rate": 2.3217227086777533e-06, + "loss": 0.6978, + "step": 23620 + }, + { + "epoch": 2.85, + "grad_norm": 0.27785399556159973, + "learning_rate": 2.3037222995901716e-06, + "loss": 0.7772, + "step": 23625 + }, + { + "epoch": 2.85, + "grad_norm": 0.2980126738548279, + "learning_rate": 2.2857914012757107e-06, + "loss": 0.7843, + "step": 23630 + }, + { + "epoch": 2.85, + "grad_norm": 0.30063748359680176, + "learning_rate": 2.267930022173253e-06, + "loss": 0.7381, + "step": 23635 + }, + { + "epoch": 2.85, + "grad_norm": 0.2854771316051483, + "learning_rate": 2.250138170688942e-06, + "loss": 0.7182, + "step": 23640 + }, + { + "epoch": 2.85, + "grad_norm": 0.2724713683128357, + "learning_rate": 2.2324158551962457e-06, + "loss": 0.8062, + "step": 23645 + }, + { + "epoch": 2.85, + "grad_norm": 0.2653485834598541, + "learning_rate": 2.214763084035842e-06, + "loss": 0.6749, + "step": 23650 + }, + { + "epoch": 2.85, + "grad_norm": 0.27311810851097107, + "learning_rate": 2.197179865515736e-06, + "loss": 0.6745, + "step": 23655 + }, + { + "epoch": 2.85, + "grad_norm": 0.3039811849594116, + "learning_rate": 2.179666207911157e-06, + "loss": 0.7771, + "step": 23660 + }, + { + "epoch": 2.85, + "grad_norm": 0.30827796459198, + "learning_rate": 2.1622221194646294e-06, + "loss": 0.7983, + "step": 23665 + }, + { + "epoch": 2.85, + "grad_norm": 0.27435705065727234, + "learning_rate": 2.144847608385919e-06, + "loss": 0.9009, + "step": 23670 + }, + { + "epoch": 2.85, + "grad_norm": 0.2926265299320221, + "learning_rate": 2.1275426828520347e-06, + "loss": 0.7032, + "step": 23675 + }, + { + "epoch": 2.85, + "grad_norm": 0.30964577198028564, + "learning_rate": 2.110307351007262e-06, + "loss": 0.7401, + "step": 23680 + }, + { + "epoch": 2.85, + "grad_norm": 0.2811949849128723, + "learning_rate": 2.0931416209631126e-06, + "loss": 0.6413, + "step": 23685 + }, + { + "epoch": 2.85, + "grad_norm": 0.2701903283596039, + "learning_rate": 2.07604550079834e-06, + "loss": 0.8361, + "step": 23690 + }, + { + "epoch": 2.85, + "grad_norm": 0.26354488730430603, + "learning_rate": 2.0590189985589755e-06, + "loss": 0.8271, + "step": 23695 + }, + { + "epoch": 2.86, + "grad_norm": 0.2724680006504059, + "learning_rate": 2.0420621222582255e-06, + "loss": 0.7632, + "step": 23700 + }, + { + "epoch": 2.86, + "grad_norm": 0.28979775309562683, + "learning_rate": 2.02517487987659e-06, + "loss": 0.7434, + "step": 23705 + }, + { + "epoch": 2.86, + "grad_norm": 0.2960839867591858, + "learning_rate": 2.0083572793617274e-06, + "loss": 0.717, + "step": 23710 + }, + { + "epoch": 2.86, + "grad_norm": 0.25892457365989685, + "learning_rate": 1.9916093286285904e-06, + "loss": 0.8393, + "step": 23715 + }, + { + "epoch": 2.86, + "grad_norm": 0.2716827690601349, + "learning_rate": 1.9749310355592907e-06, + "loss": 0.8344, + "step": 23720 + }, + { + "epoch": 2.86, + "grad_norm": 0.25879597663879395, + "learning_rate": 1.958322408003232e-06, + "loss": 0.7689, + "step": 23725 + }, + { + "epoch": 2.86, + "grad_norm": 0.2955610752105713, + "learning_rate": 1.9417834537769463e-06, + "loss": 0.7773, + "step": 23730 + }, + { + "epoch": 2.86, + "grad_norm": 0.2859145402908325, + "learning_rate": 1.925314180664239e-06, + "loss": 0.7632, + "step": 23735 + }, + { + "epoch": 2.86, + "grad_norm": 0.28816118836402893, + "learning_rate": 1.9089145964160614e-06, + "loss": 0.8108, + "step": 23740 + }, + { + "epoch": 2.86, + "grad_norm": 0.2830142080783844, + "learning_rate": 1.892584708750655e-06, + "loss": 0.7739, + "step": 23745 + }, + { + "epoch": 2.86, + "grad_norm": 0.28230804204940796, + "learning_rate": 1.876324525353373e-06, + "loss": 0.8458, + "step": 23750 + }, + { + "epoch": 2.86, + "grad_norm": 0.28245803713798523, + "learning_rate": 1.8601340538767938e-06, + "loss": 0.7592, + "step": 23755 + }, + { + "epoch": 2.86, + "grad_norm": 0.29182082414627075, + "learning_rate": 1.8440133019407056e-06, + "loss": 0.701, + "step": 23760 + }, + { + "epoch": 2.86, + "grad_norm": 0.2928227186203003, + "learning_rate": 1.8279622771320723e-06, + "loss": 0.7389, + "step": 23765 + }, + { + "epoch": 2.86, + "grad_norm": 0.2605980932712555, + "learning_rate": 1.8119809870050351e-06, + "loss": 0.8737, + "step": 23770 + }, + { + "epoch": 2.86, + "grad_norm": 0.27855974435806274, + "learning_rate": 1.7960694390809105e-06, + "loss": 0.8722, + "step": 23775 + }, + { + "epoch": 2.87, + "grad_norm": 0.2521946430206299, + "learning_rate": 1.7802276408482086e-06, + "loss": 0.8727, + "step": 23780 + }, + { + "epoch": 2.87, + "grad_norm": 0.2708735167980194, + "learning_rate": 1.7644555997626153e-06, + "loss": 0.844, + "step": 23785 + }, + { + "epoch": 2.87, + "grad_norm": 0.26479655504226685, + "learning_rate": 1.7487533232469597e-06, + "loss": 0.801, + "step": 23790 + }, + { + "epoch": 2.87, + "grad_norm": 0.32824069261550903, + "learning_rate": 1.7331208186912472e-06, + "loss": 0.7282, + "step": 23795 + }, + { + "epoch": 2.87, + "grad_norm": 0.2727472186088562, + "learning_rate": 1.7175580934526756e-06, + "loss": 0.7332, + "step": 23800 + }, + { + "epoch": 2.87, + "grad_norm": 0.2697487771511078, + "learning_rate": 1.7020651548555863e-06, + "loss": 0.8685, + "step": 23805 + }, + { + "epoch": 2.87, + "grad_norm": 0.3158542513847351, + "learning_rate": 1.6866420101914468e-06, + "loss": 0.7077, + "step": 23810 + }, + { + "epoch": 2.87, + "grad_norm": 0.2666032612323761, + "learning_rate": 1.6712886667189173e-06, + "loss": 0.784, + "step": 23815 + }, + { + "epoch": 2.87, + "grad_norm": 0.28167471289634705, + "learning_rate": 1.6560051316637678e-06, + "loss": 0.6687, + "step": 23820 + }, + { + "epoch": 2.87, + "grad_norm": 0.2855972945690155, + "learning_rate": 1.640791412218978e-06, + "loss": 0.8986, + "step": 23825 + }, + { + "epoch": 2.87, + "grad_norm": 0.28779542446136475, + "learning_rate": 1.6256475155446037e-06, + "loss": 0.7325, + "step": 23830 + }, + { + "epoch": 2.87, + "grad_norm": 0.27318522334098816, + "learning_rate": 1.61057344876786e-06, + "loss": 0.783, + "step": 23835 + }, + { + "epoch": 2.87, + "grad_norm": 0.29986461997032166, + "learning_rate": 1.595569218983106e-06, + "loss": 0.8758, + "step": 23840 + }, + { + "epoch": 2.87, + "grad_norm": 0.2557889223098755, + "learning_rate": 1.5806348332518593e-06, + "loss": 0.78, + "step": 23845 + }, + { + "epoch": 2.87, + "grad_norm": 0.26995155215263367, + "learning_rate": 1.5657702986026976e-06, + "loss": 0.8988, + "step": 23850 + }, + { + "epoch": 2.87, + "grad_norm": 0.2958948612213135, + "learning_rate": 1.550975622031375e-06, + "loss": 0.7705, + "step": 23855 + }, + { + "epoch": 2.87, + "grad_norm": 0.27343350648880005, + "learning_rate": 1.5362508105007721e-06, + "loss": 0.8335, + "step": 23860 + }, + { + "epoch": 2.88, + "grad_norm": 0.34859418869018555, + "learning_rate": 1.5215958709408783e-06, + "loss": 0.8144, + "step": 23865 + }, + { + "epoch": 2.88, + "grad_norm": 0.26644057035446167, + "learning_rate": 1.50701081024876e-06, + "loss": 0.7872, + "step": 23870 + }, + { + "epoch": 2.88, + "grad_norm": 0.26625731587409973, + "learning_rate": 1.4924956352886763e-06, + "loss": 0.769, + "step": 23875 + }, + { + "epoch": 2.88, + "grad_norm": 0.28103673458099365, + "learning_rate": 1.478050352891913e-06, + "loss": 0.8232, + "step": 23880 + }, + { + "epoch": 2.88, + "grad_norm": 0.3763957619667053, + "learning_rate": 1.463674969856915e-06, + "loss": 0.7883, + "step": 23885 + }, + { + "epoch": 2.88, + "grad_norm": 0.28736910223960876, + "learning_rate": 1.4493694929492206e-06, + "loss": 0.722, + "step": 23890 + }, + { + "epoch": 2.88, + "grad_norm": 0.2688366770744324, + "learning_rate": 1.4351339289014608e-06, + "loss": 0.9024, + "step": 23895 + }, + { + "epoch": 2.88, + "grad_norm": 0.27338284254074097, + "learning_rate": 1.420968284413343e-06, + "loss": 0.8657, + "step": 23900 + }, + { + "epoch": 2.88, + "grad_norm": 0.2942239046096802, + "learning_rate": 1.4068725661517343e-06, + "loss": 0.7344, + "step": 23905 + }, + { + "epoch": 2.88, + "grad_norm": 0.26615390181541443, + "learning_rate": 1.392846780750495e-06, + "loss": 0.7641, + "step": 23910 + }, + { + "epoch": 2.88, + "grad_norm": 0.27061372995376587, + "learning_rate": 1.378890934810678e-06, + "loss": 0.7982, + "step": 23915 + }, + { + "epoch": 2.88, + "grad_norm": 0.2560684084892273, + "learning_rate": 1.3650050349003294e-06, + "loss": 0.7434, + "step": 23920 + }, + { + "epoch": 2.88, + "grad_norm": 0.2993808388710022, + "learning_rate": 1.3511890875546217e-06, + "loss": 0.7741, + "step": 23925 + }, + { + "epoch": 2.88, + "grad_norm": 0.2594783902168274, + "learning_rate": 1.3374430992758033e-06, + "loss": 0.827, + "step": 23930 + }, + { + "epoch": 2.88, + "grad_norm": 0.31000006198883057, + "learning_rate": 1.3237670765331998e-06, + "loss": 0.7506, + "step": 23935 + }, + { + "epoch": 2.88, + "grad_norm": 0.3279038071632385, + "learning_rate": 1.310161025763179e-06, + "loss": 0.7283, + "step": 23940 + }, + { + "epoch": 2.89, + "grad_norm": 0.2829456925392151, + "learning_rate": 1.2966249533692352e-06, + "loss": 0.7266, + "step": 23945 + }, + { + "epoch": 2.89, + "grad_norm": 0.2958797812461853, + "learning_rate": 1.2831588657218728e-06, + "loss": 0.6592, + "step": 23950 + }, + { + "epoch": 2.89, + "grad_norm": 0.2799309194087982, + "learning_rate": 1.2697627691586887e-06, + "loss": 0.7966, + "step": 23955 + }, + { + "epoch": 2.89, + "grad_norm": 0.3030032217502594, + "learning_rate": 1.2564366699842899e-06, + "loss": 0.8914, + "step": 23960 + }, + { + "epoch": 2.89, + "grad_norm": 0.32187244296073914, + "learning_rate": 1.2431805744704426e-06, + "loss": 0.8033, + "step": 23965 + }, + { + "epoch": 2.89, + "grad_norm": 0.2817818820476532, + "learning_rate": 1.2299944888558732e-06, + "loss": 0.781, + "step": 23970 + }, + { + "epoch": 2.89, + "grad_norm": 0.27116790413856506, + "learning_rate": 1.2194960316631197e-06, + "loss": 0.7533, + "step": 23975 + }, + { + "epoch": 2.89, + "grad_norm": 0.3104703426361084, + "learning_rate": 1.2064359794837553e-06, + "loss": 0.8843, + "step": 23980 + }, + { + "epoch": 2.89, + "grad_norm": 0.27435311675071716, + "learning_rate": 1.1934459544969023e-06, + "loss": 0.8259, + "step": 23985 + }, + { + "epoch": 2.89, + "grad_norm": 0.29726967215538025, + "learning_rate": 1.1805259628160867e-06, + "loss": 0.7628, + "step": 23990 + }, + { + "epoch": 2.89, + "grad_norm": 0.2689824104309082, + "learning_rate": 1.167676010521912e-06, + "loss": 0.8803, + "step": 23995 + }, + { + "epoch": 2.89, + "grad_norm": 0.27188640832901, + "learning_rate": 1.154896103661973e-06, + "loss": 0.8005, + "step": 24000 + }, + { + "epoch": 2.89, + "grad_norm": 0.2570241689682007, + "learning_rate": 1.1421862482509093e-06, + "loss": 0.7015, + "step": 24005 + }, + { + "epoch": 2.89, + "grad_norm": 0.27499955892562866, + "learning_rate": 1.1295464502704187e-06, + "loss": 0.8028, + "step": 24010 + }, + { + "epoch": 2.89, + "grad_norm": 0.30428576469421387, + "learning_rate": 1.11697671566921e-06, + "loss": 0.8024, + "step": 24015 + }, + { + "epoch": 2.89, + "grad_norm": 0.3098774254322052, + "learning_rate": 1.1044770503630008e-06, + "loss": 0.7861, + "step": 24020 + }, + { + "epoch": 2.89, + "grad_norm": 0.2888442575931549, + "learning_rate": 1.092047460234552e-06, + "loss": 0.7146, + "step": 24025 + }, + { + "epoch": 2.9, + "grad_norm": 0.2686418294906616, + "learning_rate": 1.0796879511336676e-06, + "loss": 0.6743, + "step": 24030 + }, + { + "epoch": 2.9, + "grad_norm": 0.26199212670326233, + "learning_rate": 1.0673985288771114e-06, + "loss": 0.7769, + "step": 24035 + }, + { + "epoch": 2.9, + "grad_norm": 0.27240923047065735, + "learning_rate": 1.0551791992487068e-06, + "loss": 0.8166, + "step": 24040 + }, + { + "epoch": 2.9, + "grad_norm": 0.2600533366203308, + "learning_rate": 1.0430299679992704e-06, + "loss": 0.7604, + "step": 24045 + }, + { + "epoch": 2.9, + "grad_norm": 0.2539452612400055, + "learning_rate": 1.0309508408466617e-06, + "loss": 0.8341, + "step": 24050 + }, + { + "epoch": 2.9, + "grad_norm": 0.23374611139297485, + "learning_rate": 1.018941823475683e-06, + "loss": 0.8019, + "step": 24055 + }, + { + "epoch": 2.9, + "grad_norm": 0.2858569025993347, + "learning_rate": 1.0070029215381803e-06, + "loss": 0.8227, + "step": 24060 + }, + { + "epoch": 2.9, + "grad_norm": 0.3056763708591461, + "learning_rate": 9.951341406530088e-07, + "loss": 0.7465, + "step": 24065 + }, + { + "epoch": 2.9, + "grad_norm": 0.2973862290382385, + "learning_rate": 9.833354864060171e-07, + "loss": 0.6763, + "step": 24070 + }, + { + "epoch": 2.9, + "grad_norm": 0.30157533288002014, + "learning_rate": 9.716069643500467e-07, + "loss": 0.8366, + "step": 24075 + }, + { + "epoch": 2.9, + "grad_norm": 0.2873646318912506, + "learning_rate": 9.59948580004899e-07, + "loss": 0.6718, + "step": 24080 + }, + { + "epoch": 2.9, + "grad_norm": 0.2701966464519501, + "learning_rate": 9.483603388574345e-07, + "loss": 0.8791, + "step": 24085 + }, + { + "epoch": 2.9, + "grad_norm": 0.283039927482605, + "learning_rate": 9.368422463614411e-07, + "loss": 0.7677, + "step": 24090 + }, + { + "epoch": 2.9, + "grad_norm": 0.3029240071773529, + "learning_rate": 9.253943079377157e-07, + "loss": 0.8087, + "step": 24095 + }, + { + "epoch": 2.9, + "grad_norm": 0.2594771683216095, + "learning_rate": 9.14016528974032e-07, + "loss": 0.7987, + "step": 24100 + }, + { + "epoch": 2.9, + "grad_norm": 0.2657313048839569, + "learning_rate": 9.027089148251731e-07, + "loss": 0.7931, + "step": 24105 + }, + { + "epoch": 2.9, + "grad_norm": 0.26724809408187866, + "learning_rate": 8.914714708128657e-07, + "loss": 0.7604, + "step": 24110 + }, + { + "epoch": 2.91, + "grad_norm": 0.2993227541446686, + "learning_rate": 8.803042022258289e-07, + "loss": 0.769, + "step": 24115 + }, + { + "epoch": 2.91, + "grad_norm": 0.27348792552948, + "learning_rate": 8.692071143197588e-07, + "loss": 0.7721, + "step": 24120 + }, + { + "epoch": 2.91, + "grad_norm": 0.2768844664096832, + "learning_rate": 8.581802123172776e-07, + "loss": 0.7656, + "step": 24125 + }, + { + "epoch": 2.91, + "grad_norm": 0.2716030478477478, + "learning_rate": 8.472235014080508e-07, + "loss": 0.8586, + "step": 24130 + }, + { + "epoch": 2.91, + "grad_norm": 0.2838304340839386, + "learning_rate": 8.363369867486369e-07, + "loss": 0.6991, + "step": 24135 + }, + { + "epoch": 2.91, + "grad_norm": 0.2540549039840698, + "learning_rate": 8.255206734626207e-07, + "loss": 0.8098, + "step": 24140 + }, + { + "epoch": 2.91, + "grad_norm": 0.30254021286964417, + "learning_rate": 8.147745666405137e-07, + "loss": 0.8074, + "step": 24145 + }, + { + "epoch": 2.91, + "grad_norm": 0.33351510763168335, + "learning_rate": 8.040986713397867e-07, + "loss": 0.756, + "step": 24150 + }, + { + "epoch": 2.91, + "grad_norm": 0.28228092193603516, + "learning_rate": 7.934929925848543e-07, + "loss": 0.724, + "step": 24155 + }, + { + "epoch": 2.91, + "grad_norm": 0.290088951587677, + "learning_rate": 7.829575353671235e-07, + "loss": 0.7698, + "step": 24160 + }, + { + "epoch": 2.91, + "grad_norm": 0.29970574378967285, + "learning_rate": 7.724923046449117e-07, + "loss": 0.7149, + "step": 24165 + }, + { + "epoch": 2.91, + "grad_norm": 0.2955015301704407, + "learning_rate": 7.62097305343512e-07, + "loss": 0.8158, + "step": 24170 + }, + { + "epoch": 2.91, + "grad_norm": 0.31175607442855835, + "learning_rate": 7.517725423551613e-07, + "loss": 0.6608, + "step": 24175 + }, + { + "epoch": 2.91, + "grad_norm": 0.3113124966621399, + "learning_rate": 7.415180205390392e-07, + "loss": 0.7003, + "step": 24180 + }, + { + "epoch": 2.91, + "grad_norm": 0.25491559505462646, + "learning_rate": 7.313337447212519e-07, + "loss": 0.777, + "step": 24185 + }, + { + "epoch": 2.91, + "grad_norm": 0.3029264509677887, + "learning_rate": 7.212197196948655e-07, + "loss": 0.7743, + "step": 24190 + }, + { + "epoch": 2.92, + "grad_norm": 0.25185316801071167, + "learning_rate": 7.111759502198721e-07, + "loss": 0.7702, + "step": 24195 + }, + { + "epoch": 2.92, + "grad_norm": 0.29424992203712463, + "learning_rate": 7.012024410232076e-07, + "loss": 0.7636, + "step": 24200 + }, + { + "epoch": 2.92, + "grad_norm": 0.25971463322639465, + "learning_rate": 6.912991967987169e-07, + "loss": 0.792, + "step": 24205 + }, + { + "epoch": 2.92, + "grad_norm": 0.2720402479171753, + "learning_rate": 6.814662222072387e-07, + "loss": 0.7822, + "step": 24210 + }, + { + "epoch": 2.92, + "grad_norm": 0.2848743498325348, + "learning_rate": 6.717035218764543e-07, + "loss": 0.7481, + "step": 24215 + }, + { + "epoch": 2.92, + "grad_norm": 0.31447309255599976, + "learning_rate": 6.620111004010387e-07, + "loss": 0.7175, + "step": 24220 + }, + { + "epoch": 2.92, + "grad_norm": 0.2868044376373291, + "learning_rate": 6.52388962342576e-07, + "loss": 0.7447, + "step": 24225 + }, + { + "epoch": 2.92, + "grad_norm": 0.281565397977829, + "learning_rate": 6.428371122295273e-07, + "loss": 0.739, + "step": 24230 + }, + { + "epoch": 2.92, + "grad_norm": 0.27926504611968994, + "learning_rate": 6.333555545573299e-07, + "loss": 0.7863, + "step": 24235 + }, + { + "epoch": 2.92, + "grad_norm": 0.2796558439731598, + "learning_rate": 6.23944293788331e-07, + "loss": 0.7584, + "step": 24240 + }, + { + "epoch": 2.92, + "grad_norm": 0.27483847737312317, + "learning_rate": 6.146033343517709e-07, + "loss": 0.777, + "step": 24245 + }, + { + "epoch": 2.92, + "grad_norm": 0.27021974325180054, + "learning_rate": 6.053326806438163e-07, + "loss": 0.8146, + "step": 24250 + }, + { + "epoch": 2.92, + "grad_norm": 0.28549179434776306, + "learning_rate": 5.961323370275439e-07, + "loss": 0.8211, + "step": 24255 + }, + { + "epoch": 2.92, + "grad_norm": 0.2960338592529297, + "learning_rate": 5.870023078329567e-07, + "loss": 0.73, + "step": 24260 + }, + { + "epoch": 2.92, + "grad_norm": 0.2839416563510895, + "learning_rate": 5.779425973569174e-07, + "loss": 0.791, + "step": 24265 + }, + { + "epoch": 2.92, + "grad_norm": 0.28548988699913025, + "learning_rate": 5.689532098632487e-07, + "loss": 0.7879, + "step": 24270 + }, + { + "epoch": 2.92, + "grad_norm": 0.289668470621109, + "learning_rate": 5.6003414958265e-07, + "loss": 0.7007, + "step": 24275 + }, + { + "epoch": 2.93, + "grad_norm": 0.2637322247028351, + "learning_rate": 5.511854207127298e-07, + "loss": 0.8707, + "step": 24280 + }, + { + "epoch": 2.93, + "grad_norm": 0.27071821689605713, + "learning_rate": 5.424070274179904e-07, + "loss": 0.7963, + "step": 24285 + }, + { + "epoch": 2.93, + "grad_norm": 0.2783946692943573, + "learning_rate": 5.336989738298436e-07, + "loss": 0.7763, + "step": 24290 + }, + { + "epoch": 2.93, + "grad_norm": 0.24435043334960938, + "learning_rate": 5.250612640465779e-07, + "loss": 0.7297, + "step": 24295 + }, + { + "epoch": 2.93, + "grad_norm": 0.26858675479888916, + "learning_rate": 5.164939021334081e-07, + "loss": 0.7109, + "step": 24300 + }, + { + "epoch": 2.93, + "grad_norm": 0.28042492270469666, + "learning_rate": 5.079968921223754e-07, + "loss": 0.7229, + "step": 24305 + }, + { + "epoch": 2.93, + "grad_norm": 0.26023703813552856, + "learning_rate": 4.995702380124977e-07, + "loss": 0.695, + "step": 24310 + }, + { + "epoch": 2.93, + "grad_norm": 0.27599677443504333, + "learning_rate": 4.912139437696195e-07, + "loss": 0.7659, + "step": 24315 + }, + { + "epoch": 2.93, + "grad_norm": 0.2630544602870941, + "learning_rate": 4.829280133264779e-07, + "loss": 0.7636, + "step": 24320 + }, + { + "epoch": 2.93, + "grad_norm": 0.30303630232810974, + "learning_rate": 4.747124505827371e-07, + "loss": 0.6761, + "step": 24325 + }, + { + "epoch": 2.93, + "grad_norm": 0.26069948077201843, + "learning_rate": 4.665672594048875e-07, + "loss": 0.8025, + "step": 24330 + }, + { + "epoch": 2.93, + "grad_norm": 0.2887049615383148, + "learning_rate": 4.5849244362634597e-07, + "loss": 0.8476, + "step": 24335 + }, + { + "epoch": 2.93, + "grad_norm": 0.2869950532913208, + "learning_rate": 4.504880070473726e-07, + "loss": 0.8227, + "step": 24340 + }, + { + "epoch": 2.93, + "grad_norm": 0.2907213866710663, + "learning_rate": 4.4255395343513744e-07, + "loss": 0.8843, + "step": 24345 + }, + { + "epoch": 2.93, + "grad_norm": 0.27622485160827637, + "learning_rate": 4.3469028652365346e-07, + "loss": 0.7809, + "step": 24350 + }, + { + "epoch": 2.93, + "grad_norm": 0.3061695098876953, + "learning_rate": 4.268970100138269e-07, + "loss": 0.6801, + "step": 24355 + }, + { + "epoch": 2.94, + "grad_norm": 0.26443079113960266, + "learning_rate": 4.1917412757345724e-07, + "loss": 0.7058, + "step": 24360 + }, + { + "epoch": 2.94, + "grad_norm": 0.30546635389328003, + "learning_rate": 4.1152164283715373e-07, + "loss": 0.8242, + "step": 24365 + }, + { + "epoch": 2.94, + "grad_norm": 0.265575647354126, + "learning_rate": 4.039395594064521e-07, + "loss": 0.7344, + "step": 24370 + }, + { + "epoch": 2.94, + "grad_norm": 0.29234451055526733, + "learning_rate": 3.9642788084971454e-07, + "loss": 0.7067, + "step": 24375 + }, + { + "epoch": 2.94, + "grad_norm": 0.25120893120765686, + "learning_rate": 3.889866107021966e-07, + "loss": 0.7673, + "step": 24380 + }, + { + "epoch": 2.94, + "grad_norm": 0.32437393069267273, + "learning_rate": 3.816157524660135e-07, + "loss": 0.6869, + "step": 24385 + }, + { + "epoch": 2.94, + "grad_norm": 0.2962343096733093, + "learning_rate": 3.7431530961014034e-07, + "loss": 0.7815, + "step": 24390 + }, + { + "epoch": 2.94, + "grad_norm": 0.2750551700592041, + "learning_rate": 3.6708528557037873e-07, + "loss": 0.8175, + "step": 24395 + }, + { + "epoch": 2.94, + "grad_norm": 0.2620263993740082, + "learning_rate": 3.5992568374945685e-07, + "loss": 0.7162, + "step": 24400 + }, + { + "epoch": 2.94, + "grad_norm": 0.26457253098487854, + "learning_rate": 3.528365075168793e-07, + "loss": 0.6064, + "step": 24405 + }, + { + "epoch": 2.94, + "grad_norm": 0.29997846484184265, + "learning_rate": 3.4581776020907725e-07, + "loss": 0.8372, + "step": 24410 + }, + { + "epoch": 2.94, + "grad_norm": 0.2950671315193176, + "learning_rate": 3.388694451293084e-07, + "loss": 0.7008, + "step": 24415 + }, + { + "epoch": 2.94, + "grad_norm": 0.28241240978240967, + "learning_rate": 3.3199156554765684e-07, + "loss": 0.7895, + "step": 24420 + }, + { + "epoch": 2.94, + "grad_norm": 0.2898365557193756, + "learning_rate": 3.251841247011167e-07, + "loss": 0.6871, + "step": 24425 + }, + { + "epoch": 2.94, + "grad_norm": 0.2402508705854416, + "learning_rate": 3.1844712579345843e-07, + "loss": 0.8315, + "step": 24430 + }, + { + "epoch": 2.94, + "grad_norm": 0.2538807988166809, + "learning_rate": 3.1178057199536256e-07, + "loss": 0.7951, + "step": 24435 + }, + { + "epoch": 2.94, + "grad_norm": 0.24774649739265442, + "learning_rate": 3.051844664443026e-07, + "loss": 0.7521, + "step": 24440 + }, + { + "epoch": 2.95, + "grad_norm": 0.29270678758621216, + "learning_rate": 2.9865881224466206e-07, + "loss": 0.744, + "step": 24445 + }, + { + "epoch": 2.95, + "grad_norm": 0.2553872764110565, + "learning_rate": 2.9220361246761746e-07, + "loss": 0.8013, + "step": 24450 + }, + { + "epoch": 2.95, + "grad_norm": 0.24886707961559296, + "learning_rate": 2.858188701511721e-07, + "loss": 0.7174, + "step": 24455 + }, + { + "epoch": 2.95, + "grad_norm": 0.23313623666763306, + "learning_rate": 2.7950458830023893e-07, + "loss": 0.7931, + "step": 24460 + }, + { + "epoch": 2.95, + "grad_norm": 0.25668811798095703, + "learning_rate": 2.7326076988650747e-07, + "loss": 0.7746, + "step": 24465 + }, + { + "epoch": 2.95, + "grad_norm": 0.2648046016693115, + "learning_rate": 2.6708741784852716e-07, + "loss": 0.8661, + "step": 24470 + }, + { + "epoch": 2.95, + "grad_norm": 0.2649308741092682, + "learning_rate": 2.6098453509167394e-07, + "loss": 0.8434, + "step": 24475 + }, + { + "epoch": 2.95, + "grad_norm": 0.29732733964920044, + "learning_rate": 2.549521244881836e-07, + "loss": 0.8796, + "step": 24480 + }, + { + "epoch": 2.95, + "grad_norm": 0.3012838065624237, + "learning_rate": 2.489901888771184e-07, + "loss": 0.7311, + "step": 24485 + }, + { + "epoch": 2.95, + "grad_norm": 0.3150104880332947, + "learning_rate": 2.4309873106433395e-07, + "loss": 0.75, + "step": 24490 + }, + { + "epoch": 2.95, + "grad_norm": 0.260047048330307, + "learning_rate": 2.3727775382256232e-07, + "loss": 0.7938, + "step": 24495 + }, + { + "epoch": 2.95, + "grad_norm": 0.26424214243888855, + "learning_rate": 2.3152725989132869e-07, + "loss": 0.8173, + "step": 24500 + }, + { + "epoch": 2.95, + "grad_norm": 0.29962679743766785, + "learning_rate": 2.2584725197705156e-07, + "loss": 0.7686, + "step": 24505 + }, + { + "epoch": 2.95, + "grad_norm": 0.2528585195541382, + "learning_rate": 2.202377327528926e-07, + "loss": 0.7052, + "step": 24510 + }, + { + "epoch": 2.95, + "grad_norm": 0.31817159056663513, + "learning_rate": 2.1469870485888995e-07, + "loss": 0.8064, + "step": 24515 + }, + { + "epoch": 2.95, + "grad_norm": 0.26599112153053284, + "learning_rate": 2.092301709018751e-07, + "loss": 0.8348, + "step": 24520 + }, + { + "epoch": 2.95, + "grad_norm": 0.26814529299736023, + "learning_rate": 2.0383213345555595e-07, + "loss": 0.8049, + "step": 24525 + }, + { + "epoch": 2.96, + "grad_norm": 0.2638113796710968, + "learning_rate": 1.9850459506041693e-07, + "loss": 0.8862, + "step": 24530 + }, + { + "epoch": 2.96, + "grad_norm": 0.2369757890701294, + "learning_rate": 1.9324755822375248e-07, + "loss": 0.8741, + "step": 24535 + }, + { + "epoch": 2.96, + "grad_norm": 0.28597578406333923, + "learning_rate": 1.8806102541973346e-07, + "loss": 0.6238, + "step": 24540 + }, + { + "epoch": 2.96, + "grad_norm": 0.30425626039505005, + "learning_rate": 1.829449990892906e-07, + "loss": 0.788, + "step": 24545 + }, + { + "epoch": 2.96, + "grad_norm": 0.29849773645401, + "learning_rate": 1.7789948164019795e-07, + "loss": 0.769, + "step": 24550 + }, + { + "epoch": 2.96, + "grad_norm": 0.27328890562057495, + "learning_rate": 1.7292447544703937e-07, + "loss": 0.7794, + "step": 24555 + }, + { + "epoch": 2.96, + "grad_norm": 0.26762855052948, + "learning_rate": 1.6801998285124186e-07, + "loss": 0.7316, + "step": 24560 + }, + { + "epoch": 2.96, + "grad_norm": 0.2702507972717285, + "learning_rate": 1.6318600616099241e-07, + "loss": 0.8332, + "step": 24565 + }, + { + "epoch": 2.96, + "grad_norm": 0.2567251920700073, + "learning_rate": 1.5842254765135453e-07, + "loss": 0.7298, + "step": 24570 + }, + { + "epoch": 2.96, + "grad_norm": 0.2939104437828064, + "learning_rate": 1.5372960956413493e-07, + "loss": 0.688, + "step": 24575 + }, + { + "epoch": 2.96, + "grad_norm": 0.28854793310165405, + "learning_rate": 1.491071941080002e-07, + "loss": 0.7518, + "step": 24580 + }, + { + "epoch": 2.96, + "grad_norm": 0.2733674645423889, + "learning_rate": 1.445553034584268e-07, + "loss": 0.7832, + "step": 24585 + }, + { + "epoch": 2.96, + "grad_norm": 0.27166128158569336, + "learning_rate": 1.4007393975766777e-07, + "loss": 0.7416, + "step": 24590 + }, + { + "epoch": 2.96, + "grad_norm": 0.260300874710083, + "learning_rate": 1.3566310511480272e-07, + "loss": 0.7861, + "step": 24595 + }, + { + "epoch": 2.96, + "grad_norm": 0.22208818793296814, + "learning_rate": 1.3132280160572105e-07, + "loss": 0.7546, + "step": 24600 + }, + { + "epoch": 2.96, + "grad_norm": 0.2763347029685974, + "learning_rate": 1.270530312731055e-07, + "loss": 0.7988, + "step": 24605 + }, + { + "epoch": 2.97, + "grad_norm": 0.2827538847923279, + "learning_rate": 1.2285379612648195e-07, + "loss": 0.672, + "step": 24610 + }, + { + "epoch": 2.97, + "grad_norm": 0.27377375960350037, + "learning_rate": 1.187250981421195e-07, + "loss": 0.7166, + "step": 24615 + }, + { + "epoch": 2.97, + "grad_norm": 0.26959091424942017, + "learning_rate": 1.1466693926311388e-07, + "loss": 0.8364, + "step": 24620 + }, + { + "epoch": 2.97, + "grad_norm": 0.29500630497932434, + "learning_rate": 1.1067932139940394e-07, + "loss": 0.7982, + "step": 24625 + }, + { + "epoch": 2.97, + "grad_norm": 0.24503514170646667, + "learning_rate": 1.0676224642767184e-07, + "loss": 0.884, + "step": 24630 + }, + { + "epoch": 2.97, + "grad_norm": 0.30973246693611145, + "learning_rate": 1.0291571619142624e-07, + "loss": 0.7196, + "step": 24635 + }, + { + "epoch": 2.97, + "grad_norm": 0.30464237928390503, + "learning_rate": 9.913973250096907e-08, + "loss": 0.7846, + "step": 24640 + }, + { + "epoch": 2.97, + "grad_norm": 0.2849758565425873, + "learning_rate": 9.543429713339546e-08, + "loss": 0.8509, + "step": 24645 + }, + { + "epoch": 2.97, + "grad_norm": 0.2636762261390686, + "learning_rate": 9.179941183264372e-08, + "loss": 0.7391, + "step": 24650 + }, + { + "epoch": 2.97, + "grad_norm": 0.29409074783325195, + "learning_rate": 8.823507830936216e-08, + "loss": 0.8165, + "step": 24655 + }, + { + "epoch": 2.97, + "grad_norm": 0.2603693902492523, + "learning_rate": 8.474129824109222e-08, + "loss": 0.8494, + "step": 24660 + }, + { + "epoch": 2.97, + "grad_norm": 0.27702707052230835, + "learning_rate": 8.131807327208529e-08, + "loss": 0.7467, + "step": 24665 + }, + { + "epoch": 2.97, + "grad_norm": 0.2720186710357666, + "learning_rate": 7.796540501343595e-08, + "loss": 0.8084, + "step": 24670 + }, + { + "epoch": 2.97, + "grad_norm": 0.27940112352371216, + "learning_rate": 7.468329504303206e-08, + "loss": 0.8123, + "step": 24675 + }, + { + "epoch": 2.97, + "grad_norm": 0.2809736132621765, + "learning_rate": 7.147174490552132e-08, + "loss": 0.7652, + "step": 24680 + }, + { + "epoch": 2.97, + "grad_norm": 0.29643842577934265, + "learning_rate": 6.833075611239469e-08, + "loss": 0.8162, + "step": 24685 + }, + { + "epoch": 2.97, + "grad_norm": 0.329057514667511, + "learning_rate": 6.526033014188637e-08, + "loss": 0.883, + "step": 24690 + }, + { + "epoch": 2.98, + "grad_norm": 0.2862626612186432, + "learning_rate": 6.226046843904042e-08, + "loss": 0.7266, + "step": 24695 + }, + { + "epoch": 2.98, + "grad_norm": 0.28131476044654846, + "learning_rate": 5.933117241571084e-08, + "loss": 0.8039, + "step": 24700 + }, + { + "epoch": 2.98, + "grad_norm": 0.2740655839443207, + "learning_rate": 5.647244345049484e-08, + "loss": 0.7991, + "step": 24705 + }, + { + "epoch": 2.98, + "grad_norm": 0.2963801324367523, + "learning_rate": 5.368428288881621e-08, + "loss": 0.7506, + "step": 24710 + }, + { + "epoch": 2.98, + "grad_norm": 0.2843016982078552, + "learning_rate": 5.096669204287529e-08, + "loss": 0.7785, + "step": 24715 + }, + { + "epoch": 2.98, + "grad_norm": 0.2702232897281647, + "learning_rate": 4.831967219168231e-08, + "loss": 0.7964, + "step": 24720 + }, + { + "epoch": 2.98, + "grad_norm": 0.3016199767589569, + "learning_rate": 4.574322458097413e-08, + "loss": 0.7077, + "step": 24725 + }, + { + "epoch": 2.98, + "grad_norm": 0.31466734409332275, + "learning_rate": 4.3237350423330766e-08, + "loss": 0.775, + "step": 24730 + }, + { + "epoch": 2.98, + "grad_norm": 0.256837397813797, + "learning_rate": 4.080205089810884e-08, + "loss": 0.7682, + "step": 24735 + }, + { + "epoch": 2.98, + "grad_norm": 0.28616052865982056, + "learning_rate": 3.843732715142489e-08, + "loss": 0.7311, + "step": 24740 + }, + { + "epoch": 2.98, + "grad_norm": 0.24874311685562134, + "learning_rate": 3.614318029620533e-08, + "loss": 0.7582, + "step": 24745 + }, + { + "epoch": 2.98, + "grad_norm": 0.27316519618034363, + "learning_rate": 3.3919611412153156e-08, + "loss": 0.8188, + "step": 24750 + }, + { + "epoch": 2.98, + "grad_norm": 0.2963423728942871, + "learning_rate": 3.1766621545764594e-08, + "loss": 0.8122, + "step": 24755 + }, + { + "epoch": 2.98, + "grad_norm": 0.27651023864746094, + "learning_rate": 2.9684211710279128e-08, + "loss": 0.774, + "step": 24760 + }, + { + "epoch": 2.98, + "grad_norm": 0.28626778721809387, + "learning_rate": 2.7672382885762788e-08, + "loss": 0.8288, + "step": 24765 + }, + { + "epoch": 2.98, + "grad_norm": 0.2841629683971405, + "learning_rate": 2.5731136019058186e-08, + "loss": 0.7956, + "step": 24770 + }, + { + "epoch": 2.99, + "grad_norm": 0.27702802419662476, + "learning_rate": 2.3860472023767842e-08, + "loss": 0.7156, + "step": 24775 + }, + { + "epoch": 2.99, + "grad_norm": 0.3124939799308777, + "learning_rate": 2.206039178028751e-08, + "loss": 0.7434, + "step": 24780 + }, + { + "epoch": 2.99, + "grad_norm": 0.2639085650444031, + "learning_rate": 2.0330896135806184e-08, + "loss": 0.7925, + "step": 24785 + }, + { + "epoch": 2.99, + "grad_norm": 0.28985705971717834, + "learning_rate": 1.8671985904289422e-08, + "loss": 0.798, + "step": 24790 + }, + { + "epoch": 2.99, + "grad_norm": 0.29289746284484863, + "learning_rate": 1.7083661866446053e-08, + "loss": 0.8071, + "step": 24795 + }, + { + "epoch": 2.99, + "grad_norm": 0.25807228684425354, + "learning_rate": 1.5565924769811446e-08, + "loss": 0.8328, + "step": 24800 + }, + { + "epoch": 2.99, + "grad_norm": 0.2961677610874176, + "learning_rate": 1.4118775328680886e-08, + "loss": 0.7459, + "step": 24805 + }, + { + "epoch": 2.99, + "grad_norm": 0.28014975786209106, + "learning_rate": 1.2742214224126246e-08, + "loss": 0.8192, + "step": 24810 + }, + { + "epoch": 2.99, + "grad_norm": 0.26734206080436707, + "learning_rate": 1.143624210402927e-08, + "loss": 0.8212, + "step": 24815 + }, + { + "epoch": 2.99, + "grad_norm": 0.2634017765522003, + "learning_rate": 1.0200859582981669e-08, + "loss": 0.7237, + "step": 24820 + }, + { + "epoch": 2.99, + "grad_norm": 0.27349719405174255, + "learning_rate": 9.036067242418343e-09, + "loss": 0.8578, + "step": 24825 + }, + { + "epoch": 2.99, + "grad_norm": 0.2613513171672821, + "learning_rate": 7.941865630534116e-09, + "loss": 0.7642, + "step": 24830 + }, + { + "epoch": 2.99, + "grad_norm": 0.2503609359264374, + "learning_rate": 6.918255262300387e-09, + "loss": 0.7935, + "step": 24835 + }, + { + "epoch": 2.99, + "grad_norm": 0.27115345001220703, + "learning_rate": 5.9652366194318235e-09, + "loss": 0.7417, + "step": 24840 + }, + { + "epoch": 2.99, + "grad_norm": 0.29795727133750916, + "learning_rate": 5.082810150486283e-09, + "loss": 0.7581, + "step": 24845 + }, + { + "epoch": 2.99, + "grad_norm": 0.2834166884422302, + "learning_rate": 4.270976270731585e-09, + "loss": 0.685, + "step": 24850 + }, + { + "epoch": 2.99, + "grad_norm": 0.2606295347213745, + "learning_rate": 3.5297353622787406e-09, + "loss": 0.821, + "step": 24855 + }, + { + "epoch": 3.0, + "grad_norm": 0.27347010374069214, + "learning_rate": 2.8590877739487205e-09, + "loss": 0.7438, + "step": 24860 + }, + { + "epoch": 3.0, + "grad_norm": 0.2770770788192749, + "learning_rate": 2.2590338213890334e-09, + "loss": 0.8011, + "step": 24865 + }, + { + "epoch": 3.0, + "grad_norm": 0.27099478244781494, + "learning_rate": 1.729573786990457e-09, + "loss": 0.8102, + "step": 24870 + }, + { + "epoch": 3.0, + "grad_norm": 0.31159690022468567, + "learning_rate": 1.2707079199536507e-09, + "loss": 0.8513, + "step": 24875 + }, + { + "epoch": 3.0, + "grad_norm": 0.2697480618953705, + "learning_rate": 8.824364362225445e-10, + "loss": 0.6736, + "step": 24880 + }, + { + "epoch": 3.0, + "grad_norm": 0.25522977113723755, + "learning_rate": 5.647595185176435e-10, + "loss": 0.7454, + "step": 24885 + }, + { + "epoch": 3.0, + "grad_norm": 0.26916685700416565, + "learning_rate": 3.1767731638598916e-10, + "loss": 0.8128, + "step": 24890 + }, + { + "epoch": 3.0, + "grad_norm": 0.27489572763442993, + "learning_rate": 1.4118994608458556e-10, + "loss": 0.8325, + "step": 24895 + }, + { + "epoch": 3.0, + "step": 24897, + "total_flos": 3.2538134341012685e+19, + "train_loss": 0.8796718681596458, + "train_runtime": 1082714.0985, + "train_samples_per_second": 0.368, + "train_steps_per_second": 0.023 + } + ], + "logging_steps": 5, + "max_steps": 24897, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 3.2538134341012685e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}