{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9181523500810372, "eval_steps": 500, "global_step": 4532, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 10.73312117404433, "learning_rate": 4.4048582995951427e-07, "loss": 0.5001, "step": 10 }, { "epoch": 0.0, "grad_norm": 10.320516533751473, "learning_rate": 7.076923076923077e-07, "loss": 0.4459, "step": 20 }, { "epoch": 0.01, "grad_norm": 10.604032764201449, "learning_rate": 9.748987854251014e-07, "loss": 0.5337, "step": 30 }, { "epoch": 0.01, "grad_norm": 9.501814819458504, "learning_rate": 1.2421052631578948e-06, "loss": 0.4867, "step": 40 }, { "epoch": 0.01, "grad_norm": 7.2055395221098815, "learning_rate": 1.5093117408906883e-06, "loss": 0.4851, "step": 50 }, { "epoch": 0.01, "grad_norm": 5.16903562483095, "learning_rate": 1.776518218623482e-06, "loss": 0.4368, "step": 60 }, { "epoch": 0.01, "grad_norm": 5.746924199634763, "learning_rate": 2.0437246963562754e-06, "loss": 0.4289, "step": 70 }, { "epoch": 0.02, "grad_norm": 5.827693974405843, "learning_rate": 2.3109311740890693e-06, "loss": 0.3924, "step": 80 }, { "epoch": 0.02, "grad_norm": 7.020982629413032, "learning_rate": 2.5781376518218628e-06, "loss": 0.4108, "step": 90 }, { "epoch": 0.02, "grad_norm": 6.01679137180817, "learning_rate": 2.8453441295546562e-06, "loss": 0.4146, "step": 100 }, { "epoch": 0.02, "grad_norm": 5.448653259468705, "learning_rate": 3.11255060728745e-06, "loss": 0.3843, "step": 110 }, { "epoch": 0.02, "grad_norm": 4.431377216444548, "learning_rate": 3.379757085020243e-06, "loss": 0.3752, "step": 120 }, { "epoch": 0.03, "grad_norm": 4.706847685087678, "learning_rate": 3.646963562753037e-06, "loss": 0.373, "step": 130 }, { "epoch": 0.03, "grad_norm": 4.800217647386447, "learning_rate": 3.9141700404858305e-06, "loss": 0.3189, "step": 140 }, { "epoch": 0.03, "grad_norm": 4.55540918616468, "learning_rate": 4.1813765182186235e-06, "loss": 0.3337, "step": 150 }, { "epoch": 0.03, "grad_norm": 5.728866550792321, "learning_rate": 4.448582995951417e-06, "loss": 0.2853, "step": 160 }, { "epoch": 0.03, "grad_norm": 4.866232005178406, "learning_rate": 4.71578947368421e-06, "loss": 0.335, "step": 170 }, { "epoch": 0.04, "grad_norm": 5.153227503990654, "learning_rate": 4.982995951417004e-06, "loss": 0.3563, "step": 180 }, { "epoch": 0.04, "grad_norm": 4.833149588589071, "learning_rate": 5.250202429149799e-06, "loss": 0.3363, "step": 190 }, { "epoch": 0.04, "grad_norm": 5.057205257076979, "learning_rate": 5.517408906882591e-06, "loss": 0.3718, "step": 200 }, { "epoch": 0.04, "grad_norm": 5.623057559852214, "learning_rate": 5.784615384615385e-06, "loss": 0.3425, "step": 210 }, { "epoch": 0.04, "grad_norm": 5.285732195613963, "learning_rate": 6.05182186234818e-06, "loss": 0.3323, "step": 220 }, { "epoch": 0.05, "grad_norm": 5.287687993596724, "learning_rate": 6.319028340080971e-06, "loss": 0.3509, "step": 230 }, { "epoch": 0.05, "grad_norm": 5.422212177972575, "learning_rate": 6.586234817813766e-06, "loss": 0.3967, "step": 240 }, { "epoch": 0.05, "grad_norm": 4.898901033752465, "learning_rate": 6.853441295546559e-06, "loss": 0.304, "step": 250 }, { "epoch": 0.05, "grad_norm": 5.807565871352443, "learning_rate": 7.120647773279354e-06, "loss": 0.3138, "step": 260 }, { "epoch": 0.05, "grad_norm": 4.706658034348249, "learning_rate": 7.387854251012147e-06, "loss": 0.2711, "step": 270 }, { "epoch": 0.06, "grad_norm": 4.330996445669558, "learning_rate": 7.65506072874494e-06, "loss": 0.3013, "step": 280 }, { "epoch": 0.06, "grad_norm": 5.040245023856746, "learning_rate": 7.922267206477734e-06, "loss": 0.2845, "step": 290 }, { "epoch": 0.06, "grad_norm": 5.377102462850796, "learning_rate": 8.189473684210527e-06, "loss": 0.2274, "step": 300 }, { "epoch": 0.06, "grad_norm": 5.49394013615042, "learning_rate": 8.45668016194332e-06, "loss": 0.3002, "step": 310 }, { "epoch": 0.06, "grad_norm": 7.592214672466295, "learning_rate": 8.723886639676115e-06, "loss": 0.2683, "step": 320 }, { "epoch": 0.07, "grad_norm": 5.640000073588486, "learning_rate": 8.991093117408907e-06, "loss": 0.2753, "step": 330 }, { "epoch": 0.07, "grad_norm": 4.527692569068709, "learning_rate": 9.258299595141701e-06, "loss": 0.2799, "step": 340 }, { "epoch": 0.07, "grad_norm": 4.146002735657809, "learning_rate": 9.525506072874495e-06, "loss": 0.2735, "step": 350 }, { "epoch": 0.07, "grad_norm": 7.711163580304015, "learning_rate": 9.792712550607289e-06, "loss": 0.2735, "step": 360 }, { "epoch": 0.07, "grad_norm": 6.7114456570561005, "learning_rate": 1.0059919028340081e-05, "loss": 0.299, "step": 370 }, { "epoch": 0.08, "grad_norm": 9.166917647191482, "learning_rate": 1.0327125506072877e-05, "loss": 0.2633, "step": 380 }, { "epoch": 0.08, "grad_norm": 4.804784956773296, "learning_rate": 1.0594331983805667e-05, "loss": 0.3049, "step": 390 }, { "epoch": 0.08, "grad_norm": 5.407897698128889, "learning_rate": 1.0861538461538461e-05, "loss": 0.2665, "step": 400 }, { "epoch": 0.08, "grad_norm": 9.363037208047256, "learning_rate": 1.1128744939271257e-05, "loss": 0.2891, "step": 410 }, { "epoch": 0.09, "grad_norm": 7.40785296512954, "learning_rate": 1.1395951417004049e-05, "loss": 0.305, "step": 420 }, { "epoch": 0.09, "grad_norm": 4.832043881349699, "learning_rate": 1.1663157894736843e-05, "loss": 0.2275, "step": 430 }, { "epoch": 0.09, "grad_norm": 5.819742592169462, "learning_rate": 1.1930364372469638e-05, "loss": 0.2919, "step": 440 }, { "epoch": 0.09, "grad_norm": 4.641758430345508, "learning_rate": 1.2197570850202429e-05, "loss": 0.2943, "step": 450 }, { "epoch": 0.09, "grad_norm": 6.619813436316814, "learning_rate": 1.2464777327935223e-05, "loss": 0.2852, "step": 460 }, { "epoch": 0.1, "grad_norm": 4.1511414761990375, "learning_rate": 1.2731983805668018e-05, "loss": 0.2886, "step": 470 }, { "epoch": 0.1, "grad_norm": 13.864446251944102, "learning_rate": 1.299919028340081e-05, "loss": 0.2418, "step": 480 }, { "epoch": 0.1, "grad_norm": 4.2334846039024985, "learning_rate": 1.3266396761133604e-05, "loss": 0.3023, "step": 490 }, { "epoch": 0.1, "grad_norm": 8.049684733976, "learning_rate": 1.3533603238866397e-05, "loss": 0.2475, "step": 500 }, { "epoch": 0.1, "grad_norm": 10.458696782052067, "learning_rate": 1.3800809716599192e-05, "loss": 0.25, "step": 510 }, { "epoch": 0.11, "grad_norm": 4.63729749376232, "learning_rate": 1.4068016194331984e-05, "loss": 0.2871, "step": 520 }, { "epoch": 0.11, "grad_norm": 7.406867021700095, "learning_rate": 1.4335222672064776e-05, "loss": 0.2932, "step": 530 }, { "epoch": 0.11, "grad_norm": 5.89092417799605, "learning_rate": 1.4602429149797572e-05, "loss": 0.2841, "step": 540 }, { "epoch": 0.11, "grad_norm": 4.784763615906999, "learning_rate": 1.4869635627530366e-05, "loss": 0.2583, "step": 550 }, { "epoch": 0.11, "grad_norm": 5.425616026327031, "learning_rate": 1.5136842105263158e-05, "loss": 0.2633, "step": 560 }, { "epoch": 0.12, "grad_norm": 3.972773037284401, "learning_rate": 1.5404048582995954e-05, "loss": 0.2528, "step": 570 }, { "epoch": 0.12, "grad_norm": 4.221584018079482, "learning_rate": 1.5671255060728746e-05, "loss": 0.2633, "step": 580 }, { "epoch": 0.12, "grad_norm": 7.132176048070046, "learning_rate": 1.5938461538461538e-05, "loss": 0.2136, "step": 590 }, { "epoch": 0.12, "grad_norm": 5.736662281189121, "learning_rate": 1.6205668016194334e-05, "loss": 0.2639, "step": 600 }, { "epoch": 0.12, "grad_norm": 4.001622347847798, "learning_rate": 1.6472874493927126e-05, "loss": 0.315, "step": 610 }, { "epoch": 0.13, "grad_norm": 4.526524467163638, "learning_rate": 1.674008097165992e-05, "loss": 0.2424, "step": 620 }, { "epoch": 0.13, "grad_norm": 9.626596293663294, "learning_rate": 1.7007287449392717e-05, "loss": 0.2417, "step": 630 }, { "epoch": 0.13, "grad_norm": 4.2292844007418555, "learning_rate": 1.727449392712551e-05, "loss": 0.2534, "step": 640 }, { "epoch": 0.13, "grad_norm": 4.808396608849778, "learning_rate": 1.75417004048583e-05, "loss": 0.2501, "step": 650 }, { "epoch": 0.13, "grad_norm": 3.79257297513555, "learning_rate": 1.7808906882591094e-05, "loss": 0.2583, "step": 660 }, { "epoch": 0.14, "grad_norm": 2.8750772880448703, "learning_rate": 1.807611336032389e-05, "loss": 0.2296, "step": 670 }, { "epoch": 0.14, "grad_norm": 4.439196839762044, "learning_rate": 1.834331983805668e-05, "loss": 0.2439, "step": 680 }, { "epoch": 0.14, "grad_norm": 4.3898290815513965, "learning_rate": 1.8610526315789473e-05, "loss": 0.2899, "step": 690 }, { "epoch": 0.14, "grad_norm": 4.42001400664859, "learning_rate": 1.887773279352227e-05, "loss": 0.2616, "step": 700 }, { "epoch": 0.14, "grad_norm": 5.324629277322316, "learning_rate": 1.9144939271255065e-05, "loss": 0.2547, "step": 710 }, { "epoch": 0.15, "grad_norm": 5.181807370455961, "learning_rate": 1.9412145748987857e-05, "loss": 0.2746, "step": 720 }, { "epoch": 0.15, "grad_norm": 6.195963917912233, "learning_rate": 1.967935222672065e-05, "loss": 0.2269, "step": 730 }, { "epoch": 0.15, "grad_norm": 21.9010566073324, "learning_rate": 1.994655870445344e-05, "loss": 0.2214, "step": 740 }, { "epoch": 0.15, "grad_norm": 8.220773890093973, "learning_rate": 1.9999979802007072e-05, "loss": 0.3237, "step": 750 }, { "epoch": 0.15, "grad_norm": 7.431578258197571, "learning_rate": 1.9999909981889357e-05, "loss": 0.2585, "step": 760 }, { "epoch": 0.16, "grad_norm": 5.277053700798239, "learning_rate": 1.999979029063708e-05, "loss": 0.2865, "step": 770 }, { "epoch": 0.16, "grad_norm": 4.434866755224634, "learning_rate": 1.9999620728847215e-05, "loss": 0.2355, "step": 780 }, { "epoch": 0.16, "grad_norm": 5.028960471152081, "learning_rate": 1.9999401297365485e-05, "loss": 0.2896, "step": 790 }, { "epoch": 0.16, "grad_norm": 4.533763530364985, "learning_rate": 1.999913199728633e-05, "loss": 0.3033, "step": 800 }, { "epoch": 0.16, "grad_norm": 3.554515792556175, "learning_rate": 1.9998812829952933e-05, "loss": 0.2617, "step": 810 }, { "epoch": 0.17, "grad_norm": 4.026912751246585, "learning_rate": 1.999844379695719e-05, "loss": 0.2924, "step": 820 }, { "epoch": 0.17, "grad_norm": 8.551197472310484, "learning_rate": 1.999802490013971e-05, "loss": 0.2658, "step": 830 }, { "epoch": 0.17, "grad_norm": 4.390661707823087, "learning_rate": 1.9997556141589807e-05, "loss": 0.2386, "step": 840 }, { "epoch": 0.17, "grad_norm": 3.697035277012858, "learning_rate": 1.9997037523645485e-05, "loss": 0.2787, "step": 850 }, { "epoch": 0.17, "grad_norm": 4.624963502482154, "learning_rate": 1.9996469048893438e-05, "loss": 0.2885, "step": 860 }, { "epoch": 0.18, "grad_norm": 4.519073120214446, "learning_rate": 1.999585072016902e-05, "loss": 0.2184, "step": 870 }, { "epoch": 0.18, "grad_norm": 4.45710292962464, "learning_rate": 1.9995182540556242e-05, "loss": 0.2465, "step": 880 }, { "epoch": 0.18, "grad_norm": 4.907767674612729, "learning_rate": 1.9994464513387758e-05, "loss": 0.2579, "step": 890 }, { "epoch": 0.18, "grad_norm": 4.569838800905319, "learning_rate": 1.999369664224484e-05, "loss": 0.3058, "step": 900 }, { "epoch": 0.18, "grad_norm": 10.180778298594696, "learning_rate": 1.9992878930957364e-05, "loss": 0.2722, "step": 910 }, { "epoch": 0.19, "grad_norm": 3.9044365907967737, "learning_rate": 1.9992011383603794e-05, "loss": 0.2905, "step": 920 }, { "epoch": 0.19, "grad_norm": 4.204036344405577, "learning_rate": 1.999109400451116e-05, "loss": 0.2597, "step": 930 }, { "epoch": 0.19, "grad_norm": 4.483086909407923, "learning_rate": 1.9990126798255032e-05, "loss": 0.2527, "step": 940 }, { "epoch": 0.19, "grad_norm": 5.650984362708689, "learning_rate": 1.9989109769659506e-05, "loss": 0.2924, "step": 950 }, { "epoch": 0.19, "grad_norm": 4.289005355593254, "learning_rate": 1.9988042923797176e-05, "loss": 0.2248, "step": 960 }, { "epoch": 0.2, "grad_norm": 5.879170337714607, "learning_rate": 1.9986926265989092e-05, "loss": 0.2313, "step": 970 }, { "epoch": 0.2, "grad_norm": 2.7369398894003716, "learning_rate": 1.9985759801804768e-05, "loss": 0.2655, "step": 980 }, { "epoch": 0.2, "grad_norm": 3.9309313039975713, "learning_rate": 1.998454353706213e-05, "loss": 0.2413, "step": 990 }, { "epoch": 0.2, "grad_norm": 3.853949683327624, "learning_rate": 1.998327747782748e-05, "loss": 0.2626, "step": 1000 }, { "epoch": 0.2, "grad_norm": 4.233603769710417, "learning_rate": 1.9981961630415495e-05, "loss": 0.2813, "step": 1010 }, { "epoch": 0.21, "grad_norm": 3.4712931614995775, "learning_rate": 1.9980596001389173e-05, "loss": 0.2804, "step": 1020 }, { "epoch": 0.21, "grad_norm": 3.5161721036970564, "learning_rate": 1.9979180597559795e-05, "loss": 0.2498, "step": 1030 }, { "epoch": 0.21, "grad_norm": 3.914986811551112, "learning_rate": 1.997771542598691e-05, "loss": 0.2638, "step": 1040 }, { "epoch": 0.21, "grad_norm": 3.2174038340462405, "learning_rate": 1.9976200493978302e-05, "loss": 0.2188, "step": 1050 }, { "epoch": 0.21, "grad_norm": 5.034094723003907, "learning_rate": 1.9974635809089923e-05, "loss": 0.2574, "step": 1060 }, { "epoch": 0.22, "grad_norm": 3.9061787102869756, "learning_rate": 1.9973021379125887e-05, "loss": 0.2823, "step": 1070 }, { "epoch": 0.22, "grad_norm": 3.749214471559017, "learning_rate": 1.9971357212138418e-05, "loss": 0.2158, "step": 1080 }, { "epoch": 0.22, "grad_norm": 4.438077647261819, "learning_rate": 1.9969643316427806e-05, "loss": 0.2373, "step": 1090 }, { "epoch": 0.22, "grad_norm": 3.7946665428127644, "learning_rate": 1.9967879700542382e-05, "loss": 0.266, "step": 1100 }, { "epoch": 0.22, "grad_norm": 4.020417555953455, "learning_rate": 1.996606637327846e-05, "loss": 0.2692, "step": 1110 }, { "epoch": 0.23, "grad_norm": 5.291843241151047, "learning_rate": 1.9964203343680284e-05, "loss": 0.2477, "step": 1120 }, { "epoch": 0.23, "grad_norm": 3.3437519964764775, "learning_rate": 1.996229062104001e-05, "loss": 0.2507, "step": 1130 }, { "epoch": 0.23, "grad_norm": 3.3879034836678033, "learning_rate": 1.996032821489765e-05, "loss": 0.2476, "step": 1140 }, { "epoch": 0.23, "grad_norm": 3.7050532861499375, "learning_rate": 1.9958316135041e-05, "loss": 0.2224, "step": 1150 }, { "epoch": 0.24, "grad_norm": 3.8739888502442037, "learning_rate": 1.995625439150564e-05, "loss": 0.2419, "step": 1160 }, { "epoch": 0.24, "grad_norm": 10.982270952246383, "learning_rate": 1.9954142994574825e-05, "loss": 0.2242, "step": 1170 }, { "epoch": 0.24, "grad_norm": 9.832505746272833, "learning_rate": 1.9951981954779488e-05, "loss": 0.2591, "step": 1180 }, { "epoch": 0.24, "grad_norm": 4.369158741790222, "learning_rate": 1.9949771282898153e-05, "loss": 0.2374, "step": 1190 }, { "epoch": 0.24, "grad_norm": 5.630104122791308, "learning_rate": 1.994751098995689e-05, "loss": 0.2442, "step": 1200 }, { "epoch": 0.25, "grad_norm": 4.220933137387155, "learning_rate": 1.9945201087229272e-05, "loss": 0.2431, "step": 1210 }, { "epoch": 0.25, "grad_norm": 5.346564014873171, "learning_rate": 1.9942841586236297e-05, "loss": 0.2787, "step": 1220 }, { "epoch": 0.25, "grad_norm": 3.884615013459559, "learning_rate": 1.9940432498746342e-05, "loss": 0.1948, "step": 1230 }, { "epoch": 0.25, "grad_norm": 4.199936121500256, "learning_rate": 1.993797383677512e-05, "loss": 0.2605, "step": 1240 }, { "epoch": 0.25, "grad_norm": 3.7958971813092854, "learning_rate": 1.9935465612585588e-05, "loss": 0.2077, "step": 1250 }, { "epoch": 0.26, "grad_norm": 4.513168141785257, "learning_rate": 1.993290783868791e-05, "loss": 0.2584, "step": 1260 }, { "epoch": 0.26, "grad_norm": 4.7824378439198805, "learning_rate": 1.993030052783938e-05, "loss": 0.3022, "step": 1270 }, { "epoch": 0.26, "grad_norm": 4.207953986768479, "learning_rate": 1.992764369304438e-05, "loss": 0.2308, "step": 1280 }, { "epoch": 0.26, "grad_norm": 17.62536360994611, "learning_rate": 1.9924937347554282e-05, "loss": 0.2761, "step": 1290 }, { "epoch": 0.26, "grad_norm": 4.270009899619283, "learning_rate": 1.9922181504867414e-05, "loss": 0.2427, "step": 1300 }, { "epoch": 0.27, "grad_norm": 4.922976222292152, "learning_rate": 1.9919376178728975e-05, "loss": 0.2306, "step": 1310 }, { "epoch": 0.27, "grad_norm": 4.215458242682613, "learning_rate": 1.9916521383130965e-05, "loss": 0.2339, "step": 1320 }, { "epoch": 0.27, "grad_norm": 1.559875114528341, "learning_rate": 1.9913617132312132e-05, "loss": 0.1979, "step": 1330 }, { "epoch": 0.27, "grad_norm": 3.911384564892276, "learning_rate": 1.9910663440757878e-05, "loss": 0.2474, "step": 1340 }, { "epoch": 0.27, "grad_norm": 3.5797528557741036, "learning_rate": 1.9907660323200207e-05, "loss": 0.223, "step": 1350 }, { "epoch": 0.28, "grad_norm": 7.031043833783013, "learning_rate": 1.9904607794617635e-05, "loss": 0.2141, "step": 1360 }, { "epoch": 0.28, "grad_norm": 3.721340014798071, "learning_rate": 1.9901505870235137e-05, "loss": 0.2039, "step": 1370 }, { "epoch": 0.28, "grad_norm": 3.889394419978848, "learning_rate": 1.989835456552404e-05, "loss": 0.3015, "step": 1380 }, { "epoch": 0.28, "grad_norm": 4.368266346017717, "learning_rate": 1.9895153896201977e-05, "loss": 0.2278, "step": 1390 }, { "epoch": 0.28, "grad_norm": 4.736620682008376, "learning_rate": 1.9891903878232782e-05, "loss": 0.246, "step": 1400 }, { "epoch": 0.29, "grad_norm": 3.8257038790244744, "learning_rate": 1.9888604527826435e-05, "loss": 0.2758, "step": 1410 }, { "epoch": 0.29, "grad_norm": 3.1454697624115093, "learning_rate": 1.9885255861438966e-05, "loss": 0.2253, "step": 1420 }, { "epoch": 0.29, "grad_norm": 4.241144266291823, "learning_rate": 1.988185789577237e-05, "loss": 0.2545, "step": 1430 }, { "epoch": 0.29, "grad_norm": 4.303491443553284, "learning_rate": 1.987841064777454e-05, "loss": 0.2547, "step": 1440 }, { "epoch": 0.29, "grad_norm": 7.233119649704639, "learning_rate": 1.9874914134639163e-05, "loss": 0.2697, "step": 1450 }, { "epoch": 0.3, "grad_norm": 3.1627622593633133, "learning_rate": 1.987136837380565e-05, "loss": 0.2072, "step": 1460 }, { "epoch": 0.3, "grad_norm": 4.803567895467617, "learning_rate": 1.986777338295904e-05, "loss": 0.2504, "step": 1470 }, { "epoch": 0.3, "grad_norm": 4.645148028590643, "learning_rate": 1.9864129180029915e-05, "loss": 0.2957, "step": 1480 }, { "epoch": 0.3, "grad_norm": 4.372735663683347, "learning_rate": 1.9860435783194306e-05, "loss": 0.2342, "step": 1490 }, { "epoch": 0.3, "grad_norm": 4.244572708317798, "learning_rate": 1.9856693210873616e-05, "loss": 0.1971, "step": 1500 }, { "epoch": 0.31, "grad_norm": 3.150358082839134, "learning_rate": 1.9852901481734505e-05, "loss": 0.2601, "step": 1510 }, { "epoch": 0.31, "grad_norm": 3.8063005165293173, "learning_rate": 1.9849060614688825e-05, "loss": 0.2419, "step": 1520 }, { "epoch": 0.31, "grad_norm": 7.0543295261475105, "learning_rate": 1.98451706288935e-05, "loss": 0.2637, "step": 1530 }, { "epoch": 0.31, "grad_norm": 3.795035839087491, "learning_rate": 1.9841231543750446e-05, "loss": 0.2632, "step": 1540 }, { "epoch": 0.31, "grad_norm": 4.609631133641002, "learning_rate": 1.983724337890647e-05, "loss": 0.234, "step": 1550 }, { "epoch": 0.32, "grad_norm": 4.678952778756691, "learning_rate": 1.9833206154253165e-05, "loss": 0.2096, "step": 1560 }, { "epoch": 0.32, "grad_norm": 3.695528116851361, "learning_rate": 1.9829119889926836e-05, "loss": 0.2585, "step": 1570 }, { "epoch": 0.32, "grad_norm": 3.787900010945368, "learning_rate": 1.9824984606308356e-05, "loss": 0.2201, "step": 1580 }, { "epoch": 0.32, "grad_norm": 4.851549133184082, "learning_rate": 1.982080032402311e-05, "loss": 0.2625, "step": 1590 }, { "epoch": 0.32, "grad_norm": 4.285022938688077, "learning_rate": 1.9816567063940856e-05, "loss": 0.1898, "step": 1600 }, { "epoch": 0.33, "grad_norm": 4.576105603313343, "learning_rate": 1.981228484717565e-05, "loss": 0.3178, "step": 1610 }, { "epoch": 0.33, "grad_norm": 2.614430082371516, "learning_rate": 1.980795369508572e-05, "loss": 0.2022, "step": 1620 }, { "epoch": 0.33, "grad_norm": 4.348898710957631, "learning_rate": 1.9803573629273364e-05, "loss": 0.2426, "step": 1630 }, { "epoch": 0.33, "grad_norm": 3.7753032234127812, "learning_rate": 1.9799144671584853e-05, "loss": 0.2547, "step": 1640 }, { "epoch": 0.33, "grad_norm": 5.096970178030036, "learning_rate": 1.9794666844110303e-05, "loss": 0.209, "step": 1650 }, { "epoch": 0.34, "grad_norm": 3.8017117810837084, "learning_rate": 1.979014016918359e-05, "loss": 0.2149, "step": 1660 }, { "epoch": 0.34, "grad_norm": 4.186597128936523, "learning_rate": 1.978556466938221e-05, "loss": 0.2464, "step": 1670 }, { "epoch": 0.34, "grad_norm": 3.696513394158522, "learning_rate": 1.978094036752719e-05, "loss": 0.2568, "step": 1680 }, { "epoch": 0.34, "grad_norm": 4.675325784194562, "learning_rate": 1.9776267286682965e-05, "loss": 0.2323, "step": 1690 }, { "epoch": 0.34, "grad_norm": 5.427689218899514, "learning_rate": 1.9771545450157254e-05, "loss": 0.2631, "step": 1700 }, { "epoch": 0.35, "grad_norm": 3.1730771347081395, "learning_rate": 1.9766774881500958e-05, "loss": 0.216, "step": 1710 }, { "epoch": 0.35, "grad_norm": 3.581631337579299, "learning_rate": 1.9761955604508043e-05, "loss": 0.2327, "step": 1720 }, { "epoch": 0.35, "grad_norm": 4.216013154865277, "learning_rate": 1.975708764321541e-05, "loss": 0.2737, "step": 1730 }, { "epoch": 0.35, "grad_norm": 3.616181664241495, "learning_rate": 1.975217102190278e-05, "loss": 0.2531, "step": 1740 }, { "epoch": 0.35, "grad_norm": 4.944731426602014, "learning_rate": 1.974720576509257e-05, "loss": 0.2329, "step": 1750 }, { "epoch": 0.36, "grad_norm": 3.4547474359483075, "learning_rate": 1.9742191897549783e-05, "loss": 0.2082, "step": 1760 }, { "epoch": 0.36, "grad_norm": 16.606773079360572, "learning_rate": 1.973712944428187e-05, "loss": 0.2476, "step": 1770 }, { "epoch": 0.36, "grad_norm": 3.224499798212032, "learning_rate": 1.9732018430538613e-05, "loss": 0.2574, "step": 1780 }, { "epoch": 0.36, "grad_norm": 3.5823954456349805, "learning_rate": 1.9726858881811992e-05, "loss": 0.2242, "step": 1790 }, { "epoch": 0.36, "grad_norm": 4.249584626487707, "learning_rate": 1.9721650823836074e-05, "loss": 0.1894, "step": 1800 }, { "epoch": 0.37, "grad_norm": 2.805651655743975, "learning_rate": 1.971639428258686e-05, "loss": 0.2224, "step": 1810 }, { "epoch": 0.37, "grad_norm": 6.825631461083238, "learning_rate": 1.971108928428218e-05, "loss": 0.206, "step": 1820 }, { "epoch": 0.37, "grad_norm": 3.668175314712389, "learning_rate": 1.9705735855381544e-05, "loss": 0.228, "step": 1830 }, { "epoch": 0.37, "grad_norm": 2.8084463789135126, "learning_rate": 1.9700334022586016e-05, "loss": 0.2313, "step": 1840 }, { "epoch": 0.37, "grad_norm": 3.566250082150367, "learning_rate": 1.9694883812838095e-05, "loss": 0.2291, "step": 1850 }, { "epoch": 0.38, "grad_norm": 3.9864026464050437, "learning_rate": 1.9689385253321548e-05, "loss": 0.2312, "step": 1860 }, { "epoch": 0.38, "grad_norm": 4.0540053541677885, "learning_rate": 1.9683838371461315e-05, "loss": 0.2687, "step": 1870 }, { "epoch": 0.38, "grad_norm": 3.6121079968078234, "learning_rate": 1.9678243194923333e-05, "loss": 0.2403, "step": 1880 }, { "epoch": 0.38, "grad_norm": 4.062791315050937, "learning_rate": 1.9672599751614427e-05, "loss": 0.2225, "step": 1890 }, { "epoch": 0.38, "grad_norm": 11.916356720706089, "learning_rate": 1.966690806968216e-05, "loss": 0.2517, "step": 1900 }, { "epoch": 0.39, "grad_norm": 9.070397034086051, "learning_rate": 1.9661168177514683e-05, "loss": 0.2386, "step": 1910 }, { "epoch": 0.39, "grad_norm": 4.223253798926425, "learning_rate": 1.9655380103740618e-05, "loss": 0.2409, "step": 1920 }, { "epoch": 0.39, "grad_norm": 3.338130757023872, "learning_rate": 1.9649543877228886e-05, "loss": 0.2368, "step": 1930 }, { "epoch": 0.39, "grad_norm": 3.545188334725578, "learning_rate": 1.9643659527088587e-05, "loss": 0.1738, "step": 1940 }, { "epoch": 0.4, "grad_norm": 3.408623244835359, "learning_rate": 1.963772708266884e-05, "loss": 0.198, "step": 1950 }, { "epoch": 0.4, "grad_norm": 3.900722651898139, "learning_rate": 1.9631746573558646e-05, "loss": 0.1959, "step": 1960 }, { "epoch": 0.4, "grad_norm": 3.692501620096666, "learning_rate": 1.9625718029586732e-05, "loss": 0.2356, "step": 1970 }, { "epoch": 0.4, "grad_norm": 3.655015133723331, "learning_rate": 1.9619641480821407e-05, "loss": 0.242, "step": 1980 }, { "epoch": 0.4, "grad_norm": 13.072999667731198, "learning_rate": 1.9613516957570416e-05, "loss": 0.2206, "step": 1990 }, { "epoch": 0.41, "grad_norm": 3.6663532701131696, "learning_rate": 1.9607344490380778e-05, "loss": 0.2006, "step": 2000 }, { "epoch": 0.41, "grad_norm": 1.9175793161114592, "learning_rate": 1.9601124110038647e-05, "loss": 0.2138, "step": 2010 }, { "epoch": 0.41, "grad_norm": 2.244205112045336, "learning_rate": 1.9594855847569144e-05, "loss": 0.2025, "step": 2020 }, { "epoch": 0.41, "grad_norm": 1.896116682683314, "learning_rate": 1.9588539734236213e-05, "loss": 0.1844, "step": 2030 }, { "epoch": 0.41, "grad_norm": 2.0057399160737157, "learning_rate": 1.958217580154246e-05, "loss": 0.1576, "step": 2040 }, { "epoch": 0.42, "grad_norm": 2.1453450088649535, "learning_rate": 1.9575764081229004e-05, "loss": 0.2168, "step": 2050 }, { "epoch": 0.42, "grad_norm": 2.030575801761766, "learning_rate": 1.9569304605275304e-05, "loss": 0.1728, "step": 2060 }, { "epoch": 0.42, "grad_norm": 2.5496294537760775, "learning_rate": 1.9562797405899012e-05, "loss": 0.2134, "step": 2070 }, { "epoch": 0.42, "grad_norm": 2.242162724268332, "learning_rate": 1.955624251555581e-05, "loss": 0.201, "step": 2080 }, { "epoch": 0.42, "grad_norm": 1.9933952709556402, "learning_rate": 1.954963996693924e-05, "loss": 0.2179, "step": 2090 }, { "epoch": 0.43, "grad_norm": 2.240735158513416, "learning_rate": 1.954298979298055e-05, "loss": 0.2494, "step": 2100 }, { "epoch": 0.43, "grad_norm": 2.397210518680832, "learning_rate": 1.953629202684853e-05, "loss": 0.2077, "step": 2110 }, { "epoch": 0.43, "grad_norm": 4.51567146250967, "learning_rate": 1.9529546701949338e-05, "loss": 0.1859, "step": 2120 }, { "epoch": 0.43, "grad_norm": 1.6048501115252622, "learning_rate": 1.952275385192635e-05, "loss": 0.1858, "step": 2130 }, { "epoch": 0.43, "grad_norm": 3.266632654700684, "learning_rate": 1.951591351065996e-05, "loss": 0.2065, "step": 2140 }, { "epoch": 0.44, "grad_norm": 2.9963705041975857, "learning_rate": 1.950902571226745e-05, "loss": 0.2395, "step": 2150 }, { "epoch": 0.44, "grad_norm": 2.47269279368607, "learning_rate": 1.9502090491102805e-05, "loss": 0.2234, "step": 2160 }, { "epoch": 0.44, "grad_norm": 2.4172687657411283, "learning_rate": 1.949510788175652e-05, "loss": 0.2054, "step": 2170 }, { "epoch": 0.44, "grad_norm": 2.24131454859904, "learning_rate": 1.948807791905546e-05, "loss": 0.2036, "step": 2180 }, { "epoch": 0.44, "grad_norm": 2.7824059130997916, "learning_rate": 1.9481000638062667e-05, "loss": 0.211, "step": 2190 }, { "epoch": 0.45, "grad_norm": 2.5084669738687966, "learning_rate": 1.9473876074077193e-05, "loss": 0.1987, "step": 2200 }, { "epoch": 0.45, "grad_norm": 2.374685869021834, "learning_rate": 1.946670426263392e-05, "loss": 0.18, "step": 2210 }, { "epoch": 0.45, "grad_norm": 2.1982980430417665, "learning_rate": 1.9459485239503385e-05, "loss": 0.1883, "step": 2220 }, { "epoch": 0.45, "grad_norm": 2.2072115394074947, "learning_rate": 1.9452219040691604e-05, "loss": 0.1962, "step": 2230 }, { "epoch": 0.45, "grad_norm": 2.616971043987373, "learning_rate": 1.9444905702439874e-05, "loss": 0.2126, "step": 2240 }, { "epoch": 0.46, "grad_norm": 2.1674222240111525, "learning_rate": 1.943754526122463e-05, "loss": 0.1622, "step": 2250 }, { "epoch": 0.46, "grad_norm": 2.201552458849196, "learning_rate": 1.9430137753757222e-05, "loss": 0.2293, "step": 2260 }, { "epoch": 0.46, "grad_norm": 2.5879310691120314, "learning_rate": 1.9422683216983766e-05, "loss": 0.2008, "step": 2270 }, { "epoch": 0.46, "grad_norm": 2.113474838782104, "learning_rate": 1.9415181688084922e-05, "loss": 0.1622, "step": 2280 }, { "epoch": 0.46, "grad_norm": 2.05705061518402, "learning_rate": 1.9407633204475756e-05, "loss": 0.1674, "step": 2290 }, { "epoch": 0.47, "grad_norm": 3.3556666998600018, "learning_rate": 1.940003780380551e-05, "loss": 0.1765, "step": 2300 }, { "epoch": 0.47, "grad_norm": 2.332083901173591, "learning_rate": 1.9392395523957438e-05, "loss": 0.1656, "step": 2310 }, { "epoch": 0.47, "grad_norm": 2.3736488112606757, "learning_rate": 1.9384706403048618e-05, "loss": 0.2206, "step": 2320 }, { "epoch": 0.47, "grad_norm": 2.6382812391557318, "learning_rate": 1.937697047942974e-05, "loss": 0.1645, "step": 2330 }, { "epoch": 0.47, "grad_norm": 2.6013892863578194, "learning_rate": 1.9369187791684943e-05, "loss": 0.1705, "step": 2340 }, { "epoch": 0.48, "grad_norm": 2.1145392602796855, "learning_rate": 1.9361358378631604e-05, "loss": 0.206, "step": 2350 }, { "epoch": 0.48, "grad_norm": 2.200460676164746, "learning_rate": 1.9353482279320154e-05, "loss": 0.2172, "step": 2360 }, { "epoch": 0.48, "grad_norm": 2.3806064086412455, "learning_rate": 1.9345559533033867e-05, "loss": 0.1837, "step": 2370 }, { "epoch": 0.48, "grad_norm": 2.53378489217648, "learning_rate": 1.9337590179288694e-05, "loss": 0.1962, "step": 2380 }, { "epoch": 0.48, "grad_norm": 13.445648419685131, "learning_rate": 1.9329574257833035e-05, "loss": 0.1332, "step": 2390 }, { "epoch": 0.49, "grad_norm": 1.7839934654329992, "learning_rate": 1.932151180864756e-05, "loss": 0.1713, "step": 2400 }, { "epoch": 0.49, "grad_norm": 1.9367486050463985, "learning_rate": 1.9313402871945e-05, "loss": 0.207, "step": 2410 }, { "epoch": 0.49, "grad_norm": 3.9679375965918084, "learning_rate": 1.930524748816995e-05, "loss": 0.1766, "step": 2420 }, { "epoch": 0.49, "grad_norm": 2.12675067368969, "learning_rate": 1.9297045697998667e-05, "loss": 0.2119, "step": 2430 }, { "epoch": 0.49, "grad_norm": 2.330245927266684, "learning_rate": 1.9288797542338875e-05, "loss": 0.192, "step": 2440 }, { "epoch": 0.5, "grad_norm": 2.2758592638707986, "learning_rate": 1.9280503062329537e-05, "loss": 0.2218, "step": 2450 }, { "epoch": 0.5, "grad_norm": 3.420743432917777, "learning_rate": 1.9272162299340675e-05, "loss": 0.1517, "step": 2460 }, { "epoch": 0.5, "grad_norm": 1.4543338792444171, "learning_rate": 1.9263775294973168e-05, "loss": 0.1854, "step": 2470 }, { "epoch": 0.5, "grad_norm": 2.1078454017192123, "learning_rate": 1.92553420910585e-05, "loss": 0.2071, "step": 2480 }, { "epoch": 0.5, "grad_norm": 2.24945739985224, "learning_rate": 1.9246862729658616e-05, "loss": 0.1434, "step": 2490 }, { "epoch": 0.51, "grad_norm": 2.313736662246618, "learning_rate": 1.9238337253065655e-05, "loss": 0.2095, "step": 2500 }, { "epoch": 0.51, "grad_norm": 2.7303675467709847, "learning_rate": 1.922976570380177e-05, "loss": 0.2015, "step": 2510 }, { "epoch": 0.51, "grad_norm": 3.1027147281182703, "learning_rate": 1.9221148124618915e-05, "loss": 0.1902, "step": 2520 }, { "epoch": 0.51, "grad_norm": 1.9571047100967691, "learning_rate": 1.921248455849862e-05, "loss": 0.163, "step": 2530 }, { "epoch": 0.51, "grad_norm": 2.258815028020344, "learning_rate": 1.9203775048651776e-05, "loss": 0.159, "step": 2540 }, { "epoch": 0.52, "grad_norm": 3.119730206342172, "learning_rate": 1.9195019638518437e-05, "loss": 0.1766, "step": 2550 }, { "epoch": 0.52, "grad_norm": 4.155231092799811, "learning_rate": 1.9186218371767587e-05, "loss": 0.1999, "step": 2560 }, { "epoch": 0.52, "grad_norm": 3.938178594333825, "learning_rate": 1.9177371292296926e-05, "loss": 0.1967, "step": 2570 }, { "epoch": 0.52, "grad_norm": 2.264714656649641, "learning_rate": 1.916847844423265e-05, "loss": 0.1873, "step": 2580 }, { "epoch": 0.52, "grad_norm": 2.3278837142950835, "learning_rate": 1.915953987192924e-05, "loss": 0.198, "step": 2590 }, { "epoch": 0.53, "grad_norm": 2.3796897562558557, "learning_rate": 1.9150555619969228e-05, "loss": 0.1591, "step": 2600 }, { "epoch": 0.53, "grad_norm": 1.2146800627013359, "learning_rate": 1.914152573316298e-05, "loss": 0.1772, "step": 2610 }, { "epoch": 0.53, "grad_norm": 2.284220185885104, "learning_rate": 1.9132450256548482e-05, "loss": 0.1924, "step": 2620 }, { "epoch": 0.53, "grad_norm": 2.3132986803051954, "learning_rate": 1.912332923539109e-05, "loss": 0.1575, "step": 2630 }, { "epoch": 0.53, "grad_norm": 3.3347587557674214, "learning_rate": 1.9114162715183338e-05, "loss": 0.2016, "step": 2640 }, { "epoch": 0.54, "grad_norm": 4.1416219410825565, "learning_rate": 1.9104950741644682e-05, "loss": 0.1841, "step": 2650 }, { "epoch": 0.54, "grad_norm": 1.9784717611438265, "learning_rate": 1.9095693360721288e-05, "loss": 0.1988, "step": 2660 }, { "epoch": 0.54, "grad_norm": 1.9851485419245707, "learning_rate": 1.90863906185858e-05, "loss": 0.2306, "step": 2670 }, { "epoch": 0.54, "grad_norm": 1.7507079599778372, "learning_rate": 1.90770425616371e-05, "loss": 0.1812, "step": 2680 }, { "epoch": 0.54, "grad_norm": 1.4418225191285714, "learning_rate": 1.90676492365001e-05, "loss": 0.1724, "step": 2690 }, { "epoch": 0.55, "grad_norm": 6.358944590106823, "learning_rate": 1.905821069002548e-05, "loss": 0.2033, "step": 2700 }, { "epoch": 0.55, "grad_norm": 2.569265877186431, "learning_rate": 1.9048726969289472e-05, "loss": 0.1962, "step": 2710 }, { "epoch": 0.55, "grad_norm": 2.143027189864614, "learning_rate": 1.9039198121593623e-05, "loss": 0.2134, "step": 2720 }, { "epoch": 0.55, "grad_norm": 2.3661795002812593, "learning_rate": 1.9029624194464562e-05, "loss": 0.1594, "step": 2730 }, { "epoch": 0.56, "grad_norm": 2.2244765834008855, "learning_rate": 1.9020005235653752e-05, "loss": 0.1797, "step": 2740 }, { "epoch": 0.56, "grad_norm": 2.48042520800243, "learning_rate": 1.9010341293137265e-05, "loss": 0.1992, "step": 2750 }, { "epoch": 0.56, "grad_norm": 2.4141715312149854, "learning_rate": 1.9000632415115526e-05, "loss": 0.194, "step": 2760 }, { "epoch": 0.56, "grad_norm": 2.1311219610420453, "learning_rate": 1.8990878650013095e-05, "loss": 0.2152, "step": 2770 }, { "epoch": 0.56, "grad_norm": 2.2386424254232806, "learning_rate": 1.8981080046478408e-05, "loss": 0.1678, "step": 2780 }, { "epoch": 0.57, "grad_norm": 2.1290366036640167, "learning_rate": 1.8971236653383534e-05, "loss": 0.1815, "step": 2790 }, { "epoch": 0.57, "grad_norm": 2.332990235126333, "learning_rate": 1.896134851982395e-05, "loss": 0.1601, "step": 2800 }, { "epoch": 0.57, "grad_norm": 1.9313701752737014, "learning_rate": 1.895141569511827e-05, "loss": 0.1913, "step": 2810 }, { "epoch": 0.57, "grad_norm": 2.0079851484471387, "learning_rate": 1.8941438228808023e-05, "loss": 0.2147, "step": 2820 }, { "epoch": 0.57, "grad_norm": 3.300791940163902, "learning_rate": 1.8931416170657383e-05, "loss": 0.198, "step": 2830 }, { "epoch": 0.58, "grad_norm": 2.005476843621061, "learning_rate": 1.892134957065295e-05, "loss": 0.1835, "step": 2840 }, { "epoch": 0.58, "grad_norm": 2.6085877287350487, "learning_rate": 1.8911238479003464e-05, "loss": 0.1898, "step": 2850 }, { "epoch": 0.58, "grad_norm": 2.5393547946239683, "learning_rate": 1.8901082946139585e-05, "loss": 0.2094, "step": 2860 }, { "epoch": 0.58, "grad_norm": 2.1428525152610582, "learning_rate": 1.8890883022713635e-05, "loss": 0.1754, "step": 2870 }, { "epoch": 0.58, "grad_norm": 1.9149274467688195, "learning_rate": 1.8880638759599327e-05, "loss": 0.2039, "step": 2880 }, { "epoch": 0.59, "grad_norm": 3.51632507135593, "learning_rate": 1.8870350207891536e-05, "loss": 0.1857, "step": 2890 }, { "epoch": 0.59, "grad_norm": 2.4186516754116196, "learning_rate": 1.8860017418906028e-05, "loss": 0.1374, "step": 2900 }, { "epoch": 0.59, "grad_norm": 2.0258647249782333, "learning_rate": 1.884964044417921e-05, "loss": 0.2084, "step": 2910 }, { "epoch": 0.59, "grad_norm": 2.0985094320845574, "learning_rate": 1.8839219335467886e-05, "loss": 0.1484, "step": 2920 }, { "epoch": 0.59, "grad_norm": 2.19199552449932, "learning_rate": 1.8828754144748958e-05, "loss": 0.2029, "step": 2930 }, { "epoch": 0.6, "grad_norm": 1.9095904364534577, "learning_rate": 1.8818244924219217e-05, "loss": 0.1997, "step": 2940 }, { "epoch": 0.6, "grad_norm": 2.052834839544693, "learning_rate": 1.8807691726295053e-05, "loss": 0.1536, "step": 2950 }, { "epoch": 0.6, "grad_norm": 2.179267323237696, "learning_rate": 1.8797094603612192e-05, "loss": 0.2086, "step": 2960 }, { "epoch": 0.6, "grad_norm": 2.5098689789030355, "learning_rate": 1.878645360902546e-05, "loss": 0.1994, "step": 2970 }, { "epoch": 0.6, "grad_norm": 4.400137699746838, "learning_rate": 1.8775768795608472e-05, "loss": 0.1606, "step": 2980 }, { "epoch": 0.61, "grad_norm": 2.590590977646675, "learning_rate": 1.8765040216653427e-05, "loss": 0.1897, "step": 2990 }, { "epoch": 0.61, "grad_norm": 2.1321993774458954, "learning_rate": 1.8754267925670796e-05, "loss": 0.1531, "step": 3000 }, { "epoch": 0.61, "grad_norm": 2.5568642398269334, "learning_rate": 1.8743451976389068e-05, "loss": 0.178, "step": 3010 }, { "epoch": 0.61, "grad_norm": 1.9214830076333376, "learning_rate": 1.8732592422754495e-05, "loss": 0.1897, "step": 3020 }, { "epoch": 0.61, "grad_norm": 1.969751254803088, "learning_rate": 1.8721689318930806e-05, "loss": 0.1502, "step": 3030 }, { "epoch": 0.62, "grad_norm": 5.017433292114636, "learning_rate": 1.871074271929894e-05, "loss": 0.1982, "step": 3040 }, { "epoch": 0.62, "grad_norm": 1.9369614375603101, "learning_rate": 1.8699752678456788e-05, "loss": 0.1719, "step": 3050 }, { "epoch": 0.62, "grad_norm": 2.006594277192507, "learning_rate": 1.86887192512189e-05, "loss": 0.1932, "step": 3060 }, { "epoch": 0.62, "grad_norm": 1.715514310451893, "learning_rate": 1.8677642492616236e-05, "loss": 0.1801, "step": 3070 }, { "epoch": 0.62, "grad_norm": 1.9267545952361633, "learning_rate": 1.8666522457895862e-05, "loss": 0.1893, "step": 3080 }, { "epoch": 0.63, "grad_norm": 2.1874472475503106, "learning_rate": 1.86553592025207e-05, "loss": 0.1767, "step": 3090 }, { "epoch": 0.63, "grad_norm": 1.8843605357258664, "learning_rate": 1.8644152782169247e-05, "loss": 0.1802, "step": 3100 }, { "epoch": 0.63, "grad_norm": 2.3298848304608684, "learning_rate": 1.8632903252735276e-05, "loss": 0.1667, "step": 3110 }, { "epoch": 0.63, "grad_norm": 2.0245193476149312, "learning_rate": 1.862161067032759e-05, "loss": 0.1834, "step": 3120 }, { "epoch": 0.63, "grad_norm": 2.239252595358892, "learning_rate": 1.861027509126971e-05, "loss": 0.2083, "step": 3130 }, { "epoch": 0.64, "grad_norm": 2.000691294407405, "learning_rate": 1.8598896572099624e-05, "loss": 0.1739, "step": 3140 }, { "epoch": 0.64, "grad_norm": 1.656761882061858, "learning_rate": 1.8587475169569483e-05, "loss": 0.1815, "step": 3150 }, { "epoch": 0.64, "grad_norm": 2.1245215382192346, "learning_rate": 1.8576010940645325e-05, "loss": 0.1606, "step": 3160 }, { "epoch": 0.64, "grad_norm": 1.7762853803876042, "learning_rate": 1.856450394250679e-05, "loss": 0.1969, "step": 3170 }, { "epoch": 0.64, "grad_norm": 2.8830827622510697, "learning_rate": 1.855295423254685e-05, "loss": 0.1555, "step": 3180 }, { "epoch": 0.65, "grad_norm": 2.1861197340441234, "learning_rate": 1.854136186837149e-05, "loss": 0.1889, "step": 3190 }, { "epoch": 0.65, "grad_norm": 2.3198392751530705, "learning_rate": 1.8529726907799444e-05, "loss": 0.1943, "step": 3200 }, { "epoch": 0.65, "grad_norm": 1.9214231378576725, "learning_rate": 1.8518049408861915e-05, "loss": 0.1831, "step": 3210 }, { "epoch": 0.65, "grad_norm": 2.398048593011276, "learning_rate": 1.850632942980226e-05, "loss": 0.2029, "step": 3220 }, { "epoch": 0.65, "grad_norm": 2.2143080398207573, "learning_rate": 1.8494567029075714e-05, "loss": 0.1718, "step": 3230 }, { "epoch": 0.66, "grad_norm": 2.126646974386725, "learning_rate": 1.84827622653491e-05, "loss": 0.1302, "step": 3240 }, { "epoch": 0.66, "grad_norm": 11.599444777256583, "learning_rate": 1.847091519750053e-05, "loss": 0.1809, "step": 3250 }, { "epoch": 0.66, "grad_norm": 2.3991674250656403, "learning_rate": 1.8459025884619125e-05, "loss": 0.1865, "step": 3260 }, { "epoch": 0.66, "grad_norm": 6.4503963141704155, "learning_rate": 1.844709438600469e-05, "loss": 0.2251, "step": 3270 }, { "epoch": 0.66, "grad_norm": 2.4044007961395626, "learning_rate": 1.8435120761167453e-05, "loss": 0.1708, "step": 3280 }, { "epoch": 0.67, "grad_norm": 1.5993439730175965, "learning_rate": 1.8423105069827753e-05, "loss": 0.1651, "step": 3290 }, { "epoch": 0.67, "grad_norm": 1.9740878207799195, "learning_rate": 1.8411047371915737e-05, "loss": 0.1955, "step": 3300 }, { "epoch": 0.67, "grad_norm": 2.244083998109266, "learning_rate": 1.839894772757106e-05, "loss": 0.2106, "step": 3310 }, { "epoch": 0.67, "grad_norm": 2.2654343097283434, "learning_rate": 1.8386806197142607e-05, "loss": 0.1331, "step": 3320 }, { "epoch": 0.67, "grad_norm": 1.9373155321474749, "learning_rate": 1.837462284118817e-05, "loss": 0.143, "step": 3330 }, { "epoch": 0.68, "grad_norm": 2.041586413720046, "learning_rate": 1.8362397720474144e-05, "loss": 0.1805, "step": 3340 }, { "epoch": 0.68, "grad_norm": 2.311869767372086, "learning_rate": 1.8350130895975247e-05, "loss": 0.1748, "step": 3350 }, { "epoch": 0.68, "grad_norm": 2.0358596142782224, "learning_rate": 1.8337822428874187e-05, "loss": 0.197, "step": 3360 }, { "epoch": 0.68, "grad_norm": 12.765829395166001, "learning_rate": 1.8325472380561382e-05, "loss": 0.2043, "step": 3370 }, { "epoch": 0.68, "grad_norm": 2.283832499199408, "learning_rate": 1.831308081263464e-05, "loss": 0.1696, "step": 3380 }, { "epoch": 0.69, "grad_norm": 2.099264143128957, "learning_rate": 1.8300647786898843e-05, "loss": 0.1772, "step": 3390 }, { "epoch": 0.69, "grad_norm": 3.0418348048172117, "learning_rate": 1.8288173365365675e-05, "loss": 0.1679, "step": 3400 }, { "epoch": 0.69, "grad_norm": 2.928400270597656, "learning_rate": 1.827565761025326e-05, "loss": 0.1839, "step": 3410 }, { "epoch": 0.69, "grad_norm": 1.9489192388525756, "learning_rate": 1.82631005839859e-05, "loss": 0.1702, "step": 3420 }, { "epoch": 0.69, "grad_norm": 1.866673546449154, "learning_rate": 1.825050234919374e-05, "loss": 0.1913, "step": 3430 }, { "epoch": 0.7, "grad_norm": 1.9782480885566223, "learning_rate": 1.8237862968712442e-05, "loss": 0.189, "step": 3440 }, { "epoch": 0.7, "grad_norm": 2.072252048803903, "learning_rate": 1.8225182505582918e-05, "loss": 0.1908, "step": 3450 }, { "epoch": 0.7, "grad_norm": 2.1333806325715523, "learning_rate": 1.821246102305096e-05, "loss": 0.204, "step": 3460 }, { "epoch": 0.7, "grad_norm": 2.0590278992877113, "learning_rate": 1.8199698584566967e-05, "loss": 0.1833, "step": 3470 }, { "epoch": 0.71, "grad_norm": 2.2552945044942474, "learning_rate": 1.8186895253785603e-05, "loss": 0.2076, "step": 3480 }, { "epoch": 0.71, "grad_norm": 1.9634969960872868, "learning_rate": 1.8174051094565484e-05, "loss": 0.2097, "step": 3490 }, { "epoch": 0.71, "grad_norm": 2.2410044456550184, "learning_rate": 1.816116617096889e-05, "loss": 0.2217, "step": 3500 }, { "epoch": 0.71, "grad_norm": 2.1336586639575237, "learning_rate": 1.8148240547261387e-05, "loss": 0.18, "step": 3510 }, { "epoch": 0.71, "grad_norm": 2.173336388150898, "learning_rate": 1.813527428791156e-05, "loss": 0.1756, "step": 3520 }, { "epoch": 0.72, "grad_norm": 2.671364540282752, "learning_rate": 1.812226745759066e-05, "loss": 0.1863, "step": 3530 }, { "epoch": 0.72, "grad_norm": 1.9057012299641733, "learning_rate": 1.8109220121172306e-05, "loss": 0.2206, "step": 3540 }, { "epoch": 0.72, "grad_norm": 0.1813973982571034, "learning_rate": 1.8096132343732135e-05, "loss": 0.1462, "step": 3550 }, { "epoch": 0.72, "grad_norm": 1.8892866709987723, "learning_rate": 1.808300419054749e-05, "loss": 0.1803, "step": 3560 }, { "epoch": 0.72, "grad_norm": 2.003086926459966, "learning_rate": 1.80698357270971e-05, "loss": 0.2032, "step": 3570 }, { "epoch": 0.73, "grad_norm": 3.2221389262807874, "learning_rate": 1.8056627019060738e-05, "loss": 0.1631, "step": 3580 }, { "epoch": 0.73, "grad_norm": 1.2538102728217932, "learning_rate": 1.8043378132318927e-05, "loss": 0.1692, "step": 3590 }, { "epoch": 0.73, "grad_norm": 4.563350232475948, "learning_rate": 1.8030089132952557e-05, "loss": 0.1727, "step": 3600 }, { "epoch": 0.73, "grad_norm": 2.000374282147408, "learning_rate": 1.8016760087242605e-05, "loss": 0.1733, "step": 3610 }, { "epoch": 0.73, "grad_norm": 1.9278853800801559, "learning_rate": 1.800339106166978e-05, "loss": 0.1852, "step": 3620 }, { "epoch": 0.74, "grad_norm": 2.2241283295196514, "learning_rate": 1.79899821229142e-05, "loss": 0.143, "step": 3630 }, { "epoch": 0.74, "grad_norm": 2.4864913275556133, "learning_rate": 1.7976533337855053e-05, "loss": 0.1884, "step": 3640 }, { "epoch": 0.74, "grad_norm": 3.860042375322093, "learning_rate": 1.7963044773570265e-05, "loss": 0.1641, "step": 3650 }, { "epoch": 0.74, "grad_norm": 2.0454176745318677, "learning_rate": 1.7949516497336176e-05, "loss": 0.1864, "step": 3660 }, { "epoch": 0.74, "grad_norm": 2.46822670327846, "learning_rate": 1.793594857662718e-05, "loss": 0.1924, "step": 3670 }, { "epoch": 0.75, "grad_norm": 2.4872265651073158, "learning_rate": 1.792234107911542e-05, "loss": 0.1546, "step": 3680 }, { "epoch": 0.75, "grad_norm": 2.0379945728128663, "learning_rate": 1.7908694072670426e-05, "loss": 0.1711, "step": 3690 }, { "epoch": 0.75, "grad_norm": 2.407085620398544, "learning_rate": 1.7895007625358783e-05, "loss": 0.1701, "step": 3700 }, { "epoch": 0.75, "grad_norm": 4.042625051438552, "learning_rate": 1.7881281805443805e-05, "loss": 0.195, "step": 3710 }, { "epoch": 0.75, "grad_norm": 1.9781223218758097, "learning_rate": 1.786751668138517e-05, "loss": 0.1621, "step": 3720 }, { "epoch": 0.76, "grad_norm": 4.657693319193961, "learning_rate": 1.7853712321838602e-05, "loss": 0.1968, "step": 3730 }, { "epoch": 0.76, "grad_norm": 17.520636897246522, "learning_rate": 1.7839868795655507e-05, "loss": 0.1475, "step": 3740 }, { "epoch": 0.76, "grad_norm": 2.1710924893238626, "learning_rate": 1.782598617188265e-05, "loss": 0.1686, "step": 3750 }, { "epoch": 0.76, "grad_norm": 2.205220842093223, "learning_rate": 1.78120645197618e-05, "loss": 0.1587, "step": 3760 }, { "epoch": 0.76, "grad_norm": 1.95114988022513, "learning_rate": 1.7798103908729377e-05, "loss": 0.1743, "step": 3770 }, { "epoch": 0.77, "grad_norm": 2.7456615210938744, "learning_rate": 1.778410440841613e-05, "loss": 0.1468, "step": 3780 }, { "epoch": 0.77, "grad_norm": 2.8973758532017895, "learning_rate": 1.7770066088646767e-05, "loss": 0.1883, "step": 3790 }, { "epoch": 0.77, "grad_norm": 2.5445827161522616, "learning_rate": 1.7755989019439607e-05, "loss": 0.1895, "step": 3800 }, { "epoch": 0.77, "grad_norm": 2.169743981267003, "learning_rate": 1.774187327100625e-05, "loss": 0.1716, "step": 3810 }, { "epoch": 0.77, "grad_norm": 2.1135486144554387, "learning_rate": 1.7727718913751207e-05, "loss": 0.1164, "step": 3820 }, { "epoch": 0.78, "grad_norm": 2.0715545958907535, "learning_rate": 1.7713526018271558e-05, "loss": 0.1717, "step": 3830 }, { "epoch": 0.78, "grad_norm": 8.108185256384777, "learning_rate": 1.76992946553566e-05, "loss": 0.1662, "step": 3840 }, { "epoch": 0.78, "grad_norm": 2.4906125249035713, "learning_rate": 1.7685024895987494e-05, "loss": 0.184, "step": 3850 }, { "epoch": 0.78, "grad_norm": 3.009141821458063, "learning_rate": 1.7670716811336902e-05, "loss": 0.2086, "step": 3860 }, { "epoch": 0.78, "grad_norm": 1.935774373100715, "learning_rate": 1.7656370472768648e-05, "loss": 0.1566, "step": 3870 }, { "epoch": 0.79, "grad_norm": 6.368484758089139, "learning_rate": 1.7641985951837347e-05, "loss": 0.2127, "step": 3880 }, { "epoch": 0.79, "grad_norm": 2.2821667018184533, "learning_rate": 1.7627563320288056e-05, "loss": 0.1822, "step": 3890 }, { "epoch": 0.79, "grad_norm": 1.7481761010849746, "learning_rate": 1.7613102650055925e-05, "loss": 0.2264, "step": 3900 }, { "epoch": 0.79, "grad_norm": 2.111098426089184, "learning_rate": 1.759860401326581e-05, "loss": 0.1838, "step": 3910 }, { "epoch": 0.79, "grad_norm": 2.3865926806030204, "learning_rate": 1.758406748223194e-05, "loss": 0.1779, "step": 3920 }, { "epoch": 0.8, "grad_norm": 2.1732390806125, "learning_rate": 1.7569493129457554e-05, "loss": 0.1713, "step": 3930 }, { "epoch": 0.8, "grad_norm": 1.8814065202760815, "learning_rate": 1.7554881027634516e-05, "loss": 0.178, "step": 3940 }, { "epoch": 0.8, "grad_norm": 2.16916361006078, "learning_rate": 1.754023124964299e-05, "loss": 0.1475, "step": 3950 }, { "epoch": 0.8, "grad_norm": 2.27943306248829, "learning_rate": 1.7525543868551045e-05, "loss": 0.1997, "step": 3960 }, { "epoch": 0.8, "grad_norm": 3.117493119710199, "learning_rate": 1.7510818957614292e-05, "loss": 0.1475, "step": 3970 }, { "epoch": 0.81, "grad_norm": 2.3033259103584567, "learning_rate": 1.7496056590275546e-05, "loss": 0.1853, "step": 3980 }, { "epoch": 0.81, "grad_norm": 4.045509608812605, "learning_rate": 1.7481256840164436e-05, "loss": 0.171, "step": 3990 }, { "epoch": 0.81, "grad_norm": 1.7320172203917021, "learning_rate": 1.7466419781097038e-05, "loss": 0.1619, "step": 4000 }, { "epoch": 0.81, "grad_norm": 2.1721310910520772, "learning_rate": 1.745154548707551e-05, "loss": 0.1614, "step": 4010 }, { "epoch": 0.81, "grad_norm": 3.4362498003979374, "learning_rate": 1.7436634032287735e-05, "loss": 0.1885, "step": 4020 }, { "epoch": 0.82, "grad_norm": 2.203441191364378, "learning_rate": 1.7421685491106933e-05, "loss": 0.1746, "step": 4030 }, { "epoch": 0.82, "grad_norm": 2.2235513235782136, "learning_rate": 1.740669993809131e-05, "loss": 0.1958, "step": 4040 }, { "epoch": 0.82, "grad_norm": 1.662744062478203, "learning_rate": 1.7391677447983663e-05, "loss": 0.168, "step": 4050 }, { "epoch": 0.82, "grad_norm": 1.9923877091876279, "learning_rate": 1.7376618095711018e-05, "loss": 0.1718, "step": 4060 }, { "epoch": 0.82, "grad_norm": 2.426593736593661, "learning_rate": 1.7361521956384264e-05, "loss": 0.1741, "step": 4070 }, { "epoch": 0.83, "grad_norm": 1.7464227217148067, "learning_rate": 1.7346389105297766e-05, "loss": 0.1726, "step": 4080 }, { "epoch": 0.83, "grad_norm": 2.3749413734026383, "learning_rate": 1.7331219617928997e-05, "loss": 0.1583, "step": 4090 }, { "epoch": 0.83, "grad_norm": 2.114701165986187, "learning_rate": 1.7316013569938154e-05, "loss": 0.2066, "step": 4100 }, { "epoch": 0.83, "grad_norm": 1.8180037497973824, "learning_rate": 1.73007710371678e-05, "loss": 0.188, "step": 4110 }, { "epoch": 0.83, "grad_norm": 2.2556879416055726, "learning_rate": 1.7285492095642455e-05, "loss": 0.1824, "step": 4120 }, { "epoch": 0.84, "grad_norm": 1.9441561651729724, "learning_rate": 1.7270176821568244e-05, "loss": 0.1828, "step": 4130 }, { "epoch": 0.84, "grad_norm": 3.863082155472389, "learning_rate": 1.72548252913325e-05, "loss": 0.1929, "step": 4140 }, { "epoch": 0.84, "grad_norm": 2.1777659050408067, "learning_rate": 1.72394375815034e-05, "loss": 0.1872, "step": 4150 }, { "epoch": 0.84, "grad_norm": 2.289054063384357, "learning_rate": 1.722401376882955e-05, "loss": 0.1619, "step": 4160 }, { "epoch": 0.84, "grad_norm": 7.1273553535612, "learning_rate": 1.7208553930239655e-05, "loss": 0.1752, "step": 4170 }, { "epoch": 0.85, "grad_norm": 2.4202257016442523, "learning_rate": 1.7193058142842076e-05, "loss": 0.1966, "step": 4180 }, { "epoch": 0.85, "grad_norm": 2.599358985816695, "learning_rate": 1.7177526483924492e-05, "loss": 0.1739, "step": 4190 }, { "epoch": 0.85, "grad_norm": 2.145207088406254, "learning_rate": 1.7161959030953498e-05, "loss": 0.1606, "step": 4200 }, { "epoch": 0.85, "grad_norm": 2.4989297483292643, "learning_rate": 1.71463558615742e-05, "loss": 0.1441, "step": 4210 }, { "epoch": 0.85, "grad_norm": 2.333564701416749, "learning_rate": 1.713071705360987e-05, "loss": 0.1697, "step": 4220 }, { "epoch": 0.86, "grad_norm": 1.6007875381874954, "learning_rate": 1.7115042685061507e-05, "loss": 0.1801, "step": 4230 }, { "epoch": 0.86, "grad_norm": 2.0001777331534516, "learning_rate": 1.7099332834107497e-05, "loss": 0.1236, "step": 4240 }, { "epoch": 0.86, "grad_norm": 2.6336492006976187, "learning_rate": 1.7083587579103187e-05, "loss": 0.166, "step": 4250 }, { "epoch": 0.86, "grad_norm": 2.181960694578793, "learning_rate": 1.7067806998580507e-05, "loss": 0.1997, "step": 4260 }, { "epoch": 0.87, "grad_norm": 3.2024986585127366, "learning_rate": 1.7051991171247582e-05, "loss": 0.1508, "step": 4270 }, { "epoch": 0.87, "grad_norm": 2.0206614417941258, "learning_rate": 1.7036140175988344e-05, "loss": 0.1471, "step": 4280 }, { "epoch": 0.87, "grad_norm": 2.047393965997251, "learning_rate": 1.702025409186211e-05, "loss": 0.1777, "step": 4290 }, { "epoch": 0.87, "grad_norm": 2.6683142062789713, "learning_rate": 1.7004332998103232e-05, "loss": 0.1769, "step": 4300 }, { "epoch": 0.87, "grad_norm": 1.6732439515126907, "learning_rate": 1.698837697412066e-05, "loss": 0.1268, "step": 4310 }, { "epoch": 0.88, "grad_norm": 3.001202183493077, "learning_rate": 1.697238609949757e-05, "loss": 0.1489, "step": 4320 }, { "epoch": 0.88, "grad_norm": 1.9957859824768167, "learning_rate": 1.6956360453990964e-05, "loss": 0.1536, "step": 4330 }, { "epoch": 0.88, "grad_norm": 3.2825840735727154, "learning_rate": 1.694030011753127e-05, "loss": 0.2101, "step": 4340 }, { "epoch": 0.88, "grad_norm": 2.0765138274517088, "learning_rate": 1.6924205170221933e-05, "loss": 0.1811, "step": 4350 }, { "epoch": 0.88, "grad_norm": 2.1133828976030595, "learning_rate": 1.6908075692339035e-05, "loss": 0.1728, "step": 4360 }, { "epoch": 0.89, "grad_norm": 1.1354696745558366, "learning_rate": 1.6891911764330887e-05, "loss": 0.1663, "step": 4370 }, { "epoch": 0.89, "grad_norm": 2.131963092613327, "learning_rate": 1.6875713466817608e-05, "loss": 0.1971, "step": 4380 }, { "epoch": 0.89, "grad_norm": 6.080414407059132, "learning_rate": 1.6859480880590755e-05, "loss": 0.1518, "step": 4390 }, { "epoch": 0.89, "grad_norm": 1.9313104885467733, "learning_rate": 1.684321408661291e-05, "loss": 0.1726, "step": 4400 }, { "epoch": 0.89, "grad_norm": 1.891911281896888, "learning_rate": 1.6826913166017257e-05, "loss": 0.2049, "step": 4410 }, { "epoch": 0.9, "grad_norm": 1.8833660404212225, "learning_rate": 1.68105782001072e-05, "loss": 0.1628, "step": 4420 }, { "epoch": 0.9, "grad_norm": 2.043852343315347, "learning_rate": 1.6794209270355946e-05, "loss": 0.1975, "step": 4430 }, { "epoch": 0.9, "grad_norm": 1.4598525807667742, "learning_rate": 1.677780645840611e-05, "loss": 0.159, "step": 4440 }, { "epoch": 0.9, "grad_norm": 1.6015094283079794, "learning_rate": 1.6761369846069292e-05, "loss": 0.157, "step": 4450 }, { "epoch": 0.9, "grad_norm": 2.0628558921093125, "learning_rate": 1.6744899515325674e-05, "loss": 0.1748, "step": 4460 }, { "epoch": 0.91, "grad_norm": 2.076881323364807, "learning_rate": 1.672839554832362e-05, "loss": 0.1966, "step": 4470 }, { "epoch": 0.91, "grad_norm": 2.0325935028828135, "learning_rate": 1.671185802737926e-05, "loss": 0.1885, "step": 4480 }, { "epoch": 0.91, "grad_norm": 2.280549242220261, "learning_rate": 1.6695287034976078e-05, "loss": 0.1624, "step": 4490 }, { "epoch": 0.91, "grad_norm": 2.08609705396923, "learning_rate": 1.6678682653764502e-05, "loss": 0.1631, "step": 4500 }, { "epoch": 0.91, "grad_norm": 2.2526237237040903, "learning_rate": 1.666204496656149e-05, "loss": 0.145, "step": 4510 }, { "epoch": 0.92, "grad_norm": 1.9007711000245981, "learning_rate": 1.6645374056350128e-05, "loss": 0.173, "step": 4520 }, { "epoch": 0.92, "grad_norm": 1.9372171995443488, "learning_rate": 1.6628670006279194e-05, "loss": 0.142, "step": 4530 } ], "logging_steps": 10, "max_steps": 14808, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 4532, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }