bgem3-sft-msmarco-squadv2-e1 / trainer_state.json
nntoan209's picture
Upload folder using huggingface_hub
989fa49 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9181523500810372,
"eval_steps": 500,
"global_step": 4532,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 10.73312117404433,
"learning_rate": 4.4048582995951427e-07,
"loss": 0.5001,
"step": 10
},
{
"epoch": 0.0,
"grad_norm": 10.320516533751473,
"learning_rate": 7.076923076923077e-07,
"loss": 0.4459,
"step": 20
},
{
"epoch": 0.01,
"grad_norm": 10.604032764201449,
"learning_rate": 9.748987854251014e-07,
"loss": 0.5337,
"step": 30
},
{
"epoch": 0.01,
"grad_norm": 9.501814819458504,
"learning_rate": 1.2421052631578948e-06,
"loss": 0.4867,
"step": 40
},
{
"epoch": 0.01,
"grad_norm": 7.2055395221098815,
"learning_rate": 1.5093117408906883e-06,
"loss": 0.4851,
"step": 50
},
{
"epoch": 0.01,
"grad_norm": 5.16903562483095,
"learning_rate": 1.776518218623482e-06,
"loss": 0.4368,
"step": 60
},
{
"epoch": 0.01,
"grad_norm": 5.746924199634763,
"learning_rate": 2.0437246963562754e-06,
"loss": 0.4289,
"step": 70
},
{
"epoch": 0.02,
"grad_norm": 5.827693974405843,
"learning_rate": 2.3109311740890693e-06,
"loss": 0.3924,
"step": 80
},
{
"epoch": 0.02,
"grad_norm": 7.020982629413032,
"learning_rate": 2.5781376518218628e-06,
"loss": 0.4108,
"step": 90
},
{
"epoch": 0.02,
"grad_norm": 6.01679137180817,
"learning_rate": 2.8453441295546562e-06,
"loss": 0.4146,
"step": 100
},
{
"epoch": 0.02,
"grad_norm": 5.448653259468705,
"learning_rate": 3.11255060728745e-06,
"loss": 0.3843,
"step": 110
},
{
"epoch": 0.02,
"grad_norm": 4.431377216444548,
"learning_rate": 3.379757085020243e-06,
"loss": 0.3752,
"step": 120
},
{
"epoch": 0.03,
"grad_norm": 4.706847685087678,
"learning_rate": 3.646963562753037e-06,
"loss": 0.373,
"step": 130
},
{
"epoch": 0.03,
"grad_norm": 4.800217647386447,
"learning_rate": 3.9141700404858305e-06,
"loss": 0.3189,
"step": 140
},
{
"epoch": 0.03,
"grad_norm": 4.55540918616468,
"learning_rate": 4.1813765182186235e-06,
"loss": 0.3337,
"step": 150
},
{
"epoch": 0.03,
"grad_norm": 5.728866550792321,
"learning_rate": 4.448582995951417e-06,
"loss": 0.2853,
"step": 160
},
{
"epoch": 0.03,
"grad_norm": 4.866232005178406,
"learning_rate": 4.71578947368421e-06,
"loss": 0.335,
"step": 170
},
{
"epoch": 0.04,
"grad_norm": 5.153227503990654,
"learning_rate": 4.982995951417004e-06,
"loss": 0.3563,
"step": 180
},
{
"epoch": 0.04,
"grad_norm": 4.833149588589071,
"learning_rate": 5.250202429149799e-06,
"loss": 0.3363,
"step": 190
},
{
"epoch": 0.04,
"grad_norm": 5.057205257076979,
"learning_rate": 5.517408906882591e-06,
"loss": 0.3718,
"step": 200
},
{
"epoch": 0.04,
"grad_norm": 5.623057559852214,
"learning_rate": 5.784615384615385e-06,
"loss": 0.3425,
"step": 210
},
{
"epoch": 0.04,
"grad_norm": 5.285732195613963,
"learning_rate": 6.05182186234818e-06,
"loss": 0.3323,
"step": 220
},
{
"epoch": 0.05,
"grad_norm": 5.287687993596724,
"learning_rate": 6.319028340080971e-06,
"loss": 0.3509,
"step": 230
},
{
"epoch": 0.05,
"grad_norm": 5.422212177972575,
"learning_rate": 6.586234817813766e-06,
"loss": 0.3967,
"step": 240
},
{
"epoch": 0.05,
"grad_norm": 4.898901033752465,
"learning_rate": 6.853441295546559e-06,
"loss": 0.304,
"step": 250
},
{
"epoch": 0.05,
"grad_norm": 5.807565871352443,
"learning_rate": 7.120647773279354e-06,
"loss": 0.3138,
"step": 260
},
{
"epoch": 0.05,
"grad_norm": 4.706658034348249,
"learning_rate": 7.387854251012147e-06,
"loss": 0.2711,
"step": 270
},
{
"epoch": 0.06,
"grad_norm": 4.330996445669558,
"learning_rate": 7.65506072874494e-06,
"loss": 0.3013,
"step": 280
},
{
"epoch": 0.06,
"grad_norm": 5.040245023856746,
"learning_rate": 7.922267206477734e-06,
"loss": 0.2845,
"step": 290
},
{
"epoch": 0.06,
"grad_norm": 5.377102462850796,
"learning_rate": 8.189473684210527e-06,
"loss": 0.2274,
"step": 300
},
{
"epoch": 0.06,
"grad_norm": 5.49394013615042,
"learning_rate": 8.45668016194332e-06,
"loss": 0.3002,
"step": 310
},
{
"epoch": 0.06,
"grad_norm": 7.592214672466295,
"learning_rate": 8.723886639676115e-06,
"loss": 0.2683,
"step": 320
},
{
"epoch": 0.07,
"grad_norm": 5.640000073588486,
"learning_rate": 8.991093117408907e-06,
"loss": 0.2753,
"step": 330
},
{
"epoch": 0.07,
"grad_norm": 4.527692569068709,
"learning_rate": 9.258299595141701e-06,
"loss": 0.2799,
"step": 340
},
{
"epoch": 0.07,
"grad_norm": 4.146002735657809,
"learning_rate": 9.525506072874495e-06,
"loss": 0.2735,
"step": 350
},
{
"epoch": 0.07,
"grad_norm": 7.711163580304015,
"learning_rate": 9.792712550607289e-06,
"loss": 0.2735,
"step": 360
},
{
"epoch": 0.07,
"grad_norm": 6.7114456570561005,
"learning_rate": 1.0059919028340081e-05,
"loss": 0.299,
"step": 370
},
{
"epoch": 0.08,
"grad_norm": 9.166917647191482,
"learning_rate": 1.0327125506072877e-05,
"loss": 0.2633,
"step": 380
},
{
"epoch": 0.08,
"grad_norm": 4.804784956773296,
"learning_rate": 1.0594331983805667e-05,
"loss": 0.3049,
"step": 390
},
{
"epoch": 0.08,
"grad_norm": 5.407897698128889,
"learning_rate": 1.0861538461538461e-05,
"loss": 0.2665,
"step": 400
},
{
"epoch": 0.08,
"grad_norm": 9.363037208047256,
"learning_rate": 1.1128744939271257e-05,
"loss": 0.2891,
"step": 410
},
{
"epoch": 0.09,
"grad_norm": 7.40785296512954,
"learning_rate": 1.1395951417004049e-05,
"loss": 0.305,
"step": 420
},
{
"epoch": 0.09,
"grad_norm": 4.832043881349699,
"learning_rate": 1.1663157894736843e-05,
"loss": 0.2275,
"step": 430
},
{
"epoch": 0.09,
"grad_norm": 5.819742592169462,
"learning_rate": 1.1930364372469638e-05,
"loss": 0.2919,
"step": 440
},
{
"epoch": 0.09,
"grad_norm": 4.641758430345508,
"learning_rate": 1.2197570850202429e-05,
"loss": 0.2943,
"step": 450
},
{
"epoch": 0.09,
"grad_norm": 6.619813436316814,
"learning_rate": 1.2464777327935223e-05,
"loss": 0.2852,
"step": 460
},
{
"epoch": 0.1,
"grad_norm": 4.1511414761990375,
"learning_rate": 1.2731983805668018e-05,
"loss": 0.2886,
"step": 470
},
{
"epoch": 0.1,
"grad_norm": 13.864446251944102,
"learning_rate": 1.299919028340081e-05,
"loss": 0.2418,
"step": 480
},
{
"epoch": 0.1,
"grad_norm": 4.2334846039024985,
"learning_rate": 1.3266396761133604e-05,
"loss": 0.3023,
"step": 490
},
{
"epoch": 0.1,
"grad_norm": 8.049684733976,
"learning_rate": 1.3533603238866397e-05,
"loss": 0.2475,
"step": 500
},
{
"epoch": 0.1,
"grad_norm": 10.458696782052067,
"learning_rate": 1.3800809716599192e-05,
"loss": 0.25,
"step": 510
},
{
"epoch": 0.11,
"grad_norm": 4.63729749376232,
"learning_rate": 1.4068016194331984e-05,
"loss": 0.2871,
"step": 520
},
{
"epoch": 0.11,
"grad_norm": 7.406867021700095,
"learning_rate": 1.4335222672064776e-05,
"loss": 0.2932,
"step": 530
},
{
"epoch": 0.11,
"grad_norm": 5.89092417799605,
"learning_rate": 1.4602429149797572e-05,
"loss": 0.2841,
"step": 540
},
{
"epoch": 0.11,
"grad_norm": 4.784763615906999,
"learning_rate": 1.4869635627530366e-05,
"loss": 0.2583,
"step": 550
},
{
"epoch": 0.11,
"grad_norm": 5.425616026327031,
"learning_rate": 1.5136842105263158e-05,
"loss": 0.2633,
"step": 560
},
{
"epoch": 0.12,
"grad_norm": 3.972773037284401,
"learning_rate": 1.5404048582995954e-05,
"loss": 0.2528,
"step": 570
},
{
"epoch": 0.12,
"grad_norm": 4.221584018079482,
"learning_rate": 1.5671255060728746e-05,
"loss": 0.2633,
"step": 580
},
{
"epoch": 0.12,
"grad_norm": 7.132176048070046,
"learning_rate": 1.5938461538461538e-05,
"loss": 0.2136,
"step": 590
},
{
"epoch": 0.12,
"grad_norm": 5.736662281189121,
"learning_rate": 1.6205668016194334e-05,
"loss": 0.2639,
"step": 600
},
{
"epoch": 0.12,
"grad_norm": 4.001622347847798,
"learning_rate": 1.6472874493927126e-05,
"loss": 0.315,
"step": 610
},
{
"epoch": 0.13,
"grad_norm": 4.526524467163638,
"learning_rate": 1.674008097165992e-05,
"loss": 0.2424,
"step": 620
},
{
"epoch": 0.13,
"grad_norm": 9.626596293663294,
"learning_rate": 1.7007287449392717e-05,
"loss": 0.2417,
"step": 630
},
{
"epoch": 0.13,
"grad_norm": 4.2292844007418555,
"learning_rate": 1.727449392712551e-05,
"loss": 0.2534,
"step": 640
},
{
"epoch": 0.13,
"grad_norm": 4.808396608849778,
"learning_rate": 1.75417004048583e-05,
"loss": 0.2501,
"step": 650
},
{
"epoch": 0.13,
"grad_norm": 3.79257297513555,
"learning_rate": 1.7808906882591094e-05,
"loss": 0.2583,
"step": 660
},
{
"epoch": 0.14,
"grad_norm": 2.8750772880448703,
"learning_rate": 1.807611336032389e-05,
"loss": 0.2296,
"step": 670
},
{
"epoch": 0.14,
"grad_norm": 4.439196839762044,
"learning_rate": 1.834331983805668e-05,
"loss": 0.2439,
"step": 680
},
{
"epoch": 0.14,
"grad_norm": 4.3898290815513965,
"learning_rate": 1.8610526315789473e-05,
"loss": 0.2899,
"step": 690
},
{
"epoch": 0.14,
"grad_norm": 4.42001400664859,
"learning_rate": 1.887773279352227e-05,
"loss": 0.2616,
"step": 700
},
{
"epoch": 0.14,
"grad_norm": 5.324629277322316,
"learning_rate": 1.9144939271255065e-05,
"loss": 0.2547,
"step": 710
},
{
"epoch": 0.15,
"grad_norm": 5.181807370455961,
"learning_rate": 1.9412145748987857e-05,
"loss": 0.2746,
"step": 720
},
{
"epoch": 0.15,
"grad_norm": 6.195963917912233,
"learning_rate": 1.967935222672065e-05,
"loss": 0.2269,
"step": 730
},
{
"epoch": 0.15,
"grad_norm": 21.9010566073324,
"learning_rate": 1.994655870445344e-05,
"loss": 0.2214,
"step": 740
},
{
"epoch": 0.15,
"grad_norm": 8.220773890093973,
"learning_rate": 1.9999979802007072e-05,
"loss": 0.3237,
"step": 750
},
{
"epoch": 0.15,
"grad_norm": 7.431578258197571,
"learning_rate": 1.9999909981889357e-05,
"loss": 0.2585,
"step": 760
},
{
"epoch": 0.16,
"grad_norm": 5.277053700798239,
"learning_rate": 1.999979029063708e-05,
"loss": 0.2865,
"step": 770
},
{
"epoch": 0.16,
"grad_norm": 4.434866755224634,
"learning_rate": 1.9999620728847215e-05,
"loss": 0.2355,
"step": 780
},
{
"epoch": 0.16,
"grad_norm": 5.028960471152081,
"learning_rate": 1.9999401297365485e-05,
"loss": 0.2896,
"step": 790
},
{
"epoch": 0.16,
"grad_norm": 4.533763530364985,
"learning_rate": 1.999913199728633e-05,
"loss": 0.3033,
"step": 800
},
{
"epoch": 0.16,
"grad_norm": 3.554515792556175,
"learning_rate": 1.9998812829952933e-05,
"loss": 0.2617,
"step": 810
},
{
"epoch": 0.17,
"grad_norm": 4.026912751246585,
"learning_rate": 1.999844379695719e-05,
"loss": 0.2924,
"step": 820
},
{
"epoch": 0.17,
"grad_norm": 8.551197472310484,
"learning_rate": 1.999802490013971e-05,
"loss": 0.2658,
"step": 830
},
{
"epoch": 0.17,
"grad_norm": 4.390661707823087,
"learning_rate": 1.9997556141589807e-05,
"loss": 0.2386,
"step": 840
},
{
"epoch": 0.17,
"grad_norm": 3.697035277012858,
"learning_rate": 1.9997037523645485e-05,
"loss": 0.2787,
"step": 850
},
{
"epoch": 0.17,
"grad_norm": 4.624963502482154,
"learning_rate": 1.9996469048893438e-05,
"loss": 0.2885,
"step": 860
},
{
"epoch": 0.18,
"grad_norm": 4.519073120214446,
"learning_rate": 1.999585072016902e-05,
"loss": 0.2184,
"step": 870
},
{
"epoch": 0.18,
"grad_norm": 4.45710292962464,
"learning_rate": 1.9995182540556242e-05,
"loss": 0.2465,
"step": 880
},
{
"epoch": 0.18,
"grad_norm": 4.907767674612729,
"learning_rate": 1.9994464513387758e-05,
"loss": 0.2579,
"step": 890
},
{
"epoch": 0.18,
"grad_norm": 4.569838800905319,
"learning_rate": 1.999369664224484e-05,
"loss": 0.3058,
"step": 900
},
{
"epoch": 0.18,
"grad_norm": 10.180778298594696,
"learning_rate": 1.9992878930957364e-05,
"loss": 0.2722,
"step": 910
},
{
"epoch": 0.19,
"grad_norm": 3.9044365907967737,
"learning_rate": 1.9992011383603794e-05,
"loss": 0.2905,
"step": 920
},
{
"epoch": 0.19,
"grad_norm": 4.204036344405577,
"learning_rate": 1.999109400451116e-05,
"loss": 0.2597,
"step": 930
},
{
"epoch": 0.19,
"grad_norm": 4.483086909407923,
"learning_rate": 1.9990126798255032e-05,
"loss": 0.2527,
"step": 940
},
{
"epoch": 0.19,
"grad_norm": 5.650984362708689,
"learning_rate": 1.9989109769659506e-05,
"loss": 0.2924,
"step": 950
},
{
"epoch": 0.19,
"grad_norm": 4.289005355593254,
"learning_rate": 1.9988042923797176e-05,
"loss": 0.2248,
"step": 960
},
{
"epoch": 0.2,
"grad_norm": 5.879170337714607,
"learning_rate": 1.9986926265989092e-05,
"loss": 0.2313,
"step": 970
},
{
"epoch": 0.2,
"grad_norm": 2.7369398894003716,
"learning_rate": 1.9985759801804768e-05,
"loss": 0.2655,
"step": 980
},
{
"epoch": 0.2,
"grad_norm": 3.9309313039975713,
"learning_rate": 1.998454353706213e-05,
"loss": 0.2413,
"step": 990
},
{
"epoch": 0.2,
"grad_norm": 3.853949683327624,
"learning_rate": 1.998327747782748e-05,
"loss": 0.2626,
"step": 1000
},
{
"epoch": 0.2,
"grad_norm": 4.233603769710417,
"learning_rate": 1.9981961630415495e-05,
"loss": 0.2813,
"step": 1010
},
{
"epoch": 0.21,
"grad_norm": 3.4712931614995775,
"learning_rate": 1.9980596001389173e-05,
"loss": 0.2804,
"step": 1020
},
{
"epoch": 0.21,
"grad_norm": 3.5161721036970564,
"learning_rate": 1.9979180597559795e-05,
"loss": 0.2498,
"step": 1030
},
{
"epoch": 0.21,
"grad_norm": 3.914986811551112,
"learning_rate": 1.997771542598691e-05,
"loss": 0.2638,
"step": 1040
},
{
"epoch": 0.21,
"grad_norm": 3.2174038340462405,
"learning_rate": 1.9976200493978302e-05,
"loss": 0.2188,
"step": 1050
},
{
"epoch": 0.21,
"grad_norm": 5.034094723003907,
"learning_rate": 1.9974635809089923e-05,
"loss": 0.2574,
"step": 1060
},
{
"epoch": 0.22,
"grad_norm": 3.9061787102869756,
"learning_rate": 1.9973021379125887e-05,
"loss": 0.2823,
"step": 1070
},
{
"epoch": 0.22,
"grad_norm": 3.749214471559017,
"learning_rate": 1.9971357212138418e-05,
"loss": 0.2158,
"step": 1080
},
{
"epoch": 0.22,
"grad_norm": 4.438077647261819,
"learning_rate": 1.9969643316427806e-05,
"loss": 0.2373,
"step": 1090
},
{
"epoch": 0.22,
"grad_norm": 3.7946665428127644,
"learning_rate": 1.9967879700542382e-05,
"loss": 0.266,
"step": 1100
},
{
"epoch": 0.22,
"grad_norm": 4.020417555953455,
"learning_rate": 1.996606637327846e-05,
"loss": 0.2692,
"step": 1110
},
{
"epoch": 0.23,
"grad_norm": 5.291843241151047,
"learning_rate": 1.9964203343680284e-05,
"loss": 0.2477,
"step": 1120
},
{
"epoch": 0.23,
"grad_norm": 3.3437519964764775,
"learning_rate": 1.996229062104001e-05,
"loss": 0.2507,
"step": 1130
},
{
"epoch": 0.23,
"grad_norm": 3.3879034836678033,
"learning_rate": 1.996032821489765e-05,
"loss": 0.2476,
"step": 1140
},
{
"epoch": 0.23,
"grad_norm": 3.7050532861499375,
"learning_rate": 1.9958316135041e-05,
"loss": 0.2224,
"step": 1150
},
{
"epoch": 0.24,
"grad_norm": 3.8739888502442037,
"learning_rate": 1.995625439150564e-05,
"loss": 0.2419,
"step": 1160
},
{
"epoch": 0.24,
"grad_norm": 10.982270952246383,
"learning_rate": 1.9954142994574825e-05,
"loss": 0.2242,
"step": 1170
},
{
"epoch": 0.24,
"grad_norm": 9.832505746272833,
"learning_rate": 1.9951981954779488e-05,
"loss": 0.2591,
"step": 1180
},
{
"epoch": 0.24,
"grad_norm": 4.369158741790222,
"learning_rate": 1.9949771282898153e-05,
"loss": 0.2374,
"step": 1190
},
{
"epoch": 0.24,
"grad_norm": 5.630104122791308,
"learning_rate": 1.994751098995689e-05,
"loss": 0.2442,
"step": 1200
},
{
"epoch": 0.25,
"grad_norm": 4.220933137387155,
"learning_rate": 1.9945201087229272e-05,
"loss": 0.2431,
"step": 1210
},
{
"epoch": 0.25,
"grad_norm": 5.346564014873171,
"learning_rate": 1.9942841586236297e-05,
"loss": 0.2787,
"step": 1220
},
{
"epoch": 0.25,
"grad_norm": 3.884615013459559,
"learning_rate": 1.9940432498746342e-05,
"loss": 0.1948,
"step": 1230
},
{
"epoch": 0.25,
"grad_norm": 4.199936121500256,
"learning_rate": 1.993797383677512e-05,
"loss": 0.2605,
"step": 1240
},
{
"epoch": 0.25,
"grad_norm": 3.7958971813092854,
"learning_rate": 1.9935465612585588e-05,
"loss": 0.2077,
"step": 1250
},
{
"epoch": 0.26,
"grad_norm": 4.513168141785257,
"learning_rate": 1.993290783868791e-05,
"loss": 0.2584,
"step": 1260
},
{
"epoch": 0.26,
"grad_norm": 4.7824378439198805,
"learning_rate": 1.993030052783938e-05,
"loss": 0.3022,
"step": 1270
},
{
"epoch": 0.26,
"grad_norm": 4.207953986768479,
"learning_rate": 1.992764369304438e-05,
"loss": 0.2308,
"step": 1280
},
{
"epoch": 0.26,
"grad_norm": 17.62536360994611,
"learning_rate": 1.9924937347554282e-05,
"loss": 0.2761,
"step": 1290
},
{
"epoch": 0.26,
"grad_norm": 4.270009899619283,
"learning_rate": 1.9922181504867414e-05,
"loss": 0.2427,
"step": 1300
},
{
"epoch": 0.27,
"grad_norm": 4.922976222292152,
"learning_rate": 1.9919376178728975e-05,
"loss": 0.2306,
"step": 1310
},
{
"epoch": 0.27,
"grad_norm": 4.215458242682613,
"learning_rate": 1.9916521383130965e-05,
"loss": 0.2339,
"step": 1320
},
{
"epoch": 0.27,
"grad_norm": 1.559875114528341,
"learning_rate": 1.9913617132312132e-05,
"loss": 0.1979,
"step": 1330
},
{
"epoch": 0.27,
"grad_norm": 3.911384564892276,
"learning_rate": 1.9910663440757878e-05,
"loss": 0.2474,
"step": 1340
},
{
"epoch": 0.27,
"grad_norm": 3.5797528557741036,
"learning_rate": 1.9907660323200207e-05,
"loss": 0.223,
"step": 1350
},
{
"epoch": 0.28,
"grad_norm": 7.031043833783013,
"learning_rate": 1.9904607794617635e-05,
"loss": 0.2141,
"step": 1360
},
{
"epoch": 0.28,
"grad_norm": 3.721340014798071,
"learning_rate": 1.9901505870235137e-05,
"loss": 0.2039,
"step": 1370
},
{
"epoch": 0.28,
"grad_norm": 3.889394419978848,
"learning_rate": 1.989835456552404e-05,
"loss": 0.3015,
"step": 1380
},
{
"epoch": 0.28,
"grad_norm": 4.368266346017717,
"learning_rate": 1.9895153896201977e-05,
"loss": 0.2278,
"step": 1390
},
{
"epoch": 0.28,
"grad_norm": 4.736620682008376,
"learning_rate": 1.9891903878232782e-05,
"loss": 0.246,
"step": 1400
},
{
"epoch": 0.29,
"grad_norm": 3.8257038790244744,
"learning_rate": 1.9888604527826435e-05,
"loss": 0.2758,
"step": 1410
},
{
"epoch": 0.29,
"grad_norm": 3.1454697624115093,
"learning_rate": 1.9885255861438966e-05,
"loss": 0.2253,
"step": 1420
},
{
"epoch": 0.29,
"grad_norm": 4.241144266291823,
"learning_rate": 1.988185789577237e-05,
"loss": 0.2545,
"step": 1430
},
{
"epoch": 0.29,
"grad_norm": 4.303491443553284,
"learning_rate": 1.987841064777454e-05,
"loss": 0.2547,
"step": 1440
},
{
"epoch": 0.29,
"grad_norm": 7.233119649704639,
"learning_rate": 1.9874914134639163e-05,
"loss": 0.2697,
"step": 1450
},
{
"epoch": 0.3,
"grad_norm": 3.1627622593633133,
"learning_rate": 1.987136837380565e-05,
"loss": 0.2072,
"step": 1460
},
{
"epoch": 0.3,
"grad_norm": 4.803567895467617,
"learning_rate": 1.986777338295904e-05,
"loss": 0.2504,
"step": 1470
},
{
"epoch": 0.3,
"grad_norm": 4.645148028590643,
"learning_rate": 1.9864129180029915e-05,
"loss": 0.2957,
"step": 1480
},
{
"epoch": 0.3,
"grad_norm": 4.372735663683347,
"learning_rate": 1.9860435783194306e-05,
"loss": 0.2342,
"step": 1490
},
{
"epoch": 0.3,
"grad_norm": 4.244572708317798,
"learning_rate": 1.9856693210873616e-05,
"loss": 0.1971,
"step": 1500
},
{
"epoch": 0.31,
"grad_norm": 3.150358082839134,
"learning_rate": 1.9852901481734505e-05,
"loss": 0.2601,
"step": 1510
},
{
"epoch": 0.31,
"grad_norm": 3.8063005165293173,
"learning_rate": 1.9849060614688825e-05,
"loss": 0.2419,
"step": 1520
},
{
"epoch": 0.31,
"grad_norm": 7.0543295261475105,
"learning_rate": 1.98451706288935e-05,
"loss": 0.2637,
"step": 1530
},
{
"epoch": 0.31,
"grad_norm": 3.795035839087491,
"learning_rate": 1.9841231543750446e-05,
"loss": 0.2632,
"step": 1540
},
{
"epoch": 0.31,
"grad_norm": 4.609631133641002,
"learning_rate": 1.983724337890647e-05,
"loss": 0.234,
"step": 1550
},
{
"epoch": 0.32,
"grad_norm": 4.678952778756691,
"learning_rate": 1.9833206154253165e-05,
"loss": 0.2096,
"step": 1560
},
{
"epoch": 0.32,
"grad_norm": 3.695528116851361,
"learning_rate": 1.9829119889926836e-05,
"loss": 0.2585,
"step": 1570
},
{
"epoch": 0.32,
"grad_norm": 3.787900010945368,
"learning_rate": 1.9824984606308356e-05,
"loss": 0.2201,
"step": 1580
},
{
"epoch": 0.32,
"grad_norm": 4.851549133184082,
"learning_rate": 1.982080032402311e-05,
"loss": 0.2625,
"step": 1590
},
{
"epoch": 0.32,
"grad_norm": 4.285022938688077,
"learning_rate": 1.9816567063940856e-05,
"loss": 0.1898,
"step": 1600
},
{
"epoch": 0.33,
"grad_norm": 4.576105603313343,
"learning_rate": 1.981228484717565e-05,
"loss": 0.3178,
"step": 1610
},
{
"epoch": 0.33,
"grad_norm": 2.614430082371516,
"learning_rate": 1.980795369508572e-05,
"loss": 0.2022,
"step": 1620
},
{
"epoch": 0.33,
"grad_norm": 4.348898710957631,
"learning_rate": 1.9803573629273364e-05,
"loss": 0.2426,
"step": 1630
},
{
"epoch": 0.33,
"grad_norm": 3.7753032234127812,
"learning_rate": 1.9799144671584853e-05,
"loss": 0.2547,
"step": 1640
},
{
"epoch": 0.33,
"grad_norm": 5.096970178030036,
"learning_rate": 1.9794666844110303e-05,
"loss": 0.209,
"step": 1650
},
{
"epoch": 0.34,
"grad_norm": 3.8017117810837084,
"learning_rate": 1.979014016918359e-05,
"loss": 0.2149,
"step": 1660
},
{
"epoch": 0.34,
"grad_norm": 4.186597128936523,
"learning_rate": 1.978556466938221e-05,
"loss": 0.2464,
"step": 1670
},
{
"epoch": 0.34,
"grad_norm": 3.696513394158522,
"learning_rate": 1.978094036752719e-05,
"loss": 0.2568,
"step": 1680
},
{
"epoch": 0.34,
"grad_norm": 4.675325784194562,
"learning_rate": 1.9776267286682965e-05,
"loss": 0.2323,
"step": 1690
},
{
"epoch": 0.34,
"grad_norm": 5.427689218899514,
"learning_rate": 1.9771545450157254e-05,
"loss": 0.2631,
"step": 1700
},
{
"epoch": 0.35,
"grad_norm": 3.1730771347081395,
"learning_rate": 1.9766774881500958e-05,
"loss": 0.216,
"step": 1710
},
{
"epoch": 0.35,
"grad_norm": 3.581631337579299,
"learning_rate": 1.9761955604508043e-05,
"loss": 0.2327,
"step": 1720
},
{
"epoch": 0.35,
"grad_norm": 4.216013154865277,
"learning_rate": 1.975708764321541e-05,
"loss": 0.2737,
"step": 1730
},
{
"epoch": 0.35,
"grad_norm": 3.616181664241495,
"learning_rate": 1.975217102190278e-05,
"loss": 0.2531,
"step": 1740
},
{
"epoch": 0.35,
"grad_norm": 4.944731426602014,
"learning_rate": 1.974720576509257e-05,
"loss": 0.2329,
"step": 1750
},
{
"epoch": 0.36,
"grad_norm": 3.4547474359483075,
"learning_rate": 1.9742191897549783e-05,
"loss": 0.2082,
"step": 1760
},
{
"epoch": 0.36,
"grad_norm": 16.606773079360572,
"learning_rate": 1.973712944428187e-05,
"loss": 0.2476,
"step": 1770
},
{
"epoch": 0.36,
"grad_norm": 3.224499798212032,
"learning_rate": 1.9732018430538613e-05,
"loss": 0.2574,
"step": 1780
},
{
"epoch": 0.36,
"grad_norm": 3.5823954456349805,
"learning_rate": 1.9726858881811992e-05,
"loss": 0.2242,
"step": 1790
},
{
"epoch": 0.36,
"grad_norm": 4.249584626487707,
"learning_rate": 1.9721650823836074e-05,
"loss": 0.1894,
"step": 1800
},
{
"epoch": 0.37,
"grad_norm": 2.805651655743975,
"learning_rate": 1.971639428258686e-05,
"loss": 0.2224,
"step": 1810
},
{
"epoch": 0.37,
"grad_norm": 6.825631461083238,
"learning_rate": 1.971108928428218e-05,
"loss": 0.206,
"step": 1820
},
{
"epoch": 0.37,
"grad_norm": 3.668175314712389,
"learning_rate": 1.9705735855381544e-05,
"loss": 0.228,
"step": 1830
},
{
"epoch": 0.37,
"grad_norm": 2.8084463789135126,
"learning_rate": 1.9700334022586016e-05,
"loss": 0.2313,
"step": 1840
},
{
"epoch": 0.37,
"grad_norm": 3.566250082150367,
"learning_rate": 1.9694883812838095e-05,
"loss": 0.2291,
"step": 1850
},
{
"epoch": 0.38,
"grad_norm": 3.9864026464050437,
"learning_rate": 1.9689385253321548e-05,
"loss": 0.2312,
"step": 1860
},
{
"epoch": 0.38,
"grad_norm": 4.0540053541677885,
"learning_rate": 1.9683838371461315e-05,
"loss": 0.2687,
"step": 1870
},
{
"epoch": 0.38,
"grad_norm": 3.6121079968078234,
"learning_rate": 1.9678243194923333e-05,
"loss": 0.2403,
"step": 1880
},
{
"epoch": 0.38,
"grad_norm": 4.062791315050937,
"learning_rate": 1.9672599751614427e-05,
"loss": 0.2225,
"step": 1890
},
{
"epoch": 0.38,
"grad_norm": 11.916356720706089,
"learning_rate": 1.966690806968216e-05,
"loss": 0.2517,
"step": 1900
},
{
"epoch": 0.39,
"grad_norm": 9.070397034086051,
"learning_rate": 1.9661168177514683e-05,
"loss": 0.2386,
"step": 1910
},
{
"epoch": 0.39,
"grad_norm": 4.223253798926425,
"learning_rate": 1.9655380103740618e-05,
"loss": 0.2409,
"step": 1920
},
{
"epoch": 0.39,
"grad_norm": 3.338130757023872,
"learning_rate": 1.9649543877228886e-05,
"loss": 0.2368,
"step": 1930
},
{
"epoch": 0.39,
"grad_norm": 3.545188334725578,
"learning_rate": 1.9643659527088587e-05,
"loss": 0.1738,
"step": 1940
},
{
"epoch": 0.4,
"grad_norm": 3.408623244835359,
"learning_rate": 1.963772708266884e-05,
"loss": 0.198,
"step": 1950
},
{
"epoch": 0.4,
"grad_norm": 3.900722651898139,
"learning_rate": 1.9631746573558646e-05,
"loss": 0.1959,
"step": 1960
},
{
"epoch": 0.4,
"grad_norm": 3.692501620096666,
"learning_rate": 1.9625718029586732e-05,
"loss": 0.2356,
"step": 1970
},
{
"epoch": 0.4,
"grad_norm": 3.655015133723331,
"learning_rate": 1.9619641480821407e-05,
"loss": 0.242,
"step": 1980
},
{
"epoch": 0.4,
"grad_norm": 13.072999667731198,
"learning_rate": 1.9613516957570416e-05,
"loss": 0.2206,
"step": 1990
},
{
"epoch": 0.41,
"grad_norm": 3.6663532701131696,
"learning_rate": 1.9607344490380778e-05,
"loss": 0.2006,
"step": 2000
},
{
"epoch": 0.41,
"grad_norm": 1.9175793161114592,
"learning_rate": 1.9601124110038647e-05,
"loss": 0.2138,
"step": 2010
},
{
"epoch": 0.41,
"grad_norm": 2.244205112045336,
"learning_rate": 1.9594855847569144e-05,
"loss": 0.2025,
"step": 2020
},
{
"epoch": 0.41,
"grad_norm": 1.896116682683314,
"learning_rate": 1.9588539734236213e-05,
"loss": 0.1844,
"step": 2030
},
{
"epoch": 0.41,
"grad_norm": 2.0057399160737157,
"learning_rate": 1.958217580154246e-05,
"loss": 0.1576,
"step": 2040
},
{
"epoch": 0.42,
"grad_norm": 2.1453450088649535,
"learning_rate": 1.9575764081229004e-05,
"loss": 0.2168,
"step": 2050
},
{
"epoch": 0.42,
"grad_norm": 2.030575801761766,
"learning_rate": 1.9569304605275304e-05,
"loss": 0.1728,
"step": 2060
},
{
"epoch": 0.42,
"grad_norm": 2.5496294537760775,
"learning_rate": 1.9562797405899012e-05,
"loss": 0.2134,
"step": 2070
},
{
"epoch": 0.42,
"grad_norm": 2.242162724268332,
"learning_rate": 1.955624251555581e-05,
"loss": 0.201,
"step": 2080
},
{
"epoch": 0.42,
"grad_norm": 1.9933952709556402,
"learning_rate": 1.954963996693924e-05,
"loss": 0.2179,
"step": 2090
},
{
"epoch": 0.43,
"grad_norm": 2.240735158513416,
"learning_rate": 1.954298979298055e-05,
"loss": 0.2494,
"step": 2100
},
{
"epoch": 0.43,
"grad_norm": 2.397210518680832,
"learning_rate": 1.953629202684853e-05,
"loss": 0.2077,
"step": 2110
},
{
"epoch": 0.43,
"grad_norm": 4.51567146250967,
"learning_rate": 1.9529546701949338e-05,
"loss": 0.1859,
"step": 2120
},
{
"epoch": 0.43,
"grad_norm": 1.6048501115252622,
"learning_rate": 1.952275385192635e-05,
"loss": 0.1858,
"step": 2130
},
{
"epoch": 0.43,
"grad_norm": 3.266632654700684,
"learning_rate": 1.951591351065996e-05,
"loss": 0.2065,
"step": 2140
},
{
"epoch": 0.44,
"grad_norm": 2.9963705041975857,
"learning_rate": 1.950902571226745e-05,
"loss": 0.2395,
"step": 2150
},
{
"epoch": 0.44,
"grad_norm": 2.47269279368607,
"learning_rate": 1.9502090491102805e-05,
"loss": 0.2234,
"step": 2160
},
{
"epoch": 0.44,
"grad_norm": 2.4172687657411283,
"learning_rate": 1.949510788175652e-05,
"loss": 0.2054,
"step": 2170
},
{
"epoch": 0.44,
"grad_norm": 2.24131454859904,
"learning_rate": 1.948807791905546e-05,
"loss": 0.2036,
"step": 2180
},
{
"epoch": 0.44,
"grad_norm": 2.7824059130997916,
"learning_rate": 1.9481000638062667e-05,
"loss": 0.211,
"step": 2190
},
{
"epoch": 0.45,
"grad_norm": 2.5084669738687966,
"learning_rate": 1.9473876074077193e-05,
"loss": 0.1987,
"step": 2200
},
{
"epoch": 0.45,
"grad_norm": 2.374685869021834,
"learning_rate": 1.946670426263392e-05,
"loss": 0.18,
"step": 2210
},
{
"epoch": 0.45,
"grad_norm": 2.1982980430417665,
"learning_rate": 1.9459485239503385e-05,
"loss": 0.1883,
"step": 2220
},
{
"epoch": 0.45,
"grad_norm": 2.2072115394074947,
"learning_rate": 1.9452219040691604e-05,
"loss": 0.1962,
"step": 2230
},
{
"epoch": 0.45,
"grad_norm": 2.616971043987373,
"learning_rate": 1.9444905702439874e-05,
"loss": 0.2126,
"step": 2240
},
{
"epoch": 0.46,
"grad_norm": 2.1674222240111525,
"learning_rate": 1.943754526122463e-05,
"loss": 0.1622,
"step": 2250
},
{
"epoch": 0.46,
"grad_norm": 2.201552458849196,
"learning_rate": 1.9430137753757222e-05,
"loss": 0.2293,
"step": 2260
},
{
"epoch": 0.46,
"grad_norm": 2.5879310691120314,
"learning_rate": 1.9422683216983766e-05,
"loss": 0.2008,
"step": 2270
},
{
"epoch": 0.46,
"grad_norm": 2.113474838782104,
"learning_rate": 1.9415181688084922e-05,
"loss": 0.1622,
"step": 2280
},
{
"epoch": 0.46,
"grad_norm": 2.05705061518402,
"learning_rate": 1.9407633204475756e-05,
"loss": 0.1674,
"step": 2290
},
{
"epoch": 0.47,
"grad_norm": 3.3556666998600018,
"learning_rate": 1.940003780380551e-05,
"loss": 0.1765,
"step": 2300
},
{
"epoch": 0.47,
"grad_norm": 2.332083901173591,
"learning_rate": 1.9392395523957438e-05,
"loss": 0.1656,
"step": 2310
},
{
"epoch": 0.47,
"grad_norm": 2.3736488112606757,
"learning_rate": 1.9384706403048618e-05,
"loss": 0.2206,
"step": 2320
},
{
"epoch": 0.47,
"grad_norm": 2.6382812391557318,
"learning_rate": 1.937697047942974e-05,
"loss": 0.1645,
"step": 2330
},
{
"epoch": 0.47,
"grad_norm": 2.6013892863578194,
"learning_rate": 1.9369187791684943e-05,
"loss": 0.1705,
"step": 2340
},
{
"epoch": 0.48,
"grad_norm": 2.1145392602796855,
"learning_rate": 1.9361358378631604e-05,
"loss": 0.206,
"step": 2350
},
{
"epoch": 0.48,
"grad_norm": 2.200460676164746,
"learning_rate": 1.9353482279320154e-05,
"loss": 0.2172,
"step": 2360
},
{
"epoch": 0.48,
"grad_norm": 2.3806064086412455,
"learning_rate": 1.9345559533033867e-05,
"loss": 0.1837,
"step": 2370
},
{
"epoch": 0.48,
"grad_norm": 2.53378489217648,
"learning_rate": 1.9337590179288694e-05,
"loss": 0.1962,
"step": 2380
},
{
"epoch": 0.48,
"grad_norm": 13.445648419685131,
"learning_rate": 1.9329574257833035e-05,
"loss": 0.1332,
"step": 2390
},
{
"epoch": 0.49,
"grad_norm": 1.7839934654329992,
"learning_rate": 1.932151180864756e-05,
"loss": 0.1713,
"step": 2400
},
{
"epoch": 0.49,
"grad_norm": 1.9367486050463985,
"learning_rate": 1.9313402871945e-05,
"loss": 0.207,
"step": 2410
},
{
"epoch": 0.49,
"grad_norm": 3.9679375965918084,
"learning_rate": 1.930524748816995e-05,
"loss": 0.1766,
"step": 2420
},
{
"epoch": 0.49,
"grad_norm": 2.12675067368969,
"learning_rate": 1.9297045697998667e-05,
"loss": 0.2119,
"step": 2430
},
{
"epoch": 0.49,
"grad_norm": 2.330245927266684,
"learning_rate": 1.9288797542338875e-05,
"loss": 0.192,
"step": 2440
},
{
"epoch": 0.5,
"grad_norm": 2.2758592638707986,
"learning_rate": 1.9280503062329537e-05,
"loss": 0.2218,
"step": 2450
},
{
"epoch": 0.5,
"grad_norm": 3.420743432917777,
"learning_rate": 1.9272162299340675e-05,
"loss": 0.1517,
"step": 2460
},
{
"epoch": 0.5,
"grad_norm": 1.4543338792444171,
"learning_rate": 1.9263775294973168e-05,
"loss": 0.1854,
"step": 2470
},
{
"epoch": 0.5,
"grad_norm": 2.1078454017192123,
"learning_rate": 1.92553420910585e-05,
"loss": 0.2071,
"step": 2480
},
{
"epoch": 0.5,
"grad_norm": 2.24945739985224,
"learning_rate": 1.9246862729658616e-05,
"loss": 0.1434,
"step": 2490
},
{
"epoch": 0.51,
"grad_norm": 2.313736662246618,
"learning_rate": 1.9238337253065655e-05,
"loss": 0.2095,
"step": 2500
},
{
"epoch": 0.51,
"grad_norm": 2.7303675467709847,
"learning_rate": 1.922976570380177e-05,
"loss": 0.2015,
"step": 2510
},
{
"epoch": 0.51,
"grad_norm": 3.1027147281182703,
"learning_rate": 1.9221148124618915e-05,
"loss": 0.1902,
"step": 2520
},
{
"epoch": 0.51,
"grad_norm": 1.9571047100967691,
"learning_rate": 1.921248455849862e-05,
"loss": 0.163,
"step": 2530
},
{
"epoch": 0.51,
"grad_norm": 2.258815028020344,
"learning_rate": 1.9203775048651776e-05,
"loss": 0.159,
"step": 2540
},
{
"epoch": 0.52,
"grad_norm": 3.119730206342172,
"learning_rate": 1.9195019638518437e-05,
"loss": 0.1766,
"step": 2550
},
{
"epoch": 0.52,
"grad_norm": 4.155231092799811,
"learning_rate": 1.9186218371767587e-05,
"loss": 0.1999,
"step": 2560
},
{
"epoch": 0.52,
"grad_norm": 3.938178594333825,
"learning_rate": 1.9177371292296926e-05,
"loss": 0.1967,
"step": 2570
},
{
"epoch": 0.52,
"grad_norm": 2.264714656649641,
"learning_rate": 1.916847844423265e-05,
"loss": 0.1873,
"step": 2580
},
{
"epoch": 0.52,
"grad_norm": 2.3278837142950835,
"learning_rate": 1.915953987192924e-05,
"loss": 0.198,
"step": 2590
},
{
"epoch": 0.53,
"grad_norm": 2.3796897562558557,
"learning_rate": 1.9150555619969228e-05,
"loss": 0.1591,
"step": 2600
},
{
"epoch": 0.53,
"grad_norm": 1.2146800627013359,
"learning_rate": 1.914152573316298e-05,
"loss": 0.1772,
"step": 2610
},
{
"epoch": 0.53,
"grad_norm": 2.284220185885104,
"learning_rate": 1.9132450256548482e-05,
"loss": 0.1924,
"step": 2620
},
{
"epoch": 0.53,
"grad_norm": 2.3132986803051954,
"learning_rate": 1.912332923539109e-05,
"loss": 0.1575,
"step": 2630
},
{
"epoch": 0.53,
"grad_norm": 3.3347587557674214,
"learning_rate": 1.9114162715183338e-05,
"loss": 0.2016,
"step": 2640
},
{
"epoch": 0.54,
"grad_norm": 4.1416219410825565,
"learning_rate": 1.9104950741644682e-05,
"loss": 0.1841,
"step": 2650
},
{
"epoch": 0.54,
"grad_norm": 1.9784717611438265,
"learning_rate": 1.9095693360721288e-05,
"loss": 0.1988,
"step": 2660
},
{
"epoch": 0.54,
"grad_norm": 1.9851485419245707,
"learning_rate": 1.90863906185858e-05,
"loss": 0.2306,
"step": 2670
},
{
"epoch": 0.54,
"grad_norm": 1.7507079599778372,
"learning_rate": 1.90770425616371e-05,
"loss": 0.1812,
"step": 2680
},
{
"epoch": 0.54,
"grad_norm": 1.4418225191285714,
"learning_rate": 1.90676492365001e-05,
"loss": 0.1724,
"step": 2690
},
{
"epoch": 0.55,
"grad_norm": 6.358944590106823,
"learning_rate": 1.905821069002548e-05,
"loss": 0.2033,
"step": 2700
},
{
"epoch": 0.55,
"grad_norm": 2.569265877186431,
"learning_rate": 1.9048726969289472e-05,
"loss": 0.1962,
"step": 2710
},
{
"epoch": 0.55,
"grad_norm": 2.143027189864614,
"learning_rate": 1.9039198121593623e-05,
"loss": 0.2134,
"step": 2720
},
{
"epoch": 0.55,
"grad_norm": 2.3661795002812593,
"learning_rate": 1.9029624194464562e-05,
"loss": 0.1594,
"step": 2730
},
{
"epoch": 0.56,
"grad_norm": 2.2244765834008855,
"learning_rate": 1.9020005235653752e-05,
"loss": 0.1797,
"step": 2740
},
{
"epoch": 0.56,
"grad_norm": 2.48042520800243,
"learning_rate": 1.9010341293137265e-05,
"loss": 0.1992,
"step": 2750
},
{
"epoch": 0.56,
"grad_norm": 2.4141715312149854,
"learning_rate": 1.9000632415115526e-05,
"loss": 0.194,
"step": 2760
},
{
"epoch": 0.56,
"grad_norm": 2.1311219610420453,
"learning_rate": 1.8990878650013095e-05,
"loss": 0.2152,
"step": 2770
},
{
"epoch": 0.56,
"grad_norm": 2.2386424254232806,
"learning_rate": 1.8981080046478408e-05,
"loss": 0.1678,
"step": 2780
},
{
"epoch": 0.57,
"grad_norm": 2.1290366036640167,
"learning_rate": 1.8971236653383534e-05,
"loss": 0.1815,
"step": 2790
},
{
"epoch": 0.57,
"grad_norm": 2.332990235126333,
"learning_rate": 1.896134851982395e-05,
"loss": 0.1601,
"step": 2800
},
{
"epoch": 0.57,
"grad_norm": 1.9313701752737014,
"learning_rate": 1.895141569511827e-05,
"loss": 0.1913,
"step": 2810
},
{
"epoch": 0.57,
"grad_norm": 2.0079851484471387,
"learning_rate": 1.8941438228808023e-05,
"loss": 0.2147,
"step": 2820
},
{
"epoch": 0.57,
"grad_norm": 3.300791940163902,
"learning_rate": 1.8931416170657383e-05,
"loss": 0.198,
"step": 2830
},
{
"epoch": 0.58,
"grad_norm": 2.005476843621061,
"learning_rate": 1.892134957065295e-05,
"loss": 0.1835,
"step": 2840
},
{
"epoch": 0.58,
"grad_norm": 2.6085877287350487,
"learning_rate": 1.8911238479003464e-05,
"loss": 0.1898,
"step": 2850
},
{
"epoch": 0.58,
"grad_norm": 2.5393547946239683,
"learning_rate": 1.8901082946139585e-05,
"loss": 0.2094,
"step": 2860
},
{
"epoch": 0.58,
"grad_norm": 2.1428525152610582,
"learning_rate": 1.8890883022713635e-05,
"loss": 0.1754,
"step": 2870
},
{
"epoch": 0.58,
"grad_norm": 1.9149274467688195,
"learning_rate": 1.8880638759599327e-05,
"loss": 0.2039,
"step": 2880
},
{
"epoch": 0.59,
"grad_norm": 3.51632507135593,
"learning_rate": 1.8870350207891536e-05,
"loss": 0.1857,
"step": 2890
},
{
"epoch": 0.59,
"grad_norm": 2.4186516754116196,
"learning_rate": 1.8860017418906028e-05,
"loss": 0.1374,
"step": 2900
},
{
"epoch": 0.59,
"grad_norm": 2.0258647249782333,
"learning_rate": 1.884964044417921e-05,
"loss": 0.2084,
"step": 2910
},
{
"epoch": 0.59,
"grad_norm": 2.0985094320845574,
"learning_rate": 1.8839219335467886e-05,
"loss": 0.1484,
"step": 2920
},
{
"epoch": 0.59,
"grad_norm": 2.19199552449932,
"learning_rate": 1.8828754144748958e-05,
"loss": 0.2029,
"step": 2930
},
{
"epoch": 0.6,
"grad_norm": 1.9095904364534577,
"learning_rate": 1.8818244924219217e-05,
"loss": 0.1997,
"step": 2940
},
{
"epoch": 0.6,
"grad_norm": 2.052834839544693,
"learning_rate": 1.8807691726295053e-05,
"loss": 0.1536,
"step": 2950
},
{
"epoch": 0.6,
"grad_norm": 2.179267323237696,
"learning_rate": 1.8797094603612192e-05,
"loss": 0.2086,
"step": 2960
},
{
"epoch": 0.6,
"grad_norm": 2.5098689789030355,
"learning_rate": 1.878645360902546e-05,
"loss": 0.1994,
"step": 2970
},
{
"epoch": 0.6,
"grad_norm": 4.400137699746838,
"learning_rate": 1.8775768795608472e-05,
"loss": 0.1606,
"step": 2980
},
{
"epoch": 0.61,
"grad_norm": 2.590590977646675,
"learning_rate": 1.8765040216653427e-05,
"loss": 0.1897,
"step": 2990
},
{
"epoch": 0.61,
"grad_norm": 2.1321993774458954,
"learning_rate": 1.8754267925670796e-05,
"loss": 0.1531,
"step": 3000
},
{
"epoch": 0.61,
"grad_norm": 2.5568642398269334,
"learning_rate": 1.8743451976389068e-05,
"loss": 0.178,
"step": 3010
},
{
"epoch": 0.61,
"grad_norm": 1.9214830076333376,
"learning_rate": 1.8732592422754495e-05,
"loss": 0.1897,
"step": 3020
},
{
"epoch": 0.61,
"grad_norm": 1.969751254803088,
"learning_rate": 1.8721689318930806e-05,
"loss": 0.1502,
"step": 3030
},
{
"epoch": 0.62,
"grad_norm": 5.017433292114636,
"learning_rate": 1.871074271929894e-05,
"loss": 0.1982,
"step": 3040
},
{
"epoch": 0.62,
"grad_norm": 1.9369614375603101,
"learning_rate": 1.8699752678456788e-05,
"loss": 0.1719,
"step": 3050
},
{
"epoch": 0.62,
"grad_norm": 2.006594277192507,
"learning_rate": 1.86887192512189e-05,
"loss": 0.1932,
"step": 3060
},
{
"epoch": 0.62,
"grad_norm": 1.715514310451893,
"learning_rate": 1.8677642492616236e-05,
"loss": 0.1801,
"step": 3070
},
{
"epoch": 0.62,
"grad_norm": 1.9267545952361633,
"learning_rate": 1.8666522457895862e-05,
"loss": 0.1893,
"step": 3080
},
{
"epoch": 0.63,
"grad_norm": 2.1874472475503106,
"learning_rate": 1.86553592025207e-05,
"loss": 0.1767,
"step": 3090
},
{
"epoch": 0.63,
"grad_norm": 1.8843605357258664,
"learning_rate": 1.8644152782169247e-05,
"loss": 0.1802,
"step": 3100
},
{
"epoch": 0.63,
"grad_norm": 2.3298848304608684,
"learning_rate": 1.8632903252735276e-05,
"loss": 0.1667,
"step": 3110
},
{
"epoch": 0.63,
"grad_norm": 2.0245193476149312,
"learning_rate": 1.862161067032759e-05,
"loss": 0.1834,
"step": 3120
},
{
"epoch": 0.63,
"grad_norm": 2.239252595358892,
"learning_rate": 1.861027509126971e-05,
"loss": 0.2083,
"step": 3130
},
{
"epoch": 0.64,
"grad_norm": 2.000691294407405,
"learning_rate": 1.8598896572099624e-05,
"loss": 0.1739,
"step": 3140
},
{
"epoch": 0.64,
"grad_norm": 1.656761882061858,
"learning_rate": 1.8587475169569483e-05,
"loss": 0.1815,
"step": 3150
},
{
"epoch": 0.64,
"grad_norm": 2.1245215382192346,
"learning_rate": 1.8576010940645325e-05,
"loss": 0.1606,
"step": 3160
},
{
"epoch": 0.64,
"grad_norm": 1.7762853803876042,
"learning_rate": 1.856450394250679e-05,
"loss": 0.1969,
"step": 3170
},
{
"epoch": 0.64,
"grad_norm": 2.8830827622510697,
"learning_rate": 1.855295423254685e-05,
"loss": 0.1555,
"step": 3180
},
{
"epoch": 0.65,
"grad_norm": 2.1861197340441234,
"learning_rate": 1.854136186837149e-05,
"loss": 0.1889,
"step": 3190
},
{
"epoch": 0.65,
"grad_norm": 2.3198392751530705,
"learning_rate": 1.8529726907799444e-05,
"loss": 0.1943,
"step": 3200
},
{
"epoch": 0.65,
"grad_norm": 1.9214231378576725,
"learning_rate": 1.8518049408861915e-05,
"loss": 0.1831,
"step": 3210
},
{
"epoch": 0.65,
"grad_norm": 2.398048593011276,
"learning_rate": 1.850632942980226e-05,
"loss": 0.2029,
"step": 3220
},
{
"epoch": 0.65,
"grad_norm": 2.2143080398207573,
"learning_rate": 1.8494567029075714e-05,
"loss": 0.1718,
"step": 3230
},
{
"epoch": 0.66,
"grad_norm": 2.126646974386725,
"learning_rate": 1.84827622653491e-05,
"loss": 0.1302,
"step": 3240
},
{
"epoch": 0.66,
"grad_norm": 11.599444777256583,
"learning_rate": 1.847091519750053e-05,
"loss": 0.1809,
"step": 3250
},
{
"epoch": 0.66,
"grad_norm": 2.3991674250656403,
"learning_rate": 1.8459025884619125e-05,
"loss": 0.1865,
"step": 3260
},
{
"epoch": 0.66,
"grad_norm": 6.4503963141704155,
"learning_rate": 1.844709438600469e-05,
"loss": 0.2251,
"step": 3270
},
{
"epoch": 0.66,
"grad_norm": 2.4044007961395626,
"learning_rate": 1.8435120761167453e-05,
"loss": 0.1708,
"step": 3280
},
{
"epoch": 0.67,
"grad_norm": 1.5993439730175965,
"learning_rate": 1.8423105069827753e-05,
"loss": 0.1651,
"step": 3290
},
{
"epoch": 0.67,
"grad_norm": 1.9740878207799195,
"learning_rate": 1.8411047371915737e-05,
"loss": 0.1955,
"step": 3300
},
{
"epoch": 0.67,
"grad_norm": 2.244083998109266,
"learning_rate": 1.839894772757106e-05,
"loss": 0.2106,
"step": 3310
},
{
"epoch": 0.67,
"grad_norm": 2.2654343097283434,
"learning_rate": 1.8386806197142607e-05,
"loss": 0.1331,
"step": 3320
},
{
"epoch": 0.67,
"grad_norm": 1.9373155321474749,
"learning_rate": 1.837462284118817e-05,
"loss": 0.143,
"step": 3330
},
{
"epoch": 0.68,
"grad_norm": 2.041586413720046,
"learning_rate": 1.8362397720474144e-05,
"loss": 0.1805,
"step": 3340
},
{
"epoch": 0.68,
"grad_norm": 2.311869767372086,
"learning_rate": 1.8350130895975247e-05,
"loss": 0.1748,
"step": 3350
},
{
"epoch": 0.68,
"grad_norm": 2.0358596142782224,
"learning_rate": 1.8337822428874187e-05,
"loss": 0.197,
"step": 3360
},
{
"epoch": 0.68,
"grad_norm": 12.765829395166001,
"learning_rate": 1.8325472380561382e-05,
"loss": 0.2043,
"step": 3370
},
{
"epoch": 0.68,
"grad_norm": 2.283832499199408,
"learning_rate": 1.831308081263464e-05,
"loss": 0.1696,
"step": 3380
},
{
"epoch": 0.69,
"grad_norm": 2.099264143128957,
"learning_rate": 1.8300647786898843e-05,
"loss": 0.1772,
"step": 3390
},
{
"epoch": 0.69,
"grad_norm": 3.0418348048172117,
"learning_rate": 1.8288173365365675e-05,
"loss": 0.1679,
"step": 3400
},
{
"epoch": 0.69,
"grad_norm": 2.928400270597656,
"learning_rate": 1.827565761025326e-05,
"loss": 0.1839,
"step": 3410
},
{
"epoch": 0.69,
"grad_norm": 1.9489192388525756,
"learning_rate": 1.82631005839859e-05,
"loss": 0.1702,
"step": 3420
},
{
"epoch": 0.69,
"grad_norm": 1.866673546449154,
"learning_rate": 1.825050234919374e-05,
"loss": 0.1913,
"step": 3430
},
{
"epoch": 0.7,
"grad_norm": 1.9782480885566223,
"learning_rate": 1.8237862968712442e-05,
"loss": 0.189,
"step": 3440
},
{
"epoch": 0.7,
"grad_norm": 2.072252048803903,
"learning_rate": 1.8225182505582918e-05,
"loss": 0.1908,
"step": 3450
},
{
"epoch": 0.7,
"grad_norm": 2.1333806325715523,
"learning_rate": 1.821246102305096e-05,
"loss": 0.204,
"step": 3460
},
{
"epoch": 0.7,
"grad_norm": 2.0590278992877113,
"learning_rate": 1.8199698584566967e-05,
"loss": 0.1833,
"step": 3470
},
{
"epoch": 0.71,
"grad_norm": 2.2552945044942474,
"learning_rate": 1.8186895253785603e-05,
"loss": 0.2076,
"step": 3480
},
{
"epoch": 0.71,
"grad_norm": 1.9634969960872868,
"learning_rate": 1.8174051094565484e-05,
"loss": 0.2097,
"step": 3490
},
{
"epoch": 0.71,
"grad_norm": 2.2410044456550184,
"learning_rate": 1.816116617096889e-05,
"loss": 0.2217,
"step": 3500
},
{
"epoch": 0.71,
"grad_norm": 2.1336586639575237,
"learning_rate": 1.8148240547261387e-05,
"loss": 0.18,
"step": 3510
},
{
"epoch": 0.71,
"grad_norm": 2.173336388150898,
"learning_rate": 1.813527428791156e-05,
"loss": 0.1756,
"step": 3520
},
{
"epoch": 0.72,
"grad_norm": 2.671364540282752,
"learning_rate": 1.812226745759066e-05,
"loss": 0.1863,
"step": 3530
},
{
"epoch": 0.72,
"grad_norm": 1.9057012299641733,
"learning_rate": 1.8109220121172306e-05,
"loss": 0.2206,
"step": 3540
},
{
"epoch": 0.72,
"grad_norm": 0.1813973982571034,
"learning_rate": 1.8096132343732135e-05,
"loss": 0.1462,
"step": 3550
},
{
"epoch": 0.72,
"grad_norm": 1.8892866709987723,
"learning_rate": 1.808300419054749e-05,
"loss": 0.1803,
"step": 3560
},
{
"epoch": 0.72,
"grad_norm": 2.003086926459966,
"learning_rate": 1.80698357270971e-05,
"loss": 0.2032,
"step": 3570
},
{
"epoch": 0.73,
"grad_norm": 3.2221389262807874,
"learning_rate": 1.8056627019060738e-05,
"loss": 0.1631,
"step": 3580
},
{
"epoch": 0.73,
"grad_norm": 1.2538102728217932,
"learning_rate": 1.8043378132318927e-05,
"loss": 0.1692,
"step": 3590
},
{
"epoch": 0.73,
"grad_norm": 4.563350232475948,
"learning_rate": 1.8030089132952557e-05,
"loss": 0.1727,
"step": 3600
},
{
"epoch": 0.73,
"grad_norm": 2.000374282147408,
"learning_rate": 1.8016760087242605e-05,
"loss": 0.1733,
"step": 3610
},
{
"epoch": 0.73,
"grad_norm": 1.9278853800801559,
"learning_rate": 1.800339106166978e-05,
"loss": 0.1852,
"step": 3620
},
{
"epoch": 0.74,
"grad_norm": 2.2241283295196514,
"learning_rate": 1.79899821229142e-05,
"loss": 0.143,
"step": 3630
},
{
"epoch": 0.74,
"grad_norm": 2.4864913275556133,
"learning_rate": 1.7976533337855053e-05,
"loss": 0.1884,
"step": 3640
},
{
"epoch": 0.74,
"grad_norm": 3.860042375322093,
"learning_rate": 1.7963044773570265e-05,
"loss": 0.1641,
"step": 3650
},
{
"epoch": 0.74,
"grad_norm": 2.0454176745318677,
"learning_rate": 1.7949516497336176e-05,
"loss": 0.1864,
"step": 3660
},
{
"epoch": 0.74,
"grad_norm": 2.46822670327846,
"learning_rate": 1.793594857662718e-05,
"loss": 0.1924,
"step": 3670
},
{
"epoch": 0.75,
"grad_norm": 2.4872265651073158,
"learning_rate": 1.792234107911542e-05,
"loss": 0.1546,
"step": 3680
},
{
"epoch": 0.75,
"grad_norm": 2.0379945728128663,
"learning_rate": 1.7908694072670426e-05,
"loss": 0.1711,
"step": 3690
},
{
"epoch": 0.75,
"grad_norm": 2.407085620398544,
"learning_rate": 1.7895007625358783e-05,
"loss": 0.1701,
"step": 3700
},
{
"epoch": 0.75,
"grad_norm": 4.042625051438552,
"learning_rate": 1.7881281805443805e-05,
"loss": 0.195,
"step": 3710
},
{
"epoch": 0.75,
"grad_norm": 1.9781223218758097,
"learning_rate": 1.786751668138517e-05,
"loss": 0.1621,
"step": 3720
},
{
"epoch": 0.76,
"grad_norm": 4.657693319193961,
"learning_rate": 1.7853712321838602e-05,
"loss": 0.1968,
"step": 3730
},
{
"epoch": 0.76,
"grad_norm": 17.520636897246522,
"learning_rate": 1.7839868795655507e-05,
"loss": 0.1475,
"step": 3740
},
{
"epoch": 0.76,
"grad_norm": 2.1710924893238626,
"learning_rate": 1.782598617188265e-05,
"loss": 0.1686,
"step": 3750
},
{
"epoch": 0.76,
"grad_norm": 2.205220842093223,
"learning_rate": 1.78120645197618e-05,
"loss": 0.1587,
"step": 3760
},
{
"epoch": 0.76,
"grad_norm": 1.95114988022513,
"learning_rate": 1.7798103908729377e-05,
"loss": 0.1743,
"step": 3770
},
{
"epoch": 0.77,
"grad_norm": 2.7456615210938744,
"learning_rate": 1.778410440841613e-05,
"loss": 0.1468,
"step": 3780
},
{
"epoch": 0.77,
"grad_norm": 2.8973758532017895,
"learning_rate": 1.7770066088646767e-05,
"loss": 0.1883,
"step": 3790
},
{
"epoch": 0.77,
"grad_norm": 2.5445827161522616,
"learning_rate": 1.7755989019439607e-05,
"loss": 0.1895,
"step": 3800
},
{
"epoch": 0.77,
"grad_norm": 2.169743981267003,
"learning_rate": 1.774187327100625e-05,
"loss": 0.1716,
"step": 3810
},
{
"epoch": 0.77,
"grad_norm": 2.1135486144554387,
"learning_rate": 1.7727718913751207e-05,
"loss": 0.1164,
"step": 3820
},
{
"epoch": 0.78,
"grad_norm": 2.0715545958907535,
"learning_rate": 1.7713526018271558e-05,
"loss": 0.1717,
"step": 3830
},
{
"epoch": 0.78,
"grad_norm": 8.108185256384777,
"learning_rate": 1.76992946553566e-05,
"loss": 0.1662,
"step": 3840
},
{
"epoch": 0.78,
"grad_norm": 2.4906125249035713,
"learning_rate": 1.7685024895987494e-05,
"loss": 0.184,
"step": 3850
},
{
"epoch": 0.78,
"grad_norm": 3.009141821458063,
"learning_rate": 1.7670716811336902e-05,
"loss": 0.2086,
"step": 3860
},
{
"epoch": 0.78,
"grad_norm": 1.935774373100715,
"learning_rate": 1.7656370472768648e-05,
"loss": 0.1566,
"step": 3870
},
{
"epoch": 0.79,
"grad_norm": 6.368484758089139,
"learning_rate": 1.7641985951837347e-05,
"loss": 0.2127,
"step": 3880
},
{
"epoch": 0.79,
"grad_norm": 2.2821667018184533,
"learning_rate": 1.7627563320288056e-05,
"loss": 0.1822,
"step": 3890
},
{
"epoch": 0.79,
"grad_norm": 1.7481761010849746,
"learning_rate": 1.7613102650055925e-05,
"loss": 0.2264,
"step": 3900
},
{
"epoch": 0.79,
"grad_norm": 2.111098426089184,
"learning_rate": 1.759860401326581e-05,
"loss": 0.1838,
"step": 3910
},
{
"epoch": 0.79,
"grad_norm": 2.3865926806030204,
"learning_rate": 1.758406748223194e-05,
"loss": 0.1779,
"step": 3920
},
{
"epoch": 0.8,
"grad_norm": 2.1732390806125,
"learning_rate": 1.7569493129457554e-05,
"loss": 0.1713,
"step": 3930
},
{
"epoch": 0.8,
"grad_norm": 1.8814065202760815,
"learning_rate": 1.7554881027634516e-05,
"loss": 0.178,
"step": 3940
},
{
"epoch": 0.8,
"grad_norm": 2.16916361006078,
"learning_rate": 1.754023124964299e-05,
"loss": 0.1475,
"step": 3950
},
{
"epoch": 0.8,
"grad_norm": 2.27943306248829,
"learning_rate": 1.7525543868551045e-05,
"loss": 0.1997,
"step": 3960
},
{
"epoch": 0.8,
"grad_norm": 3.117493119710199,
"learning_rate": 1.7510818957614292e-05,
"loss": 0.1475,
"step": 3970
},
{
"epoch": 0.81,
"grad_norm": 2.3033259103584567,
"learning_rate": 1.7496056590275546e-05,
"loss": 0.1853,
"step": 3980
},
{
"epoch": 0.81,
"grad_norm": 4.045509608812605,
"learning_rate": 1.7481256840164436e-05,
"loss": 0.171,
"step": 3990
},
{
"epoch": 0.81,
"grad_norm": 1.7320172203917021,
"learning_rate": 1.7466419781097038e-05,
"loss": 0.1619,
"step": 4000
},
{
"epoch": 0.81,
"grad_norm": 2.1721310910520772,
"learning_rate": 1.745154548707551e-05,
"loss": 0.1614,
"step": 4010
},
{
"epoch": 0.81,
"grad_norm": 3.4362498003979374,
"learning_rate": 1.7436634032287735e-05,
"loss": 0.1885,
"step": 4020
},
{
"epoch": 0.82,
"grad_norm": 2.203441191364378,
"learning_rate": 1.7421685491106933e-05,
"loss": 0.1746,
"step": 4030
},
{
"epoch": 0.82,
"grad_norm": 2.2235513235782136,
"learning_rate": 1.740669993809131e-05,
"loss": 0.1958,
"step": 4040
},
{
"epoch": 0.82,
"grad_norm": 1.662744062478203,
"learning_rate": 1.7391677447983663e-05,
"loss": 0.168,
"step": 4050
},
{
"epoch": 0.82,
"grad_norm": 1.9923877091876279,
"learning_rate": 1.7376618095711018e-05,
"loss": 0.1718,
"step": 4060
},
{
"epoch": 0.82,
"grad_norm": 2.426593736593661,
"learning_rate": 1.7361521956384264e-05,
"loss": 0.1741,
"step": 4070
},
{
"epoch": 0.83,
"grad_norm": 1.7464227217148067,
"learning_rate": 1.7346389105297766e-05,
"loss": 0.1726,
"step": 4080
},
{
"epoch": 0.83,
"grad_norm": 2.3749413734026383,
"learning_rate": 1.7331219617928997e-05,
"loss": 0.1583,
"step": 4090
},
{
"epoch": 0.83,
"grad_norm": 2.114701165986187,
"learning_rate": 1.7316013569938154e-05,
"loss": 0.2066,
"step": 4100
},
{
"epoch": 0.83,
"grad_norm": 1.8180037497973824,
"learning_rate": 1.73007710371678e-05,
"loss": 0.188,
"step": 4110
},
{
"epoch": 0.83,
"grad_norm": 2.2556879416055726,
"learning_rate": 1.7285492095642455e-05,
"loss": 0.1824,
"step": 4120
},
{
"epoch": 0.84,
"grad_norm": 1.9441561651729724,
"learning_rate": 1.7270176821568244e-05,
"loss": 0.1828,
"step": 4130
},
{
"epoch": 0.84,
"grad_norm": 3.863082155472389,
"learning_rate": 1.72548252913325e-05,
"loss": 0.1929,
"step": 4140
},
{
"epoch": 0.84,
"grad_norm": 2.1777659050408067,
"learning_rate": 1.72394375815034e-05,
"loss": 0.1872,
"step": 4150
},
{
"epoch": 0.84,
"grad_norm": 2.289054063384357,
"learning_rate": 1.722401376882955e-05,
"loss": 0.1619,
"step": 4160
},
{
"epoch": 0.84,
"grad_norm": 7.1273553535612,
"learning_rate": 1.7208553930239655e-05,
"loss": 0.1752,
"step": 4170
},
{
"epoch": 0.85,
"grad_norm": 2.4202257016442523,
"learning_rate": 1.7193058142842076e-05,
"loss": 0.1966,
"step": 4180
},
{
"epoch": 0.85,
"grad_norm": 2.599358985816695,
"learning_rate": 1.7177526483924492e-05,
"loss": 0.1739,
"step": 4190
},
{
"epoch": 0.85,
"grad_norm": 2.145207088406254,
"learning_rate": 1.7161959030953498e-05,
"loss": 0.1606,
"step": 4200
},
{
"epoch": 0.85,
"grad_norm": 2.4989297483292643,
"learning_rate": 1.71463558615742e-05,
"loss": 0.1441,
"step": 4210
},
{
"epoch": 0.85,
"grad_norm": 2.333564701416749,
"learning_rate": 1.713071705360987e-05,
"loss": 0.1697,
"step": 4220
},
{
"epoch": 0.86,
"grad_norm": 1.6007875381874954,
"learning_rate": 1.7115042685061507e-05,
"loss": 0.1801,
"step": 4230
},
{
"epoch": 0.86,
"grad_norm": 2.0001777331534516,
"learning_rate": 1.7099332834107497e-05,
"loss": 0.1236,
"step": 4240
},
{
"epoch": 0.86,
"grad_norm": 2.6336492006976187,
"learning_rate": 1.7083587579103187e-05,
"loss": 0.166,
"step": 4250
},
{
"epoch": 0.86,
"grad_norm": 2.181960694578793,
"learning_rate": 1.7067806998580507e-05,
"loss": 0.1997,
"step": 4260
},
{
"epoch": 0.87,
"grad_norm": 3.2024986585127366,
"learning_rate": 1.7051991171247582e-05,
"loss": 0.1508,
"step": 4270
},
{
"epoch": 0.87,
"grad_norm": 2.0206614417941258,
"learning_rate": 1.7036140175988344e-05,
"loss": 0.1471,
"step": 4280
},
{
"epoch": 0.87,
"grad_norm": 2.047393965997251,
"learning_rate": 1.702025409186211e-05,
"loss": 0.1777,
"step": 4290
},
{
"epoch": 0.87,
"grad_norm": 2.6683142062789713,
"learning_rate": 1.7004332998103232e-05,
"loss": 0.1769,
"step": 4300
},
{
"epoch": 0.87,
"grad_norm": 1.6732439515126907,
"learning_rate": 1.698837697412066e-05,
"loss": 0.1268,
"step": 4310
},
{
"epoch": 0.88,
"grad_norm": 3.001202183493077,
"learning_rate": 1.697238609949757e-05,
"loss": 0.1489,
"step": 4320
},
{
"epoch": 0.88,
"grad_norm": 1.9957859824768167,
"learning_rate": 1.6956360453990964e-05,
"loss": 0.1536,
"step": 4330
},
{
"epoch": 0.88,
"grad_norm": 3.2825840735727154,
"learning_rate": 1.694030011753127e-05,
"loss": 0.2101,
"step": 4340
},
{
"epoch": 0.88,
"grad_norm": 2.0765138274517088,
"learning_rate": 1.6924205170221933e-05,
"loss": 0.1811,
"step": 4350
},
{
"epoch": 0.88,
"grad_norm": 2.1133828976030595,
"learning_rate": 1.6908075692339035e-05,
"loss": 0.1728,
"step": 4360
},
{
"epoch": 0.89,
"grad_norm": 1.1354696745558366,
"learning_rate": 1.6891911764330887e-05,
"loss": 0.1663,
"step": 4370
},
{
"epoch": 0.89,
"grad_norm": 2.131963092613327,
"learning_rate": 1.6875713466817608e-05,
"loss": 0.1971,
"step": 4380
},
{
"epoch": 0.89,
"grad_norm": 6.080414407059132,
"learning_rate": 1.6859480880590755e-05,
"loss": 0.1518,
"step": 4390
},
{
"epoch": 0.89,
"grad_norm": 1.9313104885467733,
"learning_rate": 1.684321408661291e-05,
"loss": 0.1726,
"step": 4400
},
{
"epoch": 0.89,
"grad_norm": 1.891911281896888,
"learning_rate": 1.6826913166017257e-05,
"loss": 0.2049,
"step": 4410
},
{
"epoch": 0.9,
"grad_norm": 1.8833660404212225,
"learning_rate": 1.68105782001072e-05,
"loss": 0.1628,
"step": 4420
},
{
"epoch": 0.9,
"grad_norm": 2.043852343315347,
"learning_rate": 1.6794209270355946e-05,
"loss": 0.1975,
"step": 4430
},
{
"epoch": 0.9,
"grad_norm": 1.4598525807667742,
"learning_rate": 1.677780645840611e-05,
"loss": 0.159,
"step": 4440
},
{
"epoch": 0.9,
"grad_norm": 1.6015094283079794,
"learning_rate": 1.6761369846069292e-05,
"loss": 0.157,
"step": 4450
},
{
"epoch": 0.9,
"grad_norm": 2.0628558921093125,
"learning_rate": 1.6744899515325674e-05,
"loss": 0.1748,
"step": 4460
},
{
"epoch": 0.91,
"grad_norm": 2.076881323364807,
"learning_rate": 1.672839554832362e-05,
"loss": 0.1966,
"step": 4470
},
{
"epoch": 0.91,
"grad_norm": 2.0325935028828135,
"learning_rate": 1.671185802737926e-05,
"loss": 0.1885,
"step": 4480
},
{
"epoch": 0.91,
"grad_norm": 2.280549242220261,
"learning_rate": 1.6695287034976078e-05,
"loss": 0.1624,
"step": 4490
},
{
"epoch": 0.91,
"grad_norm": 2.08609705396923,
"learning_rate": 1.6678682653764502e-05,
"loss": 0.1631,
"step": 4500
},
{
"epoch": 0.91,
"grad_norm": 2.2526237237040903,
"learning_rate": 1.666204496656149e-05,
"loss": 0.145,
"step": 4510
},
{
"epoch": 0.92,
"grad_norm": 1.9007711000245981,
"learning_rate": 1.6645374056350128e-05,
"loss": 0.173,
"step": 4520
},
{
"epoch": 0.92,
"grad_norm": 1.9372171995443488,
"learning_rate": 1.6628670006279194e-05,
"loss": 0.142,
"step": 4530
}
],
"logging_steps": 10,
"max_steps": 14808,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 4532,
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}