bgem3-synthetic-v2-e4 / trainer_state.json
nntoan209's picture
Upload folder using huggingface_hub
c5cd147 verified
raw
history blame contribute delete
No virus
203 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 25304,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 13.192614390744945,
"learning_rate": 4.3780025284450067e-07,
"loss": 0.6539,
"step": 20
},
{
"epoch": 0.01,
"grad_norm": 10.26668883317148,
"learning_rate": 6.881163084702909e-07,
"loss": 0.6351,
"step": 40
},
{
"epoch": 0.01,
"grad_norm": 8.093364927344371,
"learning_rate": 9.38432364096081e-07,
"loss": 0.621,
"step": 60
},
{
"epoch": 0.01,
"grad_norm": 6.435663053769238,
"learning_rate": 1.188748419721871e-06,
"loss": 0.5702,
"step": 80
},
{
"epoch": 0.02,
"grad_norm": 7.962395022671904,
"learning_rate": 1.4390644753476612e-06,
"loss": 0.5317,
"step": 100
},
{
"epoch": 0.02,
"grad_norm": 6.601932601034603,
"learning_rate": 1.6893805309734515e-06,
"loss": 0.5138,
"step": 120
},
{
"epoch": 0.02,
"grad_norm": 5.565400471086017,
"learning_rate": 1.9396965865992414e-06,
"loss": 0.4772,
"step": 140
},
{
"epoch": 0.03,
"grad_norm": 10.719063890493263,
"learning_rate": 2.1900126422250318e-06,
"loss": 0.4828,
"step": 160
},
{
"epoch": 0.03,
"grad_norm": 5.456905246118198,
"learning_rate": 2.4403286978508217e-06,
"loss": 0.4612,
"step": 180
},
{
"epoch": 0.03,
"grad_norm": 5.074136053502421,
"learning_rate": 2.6906447534766116e-06,
"loss": 0.4614,
"step": 200
},
{
"epoch": 0.03,
"grad_norm": 5.739808379905823,
"learning_rate": 2.940960809102403e-06,
"loss": 0.4455,
"step": 220
},
{
"epoch": 0.04,
"grad_norm": 8.929223129466571,
"learning_rate": 3.1912768647281927e-06,
"loss": 0.4215,
"step": 240
},
{
"epoch": 0.04,
"grad_norm": 5.727490359048043,
"learning_rate": 3.4415929203539826e-06,
"loss": 0.4544,
"step": 260
},
{
"epoch": 0.04,
"grad_norm": 4.692889967552893,
"learning_rate": 3.6919089759797726e-06,
"loss": 0.4362,
"step": 280
},
{
"epoch": 0.05,
"grad_norm": 5.492004651950497,
"learning_rate": 3.942225031605563e-06,
"loss": 0.4102,
"step": 300
},
{
"epoch": 0.05,
"grad_norm": 4.905463477586154,
"learning_rate": 4.192541087231353e-06,
"loss": 0.4163,
"step": 320
},
{
"epoch": 0.05,
"grad_norm": 6.042815351357798,
"learning_rate": 4.442857142857143e-06,
"loss": 0.4043,
"step": 340
},
{
"epoch": 0.06,
"grad_norm": 3.966783411781683,
"learning_rate": 4.693173198482934e-06,
"loss": 0.4134,
"step": 360
},
{
"epoch": 0.06,
"grad_norm": 6.15182506494119,
"learning_rate": 4.943489254108724e-06,
"loss": 0.4095,
"step": 380
},
{
"epoch": 0.06,
"grad_norm": 5.556343872513135,
"learning_rate": 5.193805309734513e-06,
"loss": 0.3952,
"step": 400
},
{
"epoch": 0.07,
"grad_norm": 6.168472163196086,
"learning_rate": 5.4441213653603045e-06,
"loss": 0.4,
"step": 420
},
{
"epoch": 0.07,
"grad_norm": 7.710878876344402,
"learning_rate": 5.6944374209860944e-06,
"loss": 0.4158,
"step": 440
},
{
"epoch": 0.07,
"grad_norm": 10.149729727919098,
"learning_rate": 5.944753476611884e-06,
"loss": 0.4008,
"step": 460
},
{
"epoch": 0.08,
"grad_norm": 5.276529583928554,
"learning_rate": 6.195069532237674e-06,
"loss": 0.3693,
"step": 480
},
{
"epoch": 0.08,
"grad_norm": 5.4922316048860305,
"learning_rate": 6.445385587863464e-06,
"loss": 0.3655,
"step": 500
},
{
"epoch": 0.08,
"grad_norm": 4.834486637416974,
"learning_rate": 6.695701643489254e-06,
"loss": 0.4071,
"step": 520
},
{
"epoch": 0.09,
"grad_norm": 5.109668221648755,
"learning_rate": 6.946017699115044e-06,
"loss": 0.3611,
"step": 540
},
{
"epoch": 0.09,
"grad_norm": 4.962197451653283,
"learning_rate": 7.196333754740835e-06,
"loss": 0.3617,
"step": 560
},
{
"epoch": 0.09,
"grad_norm": 5.725555872687781,
"learning_rate": 7.4466498103666256e-06,
"loss": 0.3938,
"step": 580
},
{
"epoch": 0.09,
"grad_norm": 21.199977745188423,
"learning_rate": 7.696965865992416e-06,
"loss": 0.3753,
"step": 600
},
{
"epoch": 0.1,
"grad_norm": 6.822065444923147,
"learning_rate": 7.947281921618205e-06,
"loss": 0.3732,
"step": 620
},
{
"epoch": 0.1,
"grad_norm": 5.562639041834479,
"learning_rate": 8.197597977243996e-06,
"loss": 0.3641,
"step": 640
},
{
"epoch": 0.1,
"grad_norm": 6.746641948539073,
"learning_rate": 8.447914032869787e-06,
"loss": 0.3608,
"step": 660
},
{
"epoch": 0.11,
"grad_norm": 5.083732630988663,
"learning_rate": 8.698230088495576e-06,
"loss": 0.3489,
"step": 680
},
{
"epoch": 0.11,
"grad_norm": 5.36345764003458,
"learning_rate": 8.948546144121367e-06,
"loss": 0.3495,
"step": 700
},
{
"epoch": 0.11,
"grad_norm": 5.94599708094623,
"learning_rate": 9.198862199747156e-06,
"loss": 0.3568,
"step": 720
},
{
"epoch": 0.12,
"grad_norm": 5.59596830741628,
"learning_rate": 9.449178255372947e-06,
"loss": 0.3424,
"step": 740
},
{
"epoch": 0.12,
"grad_norm": 4.687091250082153,
"learning_rate": 9.699494310998736e-06,
"loss": 0.3373,
"step": 760
},
{
"epoch": 0.12,
"grad_norm": 4.305364261329056,
"learning_rate": 9.949810366624526e-06,
"loss": 0.3863,
"step": 780
},
{
"epoch": 0.13,
"grad_norm": 5.439101060195295,
"learning_rate": 1.0200126422250315e-05,
"loss": 0.3417,
"step": 800
},
{
"epoch": 0.13,
"grad_norm": 5.395373389869628,
"learning_rate": 1.0450442477876108e-05,
"loss": 0.3375,
"step": 820
},
{
"epoch": 0.13,
"grad_norm": 4.781731874205922,
"learning_rate": 1.0700758533501895e-05,
"loss": 0.3296,
"step": 840
},
{
"epoch": 0.14,
"grad_norm": 5.456546617203758,
"learning_rate": 1.0951074589127688e-05,
"loss": 0.3595,
"step": 860
},
{
"epoch": 0.14,
"grad_norm": 5.3912349632260455,
"learning_rate": 1.1201390644753475e-05,
"loss": 0.3597,
"step": 880
},
{
"epoch": 0.14,
"grad_norm": 4.404539817994699,
"learning_rate": 1.1451706700379268e-05,
"loss": 0.3314,
"step": 900
},
{
"epoch": 0.15,
"grad_norm": 7.289102012469942,
"learning_rate": 1.1702022756005057e-05,
"loss": 0.3406,
"step": 920
},
{
"epoch": 0.15,
"grad_norm": 5.786352926324861,
"learning_rate": 1.1952338811630847e-05,
"loss": 0.3509,
"step": 940
},
{
"epoch": 0.15,
"grad_norm": 4.131405105795528,
"learning_rate": 1.220265486725664e-05,
"loss": 0.3459,
"step": 960
},
{
"epoch": 0.15,
"grad_norm": 5.036101661631649,
"learning_rate": 1.2452970922882427e-05,
"loss": 0.3327,
"step": 980
},
{
"epoch": 0.16,
"grad_norm": 4.9031568362075895,
"learning_rate": 1.270328697850822e-05,
"loss": 0.359,
"step": 1000
},
{
"epoch": 0.16,
"grad_norm": 6.680058126596432,
"learning_rate": 1.2953603034134009e-05,
"loss": 0.3421,
"step": 1020
},
{
"epoch": 0.16,
"grad_norm": 6.016889325334172,
"learning_rate": 1.32039190897598e-05,
"loss": 0.3374,
"step": 1040
},
{
"epoch": 0.17,
"grad_norm": 6.844742376412928,
"learning_rate": 1.3454235145385589e-05,
"loss": 0.3237,
"step": 1060
},
{
"epoch": 0.17,
"grad_norm": 5.969902845962769,
"learning_rate": 1.370455120101138e-05,
"loss": 0.3237,
"step": 1080
},
{
"epoch": 0.17,
"grad_norm": 6.2950164096609305,
"learning_rate": 1.3954867256637168e-05,
"loss": 0.3168,
"step": 1100
},
{
"epoch": 0.18,
"grad_norm": 5.2335271803475445,
"learning_rate": 1.4205183312262961e-05,
"loss": 0.3396,
"step": 1120
},
{
"epoch": 0.18,
"grad_norm": 6.225398358080941,
"learning_rate": 1.4455499367888748e-05,
"loss": 0.333,
"step": 1140
},
{
"epoch": 0.18,
"grad_norm": 5.3208556045070114,
"learning_rate": 1.470581542351454e-05,
"loss": 0.3284,
"step": 1160
},
{
"epoch": 0.19,
"grad_norm": 4.7849250581905665,
"learning_rate": 1.495613147914033e-05,
"loss": 0.3304,
"step": 1180
},
{
"epoch": 0.19,
"grad_norm": 5.184459116726432,
"learning_rate": 1.520644753476612e-05,
"loss": 0.3462,
"step": 1200
},
{
"epoch": 0.19,
"grad_norm": 4.19686707405711,
"learning_rate": 1.545676359039191e-05,
"loss": 0.3325,
"step": 1220
},
{
"epoch": 0.2,
"grad_norm": 4.450896337868275,
"learning_rate": 1.57070796460177e-05,
"loss": 0.3057,
"step": 1240
},
{
"epoch": 0.2,
"grad_norm": 4.50018119447218,
"learning_rate": 1.5957395701643493e-05,
"loss": 0.3288,
"step": 1260
},
{
"epoch": 0.2,
"grad_norm": 5.166275173654794,
"learning_rate": 1.620771175726928e-05,
"loss": 0.3167,
"step": 1280
},
{
"epoch": 0.21,
"grad_norm": 5.056697958587676,
"learning_rate": 1.645802781289507e-05,
"loss": 0.3146,
"step": 1300
},
{
"epoch": 0.21,
"grad_norm": 4.3805271657086795,
"learning_rate": 1.670834386852086e-05,
"loss": 0.3441,
"step": 1320
},
{
"epoch": 0.21,
"grad_norm": 4.627520106605279,
"learning_rate": 1.6958659924146653e-05,
"loss": 0.3166,
"step": 1340
},
{
"epoch": 0.21,
"grad_norm": 4.706945749353132,
"learning_rate": 1.7208975979772438e-05,
"loss": 0.3371,
"step": 1360
},
{
"epoch": 0.22,
"grad_norm": 5.9372079877822665,
"learning_rate": 1.745929203539823e-05,
"loss": 0.3343,
"step": 1380
},
{
"epoch": 0.22,
"grad_norm": 4.745434424046423,
"learning_rate": 1.7709608091024023e-05,
"loss": 0.3075,
"step": 1400
},
{
"epoch": 0.22,
"grad_norm": 4.670681702157214,
"learning_rate": 1.7959924146649812e-05,
"loss": 0.3278,
"step": 1420
},
{
"epoch": 0.23,
"grad_norm": 4.609368793030779,
"learning_rate": 1.82102402022756e-05,
"loss": 0.3193,
"step": 1440
},
{
"epoch": 0.23,
"grad_norm": 3.8953514957120987,
"learning_rate": 1.846055625790139e-05,
"loss": 0.3181,
"step": 1460
},
{
"epoch": 0.23,
"grad_norm": 4.048645810385149,
"learning_rate": 1.8710872313527183e-05,
"loss": 0.3178,
"step": 1480
},
{
"epoch": 0.24,
"grad_norm": 4.039461038727622,
"learning_rate": 1.8961188369152972e-05,
"loss": 0.3101,
"step": 1500
},
{
"epoch": 0.24,
"grad_norm": 4.865092063730574,
"learning_rate": 1.9211504424778764e-05,
"loss": 0.3145,
"step": 1520
},
{
"epoch": 0.24,
"grad_norm": 4.320223981560967,
"learning_rate": 1.9461820480404553e-05,
"loss": 0.3188,
"step": 1540
},
{
"epoch": 0.25,
"grad_norm": 4.24607218830772,
"learning_rate": 1.9712136536030343e-05,
"loss": 0.3316,
"step": 1560
},
{
"epoch": 0.25,
"grad_norm": 6.659098557418422,
"learning_rate": 1.996245259165613e-05,
"loss": 0.3103,
"step": 1580
},
{
"epoch": 0.25,
"grad_norm": 4.4003401278033545,
"learning_rate": 1.9999982293200938e-05,
"loss": 0.3316,
"step": 1600
},
{
"epoch": 0.26,
"grad_norm": 4.466923266959081,
"learning_rate": 1.999992108459333e-05,
"loss": 0.3149,
"step": 1620
},
{
"epoch": 0.26,
"grad_norm": 4.3370197217130935,
"learning_rate": 1.9999816155842287e-05,
"loss": 0.3267,
"step": 1640
},
{
"epoch": 0.26,
"grad_norm": 4.928522761323794,
"learning_rate": 1.9999667507406614e-05,
"loss": 0.3334,
"step": 1660
},
{
"epoch": 0.27,
"grad_norm": 21.27678616799708,
"learning_rate": 1.9999475139936266e-05,
"loss": 0.31,
"step": 1680
},
{
"epoch": 0.27,
"grad_norm": 9.092160010538011,
"learning_rate": 1.9999239054272376e-05,
"loss": 0.3219,
"step": 1700
},
{
"epoch": 0.27,
"grad_norm": 4.287462168712924,
"learning_rate": 1.9998959251447223e-05,
"loss": 0.2765,
"step": 1720
},
{
"epoch": 0.28,
"grad_norm": 4.200006739293095,
"learning_rate": 1.9998635732684236e-05,
"loss": 0.3187,
"step": 1740
},
{
"epoch": 0.28,
"grad_norm": 4.182543197928632,
"learning_rate": 1.9998268499398e-05,
"loss": 0.3038,
"step": 1760
},
{
"epoch": 0.28,
"grad_norm": 4.860519421610365,
"learning_rate": 1.999785755319424e-05,
"loss": 0.3195,
"step": 1780
},
{
"epoch": 0.28,
"grad_norm": 4.6816897176843,
"learning_rate": 1.9997402895869806e-05,
"loss": 0.3122,
"step": 1800
},
{
"epoch": 0.29,
"grad_norm": 4.765263778254334,
"learning_rate": 1.9996904529412684e-05,
"loss": 0.3077,
"step": 1820
},
{
"epoch": 0.29,
"grad_norm": 3.8426131141111215,
"learning_rate": 1.999636245600198e-05,
"loss": 0.3042,
"step": 1840
},
{
"epoch": 0.29,
"grad_norm": 7.697235889469334,
"learning_rate": 1.9995776678007892e-05,
"loss": 0.3219,
"step": 1860
},
{
"epoch": 0.3,
"grad_norm": 5.3692702870620455,
"learning_rate": 1.9995147197991732e-05,
"loss": 0.3319,
"step": 1880
},
{
"epoch": 0.3,
"grad_norm": 6.216633345418337,
"learning_rate": 1.9994474018705895e-05,
"loss": 0.3059,
"step": 1900
},
{
"epoch": 0.3,
"grad_norm": 5.783165856537296,
"learning_rate": 1.9993757143093847e-05,
"loss": 0.3011,
"step": 1920
},
{
"epoch": 0.31,
"grad_norm": 4.96412779585292,
"learning_rate": 1.999299657429011e-05,
"loss": 0.3267,
"step": 1940
},
{
"epoch": 0.31,
"grad_norm": 4.584783821625622,
"learning_rate": 1.9992192315620268e-05,
"loss": 0.3186,
"step": 1960
},
{
"epoch": 0.31,
"grad_norm": 3.966800361004438,
"learning_rate": 1.9991344370600926e-05,
"loss": 0.3037,
"step": 1980
},
{
"epoch": 0.32,
"grad_norm": 5.549652693595451,
"learning_rate": 1.9990452742939716e-05,
"loss": 0.3031,
"step": 2000
},
{
"epoch": 0.32,
"grad_norm": 4.092592964025307,
"learning_rate": 1.9989517436535264e-05,
"loss": 0.299,
"step": 2020
},
{
"epoch": 0.32,
"grad_norm": 7.4143606594255385,
"learning_rate": 1.9988538455477186e-05,
"loss": 0.3212,
"step": 2040
},
{
"epoch": 0.33,
"grad_norm": 4.429279288399913,
"learning_rate": 1.9987515804046065e-05,
"loss": 0.2983,
"step": 2060
},
{
"epoch": 0.33,
"grad_norm": 4.168171471024823,
"learning_rate": 1.9986449486713425e-05,
"loss": 0.2905,
"step": 2080
},
{
"epoch": 0.33,
"grad_norm": 4.023053730559626,
"learning_rate": 1.998533950814173e-05,
"loss": 0.3202,
"step": 2100
},
{
"epoch": 0.34,
"grad_norm": 4.589556797878434,
"learning_rate": 1.998418587318434e-05,
"loss": 0.2929,
"step": 2120
},
{
"epoch": 0.34,
"grad_norm": 4.736297519231011,
"learning_rate": 1.9982988586885513e-05,
"loss": 0.3192,
"step": 2140
},
{
"epoch": 0.34,
"grad_norm": 3.2489559257150242,
"learning_rate": 1.9981747654480363e-05,
"loss": 0.32,
"step": 2160
},
{
"epoch": 0.34,
"grad_norm": 4.901187690508077,
"learning_rate": 1.9980463081394853e-05,
"loss": 0.2987,
"step": 2180
},
{
"epoch": 0.35,
"grad_norm": 5.700830285351671,
"learning_rate": 1.9979134873245754e-05,
"loss": 0.2866,
"step": 2200
},
{
"epoch": 0.35,
"grad_norm": 5.280958720644886,
"learning_rate": 1.9977763035840647e-05,
"loss": 0.308,
"step": 2220
},
{
"epoch": 0.35,
"grad_norm": 3.658172362891215,
"learning_rate": 1.9976347575177864e-05,
"loss": 0.3134,
"step": 2240
},
{
"epoch": 0.36,
"grad_norm": 3.649278564224255,
"learning_rate": 1.9974888497446493e-05,
"loss": 0.2728,
"step": 2260
},
{
"epoch": 0.36,
"grad_norm": 4.213688731917936,
"learning_rate": 1.9973385809026328e-05,
"loss": 0.2954,
"step": 2280
},
{
"epoch": 0.36,
"grad_norm": 4.290870493890129,
"learning_rate": 1.997183951648785e-05,
"loss": 0.3041,
"step": 2300
},
{
"epoch": 0.37,
"grad_norm": 4.497146973593326,
"learning_rate": 1.9970249626592207e-05,
"loss": 0.3039,
"step": 2320
},
{
"epoch": 0.37,
"grad_norm": 3.980598125657535,
"learning_rate": 1.9968616146291173e-05,
"loss": 0.2876,
"step": 2340
},
{
"epoch": 0.37,
"grad_norm": 4.845633060759088,
"learning_rate": 1.9966939082727113e-05,
"loss": 0.2859,
"step": 2360
},
{
"epoch": 0.38,
"grad_norm": 5.0017279500150655,
"learning_rate": 1.9965218443232964e-05,
"loss": 0.2949,
"step": 2380
},
{
"epoch": 0.38,
"grad_norm": 4.812496829341488,
"learning_rate": 1.9963454235332197e-05,
"loss": 0.3043,
"step": 2400
},
{
"epoch": 0.38,
"grad_norm": 4.196149632421477,
"learning_rate": 1.996164646673879e-05,
"loss": 0.2917,
"step": 2420
},
{
"epoch": 0.39,
"grad_norm": 3.8759357184009477,
"learning_rate": 1.9959795145357187e-05,
"loss": 0.2788,
"step": 2440
},
{
"epoch": 0.39,
"grad_norm": 3.840684986809998,
"learning_rate": 1.995790027928226e-05,
"loss": 0.2877,
"step": 2460
},
{
"epoch": 0.39,
"grad_norm": 5.2172036078456605,
"learning_rate": 1.9955961876799288e-05,
"loss": 0.283,
"step": 2480
},
{
"epoch": 0.4,
"grad_norm": 5.656431416505367,
"learning_rate": 1.995397994638391e-05,
"loss": 0.2867,
"step": 2500
},
{
"epoch": 0.4,
"grad_norm": 3.9781055866495945,
"learning_rate": 1.9951954496702084e-05,
"loss": 0.2842,
"step": 2520
},
{
"epoch": 0.4,
"grad_norm": 3.6294521746502917,
"learning_rate": 1.994988553661007e-05,
"loss": 0.2672,
"step": 2540
},
{
"epoch": 0.4,
"grad_norm": 4.51151201839679,
"learning_rate": 1.9947773075154352e-05,
"loss": 0.2904,
"step": 2560
},
{
"epoch": 0.41,
"grad_norm": 4.985296947393056,
"learning_rate": 1.9945617121571655e-05,
"loss": 0.2999,
"step": 2580
},
{
"epoch": 0.41,
"grad_norm": 3.674752134930168,
"learning_rate": 1.9943417685288848e-05,
"loss": 0.2785,
"step": 2600
},
{
"epoch": 0.41,
"grad_norm": 4.797788651028868,
"learning_rate": 1.9941174775922932e-05,
"loss": 0.2983,
"step": 2620
},
{
"epoch": 0.42,
"grad_norm": 4.079696177221718,
"learning_rate": 1.9938888403281006e-05,
"loss": 0.2777,
"step": 2640
},
{
"epoch": 0.42,
"grad_norm": 3.3610527639517755,
"learning_rate": 1.9936558577360198e-05,
"loss": 0.2956,
"step": 2660
},
{
"epoch": 0.42,
"grad_norm": 7.188852464890547,
"learning_rate": 1.993418530834764e-05,
"loss": 0.2829,
"step": 2680
},
{
"epoch": 0.43,
"grad_norm": 3.360659331617603,
"learning_rate": 1.993176860662041e-05,
"loss": 0.292,
"step": 2700
},
{
"epoch": 0.43,
"grad_norm": 4.107038754867596,
"learning_rate": 1.9929308482745514e-05,
"loss": 0.2694,
"step": 2720
},
{
"epoch": 0.43,
"grad_norm": 3.9890648859407647,
"learning_rate": 1.9926804947479808e-05,
"loss": 0.2962,
"step": 2740
},
{
"epoch": 0.44,
"grad_norm": 4.103890708256313,
"learning_rate": 1.9924258011769957e-05,
"loss": 0.305,
"step": 2760
},
{
"epoch": 0.44,
"grad_norm": 3.6344296373375182,
"learning_rate": 1.9921667686752412e-05,
"loss": 0.2868,
"step": 2780
},
{
"epoch": 0.44,
"grad_norm": 4.132484812446097,
"learning_rate": 1.9919033983753325e-05,
"loss": 0.3088,
"step": 2800
},
{
"epoch": 0.45,
"grad_norm": 4.602803346333572,
"learning_rate": 1.991635691428853e-05,
"loss": 0.2908,
"step": 2820
},
{
"epoch": 0.45,
"grad_norm": 3.553835454417207,
"learning_rate": 1.9913636490063475e-05,
"loss": 0.2959,
"step": 2840
},
{
"epoch": 0.45,
"grad_norm": 3.3951346647945857,
"learning_rate": 1.991087272297318e-05,
"loss": 0.2857,
"step": 2860
},
{
"epoch": 0.46,
"grad_norm": 5.5117950542313014,
"learning_rate": 1.9908065625102174e-05,
"loss": 0.3072,
"step": 2880
},
{
"epoch": 0.46,
"grad_norm": 4.238784802315611,
"learning_rate": 1.9905215208724454e-05,
"loss": 0.2781,
"step": 2900
},
{
"epoch": 0.46,
"grad_norm": 3.5164479288590287,
"learning_rate": 1.990232148630343e-05,
"loss": 0.2794,
"step": 2920
},
{
"epoch": 0.46,
"grad_norm": 4.085534858193557,
"learning_rate": 1.9899384470491854e-05,
"loss": 0.2858,
"step": 2940
},
{
"epoch": 0.47,
"grad_norm": 3.5369416675799563,
"learning_rate": 1.98964041741318e-05,
"loss": 0.2672,
"step": 2960
},
{
"epoch": 0.47,
"grad_norm": 4.60995552157379,
"learning_rate": 1.989338061025456e-05,
"loss": 0.2868,
"step": 2980
},
{
"epoch": 0.47,
"grad_norm": 3.3514909358709444,
"learning_rate": 1.989031379208063e-05,
"loss": 0.2723,
"step": 3000
},
{
"epoch": 0.48,
"grad_norm": 3.923203323163403,
"learning_rate": 1.9887203733019632e-05,
"loss": 0.28,
"step": 3020
},
{
"epoch": 0.48,
"grad_norm": 3.429027033234547,
"learning_rate": 1.9884050446670256e-05,
"loss": 0.2952,
"step": 3040
},
{
"epoch": 0.48,
"grad_norm": 3.5801457858104397,
"learning_rate": 1.9880853946820197e-05,
"loss": 0.2804,
"step": 3060
},
{
"epoch": 0.49,
"grad_norm": 4.712617033863597,
"learning_rate": 1.9877614247446116e-05,
"loss": 0.2892,
"step": 3080
},
{
"epoch": 0.49,
"grad_norm": 4.933700451637125,
"learning_rate": 1.987433136271354e-05,
"loss": 0.2717,
"step": 3100
},
{
"epoch": 0.49,
"grad_norm": 4.790425988609337,
"learning_rate": 1.9871005306976846e-05,
"loss": 0.2695,
"step": 3120
},
{
"epoch": 0.5,
"grad_norm": 3.7466637075374045,
"learning_rate": 1.9867636094779166e-05,
"loss": 0.2624,
"step": 3140
},
{
"epoch": 0.5,
"grad_norm": 3.50365965112591,
"learning_rate": 1.9864223740852334e-05,
"loss": 0.2844,
"step": 3160
},
{
"epoch": 0.5,
"grad_norm": 4.569034860813186,
"learning_rate": 1.9860768260116815e-05,
"loss": 0.2905,
"step": 3180
},
{
"epoch": 0.51,
"grad_norm": 4.170649316306094,
"learning_rate": 1.9857269667681655e-05,
"loss": 0.2674,
"step": 3200
},
{
"epoch": 0.51,
"grad_norm": 4.434905247240572,
"learning_rate": 1.98537279788444e-05,
"loss": 0.2794,
"step": 3220
},
{
"epoch": 0.51,
"grad_norm": 3.9321442768019623,
"learning_rate": 1.9850143209091034e-05,
"loss": 0.2881,
"step": 3240
},
{
"epoch": 0.52,
"grad_norm": 5.363885077307175,
"learning_rate": 1.9846515374095914e-05,
"loss": 0.2858,
"step": 3260
},
{
"epoch": 0.52,
"grad_norm": 4.191427182787896,
"learning_rate": 1.98428444897217e-05,
"loss": 0.2816,
"step": 3280
},
{
"epoch": 0.52,
"grad_norm": 4.006135526500501,
"learning_rate": 1.983913057201928e-05,
"loss": 0.2755,
"step": 3300
},
{
"epoch": 0.52,
"grad_norm": 3.8000033328393075,
"learning_rate": 1.9835373637227703e-05,
"loss": 0.2733,
"step": 3320
},
{
"epoch": 0.53,
"grad_norm": 5.333992420638842,
"learning_rate": 1.9831573701774123e-05,
"loss": 0.2779,
"step": 3340
},
{
"epoch": 0.53,
"grad_norm": 4.252307489826801,
"learning_rate": 1.9827730782273703e-05,
"loss": 0.2592,
"step": 3360
},
{
"epoch": 0.53,
"grad_norm": 3.525549279021786,
"learning_rate": 1.982384489552955e-05,
"loss": 0.2739,
"step": 3380
},
{
"epoch": 0.54,
"grad_norm": 7.6522667312835475,
"learning_rate": 1.9819916058532657e-05,
"loss": 0.2816,
"step": 3400
},
{
"epoch": 0.54,
"grad_norm": 11.523203122616478,
"learning_rate": 1.98159442884618e-05,
"loss": 0.2715,
"step": 3420
},
{
"epoch": 0.54,
"grad_norm": 4.787543449503492,
"learning_rate": 1.9811929602683497e-05,
"loss": 0.2618,
"step": 3440
},
{
"epoch": 0.55,
"grad_norm": 5.207304138221566,
"learning_rate": 1.9807872018751904e-05,
"loss": 0.254,
"step": 3460
},
{
"epoch": 0.55,
"grad_norm": 3.4061625058197453,
"learning_rate": 1.9803771554408745e-05,
"loss": 0.2526,
"step": 3480
},
{
"epoch": 0.55,
"grad_norm": 4.380956899549788,
"learning_rate": 1.9799628227583248e-05,
"loss": 0.2797,
"step": 3500
},
{
"epoch": 0.56,
"grad_norm": 4.127558290565574,
"learning_rate": 1.9795442056392054e-05,
"loss": 0.2712,
"step": 3520
},
{
"epoch": 0.56,
"grad_norm": 4.10767355386799,
"learning_rate": 1.9791213059139132e-05,
"loss": 0.2832,
"step": 3540
},
{
"epoch": 0.56,
"grad_norm": 3.662712320155948,
"learning_rate": 1.978694125431572e-05,
"loss": 0.2663,
"step": 3560
},
{
"epoch": 0.57,
"grad_norm": 3.8370942587808607,
"learning_rate": 1.978262666060022e-05,
"loss": 0.2796,
"step": 3580
},
{
"epoch": 0.57,
"grad_norm": 3.3559744723580143,
"learning_rate": 1.9778269296858138e-05,
"loss": 0.2758,
"step": 3600
},
{
"epoch": 0.57,
"grad_norm": 4.912706342639339,
"learning_rate": 1.977386918214198e-05,
"loss": 0.2691,
"step": 3620
},
{
"epoch": 0.58,
"grad_norm": 3.827769733993231,
"learning_rate": 1.9769426335691194e-05,
"loss": 0.2888,
"step": 3640
},
{
"epoch": 0.58,
"grad_norm": 8.974780882814423,
"learning_rate": 1.9764940776932057e-05,
"loss": 0.279,
"step": 3660
},
{
"epoch": 0.58,
"grad_norm": 7.288399652923296,
"learning_rate": 1.9760412525477615e-05,
"loss": 0.27,
"step": 3680
},
{
"epoch": 0.58,
"grad_norm": 5.013042224656741,
"learning_rate": 1.9755841601127587e-05,
"loss": 0.2635,
"step": 3700
},
{
"epoch": 0.59,
"grad_norm": 4.113083011382156,
"learning_rate": 1.9751228023868275e-05,
"loss": 0.2742,
"step": 3720
},
{
"epoch": 0.59,
"grad_norm": 3.8810313877329516,
"learning_rate": 1.974657181387248e-05,
"loss": 0.2608,
"step": 3740
},
{
"epoch": 0.59,
"grad_norm": 4.321527293044816,
"learning_rate": 1.974187299149942e-05,
"loss": 0.2642,
"step": 3760
},
{
"epoch": 0.6,
"grad_norm": 4.507987668552812,
"learning_rate": 1.973713157729462e-05,
"loss": 0.2839,
"step": 3780
},
{
"epoch": 0.6,
"grad_norm": 4.112442322300466,
"learning_rate": 1.9732347591989863e-05,
"loss": 0.2767,
"step": 3800
},
{
"epoch": 0.6,
"grad_norm": 3.9228516840537075,
"learning_rate": 1.972752105650304e-05,
"loss": 0.2516,
"step": 3820
},
{
"epoch": 0.61,
"grad_norm": 3.9874693579378344,
"learning_rate": 1.972265199193813e-05,
"loss": 0.2875,
"step": 3840
},
{
"epoch": 0.61,
"grad_norm": 5.032079024162868,
"learning_rate": 1.9717740419585033e-05,
"loss": 0.266,
"step": 3860
},
{
"epoch": 0.61,
"grad_norm": 3.5996549440957164,
"learning_rate": 1.9712786360919543e-05,
"loss": 0.2548,
"step": 3880
},
{
"epoch": 0.62,
"grad_norm": 4.683794515218512,
"learning_rate": 1.9707789837603205e-05,
"loss": 0.2701,
"step": 3900
},
{
"epoch": 0.62,
"grad_norm": 4.52037247566233,
"learning_rate": 1.9702750871483248e-05,
"loss": 0.261,
"step": 3920
},
{
"epoch": 0.62,
"grad_norm": 4.5693573304091535,
"learning_rate": 1.9697669484592487e-05,
"loss": 0.2568,
"step": 3940
},
{
"epoch": 0.63,
"grad_norm": 3.48060438103784,
"learning_rate": 1.9692545699149212e-05,
"loss": 0.2545,
"step": 3960
},
{
"epoch": 0.63,
"grad_norm": 4.719616538813032,
"learning_rate": 1.9687379537557107e-05,
"loss": 0.2676,
"step": 3980
},
{
"epoch": 0.63,
"grad_norm": 4.593894281521921,
"learning_rate": 1.9682171022405133e-05,
"loss": 0.2803,
"step": 4000
},
{
"epoch": 0.64,
"grad_norm": 3.2767058377982203,
"learning_rate": 1.967692017646746e-05,
"loss": 0.258,
"step": 4020
},
{
"epoch": 0.64,
"grad_norm": 4.102789530001537,
"learning_rate": 1.9671627022703333e-05,
"loss": 0.2804,
"step": 4040
},
{
"epoch": 0.64,
"grad_norm": 3.747557671264137,
"learning_rate": 1.9666291584256995e-05,
"loss": 0.275,
"step": 4060
},
{
"epoch": 0.64,
"grad_norm": 4.527128041668808,
"learning_rate": 1.9660913884457572e-05,
"loss": 0.2637,
"step": 4080
},
{
"epoch": 0.65,
"grad_norm": 6.26616500848599,
"learning_rate": 1.965549394681899e-05,
"loss": 0.2769,
"step": 4100
},
{
"epoch": 0.65,
"grad_norm": 4.208333585128525,
"learning_rate": 1.9650031795039847e-05,
"loss": 0.2506,
"step": 4120
},
{
"epoch": 0.65,
"grad_norm": 3.770191763293591,
"learning_rate": 1.9644527453003326e-05,
"loss": 0.2716,
"step": 4140
},
{
"epoch": 0.66,
"grad_norm": 8.625265366162292,
"learning_rate": 1.9638980944777085e-05,
"loss": 0.2831,
"step": 4160
},
{
"epoch": 0.66,
"grad_norm": 3.9442544557070285,
"learning_rate": 1.9633392294613155e-05,
"loss": 0.2777,
"step": 4180
},
{
"epoch": 0.66,
"grad_norm": 3.273256865914117,
"learning_rate": 1.962776152694783e-05,
"loss": 0.2507,
"step": 4200
},
{
"epoch": 0.67,
"grad_norm": 3.7930612759054445,
"learning_rate": 1.9622088666401566e-05,
"loss": 0.2545,
"step": 4220
},
{
"epoch": 0.67,
"grad_norm": 3.605998255572159,
"learning_rate": 1.9616373737778864e-05,
"loss": 0.2718,
"step": 4240
},
{
"epoch": 0.67,
"grad_norm": 3.3060364912064966,
"learning_rate": 1.961061676606817e-05,
"loss": 0.2615,
"step": 4260
},
{
"epoch": 0.68,
"grad_norm": 3.180581756735597,
"learning_rate": 1.9604817776441762e-05,
"loss": 0.2475,
"step": 4280
},
{
"epoch": 0.68,
"grad_norm": 4.109627023373317,
"learning_rate": 1.9598976794255647e-05,
"loss": 0.2781,
"step": 4300
},
{
"epoch": 0.68,
"grad_norm": 3.8246626162863007,
"learning_rate": 1.9593093845049435e-05,
"loss": 0.2797,
"step": 4320
},
{
"epoch": 0.69,
"grad_norm": 3.880680350794998,
"learning_rate": 1.9587168954546233e-05,
"loss": 0.2728,
"step": 4340
},
{
"epoch": 0.69,
"grad_norm": 3.504579545295932,
"learning_rate": 1.9581202148652555e-05,
"loss": 0.2517,
"step": 4360
},
{
"epoch": 0.69,
"grad_norm": 3.251503083301662,
"learning_rate": 1.957519345345817e-05,
"loss": 0.234,
"step": 4380
},
{
"epoch": 0.7,
"grad_norm": 3.8245770887559467,
"learning_rate": 1.9569142895236014e-05,
"loss": 0.2691,
"step": 4400
},
{
"epoch": 0.7,
"grad_norm": 3.7480270281848163,
"learning_rate": 1.9563050500442067e-05,
"loss": 0.2563,
"step": 4420
},
{
"epoch": 0.7,
"grad_norm": 3.6499757321739805,
"learning_rate": 1.9556916295715248e-05,
"loss": 0.2599,
"step": 4440
},
{
"epoch": 0.71,
"grad_norm": 4.289401570798809,
"learning_rate": 1.955074030787727e-05,
"loss": 0.2494,
"step": 4460
},
{
"epoch": 0.71,
"grad_norm": 4.463444688322885,
"learning_rate": 1.9544522563932567e-05,
"loss": 0.2503,
"step": 4480
},
{
"epoch": 0.71,
"grad_norm": 3.4818768633740076,
"learning_rate": 1.953826309106813e-05,
"loss": 0.2569,
"step": 4500
},
{
"epoch": 0.71,
"grad_norm": 4.094470710503364,
"learning_rate": 1.9531961916653416e-05,
"loss": 0.2722,
"step": 4520
},
{
"epoch": 0.72,
"grad_norm": 4.500248160407271,
"learning_rate": 1.9525619068240227e-05,
"loss": 0.2525,
"step": 4540
},
{
"epoch": 0.72,
"grad_norm": 3.482370390112968,
"learning_rate": 1.951923457356258e-05,
"loss": 0.2534,
"step": 4560
},
{
"epoch": 0.72,
"grad_norm": 5.274501845715762,
"learning_rate": 1.9512808460536586e-05,
"loss": 0.2454,
"step": 4580
},
{
"epoch": 0.73,
"grad_norm": 4.177646944576608,
"learning_rate": 1.9506340757260332e-05,
"loss": 0.2474,
"step": 4600
},
{
"epoch": 0.73,
"grad_norm": 3.56021633905292,
"learning_rate": 1.9499831492013772e-05,
"loss": 0.2744,
"step": 4620
},
{
"epoch": 0.73,
"grad_norm": 3.485877700901466,
"learning_rate": 1.9493280693258565e-05,
"loss": 0.2741,
"step": 4640
},
{
"epoch": 0.74,
"grad_norm": 4.495079741486709,
"learning_rate": 1.9486688389637993e-05,
"loss": 0.2399,
"step": 4660
},
{
"epoch": 0.74,
"grad_norm": 3.879830579696546,
"learning_rate": 1.9480054609976815e-05,
"loss": 0.2719,
"step": 4680
},
{
"epoch": 0.74,
"grad_norm": 3.683250176597202,
"learning_rate": 1.9473379383281136e-05,
"loss": 0.2647,
"step": 4700
},
{
"epoch": 0.75,
"grad_norm": 3.4564441591881585,
"learning_rate": 1.9466662738738295e-05,
"loss": 0.2614,
"step": 4720
},
{
"epoch": 0.75,
"grad_norm": 3.5430190851406516,
"learning_rate": 1.945990470571672e-05,
"loss": 0.2509,
"step": 4740
},
{
"epoch": 0.75,
"grad_norm": 3.594438702478659,
"learning_rate": 1.945310531376582e-05,
"loss": 0.249,
"step": 4760
},
{
"epoch": 0.76,
"grad_norm": 3.765747448208518,
"learning_rate": 1.944626459261585e-05,
"loss": 0.2503,
"step": 4780
},
{
"epoch": 0.76,
"grad_norm": 4.2901200899686405,
"learning_rate": 1.9439382572177755e-05,
"loss": 0.2424,
"step": 4800
},
{
"epoch": 0.76,
"grad_norm": 3.476240164568998,
"learning_rate": 1.9432459282543085e-05,
"loss": 0.264,
"step": 4820
},
{
"epoch": 0.77,
"grad_norm": 3.670630970694611,
"learning_rate": 1.942549475398382e-05,
"loss": 0.2663,
"step": 4840
},
{
"epoch": 0.77,
"grad_norm": 3.7957980665515034,
"learning_rate": 1.941848901695227e-05,
"loss": 0.2621,
"step": 4860
},
{
"epoch": 0.77,
"grad_norm": 4.793895081633612,
"learning_rate": 1.9411442102080914e-05,
"loss": 0.2498,
"step": 4880
},
{
"epoch": 0.77,
"grad_norm": 3.464785397972565,
"learning_rate": 1.9404354040182302e-05,
"loss": 0.242,
"step": 4900
},
{
"epoch": 0.78,
"grad_norm": 3.486787653538589,
"learning_rate": 1.9397224862248875e-05,
"loss": 0.2623,
"step": 4920
},
{
"epoch": 0.78,
"grad_norm": 4.089389487964841,
"learning_rate": 1.939005459945287e-05,
"loss": 0.2543,
"step": 4940
},
{
"epoch": 0.78,
"grad_norm": 3.1762519290562414,
"learning_rate": 1.9382843283146158e-05,
"loss": 0.2654,
"step": 4960
},
{
"epoch": 0.79,
"grad_norm": 3.835699443072975,
"learning_rate": 1.937559094486012e-05,
"loss": 0.2743,
"step": 4980
},
{
"epoch": 0.79,
"grad_norm": 3.8852244202589334,
"learning_rate": 1.936829761630551e-05,
"loss": 0.2629,
"step": 5000
},
{
"epoch": 0.79,
"grad_norm": 3.1906813658574835,
"learning_rate": 1.93609633293723e-05,
"loss": 0.2417,
"step": 5020
},
{
"epoch": 0.8,
"grad_norm": 2.3269103292999986,
"learning_rate": 1.9353588116129556e-05,
"loss": 0.218,
"step": 5040
},
{
"epoch": 0.8,
"grad_norm": 3.2694392219817927,
"learning_rate": 1.9346172008825302e-05,
"loss": 0.2321,
"step": 5060
},
{
"epoch": 0.8,
"grad_norm": 2.7636752394800634,
"learning_rate": 1.9338715039886357e-05,
"loss": 0.2112,
"step": 5080
},
{
"epoch": 0.81,
"grad_norm": 1.7461733533221422,
"learning_rate": 1.9331217241918223e-05,
"loss": 0.2065,
"step": 5100
},
{
"epoch": 0.81,
"grad_norm": 2.1112769700310166,
"learning_rate": 1.9323678647704908e-05,
"loss": 0.2178,
"step": 5120
},
{
"epoch": 0.81,
"grad_norm": 1.928506961128503,
"learning_rate": 1.9316099290208812e-05,
"loss": 0.2222,
"step": 5140
},
{
"epoch": 0.82,
"grad_norm": 2.289733371787946,
"learning_rate": 1.9308479202570575e-05,
"loss": 0.2331,
"step": 5160
},
{
"epoch": 0.82,
"grad_norm": 1.997736424764338,
"learning_rate": 1.9300818418108923e-05,
"loss": 0.2133,
"step": 5180
},
{
"epoch": 0.82,
"grad_norm": 2.260897208845383,
"learning_rate": 1.9293116970320528e-05,
"loss": 0.2092,
"step": 5200
},
{
"epoch": 0.83,
"grad_norm": 2.01842381348837,
"learning_rate": 1.928537489287986e-05,
"loss": 0.2189,
"step": 5220
},
{
"epoch": 0.83,
"grad_norm": 2.377313290393669,
"learning_rate": 1.927759221963905e-05,
"loss": 0.2374,
"step": 5240
},
{
"epoch": 0.83,
"grad_norm": 2.431465602229034,
"learning_rate": 1.926976898462773e-05,
"loss": 0.2219,
"step": 5260
},
{
"epoch": 0.83,
"grad_norm": 2.272111328930043,
"learning_rate": 1.9261905222052882e-05,
"loss": 0.2182,
"step": 5280
},
{
"epoch": 0.84,
"grad_norm": 2.1252795764788788,
"learning_rate": 1.9254000966298702e-05,
"loss": 0.2219,
"step": 5300
},
{
"epoch": 0.84,
"grad_norm": 2.1286718396965143,
"learning_rate": 1.924605625192643e-05,
"loss": 0.2181,
"step": 5320
},
{
"epoch": 0.84,
"grad_norm": 2.263405074029898,
"learning_rate": 1.923807111367423e-05,
"loss": 0.2082,
"step": 5340
},
{
"epoch": 0.85,
"grad_norm": 2.349777405930586,
"learning_rate": 1.9230045586457004e-05,
"loss": 0.2148,
"step": 5360
},
{
"epoch": 0.85,
"grad_norm": 1.951720991944945,
"learning_rate": 1.9221979705366256e-05,
"loss": 0.2177,
"step": 5380
},
{
"epoch": 0.85,
"grad_norm": 2.6396096890364915,
"learning_rate": 1.9213873505669944e-05,
"loss": 0.224,
"step": 5400
},
{
"epoch": 0.86,
"grad_norm": 2.340017372784151,
"learning_rate": 1.9205727022812307e-05,
"loss": 0.2149,
"step": 5420
},
{
"epoch": 0.86,
"grad_norm": 2.489578364113247,
"learning_rate": 1.9197540292413734e-05,
"loss": 0.2229,
"step": 5440
},
{
"epoch": 0.86,
"grad_norm": 1.9693131625322577,
"learning_rate": 1.9189313350270585e-05,
"loss": 0.2114,
"step": 5460
},
{
"epoch": 0.87,
"grad_norm": 2.3178735109826505,
"learning_rate": 1.9181046232355053e-05,
"loss": 0.212,
"step": 5480
},
{
"epoch": 0.87,
"grad_norm": 2.1928905009683133,
"learning_rate": 1.9172738974814993e-05,
"loss": 0.2176,
"step": 5500
},
{
"epoch": 0.87,
"grad_norm": 2.188333080240406,
"learning_rate": 1.9164391613973778e-05,
"loss": 0.2077,
"step": 5520
},
{
"epoch": 0.88,
"grad_norm": 2.3037514030133517,
"learning_rate": 1.9156004186330123e-05,
"loss": 0.2121,
"step": 5540
},
{
"epoch": 0.88,
"grad_norm": 5.37450956724229,
"learning_rate": 1.914757672855794e-05,
"loss": 0.2095,
"step": 5560
},
{
"epoch": 0.88,
"grad_norm": 2.040655690264704,
"learning_rate": 1.9139109277506173e-05,
"loss": 0.1993,
"step": 5580
},
{
"epoch": 0.89,
"grad_norm": 2.0548550497572076,
"learning_rate": 1.9130601870198633e-05,
"loss": 0.2203,
"step": 5600
},
{
"epoch": 0.89,
"grad_norm": 2.8075751525385924,
"learning_rate": 1.912205454383384e-05,
"loss": 0.2255,
"step": 5620
},
{
"epoch": 0.89,
"grad_norm": 2.213380252712336,
"learning_rate": 1.9113467335784855e-05,
"loss": 0.2192,
"step": 5640
},
{
"epoch": 0.89,
"grad_norm": 2.7606591076219633,
"learning_rate": 1.9104840283599136e-05,
"loss": 0.2245,
"step": 5660
},
{
"epoch": 0.9,
"grad_norm": 2.1216745722948787,
"learning_rate": 1.909617342499834e-05,
"loss": 0.2071,
"step": 5680
},
{
"epoch": 0.9,
"grad_norm": 2.2785095057805065,
"learning_rate": 1.908746679787819e-05,
"loss": 0.2172,
"step": 5700
},
{
"epoch": 0.9,
"grad_norm": 2.2981748471055576,
"learning_rate": 1.907872044030829e-05,
"loss": 0.2099,
"step": 5720
},
{
"epoch": 0.91,
"grad_norm": 1.960812811667782,
"learning_rate": 1.9069934390531962e-05,
"loss": 0.2146,
"step": 5740
},
{
"epoch": 0.91,
"grad_norm": 3.794717686019522,
"learning_rate": 1.90611086869661e-05,
"loss": 0.208,
"step": 5760
},
{
"epoch": 0.91,
"grad_norm": 2.6008278665808033,
"learning_rate": 1.9052243368200958e-05,
"loss": 0.2134,
"step": 5780
},
{
"epoch": 0.92,
"grad_norm": 2.4705009528511277,
"learning_rate": 1.9043338473000025e-05,
"loss": 0.2253,
"step": 5800
},
{
"epoch": 0.92,
"grad_norm": 2.401514124088343,
"learning_rate": 1.9034394040299827e-05,
"loss": 0.2173,
"step": 5820
},
{
"epoch": 0.92,
"grad_norm": 3.1101268070286685,
"learning_rate": 1.9025410109209777e-05,
"loss": 0.2133,
"step": 5840
},
{
"epoch": 0.93,
"grad_norm": 2.7159957966217187,
"learning_rate": 1.9016386719011982e-05,
"loss": 0.2022,
"step": 5860
},
{
"epoch": 0.93,
"grad_norm": 2.385988014350966,
"learning_rate": 1.90073239091611e-05,
"loss": 0.2164,
"step": 5880
},
{
"epoch": 0.93,
"grad_norm": 2.1587806998998325,
"learning_rate": 1.899822171928413e-05,
"loss": 0.203,
"step": 5900
},
{
"epoch": 0.94,
"grad_norm": 2.2767020052526497,
"learning_rate": 1.8989080189180278e-05,
"loss": 0.2216,
"step": 5920
},
{
"epoch": 0.94,
"grad_norm": 3.090507426402487,
"learning_rate": 1.8979899358820756e-05,
"loss": 0.2045,
"step": 5940
},
{
"epoch": 0.94,
"grad_norm": 1.777654599684145,
"learning_rate": 1.8970679268348617e-05,
"loss": 0.2017,
"step": 5960
},
{
"epoch": 0.95,
"grad_norm": 2.188623412353316,
"learning_rate": 1.8961419958078577e-05,
"loss": 0.2122,
"step": 5980
},
{
"epoch": 0.95,
"grad_norm": 2.262035920826766,
"learning_rate": 1.8952121468496842e-05,
"loss": 0.2149,
"step": 6000
},
{
"epoch": 0.95,
"grad_norm": 2.1025895229154403,
"learning_rate": 1.894278384026093e-05,
"loss": 0.2075,
"step": 6020
},
{
"epoch": 0.95,
"grad_norm": 2.350054387214,
"learning_rate": 1.893340711419949e-05,
"loss": 0.2026,
"step": 6040
},
{
"epoch": 0.96,
"grad_norm": 2.5056109405602704,
"learning_rate": 1.8923991331312125e-05,
"loss": 0.2003,
"step": 6060
},
{
"epoch": 0.96,
"grad_norm": 2.202062147526405,
"learning_rate": 1.891453653276921e-05,
"loss": 0.2078,
"step": 6080
},
{
"epoch": 0.96,
"grad_norm": 2.2246885746094445,
"learning_rate": 1.8905042759911734e-05,
"loss": 0.2065,
"step": 6100
},
{
"epoch": 0.97,
"grad_norm": 2.3053321066228554,
"learning_rate": 1.8895510054251074e-05,
"loss": 0.2266,
"step": 6120
},
{
"epoch": 0.97,
"grad_norm": 2.988781297520643,
"learning_rate": 1.888593845746886e-05,
"loss": 0.2168,
"step": 6140
},
{
"epoch": 0.97,
"grad_norm": 2.4575329679216167,
"learning_rate": 1.887632801141676e-05,
"loss": 0.1994,
"step": 6160
},
{
"epoch": 0.98,
"grad_norm": 1.974119765257406,
"learning_rate": 1.886667875811632e-05,
"loss": 0.2009,
"step": 6180
},
{
"epoch": 0.98,
"grad_norm": 2.250505072807692,
"learning_rate": 1.885699073975877e-05,
"loss": 0.2011,
"step": 6200
},
{
"epoch": 0.98,
"grad_norm": 3.4317109345564774,
"learning_rate": 1.8847263998704822e-05,
"loss": 0.214,
"step": 6220
},
{
"epoch": 0.99,
"grad_norm": 1.9321307572015396,
"learning_rate": 1.883749857748453e-05,
"loss": 0.2121,
"step": 6240
},
{
"epoch": 0.99,
"grad_norm": 2.44399716037142,
"learning_rate": 1.8827694518797058e-05,
"loss": 0.2121,
"step": 6260
},
{
"epoch": 0.99,
"grad_norm": 3.70267918474469,
"learning_rate": 1.881785186551051e-05,
"loss": 0.218,
"step": 6280
},
{
"epoch": 1.0,
"grad_norm": 2.2164340760047763,
"learning_rate": 1.880797066066176e-05,
"loss": 0.2018,
"step": 6300
},
{
"epoch": 1.0,
"grad_norm": 2.0350434314114065,
"learning_rate": 1.8798050947456237e-05,
"loss": 0.2146,
"step": 6320
},
{
"epoch": 1.0,
"grad_norm": 2.2542825844152614,
"learning_rate": 1.8788092769267742e-05,
"loss": 0.1966,
"step": 6340
},
{
"epoch": 1.01,
"grad_norm": 2.540485628816784,
"learning_rate": 1.877809616963828e-05,
"loss": 0.181,
"step": 6360
},
{
"epoch": 1.01,
"grad_norm": 2.0965630152825794,
"learning_rate": 1.8768061192277835e-05,
"loss": 0.1768,
"step": 6380
},
{
"epoch": 1.01,
"grad_norm": 1.832162295211555,
"learning_rate": 1.8757987881064214e-05,
"loss": 0.1841,
"step": 6400
},
{
"epoch": 1.01,
"grad_norm": 2.3417686734318877,
"learning_rate": 1.8747876280042826e-05,
"loss": 0.1748,
"step": 6420
},
{
"epoch": 1.02,
"grad_norm": 2.3993985614578657,
"learning_rate": 1.8737726433426505e-05,
"loss": 0.1806,
"step": 6440
},
{
"epoch": 1.02,
"grad_norm": 2.0282614216820862,
"learning_rate": 1.872753838559532e-05,
"loss": 0.1948,
"step": 6460
},
{
"epoch": 1.02,
"grad_norm": 2.8165988300739553,
"learning_rate": 1.8717312181096363e-05,
"loss": 0.175,
"step": 6480
},
{
"epoch": 1.03,
"grad_norm": 2.2451015768881493,
"learning_rate": 1.870704786464357e-05,
"loss": 0.1798,
"step": 6500
},
{
"epoch": 1.03,
"grad_norm": 2.1815849986963296,
"learning_rate": 1.869674548111753e-05,
"loss": 0.1831,
"step": 6520
},
{
"epoch": 1.03,
"grad_norm": 2.1640382620765846,
"learning_rate": 1.8686405075565258e-05,
"loss": 0.1829,
"step": 6540
},
{
"epoch": 1.04,
"grad_norm": 2.9145132743989945,
"learning_rate": 1.8676026693200048e-05,
"loss": 0.1747,
"step": 6560
},
{
"epoch": 1.04,
"grad_norm": 2.045497051914788,
"learning_rate": 1.8665610379401226e-05,
"loss": 0.1753,
"step": 6580
},
{
"epoch": 1.04,
"grad_norm": 1.7634716575081997,
"learning_rate": 1.8655156179713974e-05,
"loss": 0.1814,
"step": 6600
},
{
"epoch": 1.05,
"grad_norm": 1.7599637579437575,
"learning_rate": 1.8644664139849144e-05,
"loss": 0.1911,
"step": 6620
},
{
"epoch": 1.05,
"grad_norm": 2.120306666344395,
"learning_rate": 1.863413430568303e-05,
"loss": 0.1875,
"step": 6640
},
{
"epoch": 1.05,
"grad_norm": 1.639791731170639,
"learning_rate": 1.8623566723257188e-05,
"loss": 0.1784,
"step": 6660
},
{
"epoch": 1.06,
"grad_norm": 2.0062873480341943,
"learning_rate": 1.8612961438778225e-05,
"loss": 0.1764,
"step": 6680
},
{
"epoch": 1.06,
"grad_norm": 2.4050323576148234,
"learning_rate": 1.8602318498617592e-05,
"loss": 0.1782,
"step": 6700
},
{
"epoch": 1.06,
"grad_norm": 1.897036816703736,
"learning_rate": 1.8591637949311408e-05,
"loss": 0.1789,
"step": 6720
},
{
"epoch": 1.07,
"grad_norm": 3.1448228404692387,
"learning_rate": 1.8580919837560224e-05,
"loss": 0.1657,
"step": 6740
},
{
"epoch": 1.07,
"grad_norm": 1.979007580704552,
"learning_rate": 1.8570164210228826e-05,
"loss": 0.1773,
"step": 6760
},
{
"epoch": 1.07,
"grad_norm": 1.8928740814253107,
"learning_rate": 1.8559371114346058e-05,
"loss": 0.182,
"step": 6780
},
{
"epoch": 1.07,
"grad_norm": 2.1321312791081413,
"learning_rate": 1.854854059710457e-05,
"loss": 0.182,
"step": 6800
},
{
"epoch": 1.08,
"grad_norm": 2.488188785290195,
"learning_rate": 1.8537672705860653e-05,
"loss": 0.1871,
"step": 6820
},
{
"epoch": 1.08,
"grad_norm": 2.512962685869269,
"learning_rate": 1.8526767488134015e-05,
"loss": 0.1796,
"step": 6840
},
{
"epoch": 1.08,
"grad_norm": 2.457219103013576,
"learning_rate": 1.8515824991607562e-05,
"loss": 0.1768,
"step": 6860
},
{
"epoch": 1.09,
"grad_norm": 2.0058477265819272,
"learning_rate": 1.850484526412721e-05,
"loss": 0.1878,
"step": 6880
},
{
"epoch": 1.09,
"grad_norm": 1.968329732936287,
"learning_rate": 1.8493828353701666e-05,
"loss": 0.175,
"step": 6900
},
{
"epoch": 1.09,
"grad_norm": 2.2137462446632177,
"learning_rate": 1.8482774308502218e-05,
"loss": 0.1814,
"step": 6920
},
{
"epoch": 1.1,
"grad_norm": 2.1543171277309003,
"learning_rate": 1.8471683176862517e-05,
"loss": 0.1886,
"step": 6940
},
{
"epoch": 1.1,
"grad_norm": 4.139111281755878,
"learning_rate": 1.8460555007278392e-05,
"loss": 0.175,
"step": 6960
},
{
"epoch": 1.1,
"grad_norm": 2.054321604592727,
"learning_rate": 1.844938984840761e-05,
"loss": 0.1768,
"step": 6980
},
{
"epoch": 1.11,
"grad_norm": 2.256285682459467,
"learning_rate": 1.843818774906967e-05,
"loss": 0.1786,
"step": 7000
},
{
"epoch": 1.11,
"grad_norm": 2.5864624228248627,
"learning_rate": 1.8426948758245588e-05,
"loss": 0.1789,
"step": 7020
},
{
"epoch": 1.11,
"grad_norm": 2.2452602266240786,
"learning_rate": 1.8415672925077706e-05,
"loss": 0.1743,
"step": 7040
},
{
"epoch": 1.12,
"grad_norm": 1.905290268237584,
"learning_rate": 1.8404360298869443e-05,
"loss": 0.1789,
"step": 7060
},
{
"epoch": 1.12,
"grad_norm": 2.1523717750344824,
"learning_rate": 1.8393010929085106e-05,
"loss": 0.1716,
"step": 7080
},
{
"epoch": 1.12,
"grad_norm": 1.8556344369896958,
"learning_rate": 1.8381624865349644e-05,
"loss": 0.1814,
"step": 7100
},
{
"epoch": 1.13,
"grad_norm": 2.078547958125406,
"learning_rate": 1.837020215744847e-05,
"loss": 0.1774,
"step": 7120
},
{
"epoch": 1.13,
"grad_norm": 2.029658706613511,
"learning_rate": 1.8358742855327222e-05,
"loss": 0.1699,
"step": 7140
},
{
"epoch": 1.13,
"grad_norm": 2.2998633634367316,
"learning_rate": 1.8347247009091528e-05,
"loss": 0.1762,
"step": 7160
},
{
"epoch": 1.13,
"grad_norm": 2.2901708634113227,
"learning_rate": 1.8335714669006818e-05,
"loss": 0.1682,
"step": 7180
},
{
"epoch": 1.14,
"grad_norm": 1.6706682645288262,
"learning_rate": 1.8324145885498092e-05,
"loss": 0.1797,
"step": 7200
},
{
"epoch": 1.14,
"grad_norm": 1.9131372823635615,
"learning_rate": 1.8312540709149696e-05,
"loss": 0.1777,
"step": 7220
},
{
"epoch": 1.14,
"grad_norm": 2.07909984239155,
"learning_rate": 1.8300899190705098e-05,
"loss": 0.1752,
"step": 7240
},
{
"epoch": 1.15,
"grad_norm": 2.1760276778157857,
"learning_rate": 1.828922138106668e-05,
"loss": 0.1847,
"step": 7260
},
{
"epoch": 1.15,
"grad_norm": 2.247820116102049,
"learning_rate": 1.8277507331295495e-05,
"loss": 0.1899,
"step": 7280
},
{
"epoch": 1.15,
"grad_norm": 1.8558783470602038,
"learning_rate": 1.8265757092611075e-05,
"loss": 0.176,
"step": 7300
},
{
"epoch": 1.16,
"grad_norm": 2.198791896123805,
"learning_rate": 1.8253970716391166e-05,
"loss": 0.1781,
"step": 7320
},
{
"epoch": 1.16,
"grad_norm": 2.0878033811578196,
"learning_rate": 1.8242148254171532e-05,
"loss": 0.186,
"step": 7340
},
{
"epoch": 1.16,
"grad_norm": 2.0609054181173714,
"learning_rate": 1.8230289757645737e-05,
"loss": 0.1835,
"step": 7360
},
{
"epoch": 1.17,
"grad_norm": 2.458294711914135,
"learning_rate": 1.8218395278664876e-05,
"loss": 0.1746,
"step": 7380
},
{
"epoch": 1.17,
"grad_norm": 2.657085063045043,
"learning_rate": 1.8206464869237405e-05,
"loss": 0.2005,
"step": 7400
},
{
"epoch": 1.17,
"grad_norm": 2.4744370061009566,
"learning_rate": 1.819449858152887e-05,
"loss": 0.1793,
"step": 7420
},
{
"epoch": 1.18,
"grad_norm": 2.149043105341473,
"learning_rate": 1.8182496467861694e-05,
"loss": 0.1652,
"step": 7440
},
{
"epoch": 1.18,
"grad_norm": 1.9846578681762732,
"learning_rate": 1.8170458580714957e-05,
"loss": 0.1877,
"step": 7460
},
{
"epoch": 1.18,
"grad_norm": 1.8277088571957387,
"learning_rate": 1.815838497272415e-05,
"loss": 0.183,
"step": 7480
},
{
"epoch": 1.19,
"grad_norm": 1.9586834966474451,
"learning_rate": 1.814627569668096e-05,
"loss": 0.1766,
"step": 7500
},
{
"epoch": 1.19,
"grad_norm": 2.2511207120669097,
"learning_rate": 1.8134130805533027e-05,
"loss": 0.1767,
"step": 7520
},
{
"epoch": 1.19,
"grad_norm": 1.5549651238311348,
"learning_rate": 1.8121950352383714e-05,
"loss": 0.1955,
"step": 7540
},
{
"epoch": 1.2,
"grad_norm": 2.5865251040638744,
"learning_rate": 1.810973439049189e-05,
"loss": 0.1819,
"step": 7560
},
{
"epoch": 1.2,
"grad_norm": 2.399656899087326,
"learning_rate": 1.809748297327167e-05,
"loss": 0.1851,
"step": 7580
},
{
"epoch": 1.2,
"grad_norm": 2.5607074887948773,
"learning_rate": 1.8085196154292215e-05,
"loss": 0.1793,
"step": 7600
},
{
"epoch": 1.2,
"grad_norm": 2.251395428592012,
"learning_rate": 1.8072873987277463e-05,
"loss": 0.167,
"step": 7620
},
{
"epoch": 1.21,
"grad_norm": 1.9489827901942562,
"learning_rate": 1.8060516526105924e-05,
"loss": 0.1884,
"step": 7640
},
{
"epoch": 1.21,
"grad_norm": 2.0637425234570808,
"learning_rate": 1.804812382481042e-05,
"loss": 0.1876,
"step": 7660
},
{
"epoch": 1.21,
"grad_norm": 1.9347382212160897,
"learning_rate": 1.8035695937577863e-05,
"loss": 0.1726,
"step": 7680
},
{
"epoch": 1.22,
"grad_norm": 2.4247952482951596,
"learning_rate": 1.8023232918749026e-05,
"loss": 0.1779,
"step": 7700
},
{
"epoch": 1.22,
"grad_norm": 2.3176200478415843,
"learning_rate": 1.8010734822818278e-05,
"loss": 0.1837,
"step": 7720
},
{
"epoch": 1.22,
"grad_norm": 2.680156700051065,
"learning_rate": 1.7998201704433374e-05,
"loss": 0.1815,
"step": 7740
},
{
"epoch": 1.23,
"grad_norm": 1.7853843493153718,
"learning_rate": 1.7985633618395197e-05,
"loss": 0.1793,
"step": 7760
},
{
"epoch": 1.23,
"grad_norm": 2.064374130022476,
"learning_rate": 1.7973030619657535e-05,
"loss": 0.18,
"step": 7780
},
{
"epoch": 1.23,
"grad_norm": 2.694008332010934,
"learning_rate": 1.7960392763326813e-05,
"loss": 0.1787,
"step": 7800
},
{
"epoch": 1.24,
"grad_norm": 1.8953685050923013,
"learning_rate": 1.794772010466189e-05,
"loss": 0.1811,
"step": 7820
},
{
"epoch": 1.24,
"grad_norm": 2.3442305008755753,
"learning_rate": 1.7935012699073787e-05,
"loss": 0.1924,
"step": 7840
},
{
"epoch": 1.24,
"grad_norm": 2.4097855590450936,
"learning_rate": 1.7922270602125464e-05,
"loss": 0.171,
"step": 7860
},
{
"epoch": 1.25,
"grad_norm": 1.8537246377229875,
"learning_rate": 1.7909493869531555e-05,
"loss": 0.1838,
"step": 7880
},
{
"epoch": 1.25,
"grad_norm": 2.0039152922249697,
"learning_rate": 1.789668255715815e-05,
"loss": 0.1847,
"step": 7900
},
{
"epoch": 1.25,
"grad_norm": 1.529179165379362,
"learning_rate": 1.7883836721022534e-05,
"loss": 0.1728,
"step": 7920
},
{
"epoch": 1.26,
"grad_norm": 1.9763886145297185,
"learning_rate": 1.7870956417292945e-05,
"loss": 0.1688,
"step": 7940
},
{
"epoch": 1.26,
"grad_norm": 1.9066521892575954,
"learning_rate": 1.7858041702288335e-05,
"loss": 0.153,
"step": 7960
},
{
"epoch": 1.26,
"grad_norm": 1.7450878138703785,
"learning_rate": 1.784509263247811e-05,
"loss": 0.1893,
"step": 7980
},
{
"epoch": 1.26,
"grad_norm": 1.9620586700844385,
"learning_rate": 1.7832109264481904e-05,
"loss": 0.185,
"step": 8000
},
{
"epoch": 1.27,
"grad_norm": 1.8923898451681622,
"learning_rate": 1.7819091655069314e-05,
"loss": 0.1754,
"step": 8020
},
{
"epoch": 1.27,
"grad_norm": 2.0884487166296872,
"learning_rate": 1.7806039861159653e-05,
"loss": 0.1765,
"step": 8040
},
{
"epoch": 1.27,
"grad_norm": 2.222016415072701,
"learning_rate": 1.7792953939821702e-05,
"loss": 0.1788,
"step": 8060
},
{
"epoch": 1.28,
"grad_norm": 2.009439484553775,
"learning_rate": 1.7779833948273482e-05,
"loss": 0.1811,
"step": 8080
},
{
"epoch": 1.28,
"grad_norm": 2.785344314111028,
"learning_rate": 1.7766679943881966e-05,
"loss": 0.1701,
"step": 8100
},
{
"epoch": 1.28,
"grad_norm": 2.4016974843695453,
"learning_rate": 1.775349198416286e-05,
"loss": 0.1809,
"step": 8120
},
{
"epoch": 1.29,
"grad_norm": 5.230470573305321,
"learning_rate": 1.774027012678033e-05,
"loss": 0.1701,
"step": 8140
},
{
"epoch": 1.29,
"grad_norm": 2.373672415463304,
"learning_rate": 1.7727014429546762e-05,
"loss": 0.1832,
"step": 8160
},
{
"epoch": 1.29,
"grad_norm": 3.3302260379998776,
"learning_rate": 1.7713724950422516e-05,
"loss": 0.1756,
"step": 8180
},
{
"epoch": 1.3,
"grad_norm": 2.1278369142809925,
"learning_rate": 1.770040174751565e-05,
"loss": 0.1751,
"step": 8200
},
{
"epoch": 1.3,
"grad_norm": 2.1056194640732526,
"learning_rate": 1.7687044879081685e-05,
"loss": 0.1733,
"step": 8220
},
{
"epoch": 1.3,
"grad_norm": 1.9218053417452738,
"learning_rate": 1.7673654403523336e-05,
"loss": 0.1787,
"step": 8240
},
{
"epoch": 1.31,
"grad_norm": 2.015545274207546,
"learning_rate": 1.766023037939028e-05,
"loss": 0.175,
"step": 8260
},
{
"epoch": 1.31,
"grad_norm": 2.6233040008294544,
"learning_rate": 1.7646772865378873e-05,
"loss": 0.1662,
"step": 8280
},
{
"epoch": 1.31,
"grad_norm": 1.8332410774716108,
"learning_rate": 1.7633281920331906e-05,
"loss": 0.1838,
"step": 8300
},
{
"epoch": 1.32,
"grad_norm": 1.9451280989055022,
"learning_rate": 1.761975760323835e-05,
"loss": 0.1736,
"step": 8320
},
{
"epoch": 1.32,
"grad_norm": 2.3255879960964476,
"learning_rate": 1.76061999732331e-05,
"loss": 0.1656,
"step": 8340
},
{
"epoch": 1.32,
"grad_norm": 2.2343648630192363,
"learning_rate": 1.7592609089596685e-05,
"loss": 0.1779,
"step": 8360
},
{
"epoch": 1.32,
"grad_norm": 3.1163417653612955,
"learning_rate": 1.7578985011755077e-05,
"loss": 0.1715,
"step": 8380
},
{
"epoch": 1.33,
"grad_norm": 2.225480415684875,
"learning_rate": 1.7565327799279354e-05,
"loss": 0.1716,
"step": 8400
},
{
"epoch": 1.33,
"grad_norm": 2.626122370963583,
"learning_rate": 1.7551637511885494e-05,
"loss": 0.163,
"step": 8420
},
{
"epoch": 1.33,
"grad_norm": 2.338476937693113,
"learning_rate": 1.7537914209434085e-05,
"loss": 0.1827,
"step": 8440
},
{
"epoch": 1.34,
"grad_norm": 2.932171635630059,
"learning_rate": 1.752415795193008e-05,
"loss": 0.1637,
"step": 8460
},
{
"epoch": 1.34,
"grad_norm": 2.168193633641859,
"learning_rate": 1.7510368799522514e-05,
"loss": 0.1843,
"step": 8480
},
{
"epoch": 1.34,
"grad_norm": 2.0903535310262886,
"learning_rate": 1.7496546812504273e-05,
"loss": 0.179,
"step": 8500
},
{
"epoch": 1.35,
"grad_norm": 1.873644784081685,
"learning_rate": 1.7482692051311805e-05,
"loss": 0.1814,
"step": 8520
},
{
"epoch": 1.35,
"grad_norm": 1.9178052782056696,
"learning_rate": 1.7468804576524853e-05,
"loss": 0.1722,
"step": 8540
},
{
"epoch": 1.35,
"grad_norm": 2.2334060168419163,
"learning_rate": 1.7454884448866212e-05,
"loss": 0.1878,
"step": 8560
},
{
"epoch": 1.36,
"grad_norm": 2.1292565293829493,
"learning_rate": 1.7440931729201448e-05,
"loss": 0.1795,
"step": 8580
},
{
"epoch": 1.36,
"grad_norm": 2.423860978511733,
"learning_rate": 1.7426946478538626e-05,
"loss": 0.1675,
"step": 8600
},
{
"epoch": 1.36,
"grad_norm": 2.1047220409931877,
"learning_rate": 1.741292875802807e-05,
"loss": 0.1734,
"step": 8620
},
{
"epoch": 1.37,
"grad_norm": 2.1785335425805776,
"learning_rate": 1.7398878628962062e-05,
"loss": 0.1808,
"step": 8640
},
{
"epoch": 1.37,
"grad_norm": 1.854826356210005,
"learning_rate": 1.7384796152774602e-05,
"loss": 0.1833,
"step": 8660
},
{
"epoch": 1.37,
"grad_norm": 2.428632402856712,
"learning_rate": 1.737068139104111e-05,
"loss": 0.1736,
"step": 8680
},
{
"epoch": 1.38,
"grad_norm": 2.5679580432934412,
"learning_rate": 1.7356534405478197e-05,
"loss": 0.1775,
"step": 8700
},
{
"epoch": 1.38,
"grad_norm": 2.2642899361494124,
"learning_rate": 1.7342355257943354e-05,
"loss": 0.1696,
"step": 8720
},
{
"epoch": 1.38,
"grad_norm": 2.5767432931886947,
"learning_rate": 1.732814401043471e-05,
"loss": 0.186,
"step": 8740
},
{
"epoch": 1.38,
"grad_norm": 1.893321045994624,
"learning_rate": 1.7313900725090744e-05,
"loss": 0.1652,
"step": 8760
},
{
"epoch": 1.39,
"grad_norm": 1.7371063719634066,
"learning_rate": 1.7299625464190025e-05,
"loss": 0.1743,
"step": 8780
},
{
"epoch": 1.39,
"grad_norm": 2.3410836884450994,
"learning_rate": 1.7285318290150934e-05,
"loss": 0.1749,
"step": 8800
},
{
"epoch": 1.39,
"grad_norm": 2.0076324738418667,
"learning_rate": 1.727097926553139e-05,
"loss": 0.171,
"step": 8820
},
{
"epoch": 1.4,
"grad_norm": 2.114082790821531,
"learning_rate": 1.7256608453028577e-05,
"loss": 0.1826,
"step": 8840
},
{
"epoch": 1.4,
"grad_norm": 1.8298354013347164,
"learning_rate": 1.7242205915478677e-05,
"loss": 0.1694,
"step": 8860
},
{
"epoch": 1.4,
"grad_norm": 2.074394146818531,
"learning_rate": 1.722777171585658e-05,
"loss": 0.1709,
"step": 8880
},
{
"epoch": 1.41,
"grad_norm": 2.1343987124286183,
"learning_rate": 1.721330591727562e-05,
"loss": 0.174,
"step": 8900
},
{
"epoch": 1.41,
"grad_norm": 2.113311146861362,
"learning_rate": 1.7198808582987313e-05,
"loss": 0.1725,
"step": 8920
},
{
"epoch": 1.41,
"grad_norm": 2.201652738347589,
"learning_rate": 1.718427977638104e-05,
"loss": 0.1761,
"step": 8940
},
{
"epoch": 1.42,
"grad_norm": 2.0551908043411773,
"learning_rate": 1.7169719560983817e-05,
"loss": 0.1771,
"step": 8960
},
{
"epoch": 1.42,
"grad_norm": 1.4177831102654264,
"learning_rate": 1.7155128000459967e-05,
"loss": 0.1611,
"step": 8980
},
{
"epoch": 1.42,
"grad_norm": 1.974977060981519,
"learning_rate": 1.71405051586109e-05,
"loss": 0.1701,
"step": 9000
},
{
"epoch": 1.43,
"grad_norm": 2.394531449135901,
"learning_rate": 1.7125851099374784e-05,
"loss": 0.1841,
"step": 9020
},
{
"epoch": 1.43,
"grad_norm": 2.017565126183319,
"learning_rate": 1.7111165886826288e-05,
"loss": 0.1659,
"step": 9040
},
{
"epoch": 1.43,
"grad_norm": 1.9783212428806802,
"learning_rate": 1.70964495851763e-05,
"loss": 0.1807,
"step": 9060
},
{
"epoch": 1.44,
"grad_norm": 2.0181529434741,
"learning_rate": 1.708170225877165e-05,
"loss": 0.195,
"step": 9080
},
{
"epoch": 1.44,
"grad_norm": 2.2209977061102237,
"learning_rate": 1.7066923972094804e-05,
"loss": 0.1798,
"step": 9100
},
{
"epoch": 1.44,
"grad_norm": 2.132481028750825,
"learning_rate": 1.705211478976363e-05,
"loss": 0.171,
"step": 9120
},
{
"epoch": 1.44,
"grad_norm": 1.6746882362029545,
"learning_rate": 1.7037274776531064e-05,
"loss": 0.1917,
"step": 9140
},
{
"epoch": 1.45,
"grad_norm": 2.185315703710572,
"learning_rate": 1.702240399728486e-05,
"loss": 0.1655,
"step": 9160
},
{
"epoch": 1.45,
"grad_norm": 2.4321099376111084,
"learning_rate": 1.7007502517047293e-05,
"loss": 0.1778,
"step": 9180
},
{
"epoch": 1.45,
"grad_norm": 2.0789938809670385,
"learning_rate": 1.6992570400974876e-05,
"loss": 0.1795,
"step": 9200
},
{
"epoch": 1.46,
"grad_norm": 2.0820901085870127,
"learning_rate": 1.6977607714358085e-05,
"loss": 0.1691,
"step": 9220
},
{
"epoch": 1.46,
"grad_norm": 2.0783905777418834,
"learning_rate": 1.6962614522621047e-05,
"loss": 0.1758,
"step": 9240
},
{
"epoch": 1.46,
"grad_norm": 2.0755554604762887,
"learning_rate": 1.69475908913213e-05,
"loss": 0.1745,
"step": 9260
},
{
"epoch": 1.47,
"grad_norm": 3.1531529103037226,
"learning_rate": 1.693253688614945e-05,
"loss": 0.1754,
"step": 9280
},
{
"epoch": 1.47,
"grad_norm": 2.2993858263736064,
"learning_rate": 1.6917452572928936e-05,
"loss": 0.166,
"step": 9300
},
{
"epoch": 1.47,
"grad_norm": 1.75447681551487,
"learning_rate": 1.69023380176157e-05,
"loss": 0.178,
"step": 9320
},
{
"epoch": 1.48,
"grad_norm": 1.973995547249979,
"learning_rate": 1.688719328629793e-05,
"loss": 0.1799,
"step": 9340
},
{
"epoch": 1.48,
"grad_norm": 2.4126605558582397,
"learning_rate": 1.687201844519575e-05,
"loss": 0.1711,
"step": 9360
},
{
"epoch": 1.48,
"grad_norm": 1.9841098157654595,
"learning_rate": 1.685681356066094e-05,
"loss": 0.1767,
"step": 9380
},
{
"epoch": 1.49,
"grad_norm": 2.1398055464198746,
"learning_rate": 1.684157869917665e-05,
"loss": 0.1727,
"step": 9400
},
{
"epoch": 1.49,
"grad_norm": 1.9307719901389893,
"learning_rate": 1.6826313927357096e-05,
"loss": 0.1772,
"step": 9420
},
{
"epoch": 1.49,
"grad_norm": 1.8311039706909333,
"learning_rate": 1.681101931194729e-05,
"loss": 0.172,
"step": 9440
},
{
"epoch": 1.5,
"grad_norm": 2.1030690065273228,
"learning_rate": 1.6795694919822713e-05,
"loss": 0.1681,
"step": 9460
},
{
"epoch": 1.5,
"grad_norm": 2.295817125082063,
"learning_rate": 1.6780340817989067e-05,
"loss": 0.1756,
"step": 9480
},
{
"epoch": 1.5,
"grad_norm": 2.126975655764791,
"learning_rate": 1.6764957073581937e-05,
"loss": 0.1705,
"step": 9500
},
{
"epoch": 1.5,
"grad_norm": 2.1165992510815443,
"learning_rate": 1.6749543753866544e-05,
"loss": 0.1769,
"step": 9520
},
{
"epoch": 1.51,
"grad_norm": 2.2911343620991875,
"learning_rate": 1.6734100926237405e-05,
"loss": 0.1786,
"step": 9540
},
{
"epoch": 1.51,
"grad_norm": 1.9750133369092064,
"learning_rate": 1.6718628658218078e-05,
"loss": 0.1787,
"step": 9560
},
{
"epoch": 1.51,
"grad_norm": 2.1096785715440003,
"learning_rate": 1.670312701746083e-05,
"loss": 0.1746,
"step": 9580
},
{
"epoch": 1.52,
"grad_norm": 2.0647784277246126,
"learning_rate": 1.6687596071746376e-05,
"loss": 0.173,
"step": 9600
},
{
"epoch": 1.52,
"grad_norm": 1.7141270975686904,
"learning_rate": 1.667203588898356e-05,
"loss": 0.17,
"step": 9620
},
{
"epoch": 1.52,
"grad_norm": 2.1624948490500433,
"learning_rate": 1.665644653720906e-05,
"loss": 0.1746,
"step": 9640
},
{
"epoch": 1.53,
"grad_norm": 2.2815311206414797,
"learning_rate": 1.6640828084587104e-05,
"loss": 0.1811,
"step": 9660
},
{
"epoch": 1.53,
"grad_norm": 1.9683000565100015,
"learning_rate": 1.662518059940916e-05,
"loss": 0.1883,
"step": 9680
},
{
"epoch": 1.53,
"grad_norm": 2.099153179909315,
"learning_rate": 1.6609504150093634e-05,
"loss": 0.1563,
"step": 9700
},
{
"epoch": 1.54,
"grad_norm": 2.3648488796091542,
"learning_rate": 1.659379880518559e-05,
"loss": 0.1711,
"step": 9720
},
{
"epoch": 1.54,
"grad_norm": 1.9477130219057668,
"learning_rate": 1.6578064633356426e-05,
"loss": 0.1807,
"step": 9740
},
{
"epoch": 1.54,
"grad_norm": 1.8763997892785833,
"learning_rate": 1.6562301703403588e-05,
"loss": 0.17,
"step": 9760
},
{
"epoch": 1.55,
"grad_norm": 1.9473897115797032,
"learning_rate": 1.654651008425027e-05,
"loss": 0.1695,
"step": 9780
},
{
"epoch": 1.55,
"grad_norm": 1.94275021614424,
"learning_rate": 1.653068984494511e-05,
"loss": 0.1786,
"step": 9800
},
{
"epoch": 1.55,
"grad_norm": 2.591445266572519,
"learning_rate": 1.6514841054661884e-05,
"loss": 0.185,
"step": 9820
},
{
"epoch": 1.56,
"grad_norm": 2.2087296424190948,
"learning_rate": 1.64989637826992e-05,
"loss": 0.1913,
"step": 9840
},
{
"epoch": 1.56,
"grad_norm": 2.0227388921292886,
"learning_rate": 1.6483058098480214e-05,
"loss": 0.1771,
"step": 9860
},
{
"epoch": 1.56,
"grad_norm": 1.941226451998698,
"learning_rate": 1.646712407155231e-05,
"loss": 0.1604,
"step": 9880
},
{
"epoch": 1.56,
"grad_norm": 1.799624851657898,
"learning_rate": 1.64511617715868e-05,
"loss": 0.1703,
"step": 9900
},
{
"epoch": 1.57,
"grad_norm": 2.1026662884608873,
"learning_rate": 1.6435171268378617e-05,
"loss": 0.1699,
"step": 9920
},
{
"epoch": 1.57,
"grad_norm": 2.037064549070868,
"learning_rate": 1.641915263184601e-05,
"loss": 0.1749,
"step": 9940
},
{
"epoch": 1.57,
"grad_norm": 2.289972742743045,
"learning_rate": 1.6403105932030253e-05,
"loss": 0.171,
"step": 9960
},
{
"epoch": 1.58,
"grad_norm": 1.7397923448674204,
"learning_rate": 1.638703123909531e-05,
"loss": 0.1722,
"step": 9980
},
{
"epoch": 1.58,
"grad_norm": 2.2809191947314327,
"learning_rate": 1.6370928623327557e-05,
"loss": 0.1789,
"step": 10000
},
{
"epoch": 1.58,
"grad_norm": 5.837716808009581,
"learning_rate": 1.635479815513546e-05,
"loss": 0.1688,
"step": 10020
},
{
"epoch": 1.59,
"grad_norm": 1.9385163502489462,
"learning_rate": 1.6338639905049256e-05,
"loss": 0.1684,
"step": 10040
},
{
"epoch": 1.59,
"grad_norm": 2.4083879559228265,
"learning_rate": 1.6322453943720677e-05,
"loss": 0.1834,
"step": 10060
},
{
"epoch": 1.59,
"grad_norm": 1.8784457651634563,
"learning_rate": 1.6306240341922616e-05,
"loss": 0.1726,
"step": 10080
},
{
"epoch": 1.6,
"grad_norm": 2.5015152154155134,
"learning_rate": 1.628999917054882e-05,
"loss": 0.175,
"step": 10100
},
{
"epoch": 1.6,
"grad_norm": 2.3462998824591317,
"learning_rate": 1.627373050061358e-05,
"loss": 0.1836,
"step": 10120
},
{
"epoch": 1.6,
"grad_norm": 6.491573153186662,
"learning_rate": 1.625743440325143e-05,
"loss": 0.1708,
"step": 10140
},
{
"epoch": 1.61,
"grad_norm": 1.8735636613758684,
"learning_rate": 1.6241110949716837e-05,
"loss": 0.1644,
"step": 10160
},
{
"epoch": 1.61,
"grad_norm": 2.1199720026362776,
"learning_rate": 1.6224760211383867e-05,
"loss": 0.1745,
"step": 10180
},
{
"epoch": 1.61,
"grad_norm": 2.26663511712426,
"learning_rate": 1.6208382259745902e-05,
"loss": 0.1722,
"step": 10200
},
{
"epoch": 1.62,
"grad_norm": 1.6274143037047173,
"learning_rate": 1.6191977166415303e-05,
"loss": 0.1689,
"step": 10220
},
{
"epoch": 1.62,
"grad_norm": 1.7970519932361564,
"learning_rate": 1.617554500312311e-05,
"loss": 0.186,
"step": 10240
},
{
"epoch": 1.62,
"grad_norm": 2.241357629833042,
"learning_rate": 1.6159085841718732e-05,
"loss": 0.1691,
"step": 10260
},
{
"epoch": 1.63,
"grad_norm": 1.6347966162730332,
"learning_rate": 1.614259975416963e-05,
"loss": 0.1689,
"step": 10280
},
{
"epoch": 1.63,
"grad_norm": 1.9003545555776973,
"learning_rate": 1.612608681256098e-05,
"loss": 0.1831,
"step": 10300
},
{
"epoch": 1.63,
"grad_norm": 2.224301766553686,
"learning_rate": 1.61095470890954e-05,
"loss": 0.1809,
"step": 10320
},
{
"epoch": 1.63,
"grad_norm": 2.0418622318986057,
"learning_rate": 1.609298065609259e-05,
"loss": 0.1732,
"step": 10340
},
{
"epoch": 1.64,
"grad_norm": 1.8847041460989982,
"learning_rate": 1.607638758598906e-05,
"loss": 0.18,
"step": 10360
},
{
"epoch": 1.64,
"grad_norm": 2.6034376013955765,
"learning_rate": 1.6059767951337775e-05,
"loss": 0.1659,
"step": 10380
},
{
"epoch": 1.64,
"grad_norm": 2.717869407722093,
"learning_rate": 1.6043121824807853e-05,
"loss": 0.1778,
"step": 10400
},
{
"epoch": 1.65,
"grad_norm": 1.9310171511622796,
"learning_rate": 1.6026449279184252e-05,
"loss": 0.1742,
"step": 10420
},
{
"epoch": 1.65,
"grad_norm": 2.1672348842377542,
"learning_rate": 1.6009750387367446e-05,
"loss": 0.1751,
"step": 10440
},
{
"epoch": 1.65,
"grad_norm": 2.2155157882826932,
"learning_rate": 1.5993025222373107e-05,
"loss": 0.1716,
"step": 10460
},
{
"epoch": 1.66,
"grad_norm": 2.12252169432859,
"learning_rate": 1.5976273857331788e-05,
"loss": 0.1743,
"step": 10480
},
{
"epoch": 1.66,
"grad_norm": 1.8928313189284733,
"learning_rate": 1.59594963654886e-05,
"loss": 0.1666,
"step": 10500
},
{
"epoch": 1.66,
"grad_norm": 2.1209231144675917,
"learning_rate": 1.594269282020289e-05,
"loss": 0.1654,
"step": 10520
},
{
"epoch": 1.67,
"grad_norm": 1.7234563029310586,
"learning_rate": 1.592586329494793e-05,
"loss": 0.1732,
"step": 10540
},
{
"epoch": 1.67,
"grad_norm": 1.8992491192039225,
"learning_rate": 1.590900786331058e-05,
"loss": 0.1843,
"step": 10560
},
{
"epoch": 1.67,
"grad_norm": 2.255292073046707,
"learning_rate": 1.5892126598990988e-05,
"loss": 0.1663,
"step": 10580
},
{
"epoch": 1.68,
"grad_norm": 2.0226484844460937,
"learning_rate": 1.587521957580224e-05,
"loss": 0.1742,
"step": 10600
},
{
"epoch": 1.68,
"grad_norm": 1.9769425460270078,
"learning_rate": 1.5858286867670067e-05,
"loss": 0.1699,
"step": 10620
},
{
"epoch": 1.68,
"grad_norm": 2.0941096893627047,
"learning_rate": 1.584132854863249e-05,
"loss": 0.1689,
"step": 10640
},
{
"epoch": 1.69,
"grad_norm": 3.594450243853988,
"learning_rate": 1.5824344692839528e-05,
"loss": 0.1592,
"step": 10660
},
{
"epoch": 1.69,
"grad_norm": 2.7600105322761266,
"learning_rate": 1.5807335374552863e-05,
"loss": 0.1696,
"step": 10680
},
{
"epoch": 1.69,
"grad_norm": 1.9277553383287402,
"learning_rate": 1.5790300668145488e-05,
"loss": 0.1567,
"step": 10700
},
{
"epoch": 1.69,
"grad_norm": 1.8061002015824392,
"learning_rate": 1.577324064810143e-05,
"loss": 0.1714,
"step": 10720
},
{
"epoch": 1.7,
"grad_norm": 2.075080879197821,
"learning_rate": 1.575615538901539e-05,
"loss": 0.1651,
"step": 10740
},
{
"epoch": 1.7,
"grad_norm": 1.9389319511933993,
"learning_rate": 1.573904496559242e-05,
"loss": 0.1664,
"step": 10760
},
{
"epoch": 1.7,
"grad_norm": 1.989109668636887,
"learning_rate": 1.5721909452647604e-05,
"loss": 0.1644,
"step": 10780
},
{
"epoch": 1.71,
"grad_norm": 1.802713809407511,
"learning_rate": 1.570474892510575e-05,
"loss": 0.158,
"step": 10800
},
{
"epoch": 1.71,
"grad_norm": 2.1683529625057183,
"learning_rate": 1.5687563458001015e-05,
"loss": 0.1731,
"step": 10820
},
{
"epoch": 1.71,
"grad_norm": 1.7930535855304202,
"learning_rate": 1.5670353126476615e-05,
"loss": 0.1721,
"step": 10840
},
{
"epoch": 1.72,
"grad_norm": 2.8089113864956756,
"learning_rate": 1.565311800578449e-05,
"loss": 0.1581,
"step": 10860
},
{
"epoch": 1.72,
"grad_norm": 1.698115339142883,
"learning_rate": 1.5635858171284962e-05,
"loss": 0.1604,
"step": 10880
},
{
"epoch": 1.72,
"grad_norm": 2.003861038263119,
"learning_rate": 1.561857369844642e-05,
"loss": 0.1651,
"step": 10900
},
{
"epoch": 1.73,
"grad_norm": 1.9604631435332058,
"learning_rate": 1.5601264662844976e-05,
"loss": 0.166,
"step": 10920
},
{
"epoch": 1.73,
"grad_norm": 1.8966802933890046,
"learning_rate": 1.5583931140164156e-05,
"loss": 0.1504,
"step": 10940
},
{
"epoch": 1.73,
"grad_norm": 1.8798424813043781,
"learning_rate": 1.556657320619454e-05,
"loss": 0.1712,
"step": 10960
},
{
"epoch": 1.74,
"grad_norm": 1.8480005568507623,
"learning_rate": 1.5549190936833452e-05,
"loss": 0.166,
"step": 10980
},
{
"epoch": 1.74,
"grad_norm": 1.9730818789739197,
"learning_rate": 1.553178440808463e-05,
"loss": 0.1766,
"step": 11000
},
{
"epoch": 1.74,
"grad_norm": 2.055586585535215,
"learning_rate": 1.5514353696057872e-05,
"loss": 0.1684,
"step": 11020
},
{
"epoch": 1.75,
"grad_norm": 2.375617097692436,
"learning_rate": 1.5496898876968733e-05,
"loss": 0.1774,
"step": 11040
},
{
"epoch": 1.75,
"grad_norm": 1.8908606768542418,
"learning_rate": 1.5479420027138157e-05,
"loss": 0.1727,
"step": 11060
},
{
"epoch": 1.75,
"grad_norm": 1.584681873586119,
"learning_rate": 1.5461917222992176e-05,
"loss": 0.1554,
"step": 11080
},
{
"epoch": 1.75,
"grad_norm": 1.9010909888431742,
"learning_rate": 1.5444390541061557e-05,
"loss": 0.172,
"step": 11100
},
{
"epoch": 1.76,
"grad_norm": 1.6694865055184347,
"learning_rate": 1.5426840057981474e-05,
"loss": 0.1679,
"step": 11120
},
{
"epoch": 1.76,
"grad_norm": 2.168091256744236,
"learning_rate": 1.5409265850491172e-05,
"loss": 0.1581,
"step": 11140
},
{
"epoch": 1.76,
"grad_norm": 1.6447757348933234,
"learning_rate": 1.539166799543363e-05,
"loss": 0.1566,
"step": 11160
},
{
"epoch": 1.77,
"grad_norm": 1.8720093081886822,
"learning_rate": 1.5374046569755216e-05,
"loss": 0.1755,
"step": 11180
},
{
"epoch": 1.77,
"grad_norm": 1.8424853497444225,
"learning_rate": 1.5356401650505376e-05,
"loss": 0.1672,
"step": 11200
},
{
"epoch": 1.77,
"grad_norm": 1.7513883396920575,
"learning_rate": 1.533873331483627e-05,
"loss": 0.1612,
"step": 11220
},
{
"epoch": 1.78,
"grad_norm": 2.8850905765482815,
"learning_rate": 1.5321041640002455e-05,
"loss": 0.17,
"step": 11240
},
{
"epoch": 1.78,
"grad_norm": 1.5977457110067312,
"learning_rate": 1.5303326703360534e-05,
"loss": 0.168,
"step": 11260
},
{
"epoch": 1.78,
"grad_norm": 1.839271796672507,
"learning_rate": 1.5285588582368814e-05,
"loss": 0.1622,
"step": 11280
},
{
"epoch": 1.79,
"grad_norm": 2.0851069593552998,
"learning_rate": 1.526782735458699e-05,
"loss": 0.1616,
"step": 11300
},
{
"epoch": 1.79,
"grad_norm": 1.8161832282859776,
"learning_rate": 1.5250043097675773e-05,
"loss": 0.16,
"step": 11320
},
{
"epoch": 1.79,
"grad_norm": 2.1227652074250916,
"learning_rate": 1.5232235889396589e-05,
"loss": 0.1734,
"step": 11340
},
{
"epoch": 1.8,
"grad_norm": 2.054825114584178,
"learning_rate": 1.5214405807611212e-05,
"loss": 0.1693,
"step": 11360
},
{
"epoch": 1.8,
"grad_norm": 1.9053808638546215,
"learning_rate": 1.5196552930281414e-05,
"loss": 0.1555,
"step": 11380
},
{
"epoch": 1.8,
"grad_norm": 2.2568013920238466,
"learning_rate": 1.517867733546866e-05,
"loss": 0.1777,
"step": 11400
},
{
"epoch": 1.81,
"grad_norm": 1.755591383734745,
"learning_rate": 1.516077910133374e-05,
"loss": 0.1646,
"step": 11420
},
{
"epoch": 1.81,
"grad_norm": 2.040664452818391,
"learning_rate": 1.5142858306136432e-05,
"loss": 0.1562,
"step": 11440
},
{
"epoch": 1.81,
"grad_norm": 2.9163542625602634,
"learning_rate": 1.5124915028235168e-05,
"loss": 0.1671,
"step": 11460
},
{
"epoch": 1.81,
"grad_norm": 1.745198816269922,
"learning_rate": 1.5106949346086675e-05,
"loss": 0.1645,
"step": 11480
},
{
"epoch": 1.82,
"grad_norm": 3.185139698456198,
"learning_rate": 1.5088961338245656e-05,
"loss": 0.1645,
"step": 11500
},
{
"epoch": 1.82,
"grad_norm": 1.8737538329143142,
"learning_rate": 1.5070951083364413e-05,
"loss": 0.1701,
"step": 11520
},
{
"epoch": 1.82,
"grad_norm": 2.28486597548739,
"learning_rate": 1.5052918660192548e-05,
"loss": 0.1551,
"step": 11540
},
{
"epoch": 1.83,
"grad_norm": 1.9377161951346134,
"learning_rate": 1.5034864147576574e-05,
"loss": 0.1708,
"step": 11560
},
{
"epoch": 1.83,
"grad_norm": 2.0573176386356713,
"learning_rate": 1.5016787624459602e-05,
"loss": 0.1605,
"step": 11580
},
{
"epoch": 1.83,
"grad_norm": 1.8884308117753286,
"learning_rate": 1.499868916988097e-05,
"loss": 0.1635,
"step": 11600
},
{
"epoch": 1.84,
"grad_norm": 2.1599042583369026,
"learning_rate": 1.4980568862975921e-05,
"loss": 0.1624,
"step": 11620
},
{
"epoch": 1.84,
"grad_norm": 1.9050485545701388,
"learning_rate": 1.4962426782975251e-05,
"loss": 0.1686,
"step": 11640
},
{
"epoch": 1.84,
"grad_norm": 2.0206229765693404,
"learning_rate": 1.4944263009204945e-05,
"loss": 0.1609,
"step": 11660
},
{
"epoch": 1.85,
"grad_norm": 2.1626323284609077,
"learning_rate": 1.4926077621085858e-05,
"loss": 0.1663,
"step": 11680
},
{
"epoch": 1.85,
"grad_norm": 1.7733761949067968,
"learning_rate": 1.4907870698133342e-05,
"loss": 0.1647,
"step": 11700
},
{
"epoch": 1.85,
"grad_norm": 1.7803734747589461,
"learning_rate": 1.4889642319956916e-05,
"loss": 0.1704,
"step": 11720
},
{
"epoch": 1.86,
"grad_norm": 2.1492586543303975,
"learning_rate": 1.4871392566259912e-05,
"loss": 0.1607,
"step": 11740
},
{
"epoch": 1.86,
"grad_norm": 2.00354928746823,
"learning_rate": 1.4853121516839119e-05,
"loss": 0.1649,
"step": 11760
},
{
"epoch": 1.86,
"grad_norm": 1.6161133520624342,
"learning_rate": 1.4834829251584452e-05,
"loss": 0.161,
"step": 11780
},
{
"epoch": 1.87,
"grad_norm": 3.956943883806942,
"learning_rate": 1.4816515850478586e-05,
"loss": 0.1652,
"step": 11800
},
{
"epoch": 1.87,
"grad_norm": 2.2931018381308457,
"learning_rate": 1.4798181393596612e-05,
"loss": 0.1662,
"step": 11820
},
{
"epoch": 1.87,
"grad_norm": 2.174850511346493,
"learning_rate": 1.4779825961105685e-05,
"loss": 0.164,
"step": 11840
},
{
"epoch": 1.87,
"grad_norm": 1.5470178711542586,
"learning_rate": 1.4761449633264679e-05,
"loss": 0.1702,
"step": 11860
},
{
"epoch": 1.88,
"grad_norm": 3.3308654391418204,
"learning_rate": 1.4743052490423835e-05,
"loss": 0.1618,
"step": 11880
},
{
"epoch": 1.88,
"grad_norm": 1.9720383082261177,
"learning_rate": 1.4724634613024404e-05,
"loss": 0.1586,
"step": 11900
},
{
"epoch": 1.88,
"grad_norm": 1.883967887969211,
"learning_rate": 1.4706196081598298e-05,
"loss": 0.1696,
"step": 11920
},
{
"epoch": 1.89,
"grad_norm": 1.6976412060172419,
"learning_rate": 1.4687736976767737e-05,
"loss": 0.1594,
"step": 11940
},
{
"epoch": 1.89,
"grad_norm": 1.529332108170923,
"learning_rate": 1.4669257379244905e-05,
"loss": 0.1619,
"step": 11960
},
{
"epoch": 1.89,
"grad_norm": 1.7970574991179633,
"learning_rate": 1.465075736983158e-05,
"loss": 0.1584,
"step": 11980
},
{
"epoch": 1.9,
"grad_norm": 1.8625966219833416,
"learning_rate": 1.46322370294188e-05,
"loss": 0.1642,
"step": 12000
},
{
"epoch": 1.9,
"grad_norm": 1.99642977340393,
"learning_rate": 1.4613696438986493e-05,
"loss": 0.1576,
"step": 12020
},
{
"epoch": 1.9,
"grad_norm": 2.5428182671668944,
"learning_rate": 1.4595135679603135e-05,
"loss": 0.1741,
"step": 12040
},
{
"epoch": 1.91,
"grad_norm": 3.8472437913445336,
"learning_rate": 1.457655483242539e-05,
"loss": 0.1713,
"step": 12060
},
{
"epoch": 1.91,
"grad_norm": 2.0144352913407935,
"learning_rate": 1.4557953978697748e-05,
"loss": 0.1688,
"step": 12080
},
{
"epoch": 1.91,
"grad_norm": 1.792402123900783,
"learning_rate": 1.4539333199752189e-05,
"loss": 0.166,
"step": 12100
},
{
"epoch": 1.92,
"grad_norm": 2.141224129916374,
"learning_rate": 1.4520692577007808e-05,
"loss": 0.1626,
"step": 12120
},
{
"epoch": 1.92,
"grad_norm": 1.73495865357573,
"learning_rate": 1.4502032191970468e-05,
"loss": 0.1599,
"step": 12140
},
{
"epoch": 1.92,
"grad_norm": 10.641224758817746,
"learning_rate": 1.4483352126232446e-05,
"loss": 0.178,
"step": 12160
},
{
"epoch": 1.93,
"grad_norm": 1.8420610612571067,
"learning_rate": 1.4464652461472068e-05,
"loss": 0.1648,
"step": 12180
},
{
"epoch": 1.93,
"grad_norm": 1.8800775441958055,
"learning_rate": 1.4445933279453358e-05,
"loss": 0.1696,
"step": 12200
},
{
"epoch": 1.93,
"grad_norm": 2.151331613378997,
"learning_rate": 1.4427194662025678e-05,
"loss": 0.1803,
"step": 12220
},
{
"epoch": 1.93,
"grad_norm": 1.9226187530668073,
"learning_rate": 1.4408436691123373e-05,
"loss": 0.1635,
"step": 12240
},
{
"epoch": 1.94,
"grad_norm": 1.869667544978119,
"learning_rate": 1.4389659448765408e-05,
"loss": 0.1662,
"step": 12260
},
{
"epoch": 1.94,
"grad_norm": 2.123839678653108,
"learning_rate": 1.437086301705502e-05,
"loss": 0.1607,
"step": 12280
},
{
"epoch": 1.94,
"grad_norm": 1.8970609470082447,
"learning_rate": 1.4352047478179341e-05,
"loss": 0.1682,
"step": 12300
},
{
"epoch": 1.95,
"grad_norm": 1.8846339362592277,
"learning_rate": 1.4333212914409055e-05,
"loss": 0.1681,
"step": 12320
},
{
"epoch": 1.95,
"grad_norm": 2.596536724951098,
"learning_rate": 1.4314359408098029e-05,
"loss": 0.164,
"step": 12340
},
{
"epoch": 1.95,
"grad_norm": 1.983728679020799,
"learning_rate": 1.4295487041682956e-05,
"loss": 0.1657,
"step": 12360
},
{
"epoch": 1.96,
"grad_norm": 2.0578280622839555,
"learning_rate": 1.4276595897682996e-05,
"loss": 0.1788,
"step": 12380
},
{
"epoch": 1.96,
"grad_norm": 1.8225653464241232,
"learning_rate": 1.425768605869942e-05,
"loss": 0.1864,
"step": 12400
},
{
"epoch": 1.96,
"grad_norm": 1.8018042581666063,
"learning_rate": 1.4238757607415225e-05,
"loss": 0.1708,
"step": 12420
},
{
"epoch": 1.97,
"grad_norm": 1.6471439004511217,
"learning_rate": 1.421981062659481e-05,
"loss": 0.1608,
"step": 12440
},
{
"epoch": 1.97,
"grad_norm": 1.785407584963669,
"learning_rate": 1.420084519908358e-05,
"loss": 0.1659,
"step": 12460
},
{
"epoch": 1.97,
"grad_norm": 2.4554236266808225,
"learning_rate": 1.4181861407807606e-05,
"loss": 0.1675,
"step": 12480
},
{
"epoch": 1.98,
"grad_norm": 1.872120044188974,
"learning_rate": 1.4162859335773253e-05,
"loss": 0.1814,
"step": 12500
},
{
"epoch": 1.98,
"grad_norm": 1.8127393893298422,
"learning_rate": 1.4143839066066813e-05,
"loss": 0.1773,
"step": 12520
},
{
"epoch": 1.98,
"grad_norm": 2.011729890366605,
"learning_rate": 1.4124800681854152e-05,
"loss": 0.156,
"step": 12540
},
{
"epoch": 1.99,
"grad_norm": 1.6619766845812105,
"learning_rate": 1.410574426638034e-05,
"loss": 0.1576,
"step": 12560
},
{
"epoch": 1.99,
"grad_norm": 2.1915362586421234,
"learning_rate": 1.4086669902969292e-05,
"loss": 0.1736,
"step": 12580
},
{
"epoch": 1.99,
"grad_norm": 2.067978259195529,
"learning_rate": 1.4067577675023391e-05,
"loss": 0.163,
"step": 12600
},
{
"epoch": 1.99,
"grad_norm": 1.7953546477600242,
"learning_rate": 1.4048467666023144e-05,
"loss": 0.1686,
"step": 12620
},
{
"epoch": 2.0,
"grad_norm": 2.297945149563059,
"learning_rate": 1.4029339959526795e-05,
"loss": 0.1594,
"step": 12640
},
{
"epoch": 2.0,
"grad_norm": 1.5806995956780445,
"learning_rate": 1.4010194639169978e-05,
"loss": 0.1347,
"step": 12660
},
{
"epoch": 2.0,
"grad_norm": 1.8389841975393477,
"learning_rate": 1.3991031788665339e-05,
"loss": 0.1413,
"step": 12680
},
{
"epoch": 2.01,
"grad_norm": 1.6128247518061136,
"learning_rate": 1.3971851491802173e-05,
"loss": 0.1334,
"step": 12700
},
{
"epoch": 2.01,
"grad_norm": 1.6616632779596046,
"learning_rate": 1.3952653832446063e-05,
"loss": 0.1342,
"step": 12720
},
{
"epoch": 2.01,
"grad_norm": 1.7424736985310945,
"learning_rate": 1.3933438894538514e-05,
"loss": 0.1327,
"step": 12740
},
{
"epoch": 2.02,
"grad_norm": 2.034554595754729,
"learning_rate": 1.3914206762096567e-05,
"loss": 0.1322,
"step": 12760
},
{
"epoch": 2.02,
"grad_norm": 1.83859891144965,
"learning_rate": 1.3894957519212456e-05,
"loss": 0.1282,
"step": 12780
},
{
"epoch": 2.02,
"grad_norm": 1.8581995093338732,
"learning_rate": 1.3875691250053227e-05,
"loss": 0.1304,
"step": 12800
},
{
"epoch": 2.03,
"grad_norm": 2.044578720200551,
"learning_rate": 1.3856408038860376e-05,
"loss": 0.1398,
"step": 12820
},
{
"epoch": 2.03,
"grad_norm": 1.804843912323601,
"learning_rate": 1.3837107969949475e-05,
"loss": 0.1346,
"step": 12840
},
{
"epoch": 2.03,
"grad_norm": 1.7050072304580726,
"learning_rate": 1.3817791127709806e-05,
"loss": 0.1256,
"step": 12860
},
{
"epoch": 2.04,
"grad_norm": 1.4140523836395729,
"learning_rate": 1.3798457596603991e-05,
"loss": 0.12,
"step": 12880
},
{
"epoch": 2.04,
"grad_norm": 1.8202933772969543,
"learning_rate": 1.377910746116763e-05,
"loss": 0.1366,
"step": 12900
},
{
"epoch": 2.04,
"grad_norm": 1.5325733616555526,
"learning_rate": 1.3759740806008922e-05,
"loss": 0.1448,
"step": 12920
},
{
"epoch": 2.05,
"grad_norm": 1.5694030861295416,
"learning_rate": 1.3740357715808295e-05,
"loss": 0.14,
"step": 12940
},
{
"epoch": 2.05,
"grad_norm": 1.8477465101458657,
"learning_rate": 1.3720958275318042e-05,
"loss": 0.145,
"step": 12960
},
{
"epoch": 2.05,
"grad_norm": 2.0163021872211693,
"learning_rate": 1.3701542569361948e-05,
"loss": 0.1399,
"step": 12980
},
{
"epoch": 2.06,
"grad_norm": 2.1241848448011327,
"learning_rate": 1.3682110682834916e-05,
"loss": 0.1345,
"step": 13000
},
{
"epoch": 2.06,
"grad_norm": 1.8098005384061824,
"learning_rate": 1.3662662700702598e-05,
"loss": 0.1423,
"step": 13020
},
{
"epoch": 2.06,
"grad_norm": 1.9511750520025437,
"learning_rate": 1.3643198708001027e-05,
"loss": 0.1371,
"step": 13040
},
{
"epoch": 2.06,
"grad_norm": 1.9670874827003901,
"learning_rate": 1.3623718789836245e-05,
"loss": 0.1296,
"step": 13060
},
{
"epoch": 2.07,
"grad_norm": 1.9819357590758682,
"learning_rate": 1.3604223031383911e-05,
"loss": 0.1332,
"step": 13080
},
{
"epoch": 2.07,
"grad_norm": 1.582028763380569,
"learning_rate": 1.358471151788897e-05,
"loss": 0.1372,
"step": 13100
},
{
"epoch": 2.07,
"grad_norm": 1.852133361117855,
"learning_rate": 1.3565184334665245e-05,
"loss": 0.1325,
"step": 13120
},
{
"epoch": 2.08,
"grad_norm": 1.7718613800655645,
"learning_rate": 1.354564156709506e-05,
"loss": 0.1319,
"step": 13140
},
{
"epoch": 2.08,
"grad_norm": 1.91058950054918,
"learning_rate": 1.3526083300628904e-05,
"loss": 0.1449,
"step": 13160
},
{
"epoch": 2.08,
"grad_norm": 1.7718414653316004,
"learning_rate": 1.3506509620785026e-05,
"loss": 0.1356,
"step": 13180
},
{
"epoch": 2.09,
"grad_norm": 1.5712498386719833,
"learning_rate": 1.3486920613149066e-05,
"loss": 0.1516,
"step": 13200
},
{
"epoch": 2.09,
"grad_norm": 1.6727768791706181,
"learning_rate": 1.3467316363373686e-05,
"loss": 0.1504,
"step": 13220
},
{
"epoch": 2.09,
"grad_norm": 1.7611360453228757,
"learning_rate": 1.34476969571782e-05,
"loss": 0.1438,
"step": 13240
},
{
"epoch": 2.1,
"grad_norm": 1.9621135331143416,
"learning_rate": 1.3428062480348184e-05,
"loss": 0.1302,
"step": 13260
},
{
"epoch": 2.1,
"grad_norm": 1.9412137402392295,
"learning_rate": 1.3408413018735121e-05,
"loss": 0.1346,
"step": 13280
},
{
"epoch": 2.1,
"grad_norm": 3.163658285812431,
"learning_rate": 1.3388748658256e-05,
"loss": 0.1332,
"step": 13300
},
{
"epoch": 2.11,
"grad_norm": 1.5083134268752023,
"learning_rate": 1.336906948489297e-05,
"loss": 0.1411,
"step": 13320
},
{
"epoch": 2.11,
"grad_norm": 1.9734578232009425,
"learning_rate": 1.3349375584692943e-05,
"loss": 0.1323,
"step": 13340
},
{
"epoch": 2.11,
"grad_norm": 1.4186545991121415,
"learning_rate": 1.3329667043767223e-05,
"loss": 0.1301,
"step": 13360
},
{
"epoch": 2.12,
"grad_norm": 1.7445298259368838,
"learning_rate": 1.3309943948291129e-05,
"loss": 0.1401,
"step": 13380
},
{
"epoch": 2.12,
"grad_norm": 2.3939709155579467,
"learning_rate": 1.3290206384503625e-05,
"loss": 0.1373,
"step": 13400
},
{
"epoch": 2.12,
"grad_norm": 1.9898608215582223,
"learning_rate": 1.3270454438706932e-05,
"loss": 0.1377,
"step": 13420
},
{
"epoch": 2.12,
"grad_norm": 1.9890633414507193,
"learning_rate": 1.3250688197266161e-05,
"loss": 0.1408,
"step": 13440
},
{
"epoch": 2.13,
"grad_norm": 1.504362913874789,
"learning_rate": 1.3230907746608926e-05,
"loss": 0.1373,
"step": 13460
},
{
"epoch": 2.13,
"grad_norm": 1.3734057460755258,
"learning_rate": 1.3211113173224972e-05,
"loss": 0.1352,
"step": 13480
},
{
"epoch": 2.13,
"grad_norm": 1.7279895391368354,
"learning_rate": 1.3191304563665796e-05,
"loss": 0.1335,
"step": 13500
},
{
"epoch": 2.14,
"grad_norm": 4.004768866680961,
"learning_rate": 1.3171482004544269e-05,
"loss": 0.1322,
"step": 13520
},
{
"epoch": 2.14,
"grad_norm": 1.7624293076426827,
"learning_rate": 1.3151645582534254e-05,
"loss": 0.1382,
"step": 13540
},
{
"epoch": 2.14,
"grad_norm": 2.0572487999679288,
"learning_rate": 1.3131795384370231e-05,
"loss": 0.1395,
"step": 13560
},
{
"epoch": 2.15,
"grad_norm": 1.991347251667635,
"learning_rate": 1.3111931496846917e-05,
"loss": 0.13,
"step": 13580
},
{
"epoch": 2.15,
"grad_norm": 2.0307080059281235,
"learning_rate": 1.309205400681888e-05,
"loss": 0.1396,
"step": 13600
},
{
"epoch": 2.15,
"grad_norm": 1.7143150679027979,
"learning_rate": 1.3072163001200175e-05,
"loss": 0.137,
"step": 13620
},
{
"epoch": 2.16,
"grad_norm": 2.7163190826043446,
"learning_rate": 1.3052258566963941e-05,
"loss": 0.1323,
"step": 13640
},
{
"epoch": 2.16,
"grad_norm": 2.7959334850612887,
"learning_rate": 1.3032340791142044e-05,
"loss": 0.1394,
"step": 13660
},
{
"epoch": 2.16,
"grad_norm": 1.6331057490525531,
"learning_rate": 1.3012409760824684e-05,
"loss": 0.1242,
"step": 13680
},
{
"epoch": 2.17,
"grad_norm": 1.6062647570217896,
"learning_rate": 1.2992465563160008e-05,
"loss": 0.1328,
"step": 13700
},
{
"epoch": 2.17,
"grad_norm": 1.4609262394598619,
"learning_rate": 1.2972508285353747e-05,
"loss": 0.1396,
"step": 13720
},
{
"epoch": 2.17,
"grad_norm": 1.7380534162049759,
"learning_rate": 1.2952538014668819e-05,
"loss": 0.1363,
"step": 13740
},
{
"epoch": 2.18,
"grad_norm": 1.6248182782020781,
"learning_rate": 1.2932554838424953e-05,
"loss": 0.1402,
"step": 13760
},
{
"epoch": 2.18,
"grad_norm": 1.8429130982408928,
"learning_rate": 1.291255884399831e-05,
"loss": 0.1315,
"step": 13780
},
{
"epoch": 2.18,
"grad_norm": 1.5319197610753537,
"learning_rate": 1.2892550118821105e-05,
"loss": 0.1326,
"step": 13800
},
{
"epoch": 2.18,
"grad_norm": 1.9783678216132117,
"learning_rate": 1.2872528750381203e-05,
"loss": 0.1444,
"step": 13820
},
{
"epoch": 2.19,
"grad_norm": 2.4285831210712447,
"learning_rate": 1.2852494826221764e-05,
"loss": 0.1396,
"step": 13840
},
{
"epoch": 2.19,
"grad_norm": 1.9873361795915407,
"learning_rate": 1.2832448433940836e-05,
"loss": 0.1322,
"step": 13860
},
{
"epoch": 2.19,
"grad_norm": 2.1262590380997395,
"learning_rate": 1.2812389661190997e-05,
"loss": 0.1425,
"step": 13880
},
{
"epoch": 2.2,
"grad_norm": 2.017572098286316,
"learning_rate": 1.2792318595678951e-05,
"loss": 0.1433,
"step": 13900
},
{
"epoch": 2.2,
"grad_norm": 1.805715359030865,
"learning_rate": 1.277223532516515e-05,
"loss": 0.1346,
"step": 13920
},
{
"epoch": 2.2,
"grad_norm": 1.9331084904426687,
"learning_rate": 1.2752139937463417e-05,
"loss": 0.1266,
"step": 13940
},
{
"epoch": 2.21,
"grad_norm": 2.2606185846340536,
"learning_rate": 1.2732032520440557e-05,
"loss": 0.1319,
"step": 13960
},
{
"epoch": 2.21,
"grad_norm": 2.0229739096255983,
"learning_rate": 1.2711913162015966e-05,
"loss": 0.1357,
"step": 13980
},
{
"epoch": 2.21,
"grad_norm": 1.7025438069492658,
"learning_rate": 1.269178195016126e-05,
"loss": 0.1299,
"step": 14000
},
{
"epoch": 2.22,
"grad_norm": 2.174484661042054,
"learning_rate": 1.2671638972899893e-05,
"loss": 0.1511,
"step": 14020
},
{
"epoch": 2.22,
"grad_norm": 1.760985634255071,
"learning_rate": 1.2651484318306739e-05,
"loss": 0.1317,
"step": 14040
},
{
"epoch": 2.22,
"grad_norm": 2.1026027898348487,
"learning_rate": 1.2631318074507757e-05,
"loss": 0.1344,
"step": 14060
},
{
"epoch": 2.23,
"grad_norm": 2.148705261190793,
"learning_rate": 1.2611140329679558e-05,
"loss": 0.1371,
"step": 14080
},
{
"epoch": 2.23,
"grad_norm": 2.114181355069622,
"learning_rate": 1.2590951172049057e-05,
"loss": 0.1332,
"step": 14100
},
{
"epoch": 2.23,
"grad_norm": 1.9916952085910535,
"learning_rate": 1.2570750689893063e-05,
"loss": 0.1374,
"step": 14120
},
{
"epoch": 2.24,
"grad_norm": 1.7588571792620418,
"learning_rate": 1.2550538971537902e-05,
"loss": 0.1423,
"step": 14140
},
{
"epoch": 2.24,
"grad_norm": 2.393616942051735,
"learning_rate": 1.2530316105359033e-05,
"loss": 0.1374,
"step": 14160
},
{
"epoch": 2.24,
"grad_norm": 2.6372755126876255,
"learning_rate": 1.2510082179780657e-05,
"loss": 0.1294,
"step": 14180
},
{
"epoch": 2.24,
"grad_norm": 1.8302836918464744,
"learning_rate": 1.2489837283275332e-05,
"loss": 0.1363,
"step": 14200
},
{
"epoch": 2.25,
"grad_norm": 1.6516803103953872,
"learning_rate": 1.2469581504363586e-05,
"loss": 0.1343,
"step": 14220
},
{
"epoch": 2.25,
"grad_norm": 2.747252739287995,
"learning_rate": 1.2449314931613534e-05,
"loss": 0.1445,
"step": 14240
},
{
"epoch": 2.25,
"grad_norm": 1.6386430667350018,
"learning_rate": 1.242903765364048e-05,
"loss": 0.1323,
"step": 14260
},
{
"epoch": 2.26,
"grad_norm": 1.7974297372086694,
"learning_rate": 1.2408749759106539e-05,
"loss": 0.1341,
"step": 14280
},
{
"epoch": 2.26,
"grad_norm": 2.3070936153881565,
"learning_rate": 1.2388451336720258e-05,
"loss": 0.1365,
"step": 14300
},
{
"epoch": 2.26,
"grad_norm": 1.7718372939738367,
"learning_rate": 1.2368142475236191e-05,
"loss": 0.1341,
"step": 14320
},
{
"epoch": 2.27,
"grad_norm": 2.0624237046435403,
"learning_rate": 1.2347823263454565e-05,
"loss": 0.1465,
"step": 14340
},
{
"epoch": 2.27,
"grad_norm": 2.0885799972048282,
"learning_rate": 1.232749379022084e-05,
"loss": 0.1373,
"step": 14360
},
{
"epoch": 2.27,
"grad_norm": 4.0185548061588685,
"learning_rate": 1.2307154144425364e-05,
"loss": 0.1449,
"step": 14380
},
{
"epoch": 2.28,
"grad_norm": 1.8164512054458681,
"learning_rate": 1.2286804415002957e-05,
"loss": 0.1315,
"step": 14400
},
{
"epoch": 2.28,
"grad_norm": 1.4805329240966263,
"learning_rate": 1.2266444690932517e-05,
"loss": 0.1324,
"step": 14420
},
{
"epoch": 2.28,
"grad_norm": 1.7490191435641387,
"learning_rate": 1.2246075061236665e-05,
"loss": 0.1328,
"step": 14440
},
{
"epoch": 2.29,
"grad_norm": 2.018763501384926,
"learning_rate": 1.2225695614981319e-05,
"loss": 0.1421,
"step": 14460
},
{
"epoch": 2.29,
"grad_norm": 1.6303468915497366,
"learning_rate": 1.2205306441275327e-05,
"loss": 0.1306,
"step": 14480
},
{
"epoch": 2.29,
"grad_norm": 2.080062765861722,
"learning_rate": 1.2184907629270062e-05,
"loss": 0.1451,
"step": 14500
},
{
"epoch": 2.3,
"grad_norm": 2.0528869862543626,
"learning_rate": 1.2164499268159053e-05,
"loss": 0.136,
"step": 14520
},
{
"epoch": 2.3,
"grad_norm": 1.783779222533731,
"learning_rate": 1.214408144717757e-05,
"loss": 0.1378,
"step": 14540
},
{
"epoch": 2.3,
"grad_norm": 1.9296837316314104,
"learning_rate": 1.2123654255602256e-05,
"loss": 0.136,
"step": 14560
},
{
"epoch": 2.3,
"grad_norm": 2.1250306295543577,
"learning_rate": 1.2103217782750718e-05,
"loss": 0.1389,
"step": 14580
},
{
"epoch": 2.31,
"grad_norm": 2.0918921583399137,
"learning_rate": 1.208277211798115e-05,
"loss": 0.1332,
"step": 14600
},
{
"epoch": 2.31,
"grad_norm": 1.6072917157822324,
"learning_rate": 1.2062317350691941e-05,
"loss": 0.1307,
"step": 14620
},
{
"epoch": 2.31,
"grad_norm": 1.64681978078845,
"learning_rate": 1.2041853570321267e-05,
"loss": 0.1315,
"step": 14640
},
{
"epoch": 2.32,
"grad_norm": 1.808848880050173,
"learning_rate": 1.2021380866346735e-05,
"loss": 0.1331,
"step": 14660
},
{
"epoch": 2.32,
"grad_norm": 1.8144490516050213,
"learning_rate": 1.2000899328284954e-05,
"loss": 0.1341,
"step": 14680
},
{
"epoch": 2.32,
"grad_norm": 1.619887010849053,
"learning_rate": 1.1980409045691162e-05,
"loss": 0.1325,
"step": 14700
},
{
"epoch": 2.33,
"grad_norm": 1.8386396934881213,
"learning_rate": 1.1959910108158833e-05,
"loss": 0.1374,
"step": 14720
},
{
"epoch": 2.33,
"grad_norm": 1.8990810229798099,
"learning_rate": 1.1939402605319297e-05,
"loss": 0.1316,
"step": 14740
},
{
"epoch": 2.33,
"grad_norm": 2.1868339614528076,
"learning_rate": 1.191888662684131e-05,
"loss": 0.1389,
"step": 14760
},
{
"epoch": 2.34,
"grad_norm": 2.000021934388998,
"learning_rate": 1.1898362262430716e-05,
"loss": 0.1354,
"step": 14780
},
{
"epoch": 2.34,
"grad_norm": 1.8770094594132787,
"learning_rate": 1.1877829601830007e-05,
"loss": 0.1431,
"step": 14800
},
{
"epoch": 2.34,
"grad_norm": 2.0106237301571968,
"learning_rate": 1.1857288734817953e-05,
"loss": 0.1305,
"step": 14820
},
{
"epoch": 2.35,
"grad_norm": 1.7097061812743855,
"learning_rate": 1.1836739751209216e-05,
"loss": 0.1422,
"step": 14840
},
{
"epoch": 2.35,
"grad_norm": 1.799792102887704,
"learning_rate": 1.1816182740853936e-05,
"loss": 0.1314,
"step": 14860
},
{
"epoch": 2.35,
"grad_norm": 7.873131666623532,
"learning_rate": 1.179561779363736e-05,
"loss": 0.1454,
"step": 14880
},
{
"epoch": 2.36,
"grad_norm": 1.8114884941247387,
"learning_rate": 1.177504499947943e-05,
"loss": 0.137,
"step": 14900
},
{
"epoch": 2.36,
"grad_norm": 1.938115176090166,
"learning_rate": 1.1754464448334402e-05,
"loss": 0.1199,
"step": 14920
},
{
"epoch": 2.36,
"grad_norm": 1.8330913803081887,
"learning_rate": 1.173387623019045e-05,
"loss": 0.1405,
"step": 14940
},
{
"epoch": 2.36,
"grad_norm": 2.039513899725636,
"learning_rate": 1.1713280435069277e-05,
"loss": 0.1473,
"step": 14960
},
{
"epoch": 2.37,
"grad_norm": 1.9901571062266918,
"learning_rate": 1.1692677153025702e-05,
"loss": 0.1395,
"step": 14980
},
{
"epoch": 2.37,
"grad_norm": 2.0717592186891287,
"learning_rate": 1.1672066474147297e-05,
"loss": 0.1413,
"step": 15000
},
{
"epoch": 2.37,
"grad_norm": 1.5752509356218813,
"learning_rate": 1.1651448488553967e-05,
"loss": 0.1326,
"step": 15020
},
{
"epoch": 2.38,
"grad_norm": 1.87741537286892,
"learning_rate": 1.1630823286397564e-05,
"loss": 0.1336,
"step": 15040
},
{
"epoch": 2.38,
"grad_norm": 1.7865539688299008,
"learning_rate": 1.16101909578615e-05,
"loss": 0.1417,
"step": 15060
},
{
"epoch": 2.38,
"grad_norm": 1.9759715406824037,
"learning_rate": 1.1589551593160342e-05,
"loss": 0.1356,
"step": 15080
},
{
"epoch": 2.39,
"grad_norm": 2.612004308810923,
"learning_rate": 1.1568905282539431e-05,
"loss": 0.1341,
"step": 15100
},
{
"epoch": 2.39,
"grad_norm": 1.5450709448269162,
"learning_rate": 1.154825211627447e-05,
"loss": 0.1342,
"step": 15120
},
{
"epoch": 2.39,
"grad_norm": 1.782993801570437,
"learning_rate": 1.1527592184671139e-05,
"loss": 0.1342,
"step": 15140
},
{
"epoch": 2.4,
"grad_norm": 1.9775104887981971,
"learning_rate": 1.1506925578064704e-05,
"loss": 0.1392,
"step": 15160
},
{
"epoch": 2.4,
"grad_norm": 2.3788326859648916,
"learning_rate": 1.1486252386819619e-05,
"loss": 0.1426,
"step": 15180
},
{
"epoch": 2.4,
"grad_norm": 2.325376693399337,
"learning_rate": 1.1465572701329119e-05,
"loss": 0.132,
"step": 15200
},
{
"epoch": 2.41,
"grad_norm": 2.280855641179884,
"learning_rate": 1.1444886612014841e-05,
"loss": 0.1438,
"step": 15220
},
{
"epoch": 2.41,
"grad_norm": 2.1214779107253983,
"learning_rate": 1.1424194209326429e-05,
"loss": 0.1415,
"step": 15240
},
{
"epoch": 2.41,
"grad_norm": 1.9250968661026318,
"learning_rate": 1.1403495583741119e-05,
"loss": 0.1347,
"step": 15260
},
{
"epoch": 2.42,
"grad_norm": 2.359910411589033,
"learning_rate": 1.138279082576337e-05,
"loss": 0.1352,
"step": 15280
},
{
"epoch": 2.42,
"grad_norm": 2.062463239862396,
"learning_rate": 1.1362080025924447e-05,
"loss": 0.1465,
"step": 15300
},
{
"epoch": 2.42,
"grad_norm": 2.1304268822020003,
"learning_rate": 1.134136327478203e-05,
"loss": 0.1434,
"step": 15320
},
{
"epoch": 2.42,
"grad_norm": 2.2426507451349935,
"learning_rate": 1.132064066291983e-05,
"loss": 0.1339,
"step": 15340
},
{
"epoch": 2.43,
"grad_norm": 2.0520093746380144,
"learning_rate": 1.1299912280947176e-05,
"loss": 0.1272,
"step": 15360
},
{
"epoch": 2.43,
"grad_norm": 2.056495712116278,
"learning_rate": 1.1279178219498632e-05,
"loss": 0.1451,
"step": 15380
},
{
"epoch": 2.43,
"grad_norm": 1.8602023808295813,
"learning_rate": 1.1258438569233596e-05,
"loss": 0.1363,
"step": 15400
},
{
"epoch": 2.44,
"grad_norm": 1.54052734375,
"learning_rate": 1.1237693420835894e-05,
"loss": 0.1308,
"step": 15420
},
{
"epoch": 2.44,
"grad_norm": 1.9773005739680685,
"learning_rate": 1.1216942865013404e-05,
"loss": 0.1295,
"step": 15440
},
{
"epoch": 2.44,
"grad_norm": 10.435782833812334,
"learning_rate": 1.1196186992497642e-05,
"loss": 0.1362,
"step": 15460
},
{
"epoch": 2.45,
"grad_norm": 1.8592261887954638,
"learning_rate": 1.117542589404337e-05,
"loss": 0.1358,
"step": 15480
},
{
"epoch": 2.45,
"grad_norm": 1.4884719526206744,
"learning_rate": 1.1154659660428204e-05,
"loss": 0.1381,
"step": 15500
},
{
"epoch": 2.45,
"grad_norm": 1.970383526507307,
"learning_rate": 1.1133888382452212e-05,
"loss": 0.1517,
"step": 15520
},
{
"epoch": 2.46,
"grad_norm": 2.1884060754280634,
"learning_rate": 1.1113112150937517e-05,
"loss": 0.1493,
"step": 15540
},
{
"epoch": 2.46,
"grad_norm": 1.9350142992462236,
"learning_rate": 1.1092331056727903e-05,
"loss": 0.1248,
"step": 15560
},
{
"epoch": 2.46,
"grad_norm": 1.7927839063385917,
"learning_rate": 1.1071545190688418e-05,
"loss": 0.1284,
"step": 15580
},
{
"epoch": 2.47,
"grad_norm": 2.0140182833556697,
"learning_rate": 1.1050754643704967e-05,
"loss": 0.1402,
"step": 15600
},
{
"epoch": 2.47,
"grad_norm": 1.861256841233223,
"learning_rate": 1.1029959506683934e-05,
"loss": 0.1372,
"step": 15620
},
{
"epoch": 2.47,
"grad_norm": 2.1392353308485563,
"learning_rate": 1.100915987055176e-05,
"loss": 0.1286,
"step": 15640
},
{
"epoch": 2.48,
"grad_norm": 2.1435059700471863,
"learning_rate": 1.0988355826254568e-05,
"loss": 0.1373,
"step": 15660
},
{
"epoch": 2.48,
"grad_norm": 1.8265564539069363,
"learning_rate": 1.0967547464757756e-05,
"loss": 0.1374,
"step": 15680
},
{
"epoch": 2.48,
"grad_norm": 2.297540477441779,
"learning_rate": 1.0946734877045586e-05,
"loss": 0.137,
"step": 15700
},
{
"epoch": 2.48,
"grad_norm": 1.9372594130254506,
"learning_rate": 1.0925918154120819e-05,
"loss": 0.1289,
"step": 15720
},
{
"epoch": 2.49,
"grad_norm": 1.4502445508002624,
"learning_rate": 1.0905097387004278e-05,
"loss": 0.1292,
"step": 15740
},
{
"epoch": 2.49,
"grad_norm": 1.6801161108310412,
"learning_rate": 1.0884272666734483e-05,
"loss": 0.1356,
"step": 15760
},
{
"epoch": 2.49,
"grad_norm": 2.5951657395548513,
"learning_rate": 1.0863444084367231e-05,
"loss": 0.1326,
"step": 15780
},
{
"epoch": 2.5,
"grad_norm": 2.19833426704223,
"learning_rate": 1.0842611730975217e-05,
"loss": 0.1369,
"step": 15800
},
{
"epoch": 2.5,
"grad_norm": 2.2346543357542292,
"learning_rate": 1.0821775697647601e-05,
"loss": 0.1308,
"step": 15820
},
{
"epoch": 2.5,
"grad_norm": 4.195722790764242,
"learning_rate": 1.0800936075489665e-05,
"loss": 0.1329,
"step": 15840
},
{
"epoch": 2.51,
"grad_norm": 2.403148088480703,
"learning_rate": 1.0780092955622354e-05,
"loss": 0.139,
"step": 15860
},
{
"epoch": 2.51,
"grad_norm": 1.6643895647168747,
"learning_rate": 1.0759246429181929e-05,
"loss": 0.1216,
"step": 15880
},
{
"epoch": 2.51,
"grad_norm": 2.1063317846159815,
"learning_rate": 1.0738396587319535e-05,
"loss": 0.1392,
"step": 15900
},
{
"epoch": 2.52,
"grad_norm": 3.026897018450204,
"learning_rate": 1.0717543521200812e-05,
"loss": 0.1402,
"step": 15920
},
{
"epoch": 2.52,
"grad_norm": 1.6038581584515634,
"learning_rate": 1.0696687322005504e-05,
"loss": 0.1344,
"step": 15940
},
{
"epoch": 2.52,
"grad_norm": 3.5502922636264085,
"learning_rate": 1.0675828080927056e-05,
"loss": 0.1519,
"step": 15960
},
{
"epoch": 2.53,
"grad_norm": 1.8581208559484683,
"learning_rate": 1.0654965889172203e-05,
"loss": 0.136,
"step": 15980
},
{
"epoch": 2.53,
"grad_norm": 1.714358806471046,
"learning_rate": 1.063410083796059e-05,
"loss": 0.1388,
"step": 16000
},
{
"epoch": 2.53,
"grad_norm": 1.9621668757633475,
"learning_rate": 1.0613233018524367e-05,
"loss": 0.1318,
"step": 16020
},
{
"epoch": 2.54,
"grad_norm": 1.7122215434430692,
"learning_rate": 1.0592362522107781e-05,
"loss": 0.1321,
"step": 16040
},
{
"epoch": 2.54,
"grad_norm": 13.128357085546728,
"learning_rate": 1.0571489439966787e-05,
"loss": 0.1334,
"step": 16060
},
{
"epoch": 2.54,
"grad_norm": 2.073155132232276,
"learning_rate": 1.0550613863368653e-05,
"loss": 0.1396,
"step": 16080
},
{
"epoch": 2.55,
"grad_norm": 1.9614139905047652,
"learning_rate": 1.0529735883591539e-05,
"loss": 0.1366,
"step": 16100
},
{
"epoch": 2.55,
"grad_norm": 1.929701399656576,
"learning_rate": 1.0508855591924127e-05,
"loss": 0.1318,
"step": 16120
},
{
"epoch": 2.55,
"grad_norm": 3.214554209318679,
"learning_rate": 1.0487973079665197e-05,
"loss": 0.1365,
"step": 16140
},
{
"epoch": 2.55,
"grad_norm": 2.314110272601495,
"learning_rate": 1.0467088438123246e-05,
"loss": 0.1381,
"step": 16160
},
{
"epoch": 2.56,
"grad_norm": 1.7496749712329736,
"learning_rate": 1.0446201758616082e-05,
"loss": 0.132,
"step": 16180
},
{
"epoch": 2.56,
"grad_norm": 1.7498085053439836,
"learning_rate": 1.0425313132470414e-05,
"loss": 0.1276,
"step": 16200
},
{
"epoch": 2.56,
"grad_norm": 2.2003956655684656,
"learning_rate": 1.0404422651021475e-05,
"loss": 0.1417,
"step": 16220
},
{
"epoch": 2.57,
"grad_norm": 1.853898068235969,
"learning_rate": 1.0383530405612605e-05,
"loss": 0.1412,
"step": 16240
},
{
"epoch": 2.57,
"grad_norm": 1.4655309260582507,
"learning_rate": 1.036263648759485e-05,
"loss": 0.1397,
"step": 16260
},
{
"epoch": 2.57,
"grad_norm": 1.8230938925196318,
"learning_rate": 1.0341740988326585e-05,
"loss": 0.1319,
"step": 16280
},
{
"epoch": 2.58,
"grad_norm": 1.7897936397283039,
"learning_rate": 1.0320843999173087e-05,
"loss": 0.1306,
"step": 16300
},
{
"epoch": 2.58,
"grad_norm": 1.9568978452613028,
"learning_rate": 1.0299945611506143e-05,
"loss": 0.1386,
"step": 16320
},
{
"epoch": 2.58,
"grad_norm": 1.8465548371746545,
"learning_rate": 1.0279045916703676e-05,
"loss": 0.1345,
"step": 16340
},
{
"epoch": 2.59,
"grad_norm": 2.090653478317281,
"learning_rate": 1.0258145006149299e-05,
"loss": 0.1312,
"step": 16360
},
{
"epoch": 2.59,
"grad_norm": 1.811623854822939,
"learning_rate": 1.0237242971231958e-05,
"loss": 0.1299,
"step": 16380
},
{
"epoch": 2.59,
"grad_norm": 1.8774527084347712,
"learning_rate": 1.0216339903345511e-05,
"loss": 0.1428,
"step": 16400
},
{
"epoch": 2.6,
"grad_norm": 1.4699109947091842,
"learning_rate": 1.0195435893888328e-05,
"loss": 0.1286,
"step": 16420
},
{
"epoch": 2.6,
"grad_norm": 1.8019730271406957,
"learning_rate": 1.0174531034262902e-05,
"loss": 0.1365,
"step": 16440
},
{
"epoch": 2.6,
"grad_norm": 2.171613718686746,
"learning_rate": 1.0153625415875446e-05,
"loss": 0.1385,
"step": 16460
},
{
"epoch": 2.61,
"grad_norm": 1.962390984101079,
"learning_rate": 1.0132719130135476e-05,
"loss": 0.1364,
"step": 16480
},
{
"epoch": 2.61,
"grad_norm": 1.9481438556991033,
"learning_rate": 1.0111812268455442e-05,
"loss": 0.1365,
"step": 16500
},
{
"epoch": 2.61,
"grad_norm": 2.059872080155169,
"learning_rate": 1.0090904922250306e-05,
"loss": 0.1431,
"step": 16520
},
{
"epoch": 2.61,
"grad_norm": 1.593696967345868,
"learning_rate": 1.0069997182937148e-05,
"loss": 0.1353,
"step": 16540
},
{
"epoch": 2.62,
"grad_norm": 2.253326182702059,
"learning_rate": 1.0049089141934765e-05,
"loss": 0.1351,
"step": 16560
},
{
"epoch": 2.62,
"grad_norm": 2.526501192329287,
"learning_rate": 1.002818089066328e-05,
"loss": 0.1419,
"step": 16580
},
{
"epoch": 2.62,
"grad_norm": 1.840025356263137,
"learning_rate": 1.0007272520543723e-05,
"loss": 0.1369,
"step": 16600
},
{
"epoch": 2.63,
"grad_norm": 2.398808652509612,
"learning_rate": 9.98636412299766e-06,
"loss": 0.1328,
"step": 16620
},
{
"epoch": 2.63,
"grad_norm": 2.059131530354911,
"learning_rate": 9.965455789446767e-06,
"loss": 0.1363,
"step": 16640
},
{
"epoch": 2.63,
"grad_norm": 1.8134807038144354,
"learning_rate": 9.94454761131244e-06,
"loss": 0.1346,
"step": 16660
},
{
"epoch": 2.64,
"grad_norm": 1.9551373910066787,
"learning_rate": 9.923639680015398e-06,
"loss": 0.1371,
"step": 16680
},
{
"epoch": 2.64,
"grad_norm": 2.013756173975056,
"learning_rate": 9.90273208697528e-06,
"loss": 0.1389,
"step": 16700
},
{
"epoch": 2.64,
"grad_norm": 2.113142590724566,
"learning_rate": 9.881824923610243e-06,
"loss": 0.1164,
"step": 16720
},
{
"epoch": 2.65,
"grad_norm": 1.6901316432355393,
"learning_rate": 9.860918281336574e-06,
"loss": 0.1323,
"step": 16740
},
{
"epoch": 2.65,
"grad_norm": 1.915271984137991,
"learning_rate": 9.84001225156827e-06,
"loss": 0.1263,
"step": 16760
},
{
"epoch": 2.65,
"grad_norm": 1.4785365880490435,
"learning_rate": 9.819106925716659e-06,
"loss": 0.143,
"step": 16780
},
{
"epoch": 2.66,
"grad_norm": 1.6906972901406798,
"learning_rate": 9.798202395189987e-06,
"loss": 0.1315,
"step": 16800
},
{
"epoch": 2.66,
"grad_norm": 1.9431071208770567,
"learning_rate": 9.777298751393019e-06,
"loss": 0.1248,
"step": 16820
},
{
"epoch": 2.66,
"grad_norm": 2.347318856722391,
"learning_rate": 9.756396085726652e-06,
"loss": 0.1302,
"step": 16840
},
{
"epoch": 2.67,
"grad_norm": 1.9566723157464299,
"learning_rate": 9.735494489587502e-06,
"loss": 0.1331,
"step": 16860
},
{
"epoch": 2.67,
"grad_norm": 1.8332827589977703,
"learning_rate": 9.714594054367505e-06,
"loss": 0.135,
"step": 16880
},
{
"epoch": 2.67,
"grad_norm": 2.1464434993833224,
"learning_rate": 9.693694871453524e-06,
"loss": 0.1267,
"step": 16900
},
{
"epoch": 2.67,
"grad_norm": 5.809875552080618,
"learning_rate": 9.672797032226942e-06,
"loss": 0.136,
"step": 16920
},
{
"epoch": 2.68,
"grad_norm": 1.9479587434790262,
"learning_rate": 9.651900628063275e-06,
"loss": 0.1396,
"step": 16940
},
{
"epoch": 2.68,
"grad_norm": 1.8496158200950275,
"learning_rate": 9.631005750331761e-06,
"loss": 0.1243,
"step": 16960
},
{
"epoch": 2.68,
"grad_norm": 1.6922897603845557,
"learning_rate": 9.610112490394959e-06,
"loss": 0.1297,
"step": 16980
},
{
"epoch": 2.69,
"grad_norm": 2.9150794660621226,
"learning_rate": 9.589220939608355e-06,
"loss": 0.1299,
"step": 17000
},
{
"epoch": 2.69,
"grad_norm": 1.9679979977132334,
"learning_rate": 9.56833118931997e-06,
"loss": 0.1339,
"step": 17020
},
{
"epoch": 2.69,
"grad_norm": 1.9724363903237263,
"learning_rate": 9.547443330869945e-06,
"loss": 0.1315,
"step": 17040
},
{
"epoch": 2.7,
"grad_norm": 14.831981354439382,
"learning_rate": 9.526557455590147e-06,
"loss": 0.1285,
"step": 17060
},
{
"epoch": 2.7,
"grad_norm": 1.8573548057152016,
"learning_rate": 9.505673654803782e-06,
"loss": 0.1293,
"step": 17080
},
{
"epoch": 2.7,
"grad_norm": 2.0349795801136414,
"learning_rate": 9.484792019824975e-06,
"loss": 0.1385,
"step": 17100
},
{
"epoch": 2.71,
"grad_norm": 1.6923544958951748,
"learning_rate": 9.463912641958384e-06,
"loss": 0.1304,
"step": 17120
},
{
"epoch": 2.71,
"grad_norm": 1.9187898140917445,
"learning_rate": 9.4430356124988e-06,
"loss": 0.1299,
"step": 17140
},
{
"epoch": 2.71,
"grad_norm": 1.908401400866661,
"learning_rate": 9.422161022730742e-06,
"loss": 0.1267,
"step": 17160
},
{
"epoch": 2.72,
"grad_norm": 2.4055926923794972,
"learning_rate": 9.40128896392807e-06,
"loss": 0.1265,
"step": 17180
},
{
"epoch": 2.72,
"grad_norm": 1.7006638969619152,
"learning_rate": 9.38041952735356e-06,
"loss": 0.1317,
"step": 17200
},
{
"epoch": 2.72,
"grad_norm": 2.5105881586237278,
"learning_rate": 9.359552804258539e-06,
"loss": 0.1312,
"step": 17220
},
{
"epoch": 2.73,
"grad_norm": 1.8343519141127091,
"learning_rate": 9.338688885882467e-06,
"loss": 0.135,
"step": 17240
},
{
"epoch": 2.73,
"grad_norm": 1.8163979807019428,
"learning_rate": 9.317827863452528e-06,
"loss": 0.1276,
"step": 17260
},
{
"epoch": 2.73,
"grad_norm": 1.8097965204082977,
"learning_rate": 9.29696982818326e-06,
"loss": 0.1322,
"step": 17280
},
{
"epoch": 2.73,
"grad_norm": 1.8659526620279845,
"learning_rate": 9.276114871276136e-06,
"loss": 0.136,
"step": 17300
},
{
"epoch": 2.74,
"grad_norm": 1.7840111730678996,
"learning_rate": 9.255263083919155e-06,
"loss": 0.1364,
"step": 17320
},
{
"epoch": 2.74,
"grad_norm": 1.8681808446996795,
"learning_rate": 9.234414557286472e-06,
"loss": 0.1278,
"step": 17340
},
{
"epoch": 2.74,
"grad_norm": 2.7115158651381246,
"learning_rate": 9.213569382537983e-06,
"loss": 0.1303,
"step": 17360
},
{
"epoch": 2.75,
"grad_norm": 2.0046178435476008,
"learning_rate": 9.192727650818918e-06,
"loss": 0.1271,
"step": 17380
},
{
"epoch": 2.75,
"grad_norm": 2.1831456033306154,
"learning_rate": 9.171889453259467e-06,
"loss": 0.1438,
"step": 17400
},
{
"epoch": 2.75,
"grad_norm": 1.7441754140086236,
"learning_rate": 9.151054880974352e-06,
"loss": 0.1381,
"step": 17420
},
{
"epoch": 2.76,
"grad_norm": 1.9206576950413643,
"learning_rate": 9.130224025062453e-06,
"loss": 0.1303,
"step": 17440
},
{
"epoch": 2.76,
"grad_norm": 2.036397076113896,
"learning_rate": 9.109396976606401e-06,
"loss": 0.1353,
"step": 17460
},
{
"epoch": 2.76,
"grad_norm": 1.9229464141870196,
"learning_rate": 9.088573826672168e-06,
"loss": 0.1438,
"step": 17480
},
{
"epoch": 2.77,
"grad_norm": 1.915122598852103,
"learning_rate": 9.067754666308696e-06,
"loss": 0.1323,
"step": 17500
},
{
"epoch": 2.77,
"grad_norm": 1.9374958776614726,
"learning_rate": 9.046939586547475e-06,
"loss": 0.1444,
"step": 17520
},
{
"epoch": 2.77,
"grad_norm": 2.218516377789009,
"learning_rate": 9.026128678402143e-06,
"loss": 0.1416,
"step": 17540
},
{
"epoch": 2.78,
"grad_norm": 1.6079530823158987,
"learning_rate": 9.005322032868112e-06,
"loss": 0.145,
"step": 17560
},
{
"epoch": 2.78,
"grad_norm": 1.7536946532256747,
"learning_rate": 8.984519740922152e-06,
"loss": 0.134,
"step": 17580
},
{
"epoch": 2.78,
"grad_norm": 1.7613777456529505,
"learning_rate": 8.963721893521992e-06,
"loss": 0.127,
"step": 17600
},
{
"epoch": 2.79,
"grad_norm": 1.6849154707537126,
"learning_rate": 8.942928581605932e-06,
"loss": 0.1357,
"step": 17620
},
{
"epoch": 2.79,
"grad_norm": 1.6942533357476794,
"learning_rate": 8.922139896092442e-06,
"loss": 0.133,
"step": 17640
},
{
"epoch": 2.79,
"grad_norm": 1.9089851768121973,
"learning_rate": 8.901355927879755e-06,
"loss": 0.1264,
"step": 17660
},
{
"epoch": 2.79,
"grad_norm": 2.1641622031480043,
"learning_rate": 8.880576767845486e-06,
"loss": 0.1345,
"step": 17680
},
{
"epoch": 2.8,
"grad_norm": 1.933042012112841,
"learning_rate": 8.85980250684622e-06,
"loss": 0.1298,
"step": 17700
},
{
"epoch": 2.8,
"grad_norm": 2.3097975121136813,
"learning_rate": 8.83903323571713e-06,
"loss": 0.1262,
"step": 17720
},
{
"epoch": 2.8,
"grad_norm": 2.125740034607399,
"learning_rate": 8.818269045271563e-06,
"loss": 0.1316,
"step": 17740
},
{
"epoch": 2.81,
"grad_norm": 1.8092205370393344,
"learning_rate": 8.797510026300652e-06,
"loss": 0.1289,
"step": 17760
},
{
"epoch": 2.81,
"grad_norm": 1.7313726746651832,
"learning_rate": 8.776756269572918e-06,
"loss": 0.1375,
"step": 17780
},
{
"epoch": 2.81,
"grad_norm": 1.755162662021332,
"learning_rate": 8.756007865833878e-06,
"loss": 0.132,
"step": 17800
},
{
"epoch": 2.82,
"grad_norm": 1.7656821182599431,
"learning_rate": 8.735264905805637e-06,
"loss": 0.1384,
"step": 17820
},
{
"epoch": 2.82,
"grad_norm": 1.9033913988096032,
"learning_rate": 8.714527480186495e-06,
"loss": 0.1446,
"step": 17840
},
{
"epoch": 2.82,
"grad_norm": 52.08050408982233,
"learning_rate": 8.693795679650563e-06,
"loss": 0.1387,
"step": 17860
},
{
"epoch": 2.83,
"grad_norm": 1.758278000971912,
"learning_rate": 8.673069594847348e-06,
"loss": 0.1314,
"step": 17880
},
{
"epoch": 2.83,
"grad_norm": 1.6658991317896061,
"learning_rate": 8.652349316401369e-06,
"loss": 0.1316,
"step": 17900
},
{
"epoch": 2.83,
"grad_norm": 1.72349233946441,
"learning_rate": 8.63163493491175e-06,
"loss": 0.1423,
"step": 17920
},
{
"epoch": 2.84,
"grad_norm": 1.8699043332072607,
"learning_rate": 8.610926540951844e-06,
"loss": 0.1429,
"step": 17940
},
{
"epoch": 2.84,
"grad_norm": 2.6231756682884275,
"learning_rate": 8.590224225068813e-06,
"loss": 0.1447,
"step": 17960
},
{
"epoch": 2.84,
"grad_norm": 1.7294321507480823,
"learning_rate": 8.569528077783242e-06,
"loss": 0.133,
"step": 17980
},
{
"epoch": 2.85,
"grad_norm": 2.0916501803210683,
"learning_rate": 8.548838189588751e-06,
"loss": 0.1314,
"step": 18000
},
{
"epoch": 2.85,
"grad_norm": 1.8824702541437954,
"learning_rate": 8.52815465095159e-06,
"loss": 0.1292,
"step": 18020
},
{
"epoch": 2.85,
"grad_norm": 1.9015555188090831,
"learning_rate": 8.507477552310239e-06,
"loss": 0.136,
"step": 18040
},
{
"epoch": 2.85,
"grad_norm": 1.8041709391812781,
"learning_rate": 8.486806984075027e-06,
"loss": 0.1357,
"step": 18060
},
{
"epoch": 2.86,
"grad_norm": 2.61644085154715,
"learning_rate": 8.466143036627727e-06,
"loss": 0.1361,
"step": 18080
},
{
"epoch": 2.86,
"grad_norm": 1.9201489272572818,
"learning_rate": 8.445485800321158e-06,
"loss": 0.1303,
"step": 18100
},
{
"epoch": 2.86,
"grad_norm": 1.8516592652350052,
"learning_rate": 8.424835365478803e-06,
"loss": 0.1428,
"step": 18120
},
{
"epoch": 2.87,
"grad_norm": 1.7535403407329295,
"learning_rate": 8.404191822394402e-06,
"loss": 0.1339,
"step": 18140
},
{
"epoch": 2.87,
"grad_norm": 1.709358915431138,
"learning_rate": 8.383555261331556e-06,
"loss": 0.1316,
"step": 18160
},
{
"epoch": 2.87,
"grad_norm": 1.5598938378866314,
"learning_rate": 8.362925772523344e-06,
"loss": 0.1284,
"step": 18180
},
{
"epoch": 2.88,
"grad_norm": 2.01852515890654,
"learning_rate": 8.34230344617192e-06,
"loss": 0.1317,
"step": 18200
},
{
"epoch": 2.88,
"grad_norm": 2.0982966235855542,
"learning_rate": 8.321688372448116e-06,
"loss": 0.1323,
"step": 18220
},
{
"epoch": 2.88,
"grad_norm": 1.942235144155849,
"learning_rate": 8.301080641491058e-06,
"loss": 0.1296,
"step": 18240
},
{
"epoch": 2.89,
"grad_norm": 2.070442512316597,
"learning_rate": 8.28048034340776e-06,
"loss": 0.1289,
"step": 18260
},
{
"epoch": 2.89,
"grad_norm": 1.8979163062733526,
"learning_rate": 8.259887568272742e-06,
"loss": 0.129,
"step": 18280
},
{
"epoch": 2.89,
"grad_norm": 1.9629619832139733,
"learning_rate": 8.239302406127627e-06,
"loss": 0.1258,
"step": 18300
},
{
"epoch": 2.9,
"grad_norm": 1.6511160255157618,
"learning_rate": 8.218724946980748e-06,
"loss": 0.1344,
"step": 18320
},
{
"epoch": 2.9,
"grad_norm": 2.0793960240926226,
"learning_rate": 8.198155280806757e-06,
"loss": 0.1441,
"step": 18340
},
{
"epoch": 2.9,
"grad_norm": 4.340700684691589,
"learning_rate": 8.177593497546244e-06,
"loss": 0.138,
"step": 18360
},
{
"epoch": 2.91,
"grad_norm": 1.9336701349140626,
"learning_rate": 8.157039687105309e-06,
"loss": 0.1342,
"step": 18380
},
{
"epoch": 2.91,
"grad_norm": 1.8946300992052065,
"learning_rate": 8.136493939355205e-06,
"loss": 0.1439,
"step": 18400
},
{
"epoch": 2.91,
"grad_norm": 2.0345357289377395,
"learning_rate": 8.115956344131934e-06,
"loss": 0.1328,
"step": 18420
},
{
"epoch": 2.91,
"grad_norm": 1.8466633554399094,
"learning_rate": 8.095426991235837e-06,
"loss": 0.1322,
"step": 18440
},
{
"epoch": 2.92,
"grad_norm": 1.8029615066955407,
"learning_rate": 8.074905970431234e-06,
"loss": 0.1294,
"step": 18460
},
{
"epoch": 2.92,
"grad_norm": 2.190576841774613,
"learning_rate": 8.054393371445993e-06,
"loss": 0.1297,
"step": 18480
},
{
"epoch": 2.92,
"grad_norm": 1.5434228156550367,
"learning_rate": 8.033889283971175e-06,
"loss": 0.1358,
"step": 18500
},
{
"epoch": 2.93,
"grad_norm": 1.8951158388509064,
"learning_rate": 8.013393797660616e-06,
"loss": 0.138,
"step": 18520
},
{
"epoch": 2.93,
"grad_norm": 24.433724582483453,
"learning_rate": 7.992907002130542e-06,
"loss": 0.1393,
"step": 18540
},
{
"epoch": 2.93,
"grad_norm": 1.741352924884179,
"learning_rate": 7.97242898695918e-06,
"loss": 0.1346,
"step": 18560
},
{
"epoch": 2.94,
"grad_norm": 2.0199062803452477,
"learning_rate": 7.95195984168638e-06,
"loss": 0.138,
"step": 18580
},
{
"epoch": 2.94,
"grad_norm": 1.8803011025097132,
"learning_rate": 7.931499655813178e-06,
"loss": 0.1316,
"step": 18600
},
{
"epoch": 2.94,
"grad_norm": 2.8191476883918027,
"learning_rate": 7.911048518801462e-06,
"loss": 0.1311,
"step": 18620
},
{
"epoch": 2.95,
"grad_norm": 1.605290015450998,
"learning_rate": 7.890606520073544e-06,
"loss": 0.1277,
"step": 18640
},
{
"epoch": 2.95,
"grad_norm": 1.7656043397487997,
"learning_rate": 7.870173749011775e-06,
"loss": 0.1301,
"step": 18660
},
{
"epoch": 2.95,
"grad_norm": 1.9944081215781464,
"learning_rate": 7.849750294958165e-06,
"loss": 0.1352,
"step": 18680
},
{
"epoch": 2.96,
"grad_norm": 1.719323426738143,
"learning_rate": 7.829336247213984e-06,
"loss": 0.1245,
"step": 18700
},
{
"epoch": 2.96,
"grad_norm": 2.2552863644204413,
"learning_rate": 7.808931695039369e-06,
"loss": 0.1361,
"step": 18720
},
{
"epoch": 2.96,
"grad_norm": 1.592688188078247,
"learning_rate": 7.788536727652944e-06,
"loss": 0.1357,
"step": 18740
},
{
"epoch": 2.97,
"grad_norm": 2.117453808241257,
"learning_rate": 7.768151434231416e-06,
"loss": 0.1307,
"step": 18760
},
{
"epoch": 2.97,
"grad_norm": 1.7253432996502318,
"learning_rate": 7.747775903909198e-06,
"loss": 0.1293,
"step": 18780
},
{
"epoch": 2.97,
"grad_norm": 3.3530177800508905,
"learning_rate": 7.727410225778018e-06,
"loss": 0.1392,
"step": 18800
},
{
"epoch": 2.98,
"grad_norm": 1.7467276768669977,
"learning_rate": 7.707054488886518e-06,
"loss": 0.1252,
"step": 18820
},
{
"epoch": 2.98,
"grad_norm": 2.0304833946272334,
"learning_rate": 7.68670878223987e-06,
"loss": 0.1349,
"step": 18840
},
{
"epoch": 2.98,
"grad_norm": 2.190700152056297,
"learning_rate": 7.666373194799403e-06,
"loss": 0.1344,
"step": 18860
},
{
"epoch": 2.98,
"grad_norm": 1.6201572884626112,
"learning_rate": 7.64604781548218e-06,
"loss": 0.133,
"step": 18880
},
{
"epoch": 2.99,
"grad_norm": 1.7195137407797079,
"learning_rate": 7.625732733160647e-06,
"loss": 0.1362,
"step": 18900
},
{
"epoch": 2.99,
"grad_norm": 1.913768761341664,
"learning_rate": 7.605428036662216e-06,
"loss": 0.1317,
"step": 18920
},
{
"epoch": 2.99,
"grad_norm": 1.739753288449023,
"learning_rate": 7.58513381476889e-06,
"loss": 0.1329,
"step": 18940
},
{
"epoch": 3.0,
"grad_norm": 6.727991747566451,
"learning_rate": 7.564850156216873e-06,
"loss": 0.1298,
"step": 18960
},
{
"epoch": 3.0,
"grad_norm": 1.8885158407300895,
"learning_rate": 7.544577149696179e-06,
"loss": 0.1383,
"step": 18980
},
{
"epoch": 3.0,
"grad_norm": 1.567824409048811,
"learning_rate": 7.524314883850242e-06,
"loss": 0.1095,
"step": 19000
},
{
"epoch": 3.01,
"grad_norm": 1.659947640271154,
"learning_rate": 7.504063447275552e-06,
"loss": 0.1144,
"step": 19020
},
{
"epoch": 3.01,
"grad_norm": 1.4700832403061246,
"learning_rate": 7.4838229285212204e-06,
"loss": 0.1057,
"step": 19040
},
{
"epoch": 3.01,
"grad_norm": 3.2695879823053615,
"learning_rate": 7.4635934160886384e-06,
"loss": 0.1105,
"step": 19060
},
{
"epoch": 3.02,
"grad_norm": 2.099065609092392,
"learning_rate": 7.443374998431071e-06,
"loss": 0.1171,
"step": 19080
},
{
"epoch": 3.02,
"grad_norm": 1.7447734167558466,
"learning_rate": 7.423167763953264e-06,
"loss": 0.1109,
"step": 19100
},
{
"epoch": 3.02,
"grad_norm": 1.5346978743258248,
"learning_rate": 7.4029718010110705e-06,
"loss": 0.1048,
"step": 19120
},
{
"epoch": 3.03,
"grad_norm": 1.945612014937947,
"learning_rate": 7.382787197911061e-06,
"loss": 0.1317,
"step": 19140
},
{
"epoch": 3.03,
"grad_norm": 1.9886953825676763,
"learning_rate": 7.362614042910127e-06,
"loss": 0.1087,
"step": 19160
},
{
"epoch": 3.03,
"grad_norm": 1.8488584966077084,
"learning_rate": 7.3424524242151114e-06,
"loss": 0.1074,
"step": 19180
},
{
"epoch": 3.04,
"grad_norm": 1.5531954098865746,
"learning_rate": 7.322302429982408e-06,
"loss": 0.1077,
"step": 19200
},
{
"epoch": 3.04,
"grad_norm": 1.4380011306991693,
"learning_rate": 7.302164148317588e-06,
"loss": 0.1116,
"step": 19220
},
{
"epoch": 3.04,
"grad_norm": 1.9224244239546222,
"learning_rate": 7.282037667275014e-06,
"loss": 0.1084,
"step": 19240
},
{
"epoch": 3.04,
"grad_norm": 1.9495306893002633,
"learning_rate": 7.261923074857434e-06,
"loss": 0.1067,
"step": 19260
},
{
"epoch": 3.05,
"grad_norm": 1.6135892781801189,
"learning_rate": 7.2418204590156325e-06,
"loss": 0.1113,
"step": 19280
},
{
"epoch": 3.05,
"grad_norm": 1.8619337027452345,
"learning_rate": 7.221729907648017e-06,
"loss": 0.111,
"step": 19300
},
{
"epoch": 3.05,
"grad_norm": 1.58435249989802,
"learning_rate": 7.201651508600237e-06,
"loss": 0.1048,
"step": 19320
},
{
"epoch": 3.06,
"grad_norm": 1.8644650455511773,
"learning_rate": 7.181585349664819e-06,
"loss": 0.1122,
"step": 19340
},
{
"epoch": 3.06,
"grad_norm": 3.1063237386812452,
"learning_rate": 7.161531518580764e-06,
"loss": 0.1163,
"step": 19360
},
{
"epoch": 3.06,
"grad_norm": 1.9307098151706066,
"learning_rate": 7.1414901030331685e-06,
"loss": 0.1033,
"step": 19380
},
{
"epoch": 3.07,
"grad_norm": 1.9119676609536556,
"learning_rate": 7.121461190652841e-06,
"loss": 0.1163,
"step": 19400
},
{
"epoch": 3.07,
"grad_norm": 1.2850029572775687,
"learning_rate": 7.101444869015928e-06,
"loss": 0.1084,
"step": 19420
},
{
"epoch": 3.07,
"grad_norm": 1.9555603713038903,
"learning_rate": 7.081441225643509e-06,
"loss": 0.1129,
"step": 19440
},
{
"epoch": 3.08,
"grad_norm": 2.031672037736627,
"learning_rate": 7.061450348001245e-06,
"loss": 0.1084,
"step": 19460
},
{
"epoch": 3.08,
"grad_norm": 1.8516105291519218,
"learning_rate": 7.041472323498969e-06,
"loss": 0.1169,
"step": 19480
},
{
"epoch": 3.08,
"grad_norm": 1.891605847875244,
"learning_rate": 7.021507239490312e-06,
"loss": 0.1082,
"step": 19500
},
{
"epoch": 3.09,
"grad_norm": 1.6862693820571864,
"learning_rate": 7.00155518327233e-06,
"loss": 0.1048,
"step": 19520
},
{
"epoch": 3.09,
"grad_norm": 1.7775705716788248,
"learning_rate": 6.981616242085108e-06,
"loss": 0.1179,
"step": 19540
},
{
"epoch": 3.09,
"grad_norm": 2.0546306413681097,
"learning_rate": 6.961690503111388e-06,
"loss": 0.1212,
"step": 19560
},
{
"epoch": 3.1,
"grad_norm": 1.8848244455280723,
"learning_rate": 6.9417780534761935e-06,
"loss": 0.1126,
"step": 19580
},
{
"epoch": 3.1,
"grad_norm": 2.0521893417422166,
"learning_rate": 6.921878980246426e-06,
"loss": 0.1183,
"step": 19600
},
{
"epoch": 3.1,
"grad_norm": 2.249708474664614,
"learning_rate": 6.901993370430512e-06,
"loss": 0.106,
"step": 19620
},
{
"epoch": 3.1,
"grad_norm": 2.5021516124123657,
"learning_rate": 6.882121310978001e-06,
"loss": 0.1015,
"step": 19640
},
{
"epoch": 3.11,
"grad_norm": 2.065160595835243,
"learning_rate": 6.862262888779196e-06,
"loss": 0.1151,
"step": 19660
},
{
"epoch": 3.11,
"grad_norm": 1.7232017431925082,
"learning_rate": 6.842418190664777e-06,
"loss": 0.1173,
"step": 19680
},
{
"epoch": 3.11,
"grad_norm": 1.5298123520114955,
"learning_rate": 6.822587303405406e-06,
"loss": 0.1161,
"step": 19700
},
{
"epoch": 3.12,
"grad_norm": 2.0069257507274916,
"learning_rate": 6.8027703137113625e-06,
"loss": 0.1093,
"step": 19720
},
{
"epoch": 3.12,
"grad_norm": 1.5747655482102199,
"learning_rate": 6.782967308232161e-06,
"loss": 0.1103,
"step": 19740
},
{
"epoch": 3.12,
"grad_norm": 1.9611243048822349,
"learning_rate": 6.763178373556163e-06,
"loss": 0.1121,
"step": 19760
},
{
"epoch": 3.13,
"grad_norm": 1.85425587414514,
"learning_rate": 6.743403596210214e-06,
"loss": 0.1092,
"step": 19780
},
{
"epoch": 3.13,
"grad_norm": 1.974717798472976,
"learning_rate": 6.723643062659254e-06,
"loss": 0.1306,
"step": 19800
},
{
"epoch": 3.13,
"grad_norm": 1.6066008425850584,
"learning_rate": 6.7038968593059365e-06,
"loss": 0.1105,
"step": 19820
},
{
"epoch": 3.14,
"grad_norm": 2.287660345075543,
"learning_rate": 6.684165072490264e-06,
"loss": 0.1133,
"step": 19840
},
{
"epoch": 3.14,
"grad_norm": 2.0214442742097423,
"learning_rate": 6.6644477884892e-06,
"loss": 0.107,
"step": 19860
},
{
"epoch": 3.14,
"grad_norm": 1.9292552015890163,
"learning_rate": 6.644745093516293e-06,
"loss": 0.1154,
"step": 19880
},
{
"epoch": 3.15,
"grad_norm": 2.570746944548466,
"learning_rate": 6.625057073721306e-06,
"loss": 0.1123,
"step": 19900
},
{
"epoch": 3.15,
"grad_norm": 2.055682387462859,
"learning_rate": 6.605383815189831e-06,
"loss": 0.1169,
"step": 19920
},
{
"epoch": 3.15,
"grad_norm": 2.341089391540735,
"learning_rate": 6.5857254039429165e-06,
"loss": 0.1093,
"step": 19940
},
{
"epoch": 3.16,
"grad_norm": 1.5801615651736383,
"learning_rate": 6.5660819259366915e-06,
"loss": 0.1121,
"step": 19960
},
{
"epoch": 3.16,
"grad_norm": 1.664256268175884,
"learning_rate": 6.5464534670619924e-06,
"loss": 0.1136,
"step": 19980
},
{
"epoch": 3.16,
"grad_norm": 1.704686699997306,
"learning_rate": 6.52684011314398e-06,
"loss": 0.1123,
"step": 20000
},
{
"epoch": 3.16,
"grad_norm": 1.6533683323744832,
"learning_rate": 6.5072419499417776e-06,
"loss": 0.1145,
"step": 20020
},
{
"epoch": 3.17,
"grad_norm": 1.4953676060136076,
"learning_rate": 6.487659063148073e-06,
"loss": 0.1059,
"step": 20040
},
{
"epoch": 3.17,
"grad_norm": 1.7977671523121384,
"learning_rate": 6.468091538388772e-06,
"loss": 0.1079,
"step": 20060
},
{
"epoch": 3.17,
"grad_norm": 2.69923509076661,
"learning_rate": 6.448539461222603e-06,
"loss": 0.1184,
"step": 20080
},
{
"epoch": 3.18,
"grad_norm": 1.5974929838071241,
"learning_rate": 6.429002917140752e-06,
"loss": 0.1151,
"step": 20100
},
{
"epoch": 3.18,
"grad_norm": 1.4917368582863793,
"learning_rate": 6.409481991566485e-06,
"loss": 0.1216,
"step": 20120
},
{
"epoch": 3.18,
"grad_norm": 1.9287268838343954,
"learning_rate": 6.389976769854782e-06,
"loss": 0.1116,
"step": 20140
},
{
"epoch": 3.19,
"grad_norm": 1.8722729242454341,
"learning_rate": 6.370487337291948e-06,
"loss": 0.1039,
"step": 20160
},
{
"epoch": 3.19,
"grad_norm": 2.262740515470199,
"learning_rate": 6.35101377909526e-06,
"loss": 0.1056,
"step": 20180
},
{
"epoch": 3.19,
"grad_norm": 1.867728055334631,
"learning_rate": 6.331556180412579e-06,
"loss": 0.1146,
"step": 20200
},
{
"epoch": 3.2,
"grad_norm": 2.3080028416700404,
"learning_rate": 6.312114626321985e-06,
"loss": 0.1118,
"step": 20220
},
{
"epoch": 3.2,
"grad_norm": 1.9272440525427814,
"learning_rate": 6.2926892018314035e-06,
"loss": 0.1103,
"step": 20240
},
{
"epoch": 3.2,
"grad_norm": 1.9673586576265236,
"learning_rate": 6.2732799918782305e-06,
"loss": 0.1205,
"step": 20260
},
{
"epoch": 3.21,
"grad_norm": 1.692902711320511,
"learning_rate": 6.253887081328968e-06,
"loss": 0.1252,
"step": 20280
},
{
"epoch": 3.21,
"grad_norm": 2.506000660505763,
"learning_rate": 6.23451055497885e-06,
"loss": 0.1143,
"step": 20300
},
{
"epoch": 3.21,
"grad_norm": 2.2175449806304686,
"learning_rate": 6.215150497551464e-06,
"loss": 0.1121,
"step": 20320
},
{
"epoch": 3.22,
"grad_norm": 1.772260367654961,
"learning_rate": 6.195806993698397e-06,
"loss": 0.11,
"step": 20340
},
{
"epoch": 3.22,
"grad_norm": 1.6014992306005063,
"learning_rate": 6.17648012799885e-06,
"loss": 0.1093,
"step": 20360
},
{
"epoch": 3.22,
"grad_norm": 1.5558192288876895,
"learning_rate": 6.15716998495927e-06,
"loss": 0.1083,
"step": 20380
},
{
"epoch": 3.22,
"grad_norm": 1.739500770969993,
"learning_rate": 6.137876649012992e-06,
"loss": 0.1189,
"step": 20400
},
{
"epoch": 3.23,
"grad_norm": 2.3835755174027664,
"learning_rate": 6.118600204519862e-06,
"loss": 0.1094,
"step": 20420
},
{
"epoch": 3.23,
"grad_norm": 1.9612272739977963,
"learning_rate": 6.099340735765863e-06,
"loss": 0.1164,
"step": 20440
},
{
"epoch": 3.23,
"grad_norm": 6.337640535801565,
"learning_rate": 6.0800983269627526e-06,
"loss": 0.0984,
"step": 20460
},
{
"epoch": 3.24,
"grad_norm": 1.7125882063376165,
"learning_rate": 6.060873062247698e-06,
"loss": 0.1109,
"step": 20480
},
{
"epoch": 3.24,
"grad_norm": 2.279935275045859,
"learning_rate": 6.0416650256829e-06,
"loss": 0.1175,
"step": 20500
},
{
"epoch": 3.24,
"grad_norm": 2.7257613177239035,
"learning_rate": 6.022474301255231e-06,
"loss": 0.1124,
"step": 20520
},
{
"epoch": 3.25,
"grad_norm": 1.4344343584531631,
"learning_rate": 6.003300972875871e-06,
"loss": 0.1175,
"step": 20540
},
{
"epoch": 3.25,
"grad_norm": 5.126754646455792,
"learning_rate": 5.98414512437993e-06,
"loss": 0.1076,
"step": 20560
},
{
"epoch": 3.25,
"grad_norm": 1.6866115067751604,
"learning_rate": 5.965006839526088e-06,
"loss": 0.1211,
"step": 20580
},
{
"epoch": 3.26,
"grad_norm": 1.5961937214393198,
"learning_rate": 5.94588620199623e-06,
"loss": 0.1093,
"step": 20600
},
{
"epoch": 3.26,
"grad_norm": 1.7996107422293754,
"learning_rate": 5.926783295395075e-06,
"loss": 0.1141,
"step": 20620
},
{
"epoch": 3.26,
"grad_norm": 1.9135327910202036,
"learning_rate": 5.907698203249822e-06,
"loss": 0.1158,
"step": 20640
},
{
"epoch": 3.27,
"grad_norm": 3.0880465869255374,
"learning_rate": 5.888631009009768e-06,
"loss": 0.1082,
"step": 20660
},
{
"epoch": 3.27,
"grad_norm": 1.6306328347757366,
"learning_rate": 5.8695817960459525e-06,
"loss": 0.114,
"step": 20680
},
{
"epoch": 3.27,
"grad_norm": 2.2651566646196892,
"learning_rate": 5.850550647650801e-06,
"loss": 0.1183,
"step": 20700
},
{
"epoch": 3.28,
"grad_norm": 2.0262569388306666,
"learning_rate": 5.83153764703774e-06,
"loss": 0.1158,
"step": 20720
},
{
"epoch": 3.28,
"grad_norm": 2.099903231616476,
"learning_rate": 5.812542877340848e-06,
"loss": 0.1201,
"step": 20740
},
{
"epoch": 3.28,
"grad_norm": 1.6278711776424895,
"learning_rate": 5.793566421614495e-06,
"loss": 0.112,
"step": 20760
},
{
"epoch": 3.28,
"grad_norm": 1.6424288324646836,
"learning_rate": 5.774608362832974e-06,
"loss": 0.1126,
"step": 20780
},
{
"epoch": 3.29,
"grad_norm": 1.6482874363576925,
"learning_rate": 5.755668783890128e-06,
"loss": 0.1174,
"step": 20800
},
{
"epoch": 3.29,
"grad_norm": 1.6977354393723512,
"learning_rate": 5.736747767599001e-06,
"loss": 0.1116,
"step": 20820
},
{
"epoch": 3.29,
"grad_norm": 1.4209184887706778,
"learning_rate": 5.71784539669148e-06,
"loss": 0.1152,
"step": 20840
},
{
"epoch": 3.3,
"grad_norm": 2.364407559442157,
"learning_rate": 5.698961753817915e-06,
"loss": 0.1078,
"step": 20860
},
{
"epoch": 3.3,
"grad_norm": 1.467463640220729,
"learning_rate": 5.6800969215467735e-06,
"loss": 0.1091,
"step": 20880
},
{
"epoch": 3.3,
"grad_norm": 2.1606079512263645,
"learning_rate": 5.661250982364267e-06,
"loss": 0.1187,
"step": 20900
},
{
"epoch": 3.31,
"grad_norm": 1.8643408107915673,
"learning_rate": 5.642424018674011e-06,
"loss": 0.1198,
"step": 20920
},
{
"epoch": 3.31,
"grad_norm": 2.19062908351806,
"learning_rate": 5.6236161127966385e-06,
"loss": 0.1175,
"step": 20940
},
{
"epoch": 3.31,
"grad_norm": 1.4145514554386553,
"learning_rate": 5.604827346969453e-06,
"loss": 0.1064,
"step": 20960
},
{
"epoch": 3.32,
"grad_norm": 2.0613286709274194,
"learning_rate": 5.586057803346073e-06,
"loss": 0.115,
"step": 20980
},
{
"epoch": 3.32,
"grad_norm": 2.006127151103046,
"learning_rate": 5.567307563996075e-06,
"loss": 0.1094,
"step": 21000
},
{
"epoch": 3.32,
"grad_norm": 1.597588572665947,
"learning_rate": 5.548576710904614e-06,
"loss": 0.1094,
"step": 21020
},
{
"epoch": 3.33,
"grad_norm": 2.1403728253037664,
"learning_rate": 5.52986532597208e-06,
"loss": 0.1175,
"step": 21040
},
{
"epoch": 3.33,
"grad_norm": 1.8733504987023009,
"learning_rate": 5.511173491013754e-06,
"loss": 0.1114,
"step": 21060
},
{
"epoch": 3.33,
"grad_norm": 1.8398287827192752,
"learning_rate": 5.492501287759417e-06,
"loss": 0.1192,
"step": 21080
},
{
"epoch": 3.34,
"grad_norm": 2.139379765658207,
"learning_rate": 5.473848797853017e-06,
"loss": 0.1133,
"step": 21100
},
{
"epoch": 3.34,
"grad_norm": 1.4261305289751878,
"learning_rate": 5.455216102852314e-06,
"loss": 0.1135,
"step": 21120
},
{
"epoch": 3.34,
"grad_norm": 1.6679405906855305,
"learning_rate": 5.4366032842285035e-06,
"loss": 0.1086,
"step": 21140
},
{
"epoch": 3.34,
"grad_norm": 2.622283619989835,
"learning_rate": 5.418010423365876e-06,
"loss": 0.117,
"step": 21160
},
{
"epoch": 3.35,
"grad_norm": 1.5718273239034348,
"learning_rate": 5.399437601561454e-06,
"loss": 0.1114,
"step": 21180
},
{
"epoch": 3.35,
"grad_norm": 1.8854467554659675,
"learning_rate": 5.380884900024645e-06,
"loss": 0.1088,
"step": 21200
},
{
"epoch": 3.35,
"grad_norm": 1.8148085263562117,
"learning_rate": 5.362352399876884e-06,
"loss": 0.1204,
"step": 21220
},
{
"epoch": 3.36,
"grad_norm": 1.8470292106057589,
"learning_rate": 5.343840182151266e-06,
"loss": 0.1215,
"step": 21240
},
{
"epoch": 3.36,
"grad_norm": 1.393588433929716,
"learning_rate": 5.325348327792201e-06,
"loss": 0.1161,
"step": 21260
},
{
"epoch": 3.36,
"grad_norm": 1.4963551584256642,
"learning_rate": 5.306876917655075e-06,
"loss": 0.1138,
"step": 21280
},
{
"epoch": 3.37,
"grad_norm": 1.8696653133901369,
"learning_rate": 5.2884260325058655e-06,
"loss": 0.1109,
"step": 21300
},
{
"epoch": 3.37,
"grad_norm": 2.008836655708987,
"learning_rate": 5.269995753020809e-06,
"loss": 0.1185,
"step": 21320
},
{
"epoch": 3.37,
"grad_norm": 1.8672455974143392,
"learning_rate": 5.251586159786054e-06,
"loss": 0.1157,
"step": 21340
},
{
"epoch": 3.38,
"grad_norm": 2.0707263460927985,
"learning_rate": 5.233197333297286e-06,
"loss": 0.1126,
"step": 21360
},
{
"epoch": 3.38,
"grad_norm": 6.667836595241356,
"learning_rate": 5.214829353959395e-06,
"loss": 0.1148,
"step": 21380
},
{
"epoch": 3.38,
"grad_norm": 2.024026674486881,
"learning_rate": 5.19648230208611e-06,
"loss": 0.1112,
"step": 21400
},
{
"epoch": 3.39,
"grad_norm": 1.9230645692868673,
"learning_rate": 5.178156257899663e-06,
"loss": 0.1265,
"step": 21420
},
{
"epoch": 3.39,
"grad_norm": 1.6974441557337658,
"learning_rate": 5.159851301530433e-06,
"loss": 0.1114,
"step": 21440
},
{
"epoch": 3.39,
"grad_norm": 1.329976440665595,
"learning_rate": 5.141567513016585e-06,
"loss": 0.1057,
"step": 21460
},
{
"epoch": 3.4,
"grad_norm": 1.6865527001930787,
"learning_rate": 5.1233049723037235e-06,
"loss": 0.1094,
"step": 21480
},
{
"epoch": 3.4,
"grad_norm": 1.5054558239210176,
"learning_rate": 5.105063759244562e-06,
"loss": 0.1064,
"step": 21500
},
{
"epoch": 3.4,
"grad_norm": 1.6249125530482282,
"learning_rate": 5.086843953598548e-06,
"loss": 0.1072,
"step": 21520
},
{
"epoch": 3.4,
"grad_norm": 1.5731361017149925,
"learning_rate": 5.068645635031524e-06,
"loss": 0.1051,
"step": 21540
},
{
"epoch": 3.41,
"grad_norm": 1.8560404007353901,
"learning_rate": 5.050468883115392e-06,
"loss": 0.1107,
"step": 21560
},
{
"epoch": 3.41,
"grad_norm": 1.3906877910710915,
"learning_rate": 5.032313777327746e-06,
"loss": 0.1141,
"step": 21580
},
{
"epoch": 3.41,
"grad_norm": 2.147609814821475,
"learning_rate": 5.014180397051526e-06,
"loss": 0.118,
"step": 21600
},
{
"epoch": 3.42,
"grad_norm": 1.7516449962452705,
"learning_rate": 4.996068821574695e-06,
"loss": 0.116,
"step": 21620
},
{
"epoch": 3.42,
"grad_norm": 1.7847853260618578,
"learning_rate": 4.9779791300898566e-06,
"loss": 0.1072,
"step": 21640
},
{
"epoch": 3.42,
"grad_norm": 1.7936207369174764,
"learning_rate": 4.959911401693944e-06,
"loss": 0.1103,
"step": 21660
},
{
"epoch": 3.43,
"grad_norm": 2.0430958345466435,
"learning_rate": 4.941865715387843e-06,
"loss": 0.1139,
"step": 21680
},
{
"epoch": 3.43,
"grad_norm": 2.334584638586252,
"learning_rate": 4.923842150076065e-06,
"loss": 0.1091,
"step": 21700
},
{
"epoch": 3.43,
"grad_norm": 1.5575401932871056,
"learning_rate": 4.905840784566403e-06,
"loss": 0.1076,
"step": 21720
},
{
"epoch": 3.44,
"grad_norm": 1.6662239758347268,
"learning_rate": 4.887861697569576e-06,
"loss": 0.1069,
"step": 21740
},
{
"epoch": 3.44,
"grad_norm": 3.6921974908448987,
"learning_rate": 4.8699049676988865e-06,
"loss": 0.1212,
"step": 21760
},
{
"epoch": 3.44,
"grad_norm": 1.8420210668496095,
"learning_rate": 4.851970673469894e-06,
"loss": 0.1181,
"step": 21780
},
{
"epoch": 3.45,
"grad_norm": 1.9468098588212996,
"learning_rate": 4.834058893300049e-06,
"loss": 0.1231,
"step": 21800
},
{
"epoch": 3.45,
"grad_norm": 1.6767993315331386,
"learning_rate": 4.8161697055083556e-06,
"loss": 0.1079,
"step": 21820
},
{
"epoch": 3.45,
"grad_norm": 1.4919550050491015,
"learning_rate": 4.798303188315047e-06,
"loss": 0.1056,
"step": 21840
},
{
"epoch": 3.46,
"grad_norm": 1.8485881882655537,
"learning_rate": 4.780459419841213e-06,
"loss": 0.1134,
"step": 21860
},
{
"epoch": 3.46,
"grad_norm": 1.8687690835954356,
"learning_rate": 4.762638478108491e-06,
"loss": 0.1125,
"step": 21880
},
{
"epoch": 3.46,
"grad_norm": 1.797525437431045,
"learning_rate": 4.744840441038697e-06,
"loss": 0.1177,
"step": 21900
},
{
"epoch": 3.47,
"grad_norm": 1.5116159172469752,
"learning_rate": 4.7270653864534985e-06,
"loss": 0.1102,
"step": 21920
},
{
"epoch": 3.47,
"grad_norm": 2.2464407213321316,
"learning_rate": 4.70931339207408e-06,
"loss": 0.1158,
"step": 21940
},
{
"epoch": 3.47,
"grad_norm": 1.9087182621543748,
"learning_rate": 4.691584535520786e-06,
"loss": 0.1033,
"step": 21960
},
{
"epoch": 3.47,
"grad_norm": 1.9564309179723407,
"learning_rate": 4.673878894312794e-06,
"loss": 0.1109,
"step": 21980
},
{
"epoch": 3.48,
"grad_norm": 1.8633572544934949,
"learning_rate": 4.65619654586778e-06,
"loss": 0.1119,
"step": 22000
},
{
"epoch": 3.48,
"grad_norm": 1.6004524664072357,
"learning_rate": 4.638537567501563e-06,
"loss": 0.1171,
"step": 22020
},
{
"epoch": 3.48,
"grad_norm": 1.948022754520626,
"learning_rate": 4.6209020364277765e-06,
"loss": 0.1165,
"step": 22040
},
{
"epoch": 3.49,
"grad_norm": 1.781930626115007,
"learning_rate": 4.603290029757544e-06,
"loss": 0.1244,
"step": 22060
},
{
"epoch": 3.49,
"grad_norm": 2.5054519333972394,
"learning_rate": 4.585701624499111e-06,
"loss": 0.119,
"step": 22080
},
{
"epoch": 3.49,
"grad_norm": 1.8162532618914073,
"learning_rate": 4.5681368975575415e-06,
"loss": 0.1122,
"step": 22100
},
{
"epoch": 3.5,
"grad_norm": 1.878752450338607,
"learning_rate": 4.550595925734358e-06,
"loss": 0.1092,
"step": 22120
},
{
"epoch": 3.5,
"grad_norm": 2.222874957488531,
"learning_rate": 4.533078785727211e-06,
"loss": 0.1185,
"step": 22140
},
{
"epoch": 3.5,
"grad_norm": 1.5316100183886057,
"learning_rate": 4.51558555412956e-06,
"loss": 0.1043,
"step": 22160
},
{
"epoch": 3.51,
"grad_norm": 2.009223649893901,
"learning_rate": 4.498116307430313e-06,
"loss": 0.1084,
"step": 22180
},
{
"epoch": 3.51,
"grad_norm": 1.720831841321149,
"learning_rate": 4.480671122013504e-06,
"loss": 0.1206,
"step": 22200
},
{
"epoch": 3.51,
"grad_norm": 2.0870808528926785,
"learning_rate": 4.46325007415797e-06,
"loss": 0.106,
"step": 22220
},
{
"epoch": 3.52,
"grad_norm": 2.381399967947957,
"learning_rate": 4.445853240037e-06,
"loss": 0.1053,
"step": 22240
},
{
"epoch": 3.52,
"grad_norm": 1.3646238432639344,
"learning_rate": 4.4284806957180036e-06,
"loss": 0.1068,
"step": 22260
},
{
"epoch": 3.52,
"grad_norm": 1.513783073032496,
"learning_rate": 4.411132517162198e-06,
"loss": 0.1107,
"step": 22280
},
{
"epoch": 3.53,
"grad_norm": 1.7493141737786164,
"learning_rate": 4.393808780224242e-06,
"loss": 0.1041,
"step": 22300
},
{
"epoch": 3.53,
"grad_norm": 1.7816743847091314,
"learning_rate": 4.376509560651946e-06,
"loss": 0.11,
"step": 22320
},
{
"epoch": 3.53,
"grad_norm": 2.0147666582118724,
"learning_rate": 4.359234934085902e-06,
"loss": 0.1102,
"step": 22340
},
{
"epoch": 3.53,
"grad_norm": 5.796394595575528,
"learning_rate": 4.341984976059171e-06,
"loss": 0.1064,
"step": 22360
},
{
"epoch": 3.54,
"grad_norm": 1.9445440773945069,
"learning_rate": 4.32475976199696e-06,
"loss": 0.1098,
"step": 22380
},
{
"epoch": 3.54,
"grad_norm": 1.8922501192197436,
"learning_rate": 4.307559367216276e-06,
"loss": 0.1125,
"step": 22400
},
{
"epoch": 3.54,
"grad_norm": 2.080419664595334,
"learning_rate": 4.290383866925604e-06,
"loss": 0.1134,
"step": 22420
},
{
"epoch": 3.55,
"grad_norm": 1.8042606658176292,
"learning_rate": 4.273233336224585e-06,
"loss": 0.1055,
"step": 22440
},
{
"epoch": 3.55,
"grad_norm": 1.8801621900762795,
"learning_rate": 4.256107850103673e-06,
"loss": 0.1095,
"step": 22460
},
{
"epoch": 3.55,
"grad_norm": 1.5322746429777043,
"learning_rate": 4.239007483443814e-06,
"loss": 0.1126,
"step": 22480
},
{
"epoch": 3.56,
"grad_norm": 1.6181831427803501,
"learning_rate": 4.221932311016133e-06,
"loss": 0.1029,
"step": 22500
},
{
"epoch": 3.56,
"grad_norm": 2.294195363726324,
"learning_rate": 4.204882407481577e-06,
"loss": 0.1105,
"step": 22520
},
{
"epoch": 3.56,
"grad_norm": 2.0863431043288907,
"learning_rate": 4.18785784739061e-06,
"loss": 0.1141,
"step": 22540
},
{
"epoch": 3.57,
"grad_norm": 1.9840648138357524,
"learning_rate": 4.1708587051828945e-06,
"loss": 0.1089,
"step": 22560
},
{
"epoch": 3.57,
"grad_norm": 2.1145849243558965,
"learning_rate": 4.153885055186935e-06,
"loss": 0.1152,
"step": 22580
},
{
"epoch": 3.57,
"grad_norm": 1.8533770204902122,
"learning_rate": 4.136936971619776e-06,
"loss": 0.1104,
"step": 22600
},
{
"epoch": 3.58,
"grad_norm": 1.6395678792933663,
"learning_rate": 4.120014528586691e-06,
"loss": 0.1009,
"step": 22620
},
{
"epoch": 3.58,
"grad_norm": 1.5704658395476276,
"learning_rate": 4.103117800080819e-06,
"loss": 0.1142,
"step": 22640
},
{
"epoch": 3.58,
"grad_norm": 1.7585945932665057,
"learning_rate": 4.086246859982867e-06,
"loss": 0.1133,
"step": 22660
},
{
"epoch": 3.59,
"grad_norm": 1.7782676223917826,
"learning_rate": 4.069401782060794e-06,
"loss": 0.1008,
"step": 22680
},
{
"epoch": 3.59,
"grad_norm": 1.620988663174276,
"learning_rate": 4.052582639969466e-06,
"loss": 0.1019,
"step": 22700
},
{
"epoch": 3.59,
"grad_norm": 1.928761433773305,
"learning_rate": 4.035789507250345e-06,
"loss": 0.1085,
"step": 22720
},
{
"epoch": 3.59,
"grad_norm": 1.7712479573455426,
"learning_rate": 4.019022457331172e-06,
"loss": 0.1094,
"step": 22740
},
{
"epoch": 3.6,
"grad_norm": 2.1910464285488986,
"learning_rate": 4.002281563525641e-06,
"loss": 0.1068,
"step": 22760
},
{
"epoch": 3.6,
"grad_norm": 1.4564086565681893,
"learning_rate": 3.985566899033077e-06,
"loss": 0.1139,
"step": 22780
},
{
"epoch": 3.6,
"grad_norm": 1.576080054878554,
"learning_rate": 3.968878536938114e-06,
"loss": 0.1052,
"step": 22800
},
{
"epoch": 3.61,
"grad_norm": 4.14148838470536,
"learning_rate": 3.952216550210391e-06,
"loss": 0.1098,
"step": 22820
},
{
"epoch": 3.61,
"grad_norm": 1.9605161662564978,
"learning_rate": 3.9355810117042095e-06,
"loss": 0.1128,
"step": 22840
},
{
"epoch": 3.61,
"grad_norm": 3.1602653418573614,
"learning_rate": 3.918971994158225e-06,
"loss": 0.1127,
"step": 22860
},
{
"epoch": 3.62,
"grad_norm": 2.0765817978581533,
"learning_rate": 3.9023895701951456e-06,
"loss": 0.113,
"step": 22880
},
{
"epoch": 3.62,
"grad_norm": 1.5711439218576724,
"learning_rate": 3.885833812321384e-06,
"loss": 0.1091,
"step": 22900
},
{
"epoch": 3.62,
"grad_norm": 1.9301946799764145,
"learning_rate": 3.869304792926758e-06,
"loss": 0.1182,
"step": 22920
},
{
"epoch": 3.63,
"grad_norm": 2.224380119641536,
"learning_rate": 3.8528025842841845e-06,
"loss": 0.1038,
"step": 22940
},
{
"epoch": 3.63,
"grad_norm": 1.7738055548862761,
"learning_rate": 3.836327258549335e-06,
"loss": 0.1123,
"step": 22960
},
{
"epoch": 3.63,
"grad_norm": 1.7624596773770125,
"learning_rate": 3.819878887760339e-06,
"loss": 0.107,
"step": 22980
},
{
"epoch": 3.64,
"grad_norm": 1.3869871927397515,
"learning_rate": 3.803457543837479e-06,
"loss": 0.1097,
"step": 23000
},
{
"epoch": 3.64,
"grad_norm": 1.5703344391006682,
"learning_rate": 3.7870632985828465e-06,
"loss": 0.1186,
"step": 23020
},
{
"epoch": 3.64,
"grad_norm": 1.5843117936447395,
"learning_rate": 3.7706962236800443e-06,
"loss": 0.1038,
"step": 23040
},
{
"epoch": 3.65,
"grad_norm": 1.8133782856759533,
"learning_rate": 3.7543563906938945e-06,
"loss": 0.1113,
"step": 23060
},
{
"epoch": 3.65,
"grad_norm": 2.209808276909339,
"learning_rate": 3.7380438710700827e-06,
"loss": 0.116,
"step": 23080
},
{
"epoch": 3.65,
"grad_norm": 1.5640332138261694,
"learning_rate": 3.7217587361348717e-06,
"loss": 0.1175,
"step": 23100
},
{
"epoch": 3.65,
"grad_norm": 1.9605196929464044,
"learning_rate": 3.7055010570947926e-06,
"loss": 0.1133,
"step": 23120
},
{
"epoch": 3.66,
"grad_norm": 1.5058121927392691,
"learning_rate": 3.6892709050363184e-06,
"loss": 0.1027,
"step": 23140
},
{
"epoch": 3.66,
"grad_norm": 1.834472071404119,
"learning_rate": 3.67306835092556e-06,
"loss": 0.1057,
"step": 23160
},
{
"epoch": 3.66,
"grad_norm": 4.783447359079533,
"learning_rate": 3.656893465607966e-06,
"loss": 0.1104,
"step": 23180
},
{
"epoch": 3.67,
"grad_norm": 1.9049505588793905,
"learning_rate": 3.6407463198079955e-06,
"loss": 0.1058,
"step": 23200
},
{
"epoch": 3.67,
"grad_norm": 1.1466847753111398,
"learning_rate": 3.6246269841288185e-06,
"loss": 0.1029,
"step": 23220
},
{
"epoch": 3.67,
"grad_norm": 1.9485277315613068,
"learning_rate": 3.6085355290520026e-06,
"loss": 0.1054,
"step": 23240
},
{
"epoch": 3.68,
"grad_norm": 2.075721440657812,
"learning_rate": 3.592472024937218e-06,
"loss": 0.1103,
"step": 23260
},
{
"epoch": 3.68,
"grad_norm": 2.215528917045134,
"learning_rate": 3.5764365420219106e-06,
"loss": 0.1035,
"step": 23280
},
{
"epoch": 3.68,
"grad_norm": 1.9333486690680328,
"learning_rate": 3.560429150421002e-06,
"loss": 0.1133,
"step": 23300
},
{
"epoch": 3.69,
"grad_norm": 3.178619130298641,
"learning_rate": 3.5444499201265964e-06,
"loss": 0.1113,
"step": 23320
},
{
"epoch": 3.69,
"grad_norm": 2.0658199293351376,
"learning_rate": 3.528498921007653e-06,
"loss": 0.105,
"step": 23340
},
{
"epoch": 3.69,
"grad_norm": 1.7821070968093615,
"learning_rate": 3.5125762228096906e-06,
"loss": 0.1063,
"step": 23360
},
{
"epoch": 3.7,
"grad_norm": 1.7928188818169188,
"learning_rate": 3.4966818951544933e-06,
"loss": 0.1184,
"step": 23380
},
{
"epoch": 3.7,
"grad_norm": 1.3092213506384822,
"learning_rate": 3.480816007539782e-06,
"loss": 0.1059,
"step": 23400
},
{
"epoch": 3.7,
"grad_norm": 2.038627373464944,
"learning_rate": 3.464978629338928e-06,
"loss": 0.1074,
"step": 23420
},
{
"epoch": 3.71,
"grad_norm": 1.731329090501485,
"learning_rate": 3.4491698298006537e-06,
"loss": 0.1179,
"step": 23440
},
{
"epoch": 3.71,
"grad_norm": 2.0442289765237827,
"learning_rate": 3.4333896780487087e-06,
"loss": 0.1115,
"step": 23460
},
{
"epoch": 3.71,
"grad_norm": 1.94777300163772,
"learning_rate": 3.417638243081591e-06,
"loss": 0.1092,
"step": 23480
},
{
"epoch": 3.71,
"grad_norm": 2.497113182347344,
"learning_rate": 3.4019155937722324e-06,
"loss": 0.1134,
"step": 23500
},
{
"epoch": 3.72,
"grad_norm": 1.8816310610695417,
"learning_rate": 3.3862217988676984e-06,
"loss": 0.1106,
"step": 23520
},
{
"epoch": 3.72,
"grad_norm": 2.068099656697655,
"learning_rate": 3.370556926988883e-06,
"loss": 0.1077,
"step": 23540
},
{
"epoch": 3.72,
"grad_norm": 1.6426697114682982,
"learning_rate": 3.3549210466302266e-06,
"loss": 0.1123,
"step": 23560
},
{
"epoch": 3.73,
"grad_norm": 2.1895517808532405,
"learning_rate": 3.3393142261593986e-06,
"loss": 0.107,
"step": 23580
},
{
"epoch": 3.73,
"grad_norm": 1.8676453990584363,
"learning_rate": 3.3237365338169988e-06,
"loss": 0.115,
"step": 23600
},
{
"epoch": 3.73,
"grad_norm": 1.9644120274839887,
"learning_rate": 3.308188037716278e-06,
"loss": 0.1053,
"step": 23620
},
{
"epoch": 3.74,
"grad_norm": 1.8282133553827034,
"learning_rate": 3.2926688058428165e-06,
"loss": 0.1137,
"step": 23640
},
{
"epoch": 3.74,
"grad_norm": 1.8427755722737038,
"learning_rate": 3.2771789060542335e-06,
"loss": 0.115,
"step": 23660
},
{
"epoch": 3.74,
"grad_norm": 1.749664001906615,
"learning_rate": 3.2617184060799117e-06,
"loss": 0.1108,
"step": 23680
},
{
"epoch": 3.75,
"grad_norm": 1.6480471614413341,
"learning_rate": 3.246287373520665e-06,
"loss": 0.1119,
"step": 23700
},
{
"epoch": 3.75,
"grad_norm": 2.0029699208995426,
"learning_rate": 3.2308858758484716e-06,
"loss": 0.1071,
"step": 23720
},
{
"epoch": 3.75,
"grad_norm": 1.7815002131819295,
"learning_rate": 3.2155139804061615e-06,
"loss": 0.11,
"step": 23740
},
{
"epoch": 3.76,
"grad_norm": 2.1270087228729775,
"learning_rate": 3.200171754407139e-06,
"loss": 0.1112,
"step": 23760
},
{
"epoch": 3.76,
"grad_norm": 1.4808186542826025,
"learning_rate": 3.1848592649350725e-06,
"loss": 0.1096,
"step": 23780
},
{
"epoch": 3.76,
"grad_norm": 1.4204592520895922,
"learning_rate": 3.1695765789436066e-06,
"loss": 0.1056,
"step": 23800
},
{
"epoch": 3.77,
"grad_norm": 1.503398462746958,
"learning_rate": 3.1543237632560775e-06,
"loss": 0.1056,
"step": 23820
},
{
"epoch": 3.77,
"grad_norm": 2.139391912894411,
"learning_rate": 3.139100884565209e-06,
"loss": 0.1092,
"step": 23840
},
{
"epoch": 3.77,
"grad_norm": 1.825030650573863,
"learning_rate": 3.123908009432821e-06,
"loss": 0.1068,
"step": 23860
},
{
"epoch": 3.77,
"grad_norm": 1.7801869884600456,
"learning_rate": 3.108745204289557e-06,
"loss": 0.1084,
"step": 23880
},
{
"epoch": 3.78,
"grad_norm": 1.988421541971586,
"learning_rate": 3.0936125354345658e-06,
"loss": 0.1226,
"step": 23900
},
{
"epoch": 3.78,
"grad_norm": 1.8162004251665067,
"learning_rate": 3.0785100690352326e-06,
"loss": 0.1042,
"step": 23920
},
{
"epoch": 3.78,
"grad_norm": 1.768883862501593,
"learning_rate": 3.063437871126885e-06,
"loss": 0.1081,
"step": 23940
},
{
"epoch": 3.79,
"grad_norm": 2.111012367259292,
"learning_rate": 3.0483960076124983e-06,
"loss": 0.1112,
"step": 23960
},
{
"epoch": 3.79,
"grad_norm": 1.4935435579408298,
"learning_rate": 3.033384544262406e-06,
"loss": 0.115,
"step": 23980
},
{
"epoch": 3.79,
"grad_norm": 2.3263165919771964,
"learning_rate": 3.01840354671403e-06,
"loss": 0.1046,
"step": 24000
},
{
"epoch": 3.8,
"grad_norm": 1.8098783277218462,
"learning_rate": 3.0034530804715705e-06,
"loss": 0.1081,
"step": 24020
},
{
"epoch": 3.8,
"grad_norm": 1.7597596637315789,
"learning_rate": 2.98853321090573e-06,
"loss": 0.1057,
"step": 24040
},
{
"epoch": 3.8,
"grad_norm": 1.666663773852063,
"learning_rate": 2.9736440032534385e-06,
"loss": 0.1108,
"step": 24060
},
{
"epoch": 3.81,
"grad_norm": 1.6665436778784075,
"learning_rate": 2.9587855226175448e-06,
"loss": 0.112,
"step": 24080
},
{
"epoch": 3.81,
"grad_norm": 2.0418451840991416,
"learning_rate": 2.943957833966546e-06,
"loss": 0.1077,
"step": 24100
},
{
"epoch": 3.81,
"grad_norm": 1.6205631720668663,
"learning_rate": 2.9291610021343115e-06,
"loss": 0.108,
"step": 24120
},
{
"epoch": 3.82,
"grad_norm": 1.9050108837707873,
"learning_rate": 2.9143950918197785e-06,
"loss": 0.1046,
"step": 24140
},
{
"epoch": 3.82,
"grad_norm": 1.7824903486292831,
"learning_rate": 2.8996601675866813e-06,
"loss": 0.1072,
"step": 24160
},
{
"epoch": 3.82,
"grad_norm": 2.027052783115099,
"learning_rate": 2.884956293863279e-06,
"loss": 0.1149,
"step": 24180
},
{
"epoch": 3.83,
"grad_norm": 1.855350273766843,
"learning_rate": 2.8702835349420523e-06,
"loss": 0.1098,
"step": 24200
},
{
"epoch": 3.83,
"grad_norm": 1.5740339677143422,
"learning_rate": 2.8556419549794333e-06,
"loss": 0.1034,
"step": 24220
},
{
"epoch": 3.83,
"grad_norm": 2.2453458121210463,
"learning_rate": 2.8410316179955244e-06,
"loss": 0.1094,
"step": 24240
},
{
"epoch": 3.83,
"grad_norm": 1.8228216600637548,
"learning_rate": 2.8264525878738293e-06,
"loss": 0.1088,
"step": 24260
},
{
"epoch": 3.84,
"grad_norm": 1.607138848678192,
"learning_rate": 2.811904928360948e-06,
"loss": 0.1053,
"step": 24280
},
{
"epoch": 3.84,
"grad_norm": 1.628362111930888,
"learning_rate": 2.797388703066319e-06,
"loss": 0.1095,
"step": 24300
},
{
"epoch": 3.84,
"grad_norm": 2.214708973384989,
"learning_rate": 2.7829039754619415e-06,
"loss": 0.1135,
"step": 24320
},
{
"epoch": 3.85,
"grad_norm": 1.6042709151574028,
"learning_rate": 2.768450808882078e-06,
"loss": 0.1111,
"step": 24340
},
{
"epoch": 3.85,
"grad_norm": 1.5984980596889142,
"learning_rate": 2.7540292665230025e-06,
"loss": 0.111,
"step": 24360
},
{
"epoch": 3.85,
"grad_norm": 1.6095158783815868,
"learning_rate": 2.7396394114427123e-06,
"loss": 0.1078,
"step": 24380
},
{
"epoch": 3.86,
"grad_norm": 1.454189443910847,
"learning_rate": 2.7252813065606436e-06,
"loss": 0.1062,
"step": 24400
},
{
"epoch": 3.86,
"grad_norm": 1.9801553632489952,
"learning_rate": 2.7109550146574086e-06,
"loss": 0.1088,
"step": 24420
},
{
"epoch": 3.86,
"grad_norm": 2.1777974346251225,
"learning_rate": 2.6966605983745253e-06,
"loss": 0.1173,
"step": 24440
},
{
"epoch": 3.87,
"grad_norm": 1.87606603199586,
"learning_rate": 2.6823981202141273e-06,
"loss": 0.1108,
"step": 24460
},
{
"epoch": 3.87,
"grad_norm": 1.5606771135470763,
"learning_rate": 2.6681676425386993e-06,
"loss": 0.1016,
"step": 24480
},
{
"epoch": 3.87,
"grad_norm": 1.7502281176524994,
"learning_rate": 2.6539692275708127e-06,
"loss": 0.1067,
"step": 24500
},
{
"epoch": 3.88,
"grad_norm": 2.008811138293258,
"learning_rate": 2.639802937392838e-06,
"loss": 0.109,
"step": 24520
},
{
"epoch": 3.88,
"grad_norm": 1.9520261190926087,
"learning_rate": 2.625668833946676e-06,
"loss": 0.1209,
"step": 24540
},
{
"epoch": 3.88,
"grad_norm": 1.4439121369975725,
"learning_rate": 2.611566979033505e-06,
"loss": 0.1064,
"step": 24560
},
{
"epoch": 3.89,
"grad_norm": 1.7599819782808264,
"learning_rate": 2.5974974343134885e-06,
"loss": 0.1089,
"step": 24580
},
{
"epoch": 3.89,
"grad_norm": 1.7500230242713197,
"learning_rate": 2.583460261305509e-06,
"loss": 0.1074,
"step": 24600
},
{
"epoch": 3.89,
"grad_norm": 3.1974584559768333,
"learning_rate": 2.5694555213869183e-06,
"loss": 0.1117,
"step": 24620
},
{
"epoch": 3.9,
"grad_norm": 1.7390586229197151,
"learning_rate": 2.5554832757932433e-06,
"loss": 0.1116,
"step": 24640
},
{
"epoch": 3.9,
"grad_norm": 1.6938332287373814,
"learning_rate": 2.541543585617931e-06,
"loss": 0.1052,
"step": 24660
},
{
"epoch": 3.9,
"grad_norm": 1.495309409972439,
"learning_rate": 2.527636511812089e-06,
"loss": 0.1134,
"step": 24680
},
{
"epoch": 3.9,
"grad_norm": 1.746692256206177,
"learning_rate": 2.5137621151842007e-06,
"loss": 0.108,
"step": 24700
},
{
"epoch": 3.91,
"grad_norm": 1.5082569949536246,
"learning_rate": 2.4999204563998725e-06,
"loss": 0.1101,
"step": 24720
},
{
"epoch": 3.91,
"grad_norm": 1.5181307185486612,
"learning_rate": 2.486111595981562e-06,
"loss": 0.1119,
"step": 24740
},
{
"epoch": 3.91,
"grad_norm": 1.7719768272891112,
"learning_rate": 2.472335594308326e-06,
"loss": 0.1071,
"step": 24760
},
{
"epoch": 3.92,
"grad_norm": 1.5913037244143435,
"learning_rate": 2.4585925116155305e-06,
"loss": 0.1106,
"step": 24780
},
{
"epoch": 3.92,
"grad_norm": 1.864769746920555,
"learning_rate": 2.4448824079946266e-06,
"loss": 0.1091,
"step": 24800
},
{
"epoch": 3.92,
"grad_norm": 2.0660320440726494,
"learning_rate": 2.4312053433928443e-06,
"loss": 0.1107,
"step": 24820
},
{
"epoch": 3.93,
"grad_norm": 1.6415362052910225,
"learning_rate": 2.4175613776129595e-06,
"loss": 0.1091,
"step": 24840
},
{
"epoch": 3.93,
"grad_norm": 1.5581192403320407,
"learning_rate": 2.403950570313019e-06,
"loss": 0.1017,
"step": 24860
},
{
"epoch": 3.93,
"grad_norm": 1.6730861376872064,
"learning_rate": 2.390372981006096e-06,
"loss": 0.1121,
"step": 24880
},
{
"epoch": 3.94,
"grad_norm": 1.934987931521925,
"learning_rate": 2.3768286690600063e-06,
"loss": 0.1119,
"step": 24900
},
{
"epoch": 3.94,
"grad_norm": 2.098112656935134,
"learning_rate": 2.3633176936970632e-06,
"loss": 0.104,
"step": 24920
},
{
"epoch": 3.94,
"grad_norm": 1.6154064008445448,
"learning_rate": 2.3498401139938247e-06,
"loss": 0.1065,
"step": 24940
},
{
"epoch": 3.95,
"grad_norm": 1.7727577822622957,
"learning_rate": 2.336395988880818e-06,
"loss": 0.1065,
"step": 24960
},
{
"epoch": 3.95,
"grad_norm": 1.6932767256088195,
"learning_rate": 2.3229853771422908e-06,
"loss": 0.1066,
"step": 24980
},
{
"epoch": 3.95,
"grad_norm": 2.153938602596105,
"learning_rate": 2.30960833741596e-06,
"loss": 0.1117,
"step": 25000
},
{
"epoch": 3.96,
"grad_norm": 1.746533707527963,
"learning_rate": 2.2962649281927455e-06,
"loss": 0.1099,
"step": 25020
},
{
"epoch": 3.96,
"grad_norm": 2.046943954587584,
"learning_rate": 2.282955207816515e-06,
"loss": 0.1097,
"step": 25040
},
{
"epoch": 3.96,
"grad_norm": 1.4502246583994192,
"learning_rate": 2.269679234483841e-06,
"loss": 0.1098,
"step": 25060
},
{
"epoch": 3.96,
"grad_norm": 1.643479978480647,
"learning_rate": 2.25643706624373e-06,
"loss": 0.1064,
"step": 25080
},
{
"epoch": 3.97,
"grad_norm": 2.0184560603097417,
"learning_rate": 2.243228760997374e-06,
"loss": 0.1121,
"step": 25100
},
{
"epoch": 3.97,
"grad_norm": 1.8261688721984002,
"learning_rate": 2.2300543764979124e-06,
"loss": 0.1049,
"step": 25120
},
{
"epoch": 3.97,
"grad_norm": 2.162498266847831,
"learning_rate": 2.216913970350154e-06,
"loss": 0.1147,
"step": 25140
},
{
"epoch": 3.98,
"grad_norm": 1.6488482067599635,
"learning_rate": 2.2038076000103406e-06,
"loss": 0.1089,
"step": 25160
},
{
"epoch": 3.98,
"grad_norm": 1.872902014324623,
"learning_rate": 2.1907353227858967e-06,
"loss": 0.1098,
"step": 25180
},
{
"epoch": 3.98,
"grad_norm": 1.8844854116910061,
"learning_rate": 2.1776971958351707e-06,
"loss": 0.11,
"step": 25200
},
{
"epoch": 3.99,
"grad_norm": 1.6151501634779615,
"learning_rate": 2.164693276167192e-06,
"loss": 0.1088,
"step": 25220
},
{
"epoch": 3.99,
"grad_norm": 1.861480866736796,
"learning_rate": 2.1517236206414223e-06,
"loss": 0.1142,
"step": 25240
},
{
"epoch": 3.99,
"grad_norm": 2.0671680683079052,
"learning_rate": 2.138788285967496e-06,
"loss": 0.1117,
"step": 25260
},
{
"epoch": 4.0,
"grad_norm": 2.1103107778552617,
"learning_rate": 2.125887328704983e-06,
"loss": 0.1152,
"step": 25280
},
{
"epoch": 4.0,
"grad_norm": 2.4651029665103334,
"learning_rate": 2.1130208052631447e-06,
"loss": 0.1156,
"step": 25300
}
],
"logging_steps": 20,
"max_steps": 31630,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}