bgem3-synthetic-v2-e2 / trainer_state.json
nntoan209's picture
Upload folder using huggingface_hub
b02ed92 verified
raw
history blame contribute delete
No virus
102 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 12652,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 13.192614390744945,
"learning_rate": 4.3780025284450067e-07,
"loss": 0.6539,
"step": 20
},
{
"epoch": 0.01,
"grad_norm": 10.26668883317148,
"learning_rate": 6.881163084702909e-07,
"loss": 0.6351,
"step": 40
},
{
"epoch": 0.01,
"grad_norm": 8.093364927344371,
"learning_rate": 9.38432364096081e-07,
"loss": 0.621,
"step": 60
},
{
"epoch": 0.01,
"grad_norm": 6.435663053769238,
"learning_rate": 1.188748419721871e-06,
"loss": 0.5702,
"step": 80
},
{
"epoch": 0.02,
"grad_norm": 7.962395022671904,
"learning_rate": 1.4390644753476612e-06,
"loss": 0.5317,
"step": 100
},
{
"epoch": 0.02,
"grad_norm": 6.601932601034603,
"learning_rate": 1.6893805309734515e-06,
"loss": 0.5138,
"step": 120
},
{
"epoch": 0.02,
"grad_norm": 5.565400471086017,
"learning_rate": 1.9396965865992414e-06,
"loss": 0.4772,
"step": 140
},
{
"epoch": 0.03,
"grad_norm": 10.719063890493263,
"learning_rate": 2.1900126422250318e-06,
"loss": 0.4828,
"step": 160
},
{
"epoch": 0.03,
"grad_norm": 5.456905246118198,
"learning_rate": 2.4403286978508217e-06,
"loss": 0.4612,
"step": 180
},
{
"epoch": 0.03,
"grad_norm": 5.074136053502421,
"learning_rate": 2.6906447534766116e-06,
"loss": 0.4614,
"step": 200
},
{
"epoch": 0.03,
"grad_norm": 5.739808379905823,
"learning_rate": 2.940960809102403e-06,
"loss": 0.4455,
"step": 220
},
{
"epoch": 0.04,
"grad_norm": 8.929223129466571,
"learning_rate": 3.1912768647281927e-06,
"loss": 0.4215,
"step": 240
},
{
"epoch": 0.04,
"grad_norm": 5.727490359048043,
"learning_rate": 3.4415929203539826e-06,
"loss": 0.4544,
"step": 260
},
{
"epoch": 0.04,
"grad_norm": 4.692889967552893,
"learning_rate": 3.6919089759797726e-06,
"loss": 0.4362,
"step": 280
},
{
"epoch": 0.05,
"grad_norm": 5.492004651950497,
"learning_rate": 3.942225031605563e-06,
"loss": 0.4102,
"step": 300
},
{
"epoch": 0.05,
"grad_norm": 4.905463477586154,
"learning_rate": 4.192541087231353e-06,
"loss": 0.4163,
"step": 320
},
{
"epoch": 0.05,
"grad_norm": 6.042815351357798,
"learning_rate": 4.442857142857143e-06,
"loss": 0.4043,
"step": 340
},
{
"epoch": 0.06,
"grad_norm": 3.966783411781683,
"learning_rate": 4.693173198482934e-06,
"loss": 0.4134,
"step": 360
},
{
"epoch": 0.06,
"grad_norm": 6.15182506494119,
"learning_rate": 4.943489254108724e-06,
"loss": 0.4095,
"step": 380
},
{
"epoch": 0.06,
"grad_norm": 5.556343872513135,
"learning_rate": 5.193805309734513e-06,
"loss": 0.3952,
"step": 400
},
{
"epoch": 0.07,
"grad_norm": 6.168472163196086,
"learning_rate": 5.4441213653603045e-06,
"loss": 0.4,
"step": 420
},
{
"epoch": 0.07,
"grad_norm": 7.710878876344402,
"learning_rate": 5.6944374209860944e-06,
"loss": 0.4158,
"step": 440
},
{
"epoch": 0.07,
"grad_norm": 10.149729727919098,
"learning_rate": 5.944753476611884e-06,
"loss": 0.4008,
"step": 460
},
{
"epoch": 0.08,
"grad_norm": 5.276529583928554,
"learning_rate": 6.195069532237674e-06,
"loss": 0.3693,
"step": 480
},
{
"epoch": 0.08,
"grad_norm": 5.4922316048860305,
"learning_rate": 6.445385587863464e-06,
"loss": 0.3655,
"step": 500
},
{
"epoch": 0.08,
"grad_norm": 4.834486637416974,
"learning_rate": 6.695701643489254e-06,
"loss": 0.4071,
"step": 520
},
{
"epoch": 0.09,
"grad_norm": 5.109668221648755,
"learning_rate": 6.946017699115044e-06,
"loss": 0.3611,
"step": 540
},
{
"epoch": 0.09,
"grad_norm": 4.962197451653283,
"learning_rate": 7.196333754740835e-06,
"loss": 0.3617,
"step": 560
},
{
"epoch": 0.09,
"grad_norm": 5.725555872687781,
"learning_rate": 7.4466498103666256e-06,
"loss": 0.3938,
"step": 580
},
{
"epoch": 0.09,
"grad_norm": 21.199977745188423,
"learning_rate": 7.696965865992416e-06,
"loss": 0.3753,
"step": 600
},
{
"epoch": 0.1,
"grad_norm": 6.822065444923147,
"learning_rate": 7.947281921618205e-06,
"loss": 0.3732,
"step": 620
},
{
"epoch": 0.1,
"grad_norm": 5.562639041834479,
"learning_rate": 8.197597977243996e-06,
"loss": 0.3641,
"step": 640
},
{
"epoch": 0.1,
"grad_norm": 6.746641948539073,
"learning_rate": 8.447914032869787e-06,
"loss": 0.3608,
"step": 660
},
{
"epoch": 0.11,
"grad_norm": 5.083732630988663,
"learning_rate": 8.698230088495576e-06,
"loss": 0.3489,
"step": 680
},
{
"epoch": 0.11,
"grad_norm": 5.36345764003458,
"learning_rate": 8.948546144121367e-06,
"loss": 0.3495,
"step": 700
},
{
"epoch": 0.11,
"grad_norm": 5.94599708094623,
"learning_rate": 9.198862199747156e-06,
"loss": 0.3568,
"step": 720
},
{
"epoch": 0.12,
"grad_norm": 5.59596830741628,
"learning_rate": 9.449178255372947e-06,
"loss": 0.3424,
"step": 740
},
{
"epoch": 0.12,
"grad_norm": 4.687091250082153,
"learning_rate": 9.699494310998736e-06,
"loss": 0.3373,
"step": 760
},
{
"epoch": 0.12,
"grad_norm": 4.305364261329056,
"learning_rate": 9.949810366624526e-06,
"loss": 0.3863,
"step": 780
},
{
"epoch": 0.13,
"grad_norm": 5.439101060195295,
"learning_rate": 1.0200126422250315e-05,
"loss": 0.3417,
"step": 800
},
{
"epoch": 0.13,
"grad_norm": 5.395373389869628,
"learning_rate": 1.0450442477876108e-05,
"loss": 0.3375,
"step": 820
},
{
"epoch": 0.13,
"grad_norm": 4.781731874205922,
"learning_rate": 1.0700758533501895e-05,
"loss": 0.3296,
"step": 840
},
{
"epoch": 0.14,
"grad_norm": 5.456546617203758,
"learning_rate": 1.0951074589127688e-05,
"loss": 0.3595,
"step": 860
},
{
"epoch": 0.14,
"grad_norm": 5.3912349632260455,
"learning_rate": 1.1201390644753475e-05,
"loss": 0.3597,
"step": 880
},
{
"epoch": 0.14,
"grad_norm": 4.404539817994699,
"learning_rate": 1.1451706700379268e-05,
"loss": 0.3314,
"step": 900
},
{
"epoch": 0.15,
"grad_norm": 7.289102012469942,
"learning_rate": 1.1702022756005057e-05,
"loss": 0.3406,
"step": 920
},
{
"epoch": 0.15,
"grad_norm": 5.786352926324861,
"learning_rate": 1.1952338811630847e-05,
"loss": 0.3509,
"step": 940
},
{
"epoch": 0.15,
"grad_norm": 4.131405105795528,
"learning_rate": 1.220265486725664e-05,
"loss": 0.3459,
"step": 960
},
{
"epoch": 0.15,
"grad_norm": 5.036101661631649,
"learning_rate": 1.2452970922882427e-05,
"loss": 0.3327,
"step": 980
},
{
"epoch": 0.16,
"grad_norm": 4.9031568362075895,
"learning_rate": 1.270328697850822e-05,
"loss": 0.359,
"step": 1000
},
{
"epoch": 0.16,
"grad_norm": 6.680058126596432,
"learning_rate": 1.2953603034134009e-05,
"loss": 0.3421,
"step": 1020
},
{
"epoch": 0.16,
"grad_norm": 6.016889325334172,
"learning_rate": 1.32039190897598e-05,
"loss": 0.3374,
"step": 1040
},
{
"epoch": 0.17,
"grad_norm": 6.844742376412928,
"learning_rate": 1.3454235145385589e-05,
"loss": 0.3237,
"step": 1060
},
{
"epoch": 0.17,
"grad_norm": 5.969902845962769,
"learning_rate": 1.370455120101138e-05,
"loss": 0.3237,
"step": 1080
},
{
"epoch": 0.17,
"grad_norm": 6.2950164096609305,
"learning_rate": 1.3954867256637168e-05,
"loss": 0.3168,
"step": 1100
},
{
"epoch": 0.18,
"grad_norm": 5.2335271803475445,
"learning_rate": 1.4205183312262961e-05,
"loss": 0.3396,
"step": 1120
},
{
"epoch": 0.18,
"grad_norm": 6.225398358080941,
"learning_rate": 1.4455499367888748e-05,
"loss": 0.333,
"step": 1140
},
{
"epoch": 0.18,
"grad_norm": 5.3208556045070114,
"learning_rate": 1.470581542351454e-05,
"loss": 0.3284,
"step": 1160
},
{
"epoch": 0.19,
"grad_norm": 4.7849250581905665,
"learning_rate": 1.495613147914033e-05,
"loss": 0.3304,
"step": 1180
},
{
"epoch": 0.19,
"grad_norm": 5.184459116726432,
"learning_rate": 1.520644753476612e-05,
"loss": 0.3462,
"step": 1200
},
{
"epoch": 0.19,
"grad_norm": 4.19686707405711,
"learning_rate": 1.545676359039191e-05,
"loss": 0.3325,
"step": 1220
},
{
"epoch": 0.2,
"grad_norm": 4.450896337868275,
"learning_rate": 1.57070796460177e-05,
"loss": 0.3057,
"step": 1240
},
{
"epoch": 0.2,
"grad_norm": 4.50018119447218,
"learning_rate": 1.5957395701643493e-05,
"loss": 0.3288,
"step": 1260
},
{
"epoch": 0.2,
"grad_norm": 5.166275173654794,
"learning_rate": 1.620771175726928e-05,
"loss": 0.3167,
"step": 1280
},
{
"epoch": 0.21,
"grad_norm": 5.056697958587676,
"learning_rate": 1.645802781289507e-05,
"loss": 0.3146,
"step": 1300
},
{
"epoch": 0.21,
"grad_norm": 4.3805271657086795,
"learning_rate": 1.670834386852086e-05,
"loss": 0.3441,
"step": 1320
},
{
"epoch": 0.21,
"grad_norm": 4.627520106605279,
"learning_rate": 1.6958659924146653e-05,
"loss": 0.3166,
"step": 1340
},
{
"epoch": 0.21,
"grad_norm": 4.706945749353132,
"learning_rate": 1.7208975979772438e-05,
"loss": 0.3371,
"step": 1360
},
{
"epoch": 0.22,
"grad_norm": 5.9372079877822665,
"learning_rate": 1.745929203539823e-05,
"loss": 0.3343,
"step": 1380
},
{
"epoch": 0.22,
"grad_norm": 4.745434424046423,
"learning_rate": 1.7709608091024023e-05,
"loss": 0.3075,
"step": 1400
},
{
"epoch": 0.22,
"grad_norm": 4.670681702157214,
"learning_rate": 1.7959924146649812e-05,
"loss": 0.3278,
"step": 1420
},
{
"epoch": 0.23,
"grad_norm": 4.609368793030779,
"learning_rate": 1.82102402022756e-05,
"loss": 0.3193,
"step": 1440
},
{
"epoch": 0.23,
"grad_norm": 3.8953514957120987,
"learning_rate": 1.846055625790139e-05,
"loss": 0.3181,
"step": 1460
},
{
"epoch": 0.23,
"grad_norm": 4.048645810385149,
"learning_rate": 1.8710872313527183e-05,
"loss": 0.3178,
"step": 1480
},
{
"epoch": 0.24,
"grad_norm": 4.039461038727622,
"learning_rate": 1.8961188369152972e-05,
"loss": 0.3101,
"step": 1500
},
{
"epoch": 0.24,
"grad_norm": 4.865092063730574,
"learning_rate": 1.9211504424778764e-05,
"loss": 0.3145,
"step": 1520
},
{
"epoch": 0.24,
"grad_norm": 4.320223981560967,
"learning_rate": 1.9461820480404553e-05,
"loss": 0.3188,
"step": 1540
},
{
"epoch": 0.25,
"grad_norm": 4.24607218830772,
"learning_rate": 1.9712136536030343e-05,
"loss": 0.3316,
"step": 1560
},
{
"epoch": 0.25,
"grad_norm": 6.659098557418422,
"learning_rate": 1.996245259165613e-05,
"loss": 0.3103,
"step": 1580
},
{
"epoch": 0.25,
"grad_norm": 4.4003401278033545,
"learning_rate": 1.9999982293200938e-05,
"loss": 0.3316,
"step": 1600
},
{
"epoch": 0.26,
"grad_norm": 4.466923266959081,
"learning_rate": 1.999992108459333e-05,
"loss": 0.3149,
"step": 1620
},
{
"epoch": 0.26,
"grad_norm": 4.3370197217130935,
"learning_rate": 1.9999816155842287e-05,
"loss": 0.3267,
"step": 1640
},
{
"epoch": 0.26,
"grad_norm": 4.928522761323794,
"learning_rate": 1.9999667507406614e-05,
"loss": 0.3334,
"step": 1660
},
{
"epoch": 0.27,
"grad_norm": 21.27678616799708,
"learning_rate": 1.9999475139936266e-05,
"loss": 0.31,
"step": 1680
},
{
"epoch": 0.27,
"grad_norm": 9.092160010538011,
"learning_rate": 1.9999239054272376e-05,
"loss": 0.3219,
"step": 1700
},
{
"epoch": 0.27,
"grad_norm": 4.287462168712924,
"learning_rate": 1.9998959251447223e-05,
"loss": 0.2765,
"step": 1720
},
{
"epoch": 0.28,
"grad_norm": 4.200006739293095,
"learning_rate": 1.9998635732684236e-05,
"loss": 0.3187,
"step": 1740
},
{
"epoch": 0.28,
"grad_norm": 4.182543197928632,
"learning_rate": 1.9998268499398e-05,
"loss": 0.3038,
"step": 1760
},
{
"epoch": 0.28,
"grad_norm": 4.860519421610365,
"learning_rate": 1.999785755319424e-05,
"loss": 0.3195,
"step": 1780
},
{
"epoch": 0.28,
"grad_norm": 4.6816897176843,
"learning_rate": 1.9997402895869806e-05,
"loss": 0.3122,
"step": 1800
},
{
"epoch": 0.29,
"grad_norm": 4.765263778254334,
"learning_rate": 1.9996904529412684e-05,
"loss": 0.3077,
"step": 1820
},
{
"epoch": 0.29,
"grad_norm": 3.8426131141111215,
"learning_rate": 1.999636245600198e-05,
"loss": 0.3042,
"step": 1840
},
{
"epoch": 0.29,
"grad_norm": 7.697235889469334,
"learning_rate": 1.9995776678007892e-05,
"loss": 0.3219,
"step": 1860
},
{
"epoch": 0.3,
"grad_norm": 5.3692702870620455,
"learning_rate": 1.9995147197991732e-05,
"loss": 0.3319,
"step": 1880
},
{
"epoch": 0.3,
"grad_norm": 6.216633345418337,
"learning_rate": 1.9994474018705895e-05,
"loss": 0.3059,
"step": 1900
},
{
"epoch": 0.3,
"grad_norm": 5.783165856537296,
"learning_rate": 1.9993757143093847e-05,
"loss": 0.3011,
"step": 1920
},
{
"epoch": 0.31,
"grad_norm": 4.96412779585292,
"learning_rate": 1.999299657429011e-05,
"loss": 0.3267,
"step": 1940
},
{
"epoch": 0.31,
"grad_norm": 4.584783821625622,
"learning_rate": 1.9992192315620268e-05,
"loss": 0.3186,
"step": 1960
},
{
"epoch": 0.31,
"grad_norm": 3.966800361004438,
"learning_rate": 1.9991344370600926e-05,
"loss": 0.3037,
"step": 1980
},
{
"epoch": 0.32,
"grad_norm": 5.549652693595451,
"learning_rate": 1.9990452742939716e-05,
"loss": 0.3031,
"step": 2000
},
{
"epoch": 0.32,
"grad_norm": 4.092592964025307,
"learning_rate": 1.9989517436535264e-05,
"loss": 0.299,
"step": 2020
},
{
"epoch": 0.32,
"grad_norm": 7.4143606594255385,
"learning_rate": 1.9988538455477186e-05,
"loss": 0.3212,
"step": 2040
},
{
"epoch": 0.33,
"grad_norm": 4.429279288399913,
"learning_rate": 1.9987515804046065e-05,
"loss": 0.2983,
"step": 2060
},
{
"epoch": 0.33,
"grad_norm": 4.168171471024823,
"learning_rate": 1.9986449486713425e-05,
"loss": 0.2905,
"step": 2080
},
{
"epoch": 0.33,
"grad_norm": 4.023053730559626,
"learning_rate": 1.998533950814173e-05,
"loss": 0.3202,
"step": 2100
},
{
"epoch": 0.34,
"grad_norm": 4.589556797878434,
"learning_rate": 1.998418587318434e-05,
"loss": 0.2929,
"step": 2120
},
{
"epoch": 0.34,
"grad_norm": 4.736297519231011,
"learning_rate": 1.9982988586885513e-05,
"loss": 0.3192,
"step": 2140
},
{
"epoch": 0.34,
"grad_norm": 3.2489559257150242,
"learning_rate": 1.9981747654480363e-05,
"loss": 0.32,
"step": 2160
},
{
"epoch": 0.34,
"grad_norm": 4.901187690508077,
"learning_rate": 1.9980463081394853e-05,
"loss": 0.2987,
"step": 2180
},
{
"epoch": 0.35,
"grad_norm": 5.700830285351671,
"learning_rate": 1.9979134873245754e-05,
"loss": 0.2866,
"step": 2200
},
{
"epoch": 0.35,
"grad_norm": 5.280958720644886,
"learning_rate": 1.9977763035840647e-05,
"loss": 0.308,
"step": 2220
},
{
"epoch": 0.35,
"grad_norm": 3.658172362891215,
"learning_rate": 1.9976347575177864e-05,
"loss": 0.3134,
"step": 2240
},
{
"epoch": 0.36,
"grad_norm": 3.649278564224255,
"learning_rate": 1.9974888497446493e-05,
"loss": 0.2728,
"step": 2260
},
{
"epoch": 0.36,
"grad_norm": 4.213688731917936,
"learning_rate": 1.9973385809026328e-05,
"loss": 0.2954,
"step": 2280
},
{
"epoch": 0.36,
"grad_norm": 4.290870493890129,
"learning_rate": 1.997183951648785e-05,
"loss": 0.3041,
"step": 2300
},
{
"epoch": 0.37,
"grad_norm": 4.497146973593326,
"learning_rate": 1.9970249626592207e-05,
"loss": 0.3039,
"step": 2320
},
{
"epoch": 0.37,
"grad_norm": 3.980598125657535,
"learning_rate": 1.9968616146291173e-05,
"loss": 0.2876,
"step": 2340
},
{
"epoch": 0.37,
"grad_norm": 4.845633060759088,
"learning_rate": 1.9966939082727113e-05,
"loss": 0.2859,
"step": 2360
},
{
"epoch": 0.38,
"grad_norm": 5.0017279500150655,
"learning_rate": 1.9965218443232964e-05,
"loss": 0.2949,
"step": 2380
},
{
"epoch": 0.38,
"grad_norm": 4.812496829341488,
"learning_rate": 1.9963454235332197e-05,
"loss": 0.3043,
"step": 2400
},
{
"epoch": 0.38,
"grad_norm": 4.196149632421477,
"learning_rate": 1.996164646673879e-05,
"loss": 0.2917,
"step": 2420
},
{
"epoch": 0.39,
"grad_norm": 3.8759357184009477,
"learning_rate": 1.9959795145357187e-05,
"loss": 0.2788,
"step": 2440
},
{
"epoch": 0.39,
"grad_norm": 3.840684986809998,
"learning_rate": 1.995790027928226e-05,
"loss": 0.2877,
"step": 2460
},
{
"epoch": 0.39,
"grad_norm": 5.2172036078456605,
"learning_rate": 1.9955961876799288e-05,
"loss": 0.283,
"step": 2480
},
{
"epoch": 0.4,
"grad_norm": 5.656431416505367,
"learning_rate": 1.995397994638391e-05,
"loss": 0.2867,
"step": 2500
},
{
"epoch": 0.4,
"grad_norm": 3.9781055866495945,
"learning_rate": 1.9951954496702084e-05,
"loss": 0.2842,
"step": 2520
},
{
"epoch": 0.4,
"grad_norm": 3.6294521746502917,
"learning_rate": 1.994988553661007e-05,
"loss": 0.2672,
"step": 2540
},
{
"epoch": 0.4,
"grad_norm": 4.51151201839679,
"learning_rate": 1.9947773075154352e-05,
"loss": 0.2904,
"step": 2560
},
{
"epoch": 0.41,
"grad_norm": 4.985296947393056,
"learning_rate": 1.9945617121571655e-05,
"loss": 0.2999,
"step": 2580
},
{
"epoch": 0.41,
"grad_norm": 3.674752134930168,
"learning_rate": 1.9943417685288848e-05,
"loss": 0.2785,
"step": 2600
},
{
"epoch": 0.41,
"grad_norm": 4.797788651028868,
"learning_rate": 1.9941174775922932e-05,
"loss": 0.2983,
"step": 2620
},
{
"epoch": 0.42,
"grad_norm": 4.079696177221718,
"learning_rate": 1.9938888403281006e-05,
"loss": 0.2777,
"step": 2640
},
{
"epoch": 0.42,
"grad_norm": 3.3610527639517755,
"learning_rate": 1.9936558577360198e-05,
"loss": 0.2956,
"step": 2660
},
{
"epoch": 0.42,
"grad_norm": 7.188852464890547,
"learning_rate": 1.993418530834764e-05,
"loss": 0.2829,
"step": 2680
},
{
"epoch": 0.43,
"grad_norm": 3.360659331617603,
"learning_rate": 1.993176860662041e-05,
"loss": 0.292,
"step": 2700
},
{
"epoch": 0.43,
"grad_norm": 4.107038754867596,
"learning_rate": 1.9929308482745514e-05,
"loss": 0.2694,
"step": 2720
},
{
"epoch": 0.43,
"grad_norm": 3.9890648859407647,
"learning_rate": 1.9926804947479808e-05,
"loss": 0.2962,
"step": 2740
},
{
"epoch": 0.44,
"grad_norm": 4.103890708256313,
"learning_rate": 1.9924258011769957e-05,
"loss": 0.305,
"step": 2760
},
{
"epoch": 0.44,
"grad_norm": 3.6344296373375182,
"learning_rate": 1.9921667686752412e-05,
"loss": 0.2868,
"step": 2780
},
{
"epoch": 0.44,
"grad_norm": 4.132484812446097,
"learning_rate": 1.9919033983753325e-05,
"loss": 0.3088,
"step": 2800
},
{
"epoch": 0.45,
"grad_norm": 4.602803346333572,
"learning_rate": 1.991635691428853e-05,
"loss": 0.2908,
"step": 2820
},
{
"epoch": 0.45,
"grad_norm": 3.553835454417207,
"learning_rate": 1.9913636490063475e-05,
"loss": 0.2959,
"step": 2840
},
{
"epoch": 0.45,
"grad_norm": 3.3951346647945857,
"learning_rate": 1.991087272297318e-05,
"loss": 0.2857,
"step": 2860
},
{
"epoch": 0.46,
"grad_norm": 5.5117950542313014,
"learning_rate": 1.9908065625102174e-05,
"loss": 0.3072,
"step": 2880
},
{
"epoch": 0.46,
"grad_norm": 4.238784802315611,
"learning_rate": 1.9905215208724454e-05,
"loss": 0.2781,
"step": 2900
},
{
"epoch": 0.46,
"grad_norm": 3.5164479288590287,
"learning_rate": 1.990232148630343e-05,
"loss": 0.2794,
"step": 2920
},
{
"epoch": 0.46,
"grad_norm": 4.085534858193557,
"learning_rate": 1.9899384470491854e-05,
"loss": 0.2858,
"step": 2940
},
{
"epoch": 0.47,
"grad_norm": 3.5369416675799563,
"learning_rate": 1.98964041741318e-05,
"loss": 0.2672,
"step": 2960
},
{
"epoch": 0.47,
"grad_norm": 4.60995552157379,
"learning_rate": 1.989338061025456e-05,
"loss": 0.2868,
"step": 2980
},
{
"epoch": 0.47,
"grad_norm": 3.3514909358709444,
"learning_rate": 1.989031379208063e-05,
"loss": 0.2723,
"step": 3000
},
{
"epoch": 0.48,
"grad_norm": 3.923203323163403,
"learning_rate": 1.9887203733019632e-05,
"loss": 0.28,
"step": 3020
},
{
"epoch": 0.48,
"grad_norm": 3.429027033234547,
"learning_rate": 1.9884050446670256e-05,
"loss": 0.2952,
"step": 3040
},
{
"epoch": 0.48,
"grad_norm": 3.5801457858104397,
"learning_rate": 1.9880853946820197e-05,
"loss": 0.2804,
"step": 3060
},
{
"epoch": 0.49,
"grad_norm": 4.712617033863597,
"learning_rate": 1.9877614247446116e-05,
"loss": 0.2892,
"step": 3080
},
{
"epoch": 0.49,
"grad_norm": 4.933700451637125,
"learning_rate": 1.987433136271354e-05,
"loss": 0.2717,
"step": 3100
},
{
"epoch": 0.49,
"grad_norm": 4.790425988609337,
"learning_rate": 1.9871005306976846e-05,
"loss": 0.2695,
"step": 3120
},
{
"epoch": 0.5,
"grad_norm": 3.7466637075374045,
"learning_rate": 1.9867636094779166e-05,
"loss": 0.2624,
"step": 3140
},
{
"epoch": 0.5,
"grad_norm": 3.50365965112591,
"learning_rate": 1.9864223740852334e-05,
"loss": 0.2844,
"step": 3160
},
{
"epoch": 0.5,
"grad_norm": 4.569034860813186,
"learning_rate": 1.9860768260116815e-05,
"loss": 0.2905,
"step": 3180
},
{
"epoch": 0.51,
"grad_norm": 4.170649316306094,
"learning_rate": 1.9857269667681655e-05,
"loss": 0.2674,
"step": 3200
},
{
"epoch": 0.51,
"grad_norm": 4.434905247240572,
"learning_rate": 1.98537279788444e-05,
"loss": 0.2794,
"step": 3220
},
{
"epoch": 0.51,
"grad_norm": 3.9321442768019623,
"learning_rate": 1.9850143209091034e-05,
"loss": 0.2881,
"step": 3240
},
{
"epoch": 0.52,
"grad_norm": 5.363885077307175,
"learning_rate": 1.9846515374095914e-05,
"loss": 0.2858,
"step": 3260
},
{
"epoch": 0.52,
"grad_norm": 4.191427182787896,
"learning_rate": 1.98428444897217e-05,
"loss": 0.2816,
"step": 3280
},
{
"epoch": 0.52,
"grad_norm": 4.006135526500501,
"learning_rate": 1.983913057201928e-05,
"loss": 0.2755,
"step": 3300
},
{
"epoch": 0.52,
"grad_norm": 3.8000033328393075,
"learning_rate": 1.9835373637227703e-05,
"loss": 0.2733,
"step": 3320
},
{
"epoch": 0.53,
"grad_norm": 5.333992420638842,
"learning_rate": 1.9831573701774123e-05,
"loss": 0.2779,
"step": 3340
},
{
"epoch": 0.53,
"grad_norm": 4.252307489826801,
"learning_rate": 1.9827730782273703e-05,
"loss": 0.2592,
"step": 3360
},
{
"epoch": 0.53,
"grad_norm": 3.525549279021786,
"learning_rate": 1.982384489552955e-05,
"loss": 0.2739,
"step": 3380
},
{
"epoch": 0.54,
"grad_norm": 7.6522667312835475,
"learning_rate": 1.9819916058532657e-05,
"loss": 0.2816,
"step": 3400
},
{
"epoch": 0.54,
"grad_norm": 11.523203122616478,
"learning_rate": 1.98159442884618e-05,
"loss": 0.2715,
"step": 3420
},
{
"epoch": 0.54,
"grad_norm": 4.787543449503492,
"learning_rate": 1.9811929602683497e-05,
"loss": 0.2618,
"step": 3440
},
{
"epoch": 0.55,
"grad_norm": 5.207304138221566,
"learning_rate": 1.9807872018751904e-05,
"loss": 0.254,
"step": 3460
},
{
"epoch": 0.55,
"grad_norm": 3.4061625058197453,
"learning_rate": 1.9803771554408745e-05,
"loss": 0.2526,
"step": 3480
},
{
"epoch": 0.55,
"grad_norm": 4.380956899549788,
"learning_rate": 1.9799628227583248e-05,
"loss": 0.2797,
"step": 3500
},
{
"epoch": 0.56,
"grad_norm": 4.127558290565574,
"learning_rate": 1.9795442056392054e-05,
"loss": 0.2712,
"step": 3520
},
{
"epoch": 0.56,
"grad_norm": 4.10767355386799,
"learning_rate": 1.9791213059139132e-05,
"loss": 0.2832,
"step": 3540
},
{
"epoch": 0.56,
"grad_norm": 3.662712320155948,
"learning_rate": 1.978694125431572e-05,
"loss": 0.2663,
"step": 3560
},
{
"epoch": 0.57,
"grad_norm": 3.8370942587808607,
"learning_rate": 1.978262666060022e-05,
"loss": 0.2796,
"step": 3580
},
{
"epoch": 0.57,
"grad_norm": 3.3559744723580143,
"learning_rate": 1.9778269296858138e-05,
"loss": 0.2758,
"step": 3600
},
{
"epoch": 0.57,
"grad_norm": 4.912706342639339,
"learning_rate": 1.977386918214198e-05,
"loss": 0.2691,
"step": 3620
},
{
"epoch": 0.58,
"grad_norm": 3.827769733993231,
"learning_rate": 1.9769426335691194e-05,
"loss": 0.2888,
"step": 3640
},
{
"epoch": 0.58,
"grad_norm": 8.974780882814423,
"learning_rate": 1.9764940776932057e-05,
"loss": 0.279,
"step": 3660
},
{
"epoch": 0.58,
"grad_norm": 7.288399652923296,
"learning_rate": 1.9760412525477615e-05,
"loss": 0.27,
"step": 3680
},
{
"epoch": 0.58,
"grad_norm": 5.013042224656741,
"learning_rate": 1.9755841601127587e-05,
"loss": 0.2635,
"step": 3700
},
{
"epoch": 0.59,
"grad_norm": 4.113083011382156,
"learning_rate": 1.9751228023868275e-05,
"loss": 0.2742,
"step": 3720
},
{
"epoch": 0.59,
"grad_norm": 3.8810313877329516,
"learning_rate": 1.974657181387248e-05,
"loss": 0.2608,
"step": 3740
},
{
"epoch": 0.59,
"grad_norm": 4.321527293044816,
"learning_rate": 1.974187299149942e-05,
"loss": 0.2642,
"step": 3760
},
{
"epoch": 0.6,
"grad_norm": 4.507987668552812,
"learning_rate": 1.973713157729462e-05,
"loss": 0.2839,
"step": 3780
},
{
"epoch": 0.6,
"grad_norm": 4.112442322300466,
"learning_rate": 1.9732347591989863e-05,
"loss": 0.2767,
"step": 3800
},
{
"epoch": 0.6,
"grad_norm": 3.9228516840537075,
"learning_rate": 1.972752105650304e-05,
"loss": 0.2516,
"step": 3820
},
{
"epoch": 0.61,
"grad_norm": 3.9874693579378344,
"learning_rate": 1.972265199193813e-05,
"loss": 0.2875,
"step": 3840
},
{
"epoch": 0.61,
"grad_norm": 5.032079024162868,
"learning_rate": 1.9717740419585033e-05,
"loss": 0.266,
"step": 3860
},
{
"epoch": 0.61,
"grad_norm": 3.5996549440957164,
"learning_rate": 1.9712786360919543e-05,
"loss": 0.2548,
"step": 3880
},
{
"epoch": 0.62,
"grad_norm": 4.683794515218512,
"learning_rate": 1.9707789837603205e-05,
"loss": 0.2701,
"step": 3900
},
{
"epoch": 0.62,
"grad_norm": 4.52037247566233,
"learning_rate": 1.9702750871483248e-05,
"loss": 0.261,
"step": 3920
},
{
"epoch": 0.62,
"grad_norm": 4.5693573304091535,
"learning_rate": 1.9697669484592487e-05,
"loss": 0.2568,
"step": 3940
},
{
"epoch": 0.63,
"grad_norm": 3.48060438103784,
"learning_rate": 1.9692545699149212e-05,
"loss": 0.2545,
"step": 3960
},
{
"epoch": 0.63,
"grad_norm": 4.719616538813032,
"learning_rate": 1.9687379537557107e-05,
"loss": 0.2676,
"step": 3980
},
{
"epoch": 0.63,
"grad_norm": 4.593894281521921,
"learning_rate": 1.9682171022405133e-05,
"loss": 0.2803,
"step": 4000
},
{
"epoch": 0.64,
"grad_norm": 3.2767058377982203,
"learning_rate": 1.967692017646746e-05,
"loss": 0.258,
"step": 4020
},
{
"epoch": 0.64,
"grad_norm": 4.102789530001537,
"learning_rate": 1.9671627022703333e-05,
"loss": 0.2804,
"step": 4040
},
{
"epoch": 0.64,
"grad_norm": 3.747557671264137,
"learning_rate": 1.9666291584256995e-05,
"loss": 0.275,
"step": 4060
},
{
"epoch": 0.64,
"grad_norm": 4.527128041668808,
"learning_rate": 1.9660913884457572e-05,
"loss": 0.2637,
"step": 4080
},
{
"epoch": 0.65,
"grad_norm": 6.26616500848599,
"learning_rate": 1.965549394681899e-05,
"loss": 0.2769,
"step": 4100
},
{
"epoch": 0.65,
"grad_norm": 4.208333585128525,
"learning_rate": 1.9650031795039847e-05,
"loss": 0.2506,
"step": 4120
},
{
"epoch": 0.65,
"grad_norm": 3.770191763293591,
"learning_rate": 1.9644527453003326e-05,
"loss": 0.2716,
"step": 4140
},
{
"epoch": 0.66,
"grad_norm": 8.625265366162292,
"learning_rate": 1.9638980944777085e-05,
"loss": 0.2831,
"step": 4160
},
{
"epoch": 0.66,
"grad_norm": 3.9442544557070285,
"learning_rate": 1.9633392294613155e-05,
"loss": 0.2777,
"step": 4180
},
{
"epoch": 0.66,
"grad_norm": 3.273256865914117,
"learning_rate": 1.962776152694783e-05,
"loss": 0.2507,
"step": 4200
},
{
"epoch": 0.67,
"grad_norm": 3.7930612759054445,
"learning_rate": 1.9622088666401566e-05,
"loss": 0.2545,
"step": 4220
},
{
"epoch": 0.67,
"grad_norm": 3.605998255572159,
"learning_rate": 1.9616373737778864e-05,
"loss": 0.2718,
"step": 4240
},
{
"epoch": 0.67,
"grad_norm": 3.3060364912064966,
"learning_rate": 1.961061676606817e-05,
"loss": 0.2615,
"step": 4260
},
{
"epoch": 0.68,
"grad_norm": 3.180581756735597,
"learning_rate": 1.9604817776441762e-05,
"loss": 0.2475,
"step": 4280
},
{
"epoch": 0.68,
"grad_norm": 4.109627023373317,
"learning_rate": 1.9598976794255647e-05,
"loss": 0.2781,
"step": 4300
},
{
"epoch": 0.68,
"grad_norm": 3.8246626162863007,
"learning_rate": 1.9593093845049435e-05,
"loss": 0.2797,
"step": 4320
},
{
"epoch": 0.69,
"grad_norm": 3.880680350794998,
"learning_rate": 1.9587168954546233e-05,
"loss": 0.2728,
"step": 4340
},
{
"epoch": 0.69,
"grad_norm": 3.504579545295932,
"learning_rate": 1.9581202148652555e-05,
"loss": 0.2517,
"step": 4360
},
{
"epoch": 0.69,
"grad_norm": 3.251503083301662,
"learning_rate": 1.957519345345817e-05,
"loss": 0.234,
"step": 4380
},
{
"epoch": 0.7,
"grad_norm": 3.8245770887559467,
"learning_rate": 1.9569142895236014e-05,
"loss": 0.2691,
"step": 4400
},
{
"epoch": 0.7,
"grad_norm": 3.7480270281848163,
"learning_rate": 1.9563050500442067e-05,
"loss": 0.2563,
"step": 4420
},
{
"epoch": 0.7,
"grad_norm": 3.6499757321739805,
"learning_rate": 1.9556916295715248e-05,
"loss": 0.2599,
"step": 4440
},
{
"epoch": 0.71,
"grad_norm": 4.289401570798809,
"learning_rate": 1.955074030787727e-05,
"loss": 0.2494,
"step": 4460
},
{
"epoch": 0.71,
"grad_norm": 4.463444688322885,
"learning_rate": 1.9544522563932567e-05,
"loss": 0.2503,
"step": 4480
},
{
"epoch": 0.71,
"grad_norm": 3.4818768633740076,
"learning_rate": 1.953826309106813e-05,
"loss": 0.2569,
"step": 4500
},
{
"epoch": 0.71,
"grad_norm": 4.094470710503364,
"learning_rate": 1.9531961916653416e-05,
"loss": 0.2722,
"step": 4520
},
{
"epoch": 0.72,
"grad_norm": 4.500248160407271,
"learning_rate": 1.9525619068240227e-05,
"loss": 0.2525,
"step": 4540
},
{
"epoch": 0.72,
"grad_norm": 3.482370390112968,
"learning_rate": 1.951923457356258e-05,
"loss": 0.2534,
"step": 4560
},
{
"epoch": 0.72,
"grad_norm": 5.274501845715762,
"learning_rate": 1.9512808460536586e-05,
"loss": 0.2454,
"step": 4580
},
{
"epoch": 0.73,
"grad_norm": 4.177646944576608,
"learning_rate": 1.9506340757260332e-05,
"loss": 0.2474,
"step": 4600
},
{
"epoch": 0.73,
"grad_norm": 3.56021633905292,
"learning_rate": 1.9499831492013772e-05,
"loss": 0.2744,
"step": 4620
},
{
"epoch": 0.73,
"grad_norm": 3.485877700901466,
"learning_rate": 1.9493280693258565e-05,
"loss": 0.2741,
"step": 4640
},
{
"epoch": 0.74,
"grad_norm": 4.495079741486709,
"learning_rate": 1.9486688389637993e-05,
"loss": 0.2399,
"step": 4660
},
{
"epoch": 0.74,
"grad_norm": 3.879830579696546,
"learning_rate": 1.9480054609976815e-05,
"loss": 0.2719,
"step": 4680
},
{
"epoch": 0.74,
"grad_norm": 3.683250176597202,
"learning_rate": 1.9473379383281136e-05,
"loss": 0.2647,
"step": 4700
},
{
"epoch": 0.75,
"grad_norm": 3.4564441591881585,
"learning_rate": 1.9466662738738295e-05,
"loss": 0.2614,
"step": 4720
},
{
"epoch": 0.75,
"grad_norm": 3.5430190851406516,
"learning_rate": 1.945990470571672e-05,
"loss": 0.2509,
"step": 4740
},
{
"epoch": 0.75,
"grad_norm": 3.594438702478659,
"learning_rate": 1.945310531376582e-05,
"loss": 0.249,
"step": 4760
},
{
"epoch": 0.76,
"grad_norm": 3.765747448208518,
"learning_rate": 1.944626459261585e-05,
"loss": 0.2503,
"step": 4780
},
{
"epoch": 0.76,
"grad_norm": 4.2901200899686405,
"learning_rate": 1.9439382572177755e-05,
"loss": 0.2424,
"step": 4800
},
{
"epoch": 0.76,
"grad_norm": 3.476240164568998,
"learning_rate": 1.9432459282543085e-05,
"loss": 0.264,
"step": 4820
},
{
"epoch": 0.77,
"grad_norm": 3.670630970694611,
"learning_rate": 1.942549475398382e-05,
"loss": 0.2663,
"step": 4840
},
{
"epoch": 0.77,
"grad_norm": 3.7957980665515034,
"learning_rate": 1.941848901695227e-05,
"loss": 0.2621,
"step": 4860
},
{
"epoch": 0.77,
"grad_norm": 4.793895081633612,
"learning_rate": 1.9411442102080914e-05,
"loss": 0.2498,
"step": 4880
},
{
"epoch": 0.77,
"grad_norm": 3.464785397972565,
"learning_rate": 1.9404354040182302e-05,
"loss": 0.242,
"step": 4900
},
{
"epoch": 0.78,
"grad_norm": 3.486787653538589,
"learning_rate": 1.9397224862248875e-05,
"loss": 0.2623,
"step": 4920
},
{
"epoch": 0.78,
"grad_norm": 4.089389487964841,
"learning_rate": 1.939005459945287e-05,
"loss": 0.2543,
"step": 4940
},
{
"epoch": 0.78,
"grad_norm": 3.1762519290562414,
"learning_rate": 1.9382843283146158e-05,
"loss": 0.2654,
"step": 4960
},
{
"epoch": 0.79,
"grad_norm": 3.835699443072975,
"learning_rate": 1.937559094486012e-05,
"loss": 0.2743,
"step": 4980
},
{
"epoch": 0.79,
"grad_norm": 3.8852244202589334,
"learning_rate": 1.936829761630551e-05,
"loss": 0.2629,
"step": 5000
},
{
"epoch": 0.79,
"grad_norm": 3.1906813658574835,
"learning_rate": 1.93609633293723e-05,
"loss": 0.2417,
"step": 5020
},
{
"epoch": 0.8,
"grad_norm": 2.3269103292999986,
"learning_rate": 1.9353588116129556e-05,
"loss": 0.218,
"step": 5040
},
{
"epoch": 0.8,
"grad_norm": 3.2694392219817927,
"learning_rate": 1.9346172008825302e-05,
"loss": 0.2321,
"step": 5060
},
{
"epoch": 0.8,
"grad_norm": 2.7636752394800634,
"learning_rate": 1.9338715039886357e-05,
"loss": 0.2112,
"step": 5080
},
{
"epoch": 0.81,
"grad_norm": 1.7461733533221422,
"learning_rate": 1.9331217241918223e-05,
"loss": 0.2065,
"step": 5100
},
{
"epoch": 0.81,
"grad_norm": 2.1112769700310166,
"learning_rate": 1.9323678647704908e-05,
"loss": 0.2178,
"step": 5120
},
{
"epoch": 0.81,
"grad_norm": 1.928506961128503,
"learning_rate": 1.9316099290208812e-05,
"loss": 0.2222,
"step": 5140
},
{
"epoch": 0.82,
"grad_norm": 2.289733371787946,
"learning_rate": 1.9308479202570575e-05,
"loss": 0.2331,
"step": 5160
},
{
"epoch": 0.82,
"grad_norm": 1.997736424764338,
"learning_rate": 1.9300818418108923e-05,
"loss": 0.2133,
"step": 5180
},
{
"epoch": 0.82,
"grad_norm": 2.260897208845383,
"learning_rate": 1.9293116970320528e-05,
"loss": 0.2092,
"step": 5200
},
{
"epoch": 0.83,
"grad_norm": 2.01842381348837,
"learning_rate": 1.928537489287986e-05,
"loss": 0.2189,
"step": 5220
},
{
"epoch": 0.83,
"grad_norm": 2.377313290393669,
"learning_rate": 1.927759221963905e-05,
"loss": 0.2374,
"step": 5240
},
{
"epoch": 0.83,
"grad_norm": 2.431465602229034,
"learning_rate": 1.926976898462773e-05,
"loss": 0.2219,
"step": 5260
},
{
"epoch": 0.83,
"grad_norm": 2.272111328930043,
"learning_rate": 1.9261905222052882e-05,
"loss": 0.2182,
"step": 5280
},
{
"epoch": 0.84,
"grad_norm": 2.1252795764788788,
"learning_rate": 1.9254000966298702e-05,
"loss": 0.2219,
"step": 5300
},
{
"epoch": 0.84,
"grad_norm": 2.1286718396965143,
"learning_rate": 1.924605625192643e-05,
"loss": 0.2181,
"step": 5320
},
{
"epoch": 0.84,
"grad_norm": 2.263405074029898,
"learning_rate": 1.923807111367423e-05,
"loss": 0.2082,
"step": 5340
},
{
"epoch": 0.85,
"grad_norm": 2.349777405930586,
"learning_rate": 1.9230045586457004e-05,
"loss": 0.2148,
"step": 5360
},
{
"epoch": 0.85,
"grad_norm": 1.951720991944945,
"learning_rate": 1.9221979705366256e-05,
"loss": 0.2177,
"step": 5380
},
{
"epoch": 0.85,
"grad_norm": 2.6396096890364915,
"learning_rate": 1.9213873505669944e-05,
"loss": 0.224,
"step": 5400
},
{
"epoch": 0.86,
"grad_norm": 2.340017372784151,
"learning_rate": 1.9205727022812307e-05,
"loss": 0.2149,
"step": 5420
},
{
"epoch": 0.86,
"grad_norm": 2.489578364113247,
"learning_rate": 1.9197540292413734e-05,
"loss": 0.2229,
"step": 5440
},
{
"epoch": 0.86,
"grad_norm": 1.9693131625322577,
"learning_rate": 1.9189313350270585e-05,
"loss": 0.2114,
"step": 5460
},
{
"epoch": 0.87,
"grad_norm": 2.3178735109826505,
"learning_rate": 1.9181046232355053e-05,
"loss": 0.212,
"step": 5480
},
{
"epoch": 0.87,
"grad_norm": 2.1928905009683133,
"learning_rate": 1.9172738974814993e-05,
"loss": 0.2176,
"step": 5500
},
{
"epoch": 0.87,
"grad_norm": 2.188333080240406,
"learning_rate": 1.9164391613973778e-05,
"loss": 0.2077,
"step": 5520
},
{
"epoch": 0.88,
"grad_norm": 2.3037514030133517,
"learning_rate": 1.9156004186330123e-05,
"loss": 0.2121,
"step": 5540
},
{
"epoch": 0.88,
"grad_norm": 5.37450956724229,
"learning_rate": 1.914757672855794e-05,
"loss": 0.2095,
"step": 5560
},
{
"epoch": 0.88,
"grad_norm": 2.040655690264704,
"learning_rate": 1.9139109277506173e-05,
"loss": 0.1993,
"step": 5580
},
{
"epoch": 0.89,
"grad_norm": 2.0548550497572076,
"learning_rate": 1.9130601870198633e-05,
"loss": 0.2203,
"step": 5600
},
{
"epoch": 0.89,
"grad_norm": 2.8075751525385924,
"learning_rate": 1.912205454383384e-05,
"loss": 0.2255,
"step": 5620
},
{
"epoch": 0.89,
"grad_norm": 2.213380252712336,
"learning_rate": 1.9113467335784855e-05,
"loss": 0.2192,
"step": 5640
},
{
"epoch": 0.89,
"grad_norm": 2.7606591076219633,
"learning_rate": 1.9104840283599136e-05,
"loss": 0.2245,
"step": 5660
},
{
"epoch": 0.9,
"grad_norm": 2.1216745722948787,
"learning_rate": 1.909617342499834e-05,
"loss": 0.2071,
"step": 5680
},
{
"epoch": 0.9,
"grad_norm": 2.2785095057805065,
"learning_rate": 1.908746679787819e-05,
"loss": 0.2172,
"step": 5700
},
{
"epoch": 0.9,
"grad_norm": 2.2981748471055576,
"learning_rate": 1.907872044030829e-05,
"loss": 0.2099,
"step": 5720
},
{
"epoch": 0.91,
"grad_norm": 1.960812811667782,
"learning_rate": 1.9069934390531962e-05,
"loss": 0.2146,
"step": 5740
},
{
"epoch": 0.91,
"grad_norm": 3.794717686019522,
"learning_rate": 1.90611086869661e-05,
"loss": 0.208,
"step": 5760
},
{
"epoch": 0.91,
"grad_norm": 2.6008278665808033,
"learning_rate": 1.9052243368200958e-05,
"loss": 0.2134,
"step": 5780
},
{
"epoch": 0.92,
"grad_norm": 2.4705009528511277,
"learning_rate": 1.9043338473000025e-05,
"loss": 0.2253,
"step": 5800
},
{
"epoch": 0.92,
"grad_norm": 2.401514124088343,
"learning_rate": 1.9034394040299827e-05,
"loss": 0.2173,
"step": 5820
},
{
"epoch": 0.92,
"grad_norm": 3.1101268070286685,
"learning_rate": 1.9025410109209777e-05,
"loss": 0.2133,
"step": 5840
},
{
"epoch": 0.93,
"grad_norm": 2.7159957966217187,
"learning_rate": 1.9016386719011982e-05,
"loss": 0.2022,
"step": 5860
},
{
"epoch": 0.93,
"grad_norm": 2.385988014350966,
"learning_rate": 1.90073239091611e-05,
"loss": 0.2164,
"step": 5880
},
{
"epoch": 0.93,
"grad_norm": 2.1587806998998325,
"learning_rate": 1.899822171928413e-05,
"loss": 0.203,
"step": 5900
},
{
"epoch": 0.94,
"grad_norm": 2.2767020052526497,
"learning_rate": 1.8989080189180278e-05,
"loss": 0.2216,
"step": 5920
},
{
"epoch": 0.94,
"grad_norm": 3.090507426402487,
"learning_rate": 1.8979899358820756e-05,
"loss": 0.2045,
"step": 5940
},
{
"epoch": 0.94,
"grad_norm": 1.777654599684145,
"learning_rate": 1.8970679268348617e-05,
"loss": 0.2017,
"step": 5960
},
{
"epoch": 0.95,
"grad_norm": 2.188623412353316,
"learning_rate": 1.8961419958078577e-05,
"loss": 0.2122,
"step": 5980
},
{
"epoch": 0.95,
"grad_norm": 2.262035920826766,
"learning_rate": 1.8952121468496842e-05,
"loss": 0.2149,
"step": 6000
},
{
"epoch": 0.95,
"grad_norm": 2.1025895229154403,
"learning_rate": 1.894278384026093e-05,
"loss": 0.2075,
"step": 6020
},
{
"epoch": 0.95,
"grad_norm": 2.350054387214,
"learning_rate": 1.893340711419949e-05,
"loss": 0.2026,
"step": 6040
},
{
"epoch": 0.96,
"grad_norm": 2.5056109405602704,
"learning_rate": 1.8923991331312125e-05,
"loss": 0.2003,
"step": 6060
},
{
"epoch": 0.96,
"grad_norm": 2.202062147526405,
"learning_rate": 1.891453653276921e-05,
"loss": 0.2078,
"step": 6080
},
{
"epoch": 0.96,
"grad_norm": 2.2246885746094445,
"learning_rate": 1.8905042759911734e-05,
"loss": 0.2065,
"step": 6100
},
{
"epoch": 0.97,
"grad_norm": 2.3053321066228554,
"learning_rate": 1.8895510054251074e-05,
"loss": 0.2266,
"step": 6120
},
{
"epoch": 0.97,
"grad_norm": 2.988781297520643,
"learning_rate": 1.888593845746886e-05,
"loss": 0.2168,
"step": 6140
},
{
"epoch": 0.97,
"grad_norm": 2.4575329679216167,
"learning_rate": 1.887632801141676e-05,
"loss": 0.1994,
"step": 6160
},
{
"epoch": 0.98,
"grad_norm": 1.974119765257406,
"learning_rate": 1.886667875811632e-05,
"loss": 0.2009,
"step": 6180
},
{
"epoch": 0.98,
"grad_norm": 2.250505072807692,
"learning_rate": 1.885699073975877e-05,
"loss": 0.2011,
"step": 6200
},
{
"epoch": 0.98,
"grad_norm": 3.4317109345564774,
"learning_rate": 1.8847263998704822e-05,
"loss": 0.214,
"step": 6220
},
{
"epoch": 0.99,
"grad_norm": 1.9321307572015396,
"learning_rate": 1.883749857748453e-05,
"loss": 0.2121,
"step": 6240
},
{
"epoch": 0.99,
"grad_norm": 2.44399716037142,
"learning_rate": 1.8827694518797058e-05,
"loss": 0.2121,
"step": 6260
},
{
"epoch": 0.99,
"grad_norm": 3.70267918474469,
"learning_rate": 1.881785186551051e-05,
"loss": 0.218,
"step": 6280
},
{
"epoch": 1.0,
"grad_norm": 2.2164340760047763,
"learning_rate": 1.880797066066176e-05,
"loss": 0.2018,
"step": 6300
},
{
"epoch": 1.0,
"grad_norm": 2.0350434314114065,
"learning_rate": 1.8798050947456237e-05,
"loss": 0.2146,
"step": 6320
},
{
"epoch": 1.0,
"grad_norm": 2.2542825844152614,
"learning_rate": 1.8788092769267742e-05,
"loss": 0.1966,
"step": 6340
},
{
"epoch": 1.01,
"grad_norm": 2.540485628816784,
"learning_rate": 1.877809616963828e-05,
"loss": 0.181,
"step": 6360
},
{
"epoch": 1.01,
"grad_norm": 2.0965630152825794,
"learning_rate": 1.8768061192277835e-05,
"loss": 0.1768,
"step": 6380
},
{
"epoch": 1.01,
"grad_norm": 1.832162295211555,
"learning_rate": 1.8757987881064214e-05,
"loss": 0.1841,
"step": 6400
},
{
"epoch": 1.01,
"grad_norm": 2.3417686734318877,
"learning_rate": 1.8747876280042826e-05,
"loss": 0.1748,
"step": 6420
},
{
"epoch": 1.02,
"grad_norm": 2.3993985614578657,
"learning_rate": 1.8737726433426505e-05,
"loss": 0.1806,
"step": 6440
},
{
"epoch": 1.02,
"grad_norm": 2.0282614216820862,
"learning_rate": 1.872753838559532e-05,
"loss": 0.1948,
"step": 6460
},
{
"epoch": 1.02,
"grad_norm": 2.8165988300739553,
"learning_rate": 1.8717312181096363e-05,
"loss": 0.175,
"step": 6480
},
{
"epoch": 1.03,
"grad_norm": 2.2451015768881493,
"learning_rate": 1.870704786464357e-05,
"loss": 0.1798,
"step": 6500
},
{
"epoch": 1.03,
"grad_norm": 2.1815849986963296,
"learning_rate": 1.869674548111753e-05,
"loss": 0.1831,
"step": 6520
},
{
"epoch": 1.03,
"grad_norm": 2.1640382620765846,
"learning_rate": 1.8686405075565258e-05,
"loss": 0.1829,
"step": 6540
},
{
"epoch": 1.04,
"grad_norm": 2.9145132743989945,
"learning_rate": 1.8676026693200048e-05,
"loss": 0.1747,
"step": 6560
},
{
"epoch": 1.04,
"grad_norm": 2.045497051914788,
"learning_rate": 1.8665610379401226e-05,
"loss": 0.1753,
"step": 6580
},
{
"epoch": 1.04,
"grad_norm": 1.7634716575081997,
"learning_rate": 1.8655156179713974e-05,
"loss": 0.1814,
"step": 6600
},
{
"epoch": 1.05,
"grad_norm": 1.7599637579437575,
"learning_rate": 1.8644664139849144e-05,
"loss": 0.1911,
"step": 6620
},
{
"epoch": 1.05,
"grad_norm": 2.120306666344395,
"learning_rate": 1.863413430568303e-05,
"loss": 0.1875,
"step": 6640
},
{
"epoch": 1.05,
"grad_norm": 1.639791731170639,
"learning_rate": 1.8623566723257188e-05,
"loss": 0.1784,
"step": 6660
},
{
"epoch": 1.06,
"grad_norm": 2.0062873480341943,
"learning_rate": 1.8612961438778225e-05,
"loss": 0.1764,
"step": 6680
},
{
"epoch": 1.06,
"grad_norm": 2.4050323576148234,
"learning_rate": 1.8602318498617592e-05,
"loss": 0.1782,
"step": 6700
},
{
"epoch": 1.06,
"grad_norm": 1.897036816703736,
"learning_rate": 1.8591637949311408e-05,
"loss": 0.1789,
"step": 6720
},
{
"epoch": 1.07,
"grad_norm": 3.1448228404692387,
"learning_rate": 1.8580919837560224e-05,
"loss": 0.1657,
"step": 6740
},
{
"epoch": 1.07,
"grad_norm": 1.979007580704552,
"learning_rate": 1.8570164210228826e-05,
"loss": 0.1773,
"step": 6760
},
{
"epoch": 1.07,
"grad_norm": 1.8928740814253107,
"learning_rate": 1.8559371114346058e-05,
"loss": 0.182,
"step": 6780
},
{
"epoch": 1.07,
"grad_norm": 2.1321312791081413,
"learning_rate": 1.854854059710457e-05,
"loss": 0.182,
"step": 6800
},
{
"epoch": 1.08,
"grad_norm": 2.488188785290195,
"learning_rate": 1.8537672705860653e-05,
"loss": 0.1871,
"step": 6820
},
{
"epoch": 1.08,
"grad_norm": 2.512962685869269,
"learning_rate": 1.8526767488134015e-05,
"loss": 0.1796,
"step": 6840
},
{
"epoch": 1.08,
"grad_norm": 2.457219103013576,
"learning_rate": 1.8515824991607562e-05,
"loss": 0.1768,
"step": 6860
},
{
"epoch": 1.09,
"grad_norm": 2.0058477265819272,
"learning_rate": 1.850484526412721e-05,
"loss": 0.1878,
"step": 6880
},
{
"epoch": 1.09,
"grad_norm": 1.968329732936287,
"learning_rate": 1.8493828353701666e-05,
"loss": 0.175,
"step": 6900
},
{
"epoch": 1.09,
"grad_norm": 2.2137462446632177,
"learning_rate": 1.8482774308502218e-05,
"loss": 0.1814,
"step": 6920
},
{
"epoch": 1.1,
"grad_norm": 2.1543171277309003,
"learning_rate": 1.8471683176862517e-05,
"loss": 0.1886,
"step": 6940
},
{
"epoch": 1.1,
"grad_norm": 4.139111281755878,
"learning_rate": 1.8460555007278392e-05,
"loss": 0.175,
"step": 6960
},
{
"epoch": 1.1,
"grad_norm": 2.054321604592727,
"learning_rate": 1.844938984840761e-05,
"loss": 0.1768,
"step": 6980
},
{
"epoch": 1.11,
"grad_norm": 2.256285682459467,
"learning_rate": 1.843818774906967e-05,
"loss": 0.1786,
"step": 7000
},
{
"epoch": 1.11,
"grad_norm": 2.5864624228248627,
"learning_rate": 1.8426948758245588e-05,
"loss": 0.1789,
"step": 7020
},
{
"epoch": 1.11,
"grad_norm": 2.2452602266240786,
"learning_rate": 1.8415672925077706e-05,
"loss": 0.1743,
"step": 7040
},
{
"epoch": 1.12,
"grad_norm": 1.905290268237584,
"learning_rate": 1.8404360298869443e-05,
"loss": 0.1789,
"step": 7060
},
{
"epoch": 1.12,
"grad_norm": 2.1523717750344824,
"learning_rate": 1.8393010929085106e-05,
"loss": 0.1716,
"step": 7080
},
{
"epoch": 1.12,
"grad_norm": 1.8556344369896958,
"learning_rate": 1.8381624865349644e-05,
"loss": 0.1814,
"step": 7100
},
{
"epoch": 1.13,
"grad_norm": 2.078547958125406,
"learning_rate": 1.837020215744847e-05,
"loss": 0.1774,
"step": 7120
},
{
"epoch": 1.13,
"grad_norm": 2.029658706613511,
"learning_rate": 1.8358742855327222e-05,
"loss": 0.1699,
"step": 7140
},
{
"epoch": 1.13,
"grad_norm": 2.2998633634367316,
"learning_rate": 1.8347247009091528e-05,
"loss": 0.1762,
"step": 7160
},
{
"epoch": 1.13,
"grad_norm": 2.2901708634113227,
"learning_rate": 1.8335714669006818e-05,
"loss": 0.1682,
"step": 7180
},
{
"epoch": 1.14,
"grad_norm": 1.6706682645288262,
"learning_rate": 1.8324145885498092e-05,
"loss": 0.1797,
"step": 7200
},
{
"epoch": 1.14,
"grad_norm": 1.9131372823635615,
"learning_rate": 1.8312540709149696e-05,
"loss": 0.1777,
"step": 7220
},
{
"epoch": 1.14,
"grad_norm": 2.07909984239155,
"learning_rate": 1.8300899190705098e-05,
"loss": 0.1752,
"step": 7240
},
{
"epoch": 1.15,
"grad_norm": 2.1760276778157857,
"learning_rate": 1.828922138106668e-05,
"loss": 0.1847,
"step": 7260
},
{
"epoch": 1.15,
"grad_norm": 2.247820116102049,
"learning_rate": 1.8277507331295495e-05,
"loss": 0.1899,
"step": 7280
},
{
"epoch": 1.15,
"grad_norm": 1.8558783470602038,
"learning_rate": 1.8265757092611075e-05,
"loss": 0.176,
"step": 7300
},
{
"epoch": 1.16,
"grad_norm": 2.198791896123805,
"learning_rate": 1.8253970716391166e-05,
"loss": 0.1781,
"step": 7320
},
{
"epoch": 1.16,
"grad_norm": 2.0878033811578196,
"learning_rate": 1.8242148254171532e-05,
"loss": 0.186,
"step": 7340
},
{
"epoch": 1.16,
"grad_norm": 2.0609054181173714,
"learning_rate": 1.8230289757645737e-05,
"loss": 0.1835,
"step": 7360
},
{
"epoch": 1.17,
"grad_norm": 2.458294711914135,
"learning_rate": 1.8218395278664876e-05,
"loss": 0.1746,
"step": 7380
},
{
"epoch": 1.17,
"grad_norm": 2.657085063045043,
"learning_rate": 1.8206464869237405e-05,
"loss": 0.2005,
"step": 7400
},
{
"epoch": 1.17,
"grad_norm": 2.4744370061009566,
"learning_rate": 1.819449858152887e-05,
"loss": 0.1793,
"step": 7420
},
{
"epoch": 1.18,
"grad_norm": 2.149043105341473,
"learning_rate": 1.8182496467861694e-05,
"loss": 0.1652,
"step": 7440
},
{
"epoch": 1.18,
"grad_norm": 1.9846578681762732,
"learning_rate": 1.8170458580714957e-05,
"loss": 0.1877,
"step": 7460
},
{
"epoch": 1.18,
"grad_norm": 1.8277088571957387,
"learning_rate": 1.815838497272415e-05,
"loss": 0.183,
"step": 7480
},
{
"epoch": 1.19,
"grad_norm": 1.9586834966474451,
"learning_rate": 1.814627569668096e-05,
"loss": 0.1766,
"step": 7500
},
{
"epoch": 1.19,
"grad_norm": 2.2511207120669097,
"learning_rate": 1.8134130805533027e-05,
"loss": 0.1767,
"step": 7520
},
{
"epoch": 1.19,
"grad_norm": 1.5549651238311348,
"learning_rate": 1.8121950352383714e-05,
"loss": 0.1955,
"step": 7540
},
{
"epoch": 1.2,
"grad_norm": 2.5865251040638744,
"learning_rate": 1.810973439049189e-05,
"loss": 0.1819,
"step": 7560
},
{
"epoch": 1.2,
"grad_norm": 2.399656899087326,
"learning_rate": 1.809748297327167e-05,
"loss": 0.1851,
"step": 7580
},
{
"epoch": 1.2,
"grad_norm": 2.5607074887948773,
"learning_rate": 1.8085196154292215e-05,
"loss": 0.1793,
"step": 7600
},
{
"epoch": 1.2,
"grad_norm": 2.251395428592012,
"learning_rate": 1.8072873987277463e-05,
"loss": 0.167,
"step": 7620
},
{
"epoch": 1.21,
"grad_norm": 1.9489827901942562,
"learning_rate": 1.8060516526105924e-05,
"loss": 0.1884,
"step": 7640
},
{
"epoch": 1.21,
"grad_norm": 2.0637425234570808,
"learning_rate": 1.804812382481042e-05,
"loss": 0.1876,
"step": 7660
},
{
"epoch": 1.21,
"grad_norm": 1.9347382212160897,
"learning_rate": 1.8035695937577863e-05,
"loss": 0.1726,
"step": 7680
},
{
"epoch": 1.22,
"grad_norm": 2.4247952482951596,
"learning_rate": 1.8023232918749026e-05,
"loss": 0.1779,
"step": 7700
},
{
"epoch": 1.22,
"grad_norm": 2.3176200478415843,
"learning_rate": 1.8010734822818278e-05,
"loss": 0.1837,
"step": 7720
},
{
"epoch": 1.22,
"grad_norm": 2.680156700051065,
"learning_rate": 1.7998201704433374e-05,
"loss": 0.1815,
"step": 7740
},
{
"epoch": 1.23,
"grad_norm": 1.7853843493153718,
"learning_rate": 1.7985633618395197e-05,
"loss": 0.1793,
"step": 7760
},
{
"epoch": 1.23,
"grad_norm": 2.064374130022476,
"learning_rate": 1.7973030619657535e-05,
"loss": 0.18,
"step": 7780
},
{
"epoch": 1.23,
"grad_norm": 2.694008332010934,
"learning_rate": 1.7960392763326813e-05,
"loss": 0.1787,
"step": 7800
},
{
"epoch": 1.24,
"grad_norm": 1.8953685050923013,
"learning_rate": 1.794772010466189e-05,
"loss": 0.1811,
"step": 7820
},
{
"epoch": 1.24,
"grad_norm": 2.3442305008755753,
"learning_rate": 1.7935012699073787e-05,
"loss": 0.1924,
"step": 7840
},
{
"epoch": 1.24,
"grad_norm": 2.4097855590450936,
"learning_rate": 1.7922270602125464e-05,
"loss": 0.171,
"step": 7860
},
{
"epoch": 1.25,
"grad_norm": 1.8537246377229875,
"learning_rate": 1.7909493869531555e-05,
"loss": 0.1838,
"step": 7880
},
{
"epoch": 1.25,
"grad_norm": 2.0039152922249697,
"learning_rate": 1.789668255715815e-05,
"loss": 0.1847,
"step": 7900
},
{
"epoch": 1.25,
"grad_norm": 1.529179165379362,
"learning_rate": 1.7883836721022534e-05,
"loss": 0.1728,
"step": 7920
},
{
"epoch": 1.26,
"grad_norm": 1.9763886145297185,
"learning_rate": 1.7870956417292945e-05,
"loss": 0.1688,
"step": 7940
},
{
"epoch": 1.26,
"grad_norm": 1.9066521892575954,
"learning_rate": 1.7858041702288335e-05,
"loss": 0.153,
"step": 7960
},
{
"epoch": 1.26,
"grad_norm": 1.7450878138703785,
"learning_rate": 1.784509263247811e-05,
"loss": 0.1893,
"step": 7980
},
{
"epoch": 1.26,
"grad_norm": 1.9620586700844385,
"learning_rate": 1.7832109264481904e-05,
"loss": 0.185,
"step": 8000
},
{
"epoch": 1.27,
"grad_norm": 1.8923898451681622,
"learning_rate": 1.7819091655069314e-05,
"loss": 0.1754,
"step": 8020
},
{
"epoch": 1.27,
"grad_norm": 2.0884487166296872,
"learning_rate": 1.7806039861159653e-05,
"loss": 0.1765,
"step": 8040
},
{
"epoch": 1.27,
"grad_norm": 2.222016415072701,
"learning_rate": 1.7792953939821702e-05,
"loss": 0.1788,
"step": 8060
},
{
"epoch": 1.28,
"grad_norm": 2.009439484553775,
"learning_rate": 1.7779833948273482e-05,
"loss": 0.1811,
"step": 8080
},
{
"epoch": 1.28,
"grad_norm": 2.785344314111028,
"learning_rate": 1.7766679943881966e-05,
"loss": 0.1701,
"step": 8100
},
{
"epoch": 1.28,
"grad_norm": 2.4016974843695453,
"learning_rate": 1.775349198416286e-05,
"loss": 0.1809,
"step": 8120
},
{
"epoch": 1.29,
"grad_norm": 5.230470573305321,
"learning_rate": 1.774027012678033e-05,
"loss": 0.1701,
"step": 8140
},
{
"epoch": 1.29,
"grad_norm": 2.373672415463304,
"learning_rate": 1.7727014429546762e-05,
"loss": 0.1832,
"step": 8160
},
{
"epoch": 1.29,
"grad_norm": 3.3302260379998776,
"learning_rate": 1.7713724950422516e-05,
"loss": 0.1756,
"step": 8180
},
{
"epoch": 1.3,
"grad_norm": 2.1278369142809925,
"learning_rate": 1.770040174751565e-05,
"loss": 0.1751,
"step": 8200
},
{
"epoch": 1.3,
"grad_norm": 2.1056194640732526,
"learning_rate": 1.7687044879081685e-05,
"loss": 0.1733,
"step": 8220
},
{
"epoch": 1.3,
"grad_norm": 1.9218053417452738,
"learning_rate": 1.7673654403523336e-05,
"loss": 0.1787,
"step": 8240
},
{
"epoch": 1.31,
"grad_norm": 2.015545274207546,
"learning_rate": 1.766023037939028e-05,
"loss": 0.175,
"step": 8260
},
{
"epoch": 1.31,
"grad_norm": 2.6233040008294544,
"learning_rate": 1.7646772865378873e-05,
"loss": 0.1662,
"step": 8280
},
{
"epoch": 1.31,
"grad_norm": 1.8332410774716108,
"learning_rate": 1.7633281920331906e-05,
"loss": 0.1838,
"step": 8300
},
{
"epoch": 1.32,
"grad_norm": 1.9451280989055022,
"learning_rate": 1.761975760323835e-05,
"loss": 0.1736,
"step": 8320
},
{
"epoch": 1.32,
"grad_norm": 2.3255879960964476,
"learning_rate": 1.76061999732331e-05,
"loss": 0.1656,
"step": 8340
},
{
"epoch": 1.32,
"grad_norm": 2.2343648630192363,
"learning_rate": 1.7592609089596685e-05,
"loss": 0.1779,
"step": 8360
},
{
"epoch": 1.32,
"grad_norm": 3.1163417653612955,
"learning_rate": 1.7578985011755077e-05,
"loss": 0.1715,
"step": 8380
},
{
"epoch": 1.33,
"grad_norm": 2.225480415684875,
"learning_rate": 1.7565327799279354e-05,
"loss": 0.1716,
"step": 8400
},
{
"epoch": 1.33,
"grad_norm": 2.626122370963583,
"learning_rate": 1.7551637511885494e-05,
"loss": 0.163,
"step": 8420
},
{
"epoch": 1.33,
"grad_norm": 2.338476937693113,
"learning_rate": 1.7537914209434085e-05,
"loss": 0.1827,
"step": 8440
},
{
"epoch": 1.34,
"grad_norm": 2.932171635630059,
"learning_rate": 1.752415795193008e-05,
"loss": 0.1637,
"step": 8460
},
{
"epoch": 1.34,
"grad_norm": 2.168193633641859,
"learning_rate": 1.7510368799522514e-05,
"loss": 0.1843,
"step": 8480
},
{
"epoch": 1.34,
"grad_norm": 2.0903535310262886,
"learning_rate": 1.7496546812504273e-05,
"loss": 0.179,
"step": 8500
},
{
"epoch": 1.35,
"grad_norm": 1.873644784081685,
"learning_rate": 1.7482692051311805e-05,
"loss": 0.1814,
"step": 8520
},
{
"epoch": 1.35,
"grad_norm": 1.9178052782056696,
"learning_rate": 1.7468804576524853e-05,
"loss": 0.1722,
"step": 8540
},
{
"epoch": 1.35,
"grad_norm": 2.2334060168419163,
"learning_rate": 1.7454884448866212e-05,
"loss": 0.1878,
"step": 8560
},
{
"epoch": 1.36,
"grad_norm": 2.1292565293829493,
"learning_rate": 1.7440931729201448e-05,
"loss": 0.1795,
"step": 8580
},
{
"epoch": 1.36,
"grad_norm": 2.423860978511733,
"learning_rate": 1.7426946478538626e-05,
"loss": 0.1675,
"step": 8600
},
{
"epoch": 1.36,
"grad_norm": 2.1047220409931877,
"learning_rate": 1.741292875802807e-05,
"loss": 0.1734,
"step": 8620
},
{
"epoch": 1.37,
"grad_norm": 2.1785335425805776,
"learning_rate": 1.7398878628962062e-05,
"loss": 0.1808,
"step": 8640
},
{
"epoch": 1.37,
"grad_norm": 1.854826356210005,
"learning_rate": 1.7384796152774602e-05,
"loss": 0.1833,
"step": 8660
},
{
"epoch": 1.37,
"grad_norm": 2.428632402856712,
"learning_rate": 1.737068139104111e-05,
"loss": 0.1736,
"step": 8680
},
{
"epoch": 1.38,
"grad_norm": 2.5679580432934412,
"learning_rate": 1.7356534405478197e-05,
"loss": 0.1775,
"step": 8700
},
{
"epoch": 1.38,
"grad_norm": 2.2642899361494124,
"learning_rate": 1.7342355257943354e-05,
"loss": 0.1696,
"step": 8720
},
{
"epoch": 1.38,
"grad_norm": 2.5767432931886947,
"learning_rate": 1.732814401043471e-05,
"loss": 0.186,
"step": 8740
},
{
"epoch": 1.38,
"grad_norm": 1.893321045994624,
"learning_rate": 1.7313900725090744e-05,
"loss": 0.1652,
"step": 8760
},
{
"epoch": 1.39,
"grad_norm": 1.7371063719634066,
"learning_rate": 1.7299625464190025e-05,
"loss": 0.1743,
"step": 8780
},
{
"epoch": 1.39,
"grad_norm": 2.3410836884450994,
"learning_rate": 1.7285318290150934e-05,
"loss": 0.1749,
"step": 8800
},
{
"epoch": 1.39,
"grad_norm": 2.0076324738418667,
"learning_rate": 1.727097926553139e-05,
"loss": 0.171,
"step": 8820
},
{
"epoch": 1.4,
"grad_norm": 2.114082790821531,
"learning_rate": 1.7256608453028577e-05,
"loss": 0.1826,
"step": 8840
},
{
"epoch": 1.4,
"grad_norm": 1.8298354013347164,
"learning_rate": 1.7242205915478677e-05,
"loss": 0.1694,
"step": 8860
},
{
"epoch": 1.4,
"grad_norm": 2.074394146818531,
"learning_rate": 1.722777171585658e-05,
"loss": 0.1709,
"step": 8880
},
{
"epoch": 1.41,
"grad_norm": 2.1343987124286183,
"learning_rate": 1.721330591727562e-05,
"loss": 0.174,
"step": 8900
},
{
"epoch": 1.41,
"grad_norm": 2.113311146861362,
"learning_rate": 1.7198808582987313e-05,
"loss": 0.1725,
"step": 8920
},
{
"epoch": 1.41,
"grad_norm": 2.201652738347589,
"learning_rate": 1.718427977638104e-05,
"loss": 0.1761,
"step": 8940
},
{
"epoch": 1.42,
"grad_norm": 2.0551908043411773,
"learning_rate": 1.7169719560983817e-05,
"loss": 0.1771,
"step": 8960
},
{
"epoch": 1.42,
"grad_norm": 1.4177831102654264,
"learning_rate": 1.7155128000459967e-05,
"loss": 0.1611,
"step": 8980
},
{
"epoch": 1.42,
"grad_norm": 1.974977060981519,
"learning_rate": 1.71405051586109e-05,
"loss": 0.1701,
"step": 9000
},
{
"epoch": 1.43,
"grad_norm": 2.394531449135901,
"learning_rate": 1.7125851099374784e-05,
"loss": 0.1841,
"step": 9020
},
{
"epoch": 1.43,
"grad_norm": 2.017565126183319,
"learning_rate": 1.7111165886826288e-05,
"loss": 0.1659,
"step": 9040
},
{
"epoch": 1.43,
"grad_norm": 1.9783212428806802,
"learning_rate": 1.70964495851763e-05,
"loss": 0.1807,
"step": 9060
},
{
"epoch": 1.44,
"grad_norm": 2.0181529434741,
"learning_rate": 1.708170225877165e-05,
"loss": 0.195,
"step": 9080
},
{
"epoch": 1.44,
"grad_norm": 2.2209977061102237,
"learning_rate": 1.7066923972094804e-05,
"loss": 0.1798,
"step": 9100
},
{
"epoch": 1.44,
"grad_norm": 2.132481028750825,
"learning_rate": 1.705211478976363e-05,
"loss": 0.171,
"step": 9120
},
{
"epoch": 1.44,
"grad_norm": 1.6746882362029545,
"learning_rate": 1.7037274776531064e-05,
"loss": 0.1917,
"step": 9140
},
{
"epoch": 1.45,
"grad_norm": 2.185315703710572,
"learning_rate": 1.702240399728486e-05,
"loss": 0.1655,
"step": 9160
},
{
"epoch": 1.45,
"grad_norm": 2.4321099376111084,
"learning_rate": 1.7007502517047293e-05,
"loss": 0.1778,
"step": 9180
},
{
"epoch": 1.45,
"grad_norm": 2.0789938809670385,
"learning_rate": 1.6992570400974876e-05,
"loss": 0.1795,
"step": 9200
},
{
"epoch": 1.46,
"grad_norm": 2.0820901085870127,
"learning_rate": 1.6977607714358085e-05,
"loss": 0.1691,
"step": 9220
},
{
"epoch": 1.46,
"grad_norm": 2.0783905777418834,
"learning_rate": 1.6962614522621047e-05,
"loss": 0.1758,
"step": 9240
},
{
"epoch": 1.46,
"grad_norm": 2.0755554604762887,
"learning_rate": 1.69475908913213e-05,
"loss": 0.1745,
"step": 9260
},
{
"epoch": 1.47,
"grad_norm": 3.1531529103037226,
"learning_rate": 1.693253688614945e-05,
"loss": 0.1754,
"step": 9280
},
{
"epoch": 1.47,
"grad_norm": 2.2993858263736064,
"learning_rate": 1.6917452572928936e-05,
"loss": 0.166,
"step": 9300
},
{
"epoch": 1.47,
"grad_norm": 1.75447681551487,
"learning_rate": 1.69023380176157e-05,
"loss": 0.178,
"step": 9320
},
{
"epoch": 1.48,
"grad_norm": 1.973995547249979,
"learning_rate": 1.688719328629793e-05,
"loss": 0.1799,
"step": 9340
},
{
"epoch": 1.48,
"grad_norm": 2.4126605558582397,
"learning_rate": 1.687201844519575e-05,
"loss": 0.1711,
"step": 9360
},
{
"epoch": 1.48,
"grad_norm": 1.9841098157654595,
"learning_rate": 1.685681356066094e-05,
"loss": 0.1767,
"step": 9380
},
{
"epoch": 1.49,
"grad_norm": 2.1398055464198746,
"learning_rate": 1.684157869917665e-05,
"loss": 0.1727,
"step": 9400
},
{
"epoch": 1.49,
"grad_norm": 1.9307719901389893,
"learning_rate": 1.6826313927357096e-05,
"loss": 0.1772,
"step": 9420
},
{
"epoch": 1.49,
"grad_norm": 1.8311039706909333,
"learning_rate": 1.681101931194729e-05,
"loss": 0.172,
"step": 9440
},
{
"epoch": 1.5,
"grad_norm": 2.1030690065273228,
"learning_rate": 1.6795694919822713e-05,
"loss": 0.1681,
"step": 9460
},
{
"epoch": 1.5,
"grad_norm": 2.295817125082063,
"learning_rate": 1.6780340817989067e-05,
"loss": 0.1756,
"step": 9480
},
{
"epoch": 1.5,
"grad_norm": 2.126975655764791,
"learning_rate": 1.6764957073581937e-05,
"loss": 0.1705,
"step": 9500
},
{
"epoch": 1.5,
"grad_norm": 2.1165992510815443,
"learning_rate": 1.6749543753866544e-05,
"loss": 0.1769,
"step": 9520
},
{
"epoch": 1.51,
"grad_norm": 2.2911343620991875,
"learning_rate": 1.6734100926237405e-05,
"loss": 0.1786,
"step": 9540
},
{
"epoch": 1.51,
"grad_norm": 1.9750133369092064,
"learning_rate": 1.6718628658218078e-05,
"loss": 0.1787,
"step": 9560
},
{
"epoch": 1.51,
"grad_norm": 2.1096785715440003,
"learning_rate": 1.670312701746083e-05,
"loss": 0.1746,
"step": 9580
},
{
"epoch": 1.52,
"grad_norm": 2.0647784277246126,
"learning_rate": 1.6687596071746376e-05,
"loss": 0.173,
"step": 9600
},
{
"epoch": 1.52,
"grad_norm": 1.7141270975686904,
"learning_rate": 1.667203588898356e-05,
"loss": 0.17,
"step": 9620
},
{
"epoch": 1.52,
"grad_norm": 2.1624948490500433,
"learning_rate": 1.665644653720906e-05,
"loss": 0.1746,
"step": 9640
},
{
"epoch": 1.53,
"grad_norm": 2.2815311206414797,
"learning_rate": 1.6640828084587104e-05,
"loss": 0.1811,
"step": 9660
},
{
"epoch": 1.53,
"grad_norm": 1.9683000565100015,
"learning_rate": 1.662518059940916e-05,
"loss": 0.1883,
"step": 9680
},
{
"epoch": 1.53,
"grad_norm": 2.099153179909315,
"learning_rate": 1.6609504150093634e-05,
"loss": 0.1563,
"step": 9700
},
{
"epoch": 1.54,
"grad_norm": 2.3648488796091542,
"learning_rate": 1.659379880518559e-05,
"loss": 0.1711,
"step": 9720
},
{
"epoch": 1.54,
"grad_norm": 1.9477130219057668,
"learning_rate": 1.6578064633356426e-05,
"loss": 0.1807,
"step": 9740
},
{
"epoch": 1.54,
"grad_norm": 1.8763997892785833,
"learning_rate": 1.6562301703403588e-05,
"loss": 0.17,
"step": 9760
},
{
"epoch": 1.55,
"grad_norm": 1.9473897115797032,
"learning_rate": 1.654651008425027e-05,
"loss": 0.1695,
"step": 9780
},
{
"epoch": 1.55,
"grad_norm": 1.94275021614424,
"learning_rate": 1.653068984494511e-05,
"loss": 0.1786,
"step": 9800
},
{
"epoch": 1.55,
"grad_norm": 2.591445266572519,
"learning_rate": 1.6514841054661884e-05,
"loss": 0.185,
"step": 9820
},
{
"epoch": 1.56,
"grad_norm": 2.2087296424190948,
"learning_rate": 1.64989637826992e-05,
"loss": 0.1913,
"step": 9840
},
{
"epoch": 1.56,
"grad_norm": 2.0227388921292886,
"learning_rate": 1.6483058098480214e-05,
"loss": 0.1771,
"step": 9860
},
{
"epoch": 1.56,
"grad_norm": 1.941226451998698,
"learning_rate": 1.646712407155231e-05,
"loss": 0.1604,
"step": 9880
},
{
"epoch": 1.56,
"grad_norm": 1.799624851657898,
"learning_rate": 1.64511617715868e-05,
"loss": 0.1703,
"step": 9900
},
{
"epoch": 1.57,
"grad_norm": 2.1026662884608873,
"learning_rate": 1.6435171268378617e-05,
"loss": 0.1699,
"step": 9920
},
{
"epoch": 1.57,
"grad_norm": 2.037064549070868,
"learning_rate": 1.641915263184601e-05,
"loss": 0.1749,
"step": 9940
},
{
"epoch": 1.57,
"grad_norm": 2.289972742743045,
"learning_rate": 1.6403105932030253e-05,
"loss": 0.171,
"step": 9960
},
{
"epoch": 1.58,
"grad_norm": 1.7397923448674204,
"learning_rate": 1.638703123909531e-05,
"loss": 0.1722,
"step": 9980
},
{
"epoch": 1.58,
"grad_norm": 2.2809191947314327,
"learning_rate": 1.6370928623327557e-05,
"loss": 0.1789,
"step": 10000
},
{
"epoch": 1.58,
"grad_norm": 5.837716808009581,
"learning_rate": 1.635479815513546e-05,
"loss": 0.1688,
"step": 10020
},
{
"epoch": 1.59,
"grad_norm": 1.9385163502489462,
"learning_rate": 1.6338639905049256e-05,
"loss": 0.1684,
"step": 10040
},
{
"epoch": 1.59,
"grad_norm": 2.4083879559228265,
"learning_rate": 1.6322453943720677e-05,
"loss": 0.1834,
"step": 10060
},
{
"epoch": 1.59,
"grad_norm": 1.8784457651634563,
"learning_rate": 1.6306240341922616e-05,
"loss": 0.1726,
"step": 10080
},
{
"epoch": 1.6,
"grad_norm": 2.5015152154155134,
"learning_rate": 1.628999917054882e-05,
"loss": 0.175,
"step": 10100
},
{
"epoch": 1.6,
"grad_norm": 2.3462998824591317,
"learning_rate": 1.627373050061358e-05,
"loss": 0.1836,
"step": 10120
},
{
"epoch": 1.6,
"grad_norm": 6.491573153186662,
"learning_rate": 1.625743440325143e-05,
"loss": 0.1708,
"step": 10140
},
{
"epoch": 1.61,
"grad_norm": 1.8735636613758684,
"learning_rate": 1.6241110949716837e-05,
"loss": 0.1644,
"step": 10160
},
{
"epoch": 1.61,
"grad_norm": 2.1199720026362776,
"learning_rate": 1.6224760211383867e-05,
"loss": 0.1745,
"step": 10180
},
{
"epoch": 1.61,
"grad_norm": 2.26663511712426,
"learning_rate": 1.6208382259745902e-05,
"loss": 0.1722,
"step": 10200
},
{
"epoch": 1.62,
"grad_norm": 1.6274143037047173,
"learning_rate": 1.6191977166415303e-05,
"loss": 0.1689,
"step": 10220
},
{
"epoch": 1.62,
"grad_norm": 1.7970519932361564,
"learning_rate": 1.617554500312311e-05,
"loss": 0.186,
"step": 10240
},
{
"epoch": 1.62,
"grad_norm": 2.241357629833042,
"learning_rate": 1.6159085841718732e-05,
"loss": 0.1691,
"step": 10260
},
{
"epoch": 1.63,
"grad_norm": 1.6347966162730332,
"learning_rate": 1.614259975416963e-05,
"loss": 0.1689,
"step": 10280
},
{
"epoch": 1.63,
"grad_norm": 1.9003545555776973,
"learning_rate": 1.612608681256098e-05,
"loss": 0.1831,
"step": 10300
},
{
"epoch": 1.63,
"grad_norm": 2.224301766553686,
"learning_rate": 1.61095470890954e-05,
"loss": 0.1809,
"step": 10320
},
{
"epoch": 1.63,
"grad_norm": 2.0418622318986057,
"learning_rate": 1.609298065609259e-05,
"loss": 0.1732,
"step": 10340
},
{
"epoch": 1.64,
"grad_norm": 1.8847041460989982,
"learning_rate": 1.607638758598906e-05,
"loss": 0.18,
"step": 10360
},
{
"epoch": 1.64,
"grad_norm": 2.6034376013955765,
"learning_rate": 1.6059767951337775e-05,
"loss": 0.1659,
"step": 10380
},
{
"epoch": 1.64,
"grad_norm": 2.717869407722093,
"learning_rate": 1.6043121824807853e-05,
"loss": 0.1778,
"step": 10400
},
{
"epoch": 1.65,
"grad_norm": 1.9310171511622796,
"learning_rate": 1.6026449279184252e-05,
"loss": 0.1742,
"step": 10420
},
{
"epoch": 1.65,
"grad_norm": 2.1672348842377542,
"learning_rate": 1.6009750387367446e-05,
"loss": 0.1751,
"step": 10440
},
{
"epoch": 1.65,
"grad_norm": 2.2155157882826932,
"learning_rate": 1.5993025222373107e-05,
"loss": 0.1716,
"step": 10460
},
{
"epoch": 1.66,
"grad_norm": 2.12252169432859,
"learning_rate": 1.5976273857331788e-05,
"loss": 0.1743,
"step": 10480
},
{
"epoch": 1.66,
"grad_norm": 1.8928313189284733,
"learning_rate": 1.59594963654886e-05,
"loss": 0.1666,
"step": 10500
},
{
"epoch": 1.66,
"grad_norm": 2.1209231144675917,
"learning_rate": 1.594269282020289e-05,
"loss": 0.1654,
"step": 10520
},
{
"epoch": 1.67,
"grad_norm": 1.7234563029310586,
"learning_rate": 1.592586329494793e-05,
"loss": 0.1732,
"step": 10540
},
{
"epoch": 1.67,
"grad_norm": 1.8992491192039225,
"learning_rate": 1.590900786331058e-05,
"loss": 0.1843,
"step": 10560
},
{
"epoch": 1.67,
"grad_norm": 2.255292073046707,
"learning_rate": 1.5892126598990988e-05,
"loss": 0.1663,
"step": 10580
},
{
"epoch": 1.68,
"grad_norm": 2.0226484844460937,
"learning_rate": 1.587521957580224e-05,
"loss": 0.1742,
"step": 10600
},
{
"epoch": 1.68,
"grad_norm": 1.9769425460270078,
"learning_rate": 1.5858286867670067e-05,
"loss": 0.1699,
"step": 10620
},
{
"epoch": 1.68,
"grad_norm": 2.0941096893627047,
"learning_rate": 1.584132854863249e-05,
"loss": 0.1689,
"step": 10640
},
{
"epoch": 1.69,
"grad_norm": 3.594450243853988,
"learning_rate": 1.5824344692839528e-05,
"loss": 0.1592,
"step": 10660
},
{
"epoch": 1.69,
"grad_norm": 2.7600105322761266,
"learning_rate": 1.5807335374552863e-05,
"loss": 0.1696,
"step": 10680
},
{
"epoch": 1.69,
"grad_norm": 1.9277553383287402,
"learning_rate": 1.5790300668145488e-05,
"loss": 0.1567,
"step": 10700
},
{
"epoch": 1.69,
"grad_norm": 1.8061002015824392,
"learning_rate": 1.577324064810143e-05,
"loss": 0.1714,
"step": 10720
},
{
"epoch": 1.7,
"grad_norm": 2.075080879197821,
"learning_rate": 1.575615538901539e-05,
"loss": 0.1651,
"step": 10740
},
{
"epoch": 1.7,
"grad_norm": 1.9389319511933993,
"learning_rate": 1.573904496559242e-05,
"loss": 0.1664,
"step": 10760
},
{
"epoch": 1.7,
"grad_norm": 1.989109668636887,
"learning_rate": 1.5721909452647604e-05,
"loss": 0.1644,
"step": 10780
},
{
"epoch": 1.71,
"grad_norm": 1.802713809407511,
"learning_rate": 1.570474892510575e-05,
"loss": 0.158,
"step": 10800
},
{
"epoch": 1.71,
"grad_norm": 2.1683529625057183,
"learning_rate": 1.5687563458001015e-05,
"loss": 0.1731,
"step": 10820
},
{
"epoch": 1.71,
"grad_norm": 1.7930535855304202,
"learning_rate": 1.5670353126476615e-05,
"loss": 0.1721,
"step": 10840
},
{
"epoch": 1.72,
"grad_norm": 2.8089113864956756,
"learning_rate": 1.565311800578449e-05,
"loss": 0.1581,
"step": 10860
},
{
"epoch": 1.72,
"grad_norm": 1.698115339142883,
"learning_rate": 1.5635858171284962e-05,
"loss": 0.1604,
"step": 10880
},
{
"epoch": 1.72,
"grad_norm": 2.003861038263119,
"learning_rate": 1.561857369844642e-05,
"loss": 0.1651,
"step": 10900
},
{
"epoch": 1.73,
"grad_norm": 1.9604631435332058,
"learning_rate": 1.5601264662844976e-05,
"loss": 0.166,
"step": 10920
},
{
"epoch": 1.73,
"grad_norm": 1.8966802933890046,
"learning_rate": 1.5583931140164156e-05,
"loss": 0.1504,
"step": 10940
},
{
"epoch": 1.73,
"grad_norm": 1.8798424813043781,
"learning_rate": 1.556657320619454e-05,
"loss": 0.1712,
"step": 10960
},
{
"epoch": 1.74,
"grad_norm": 1.8480005568507623,
"learning_rate": 1.5549190936833452e-05,
"loss": 0.166,
"step": 10980
},
{
"epoch": 1.74,
"grad_norm": 1.9730818789739197,
"learning_rate": 1.553178440808463e-05,
"loss": 0.1766,
"step": 11000
},
{
"epoch": 1.74,
"grad_norm": 2.055586585535215,
"learning_rate": 1.5514353696057872e-05,
"loss": 0.1684,
"step": 11020
},
{
"epoch": 1.75,
"grad_norm": 2.375617097692436,
"learning_rate": 1.5496898876968733e-05,
"loss": 0.1774,
"step": 11040
},
{
"epoch": 1.75,
"grad_norm": 1.8908606768542418,
"learning_rate": 1.5479420027138157e-05,
"loss": 0.1727,
"step": 11060
},
{
"epoch": 1.75,
"grad_norm": 1.584681873586119,
"learning_rate": 1.5461917222992176e-05,
"loss": 0.1554,
"step": 11080
},
{
"epoch": 1.75,
"grad_norm": 1.9010909888431742,
"learning_rate": 1.5444390541061557e-05,
"loss": 0.172,
"step": 11100
},
{
"epoch": 1.76,
"grad_norm": 1.6694865055184347,
"learning_rate": 1.5426840057981474e-05,
"loss": 0.1679,
"step": 11120
},
{
"epoch": 1.76,
"grad_norm": 2.168091256744236,
"learning_rate": 1.5409265850491172e-05,
"loss": 0.1581,
"step": 11140
},
{
"epoch": 1.76,
"grad_norm": 1.6447757348933234,
"learning_rate": 1.539166799543363e-05,
"loss": 0.1566,
"step": 11160
},
{
"epoch": 1.77,
"grad_norm": 1.8720093081886822,
"learning_rate": 1.5374046569755216e-05,
"loss": 0.1755,
"step": 11180
},
{
"epoch": 1.77,
"grad_norm": 1.8424853497444225,
"learning_rate": 1.5356401650505376e-05,
"loss": 0.1672,
"step": 11200
},
{
"epoch": 1.77,
"grad_norm": 1.7513883396920575,
"learning_rate": 1.533873331483627e-05,
"loss": 0.1612,
"step": 11220
},
{
"epoch": 1.78,
"grad_norm": 2.8850905765482815,
"learning_rate": 1.5321041640002455e-05,
"loss": 0.17,
"step": 11240
},
{
"epoch": 1.78,
"grad_norm": 1.5977457110067312,
"learning_rate": 1.5303326703360534e-05,
"loss": 0.168,
"step": 11260
},
{
"epoch": 1.78,
"grad_norm": 1.839271796672507,
"learning_rate": 1.5285588582368814e-05,
"loss": 0.1622,
"step": 11280
},
{
"epoch": 1.79,
"grad_norm": 2.0851069593552998,
"learning_rate": 1.526782735458699e-05,
"loss": 0.1616,
"step": 11300
},
{
"epoch": 1.79,
"grad_norm": 1.8161832282859776,
"learning_rate": 1.5250043097675773e-05,
"loss": 0.16,
"step": 11320
},
{
"epoch": 1.79,
"grad_norm": 2.1227652074250916,
"learning_rate": 1.5232235889396589e-05,
"loss": 0.1734,
"step": 11340
},
{
"epoch": 1.8,
"grad_norm": 2.054825114584178,
"learning_rate": 1.5214405807611212e-05,
"loss": 0.1693,
"step": 11360
},
{
"epoch": 1.8,
"grad_norm": 1.9053808638546215,
"learning_rate": 1.5196552930281414e-05,
"loss": 0.1555,
"step": 11380
},
{
"epoch": 1.8,
"grad_norm": 2.2568013920238466,
"learning_rate": 1.517867733546866e-05,
"loss": 0.1777,
"step": 11400
},
{
"epoch": 1.81,
"grad_norm": 1.755591383734745,
"learning_rate": 1.516077910133374e-05,
"loss": 0.1646,
"step": 11420
},
{
"epoch": 1.81,
"grad_norm": 2.040664452818391,
"learning_rate": 1.5142858306136432e-05,
"loss": 0.1562,
"step": 11440
},
{
"epoch": 1.81,
"grad_norm": 2.9163542625602634,
"learning_rate": 1.5124915028235168e-05,
"loss": 0.1671,
"step": 11460
},
{
"epoch": 1.81,
"grad_norm": 1.745198816269922,
"learning_rate": 1.5106949346086675e-05,
"loss": 0.1645,
"step": 11480
},
{
"epoch": 1.82,
"grad_norm": 3.185139698456198,
"learning_rate": 1.5088961338245656e-05,
"loss": 0.1645,
"step": 11500
},
{
"epoch": 1.82,
"grad_norm": 1.8737538329143142,
"learning_rate": 1.5070951083364413e-05,
"loss": 0.1701,
"step": 11520
},
{
"epoch": 1.82,
"grad_norm": 2.28486597548739,
"learning_rate": 1.5052918660192548e-05,
"loss": 0.1551,
"step": 11540
},
{
"epoch": 1.83,
"grad_norm": 1.9377161951346134,
"learning_rate": 1.5034864147576574e-05,
"loss": 0.1708,
"step": 11560
},
{
"epoch": 1.83,
"grad_norm": 2.0573176386356713,
"learning_rate": 1.5016787624459602e-05,
"loss": 0.1605,
"step": 11580
},
{
"epoch": 1.83,
"grad_norm": 1.8884308117753286,
"learning_rate": 1.499868916988097e-05,
"loss": 0.1635,
"step": 11600
},
{
"epoch": 1.84,
"grad_norm": 2.1599042583369026,
"learning_rate": 1.4980568862975921e-05,
"loss": 0.1624,
"step": 11620
},
{
"epoch": 1.84,
"grad_norm": 1.9050485545701388,
"learning_rate": 1.4962426782975251e-05,
"loss": 0.1686,
"step": 11640
},
{
"epoch": 1.84,
"grad_norm": 2.0206229765693404,
"learning_rate": 1.4944263009204945e-05,
"loss": 0.1609,
"step": 11660
},
{
"epoch": 1.85,
"grad_norm": 2.1626323284609077,
"learning_rate": 1.4926077621085858e-05,
"loss": 0.1663,
"step": 11680
},
{
"epoch": 1.85,
"grad_norm": 1.7733761949067968,
"learning_rate": 1.4907870698133342e-05,
"loss": 0.1647,
"step": 11700
},
{
"epoch": 1.85,
"grad_norm": 1.7803734747589461,
"learning_rate": 1.4889642319956916e-05,
"loss": 0.1704,
"step": 11720
},
{
"epoch": 1.86,
"grad_norm": 2.1492586543303975,
"learning_rate": 1.4871392566259912e-05,
"loss": 0.1607,
"step": 11740
},
{
"epoch": 1.86,
"grad_norm": 2.00354928746823,
"learning_rate": 1.4853121516839119e-05,
"loss": 0.1649,
"step": 11760
},
{
"epoch": 1.86,
"grad_norm": 1.6161133520624342,
"learning_rate": 1.4834829251584452e-05,
"loss": 0.161,
"step": 11780
},
{
"epoch": 1.87,
"grad_norm": 3.956943883806942,
"learning_rate": 1.4816515850478586e-05,
"loss": 0.1652,
"step": 11800
},
{
"epoch": 1.87,
"grad_norm": 2.2931018381308457,
"learning_rate": 1.4798181393596612e-05,
"loss": 0.1662,
"step": 11820
},
{
"epoch": 1.87,
"grad_norm": 2.174850511346493,
"learning_rate": 1.4779825961105685e-05,
"loss": 0.164,
"step": 11840
},
{
"epoch": 1.87,
"grad_norm": 1.5470178711542586,
"learning_rate": 1.4761449633264679e-05,
"loss": 0.1702,
"step": 11860
},
{
"epoch": 1.88,
"grad_norm": 3.3308654391418204,
"learning_rate": 1.4743052490423835e-05,
"loss": 0.1618,
"step": 11880
},
{
"epoch": 1.88,
"grad_norm": 1.9720383082261177,
"learning_rate": 1.4724634613024404e-05,
"loss": 0.1586,
"step": 11900
},
{
"epoch": 1.88,
"grad_norm": 1.883967887969211,
"learning_rate": 1.4706196081598298e-05,
"loss": 0.1696,
"step": 11920
},
{
"epoch": 1.89,
"grad_norm": 1.6976412060172419,
"learning_rate": 1.4687736976767737e-05,
"loss": 0.1594,
"step": 11940
},
{
"epoch": 1.89,
"grad_norm": 1.529332108170923,
"learning_rate": 1.4669257379244905e-05,
"loss": 0.1619,
"step": 11960
},
{
"epoch": 1.89,
"grad_norm": 1.7970574991179633,
"learning_rate": 1.465075736983158e-05,
"loss": 0.1584,
"step": 11980
},
{
"epoch": 1.9,
"grad_norm": 1.8625966219833416,
"learning_rate": 1.46322370294188e-05,
"loss": 0.1642,
"step": 12000
},
{
"epoch": 1.9,
"grad_norm": 1.99642977340393,
"learning_rate": 1.4613696438986493e-05,
"loss": 0.1576,
"step": 12020
},
{
"epoch": 1.9,
"grad_norm": 2.5428182671668944,
"learning_rate": 1.4595135679603135e-05,
"loss": 0.1741,
"step": 12040
},
{
"epoch": 1.91,
"grad_norm": 3.8472437913445336,
"learning_rate": 1.457655483242539e-05,
"loss": 0.1713,
"step": 12060
},
{
"epoch": 1.91,
"grad_norm": 2.0144352913407935,
"learning_rate": 1.4557953978697748e-05,
"loss": 0.1688,
"step": 12080
},
{
"epoch": 1.91,
"grad_norm": 1.792402123900783,
"learning_rate": 1.4539333199752189e-05,
"loss": 0.166,
"step": 12100
},
{
"epoch": 1.92,
"grad_norm": 2.141224129916374,
"learning_rate": 1.4520692577007808e-05,
"loss": 0.1626,
"step": 12120
},
{
"epoch": 1.92,
"grad_norm": 1.73495865357573,
"learning_rate": 1.4502032191970468e-05,
"loss": 0.1599,
"step": 12140
},
{
"epoch": 1.92,
"grad_norm": 10.641224758817746,
"learning_rate": 1.4483352126232446e-05,
"loss": 0.178,
"step": 12160
},
{
"epoch": 1.93,
"grad_norm": 1.8420610612571067,
"learning_rate": 1.4464652461472068e-05,
"loss": 0.1648,
"step": 12180
},
{
"epoch": 1.93,
"grad_norm": 1.8800775441958055,
"learning_rate": 1.4445933279453358e-05,
"loss": 0.1696,
"step": 12200
},
{
"epoch": 1.93,
"grad_norm": 2.151331613378997,
"learning_rate": 1.4427194662025678e-05,
"loss": 0.1803,
"step": 12220
},
{
"epoch": 1.93,
"grad_norm": 1.9226187530668073,
"learning_rate": 1.4408436691123373e-05,
"loss": 0.1635,
"step": 12240
},
{
"epoch": 1.94,
"grad_norm": 1.869667544978119,
"learning_rate": 1.4389659448765408e-05,
"loss": 0.1662,
"step": 12260
},
{
"epoch": 1.94,
"grad_norm": 2.123839678653108,
"learning_rate": 1.437086301705502e-05,
"loss": 0.1607,
"step": 12280
},
{
"epoch": 1.94,
"grad_norm": 1.8970609470082447,
"learning_rate": 1.4352047478179341e-05,
"loss": 0.1682,
"step": 12300
},
{
"epoch": 1.95,
"grad_norm": 1.8846339362592277,
"learning_rate": 1.4333212914409055e-05,
"loss": 0.1681,
"step": 12320
},
{
"epoch": 1.95,
"grad_norm": 2.596536724951098,
"learning_rate": 1.4314359408098029e-05,
"loss": 0.164,
"step": 12340
},
{
"epoch": 1.95,
"grad_norm": 1.983728679020799,
"learning_rate": 1.4295487041682956e-05,
"loss": 0.1657,
"step": 12360
},
{
"epoch": 1.96,
"grad_norm": 2.0578280622839555,
"learning_rate": 1.4276595897682996e-05,
"loss": 0.1788,
"step": 12380
},
{
"epoch": 1.96,
"grad_norm": 1.8225653464241232,
"learning_rate": 1.425768605869942e-05,
"loss": 0.1864,
"step": 12400
},
{
"epoch": 1.96,
"grad_norm": 1.8018042581666063,
"learning_rate": 1.4238757607415225e-05,
"loss": 0.1708,
"step": 12420
},
{
"epoch": 1.97,
"grad_norm": 1.6471439004511217,
"learning_rate": 1.421981062659481e-05,
"loss": 0.1608,
"step": 12440
},
{
"epoch": 1.97,
"grad_norm": 1.785407584963669,
"learning_rate": 1.420084519908358e-05,
"loss": 0.1659,
"step": 12460
},
{
"epoch": 1.97,
"grad_norm": 2.4554236266808225,
"learning_rate": 1.4181861407807606e-05,
"loss": 0.1675,
"step": 12480
},
{
"epoch": 1.98,
"grad_norm": 1.872120044188974,
"learning_rate": 1.4162859335773253e-05,
"loss": 0.1814,
"step": 12500
},
{
"epoch": 1.98,
"grad_norm": 1.8127393893298422,
"learning_rate": 1.4143839066066813e-05,
"loss": 0.1773,
"step": 12520
},
{
"epoch": 1.98,
"grad_norm": 2.011729890366605,
"learning_rate": 1.4124800681854152e-05,
"loss": 0.156,
"step": 12540
},
{
"epoch": 1.99,
"grad_norm": 1.6619766845812105,
"learning_rate": 1.410574426638034e-05,
"loss": 0.1576,
"step": 12560
},
{
"epoch": 1.99,
"grad_norm": 2.1915362586421234,
"learning_rate": 1.4086669902969292e-05,
"loss": 0.1736,
"step": 12580
},
{
"epoch": 1.99,
"grad_norm": 2.067978259195529,
"learning_rate": 1.4067577675023391e-05,
"loss": 0.163,
"step": 12600
},
{
"epoch": 1.99,
"grad_norm": 1.7953546477600242,
"learning_rate": 1.4048467666023144e-05,
"loss": 0.1686,
"step": 12620
},
{
"epoch": 2.0,
"grad_norm": 2.297945149563059,
"learning_rate": 1.4029339959526795e-05,
"loss": 0.1594,
"step": 12640
}
],
"logging_steps": 20,
"max_steps": 31630,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}