9b-5 / trainer_state.json
furproxy's picture
Upload folder using huggingface_hub
55d1b62 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 1626,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0024622960911049553,
"grad_norm": 1.5234375,
"learning_rate": 4.0816326530612243e-07,
"loss": 1.3865270614624023,
"step": 2
},
{
"epoch": 0.0049245921822099106,
"grad_norm": 36.75,
"learning_rate": 1.2244897959183673e-06,
"loss": 1.8756635189056396,
"step": 4
},
{
"epoch": 0.007386888273314866,
"grad_norm": 3.625,
"learning_rate": 2.0408163265306125e-06,
"loss": 1.1310276985168457,
"step": 6
},
{
"epoch": 0.009849184364419821,
"grad_norm": 6.09375,
"learning_rate": 2.8571428571428573e-06,
"loss": 1.8238341808319092,
"step": 8
},
{
"epoch": 0.012311480455524777,
"grad_norm": 12.0,
"learning_rate": 3.6734693877551024e-06,
"loss": 2.2014291286468506,
"step": 10
},
{
"epoch": 0.014773776546629732,
"grad_norm": 19.5,
"learning_rate": 4.489795918367348e-06,
"loss": 2.4339303970336914,
"step": 12
},
{
"epoch": 0.017236072637734686,
"grad_norm": 9.1875,
"learning_rate": 5.306122448979593e-06,
"loss": 1.3835787773132324,
"step": 14
},
{
"epoch": 0.019698368728839642,
"grad_norm": 3.453125,
"learning_rate": 6.122448979591837e-06,
"loss": 1.1793060302734375,
"step": 16
},
{
"epoch": 0.0221606648199446,
"grad_norm": 2.21875,
"learning_rate": 6.938775510204082e-06,
"loss": 1.173147439956665,
"step": 18
},
{
"epoch": 0.024622960911049555,
"grad_norm": 12.875,
"learning_rate": 7.755102040816327e-06,
"loss": 2.2560791969299316,
"step": 20
},
{
"epoch": 0.02708525700215451,
"grad_norm": 3.859375,
"learning_rate": 8.571428571428571e-06,
"loss": 1.753507137298584,
"step": 22
},
{
"epoch": 0.029547553093259463,
"grad_norm": 11.125,
"learning_rate": 9.387755102040818e-06,
"loss": 2.109658718109131,
"step": 24
},
{
"epoch": 0.03200984918436442,
"grad_norm": 4.375,
"learning_rate": 1.0204081632653063e-05,
"loss": 1.7001088857650757,
"step": 26
},
{
"epoch": 0.03447214527546937,
"grad_norm": 7.53125,
"learning_rate": 1.1020408163265306e-05,
"loss": 2.2228636741638184,
"step": 28
},
{
"epoch": 0.03693444136657433,
"grad_norm": 2.09375,
"learning_rate": 1.1836734693877552e-05,
"loss": 1.233575463294983,
"step": 30
},
{
"epoch": 0.039396737457679284,
"grad_norm": 5.03125,
"learning_rate": 1.2653061224489798e-05,
"loss": 1.834639549255371,
"step": 32
},
{
"epoch": 0.041859033548784244,
"grad_norm": 3.796875,
"learning_rate": 1.3469387755102042e-05,
"loss": 1.8060579299926758,
"step": 34
},
{
"epoch": 0.0443213296398892,
"grad_norm": 2.625,
"learning_rate": 1.4285714285714287e-05,
"loss": 1.4287090301513672,
"step": 36
},
{
"epoch": 0.04678362573099415,
"grad_norm": 11.375,
"learning_rate": 1.510204081632653e-05,
"loss": 2.1703319549560547,
"step": 38
},
{
"epoch": 0.04924592182209911,
"grad_norm": 40.75,
"learning_rate": 1.5918367346938776e-05,
"loss": 2.1797375679016113,
"step": 40
},
{
"epoch": 0.05170821791320406,
"grad_norm": 15.8125,
"learning_rate": 1.673469387755102e-05,
"loss": 1.9881037473678589,
"step": 42
},
{
"epoch": 0.05417051400430902,
"grad_norm": 9.375,
"learning_rate": 1.7551020408163266e-05,
"loss": 1.735787034034729,
"step": 44
},
{
"epoch": 0.056632810095413974,
"grad_norm": 8.125,
"learning_rate": 1.836734693877551e-05,
"loss": 1.9953798055648804,
"step": 46
},
{
"epoch": 0.05909510618651893,
"grad_norm": 4.40625,
"learning_rate": 1.9183673469387756e-05,
"loss": 1.1727348566055298,
"step": 48
},
{
"epoch": 0.061557402277623886,
"grad_norm": 3.4375,
"learning_rate": 2e-05,
"loss": 1.6915946006774902,
"step": 50
},
{
"epoch": 0.06401969836872884,
"grad_norm": 5.96875,
"learning_rate": 1.9999936502625722e-05,
"loss": 2.3282856941223145,
"step": 52
},
{
"epoch": 0.0664819944598338,
"grad_norm": 5.90625,
"learning_rate": 1.9999746011510863e-05,
"loss": 1.9712034463882446,
"step": 54
},
{
"epoch": 0.06894429055093874,
"grad_norm": 18.375,
"learning_rate": 1.9999428529679345e-05,
"loss": 1.5145387649536133,
"step": 56
},
{
"epoch": 0.0714065866420437,
"grad_norm": 3.84375,
"learning_rate": 1.9998984062170987e-05,
"loss": 1.1939287185668945,
"step": 58
},
{
"epoch": 0.07386888273314866,
"grad_norm": 5.34375,
"learning_rate": 1.9998412616041416e-05,
"loss": 1.7602123022079468,
"step": 60
},
{
"epoch": 0.07633117882425362,
"grad_norm": 4.71875,
"learning_rate": 1.9997714200361962e-05,
"loss": 1.446789264678955,
"step": 62
},
{
"epoch": 0.07879347491535857,
"grad_norm": 3.296875,
"learning_rate": 1.999688882621952e-05,
"loss": 1.6264426708221436,
"step": 64
},
{
"epoch": 0.08125577100646353,
"grad_norm": 2.84375,
"learning_rate": 1.9995936506716357e-05,
"loss": 1.63454008102417,
"step": 66
},
{
"epoch": 0.08371806709756849,
"grad_norm": 4.0625,
"learning_rate": 1.9994857256969928e-05,
"loss": 1.8928616046905518,
"step": 68
},
{
"epoch": 0.08618036318867343,
"grad_norm": 4.15625,
"learning_rate": 1.999365109411261e-05,
"loss": 1.7232308387756348,
"step": 70
},
{
"epoch": 0.0886426592797784,
"grad_norm": 3.5,
"learning_rate": 1.9992318037291443e-05,
"loss": 1.5345882177352905,
"step": 72
},
{
"epoch": 0.09110495537088335,
"grad_norm": 2.390625,
"learning_rate": 1.9990858107667836e-05,
"loss": 1.5957210063934326,
"step": 74
},
{
"epoch": 0.0935672514619883,
"grad_norm": 4.9375,
"learning_rate": 1.9989271328417207e-05,
"loss": 1.5378596782684326,
"step": 76
},
{
"epoch": 0.09602954755309326,
"grad_norm": 4.625,
"learning_rate": 1.998755772472864e-05,
"loss": 1.7094926834106445,
"step": 78
},
{
"epoch": 0.09849184364419822,
"grad_norm": 4.78125,
"learning_rate": 1.9985717323804467e-05,
"loss": 1.6278411149978638,
"step": 80
},
{
"epoch": 0.10095413973530316,
"grad_norm": 2.859375,
"learning_rate": 1.998375015485984e-05,
"loss": 1.1961259841918945,
"step": 82
},
{
"epoch": 0.10341643582640812,
"grad_norm": 6.15625,
"learning_rate": 1.9981656249122285e-05,
"loss": 1.1318538188934326,
"step": 84
},
{
"epoch": 0.10587873191751308,
"grad_norm": 7.0625,
"learning_rate": 1.997943563983117e-05,
"loss": 1.7807825803756714,
"step": 86
},
{
"epoch": 0.10834102800861804,
"grad_norm": 5.1875,
"learning_rate": 1.9977088362237217e-05,
"loss": 1.4653401374816895,
"step": 88
},
{
"epoch": 0.11080332409972299,
"grad_norm": 1.8671875,
"learning_rate": 1.9974614453601913e-05,
"loss": 1.245106816291809,
"step": 90
},
{
"epoch": 0.11326562019082795,
"grad_norm": 13.625,
"learning_rate": 1.997201395319694e-05,
"loss": 1.646073818206787,
"step": 92
},
{
"epoch": 0.11572791628193291,
"grad_norm": 4.28125,
"learning_rate": 1.996928690230353e-05,
"loss": 1.5019184350967407,
"step": 94
},
{
"epoch": 0.11819021237303785,
"grad_norm": 4.375,
"learning_rate": 1.996643334421182e-05,
"loss": 1.4860734939575195,
"step": 96
},
{
"epoch": 0.12065250846414281,
"grad_norm": 2.8125,
"learning_rate": 1.9963453324220185e-05,
"loss": 1.1848664283752441,
"step": 98
},
{
"epoch": 0.12311480455524777,
"grad_norm": 3.015625,
"learning_rate": 1.9960346889634478e-05,
"loss": 1.2456748485565186,
"step": 100
},
{
"epoch": 0.12557710064635272,
"grad_norm": 1.765625,
"learning_rate": 1.9957114089767306e-05,
"loss": 1.163445234298706,
"step": 102
},
{
"epoch": 0.12803939673745768,
"grad_norm": 2.546875,
"learning_rate": 1.9953754975937246e-05,
"loss": 1.5070371627807617,
"step": 104
},
{
"epoch": 0.13050169282856264,
"grad_norm": 2.171875,
"learning_rate": 1.9950269601468033e-05,
"loss": 1.0462160110473633,
"step": 106
},
{
"epoch": 0.1329639889196676,
"grad_norm": 2.75,
"learning_rate": 1.9946658021687694e-05,
"loss": 1.46537184715271,
"step": 108
},
{
"epoch": 0.13542628501077256,
"grad_norm": 3.109375,
"learning_rate": 1.994292029392768e-05,
"loss": 1.5482763051986694,
"step": 110
},
{
"epoch": 0.1378885811018775,
"grad_norm": 1.4921875,
"learning_rate": 1.993905647752198e-05,
"loss": 1.0207593441009521,
"step": 112
},
{
"epoch": 0.14035087719298245,
"grad_norm": 6.46875,
"learning_rate": 1.9935066633806133e-05,
"loss": 1.77092444896698,
"step": 114
},
{
"epoch": 0.1428131732840874,
"grad_norm": 4.21875,
"learning_rate": 1.9930950826116288e-05,
"loss": 1.4896173477172852,
"step": 116
},
{
"epoch": 0.14527546937519237,
"grad_norm": 1.6796875,
"learning_rate": 1.9926709119788197e-05,
"loss": 1.1458995342254639,
"step": 118
},
{
"epoch": 0.14773776546629733,
"grad_norm": 1.8203125,
"learning_rate": 1.9922341582156156e-05,
"loss": 1.0295559167861938,
"step": 120
},
{
"epoch": 0.1502000615574023,
"grad_norm": 3.984375,
"learning_rate": 1.9917848282551965e-05,
"loss": 1.4944086074829102,
"step": 122
},
{
"epoch": 0.15266235764850725,
"grad_norm": 2.78125,
"learning_rate": 1.9913229292303806e-05,
"loss": 1.5551412105560303,
"step": 124
},
{
"epoch": 0.15512465373961218,
"grad_norm": 23.5,
"learning_rate": 1.990848468473511e-05,
"loss": 1.9140477180480957,
"step": 126
},
{
"epoch": 0.15758694983071714,
"grad_norm": 10.4375,
"learning_rate": 1.9903614535163417e-05,
"loss": 1.4774185419082642,
"step": 128
},
{
"epoch": 0.1600492459218221,
"grad_norm": 1.6484375,
"learning_rate": 1.989861892089914e-05,
"loss": 1.1932008266448975,
"step": 130
},
{
"epoch": 0.16251154201292706,
"grad_norm": 1.0625,
"learning_rate": 1.9893497921244394e-05,
"loss": 1.253312349319458,
"step": 132
},
{
"epoch": 0.16497383810403202,
"grad_norm": 2.8125,
"learning_rate": 1.9888251617491674e-05,
"loss": 1.0982537269592285,
"step": 134
},
{
"epoch": 0.16743613419513698,
"grad_norm": 3.5,
"learning_rate": 1.9882880092922612e-05,
"loss": 1.5139843225479126,
"step": 136
},
{
"epoch": 0.1698984302862419,
"grad_norm": 3.109375,
"learning_rate": 1.9877383432806633e-05,
"loss": 1.542289137840271,
"step": 138
},
{
"epoch": 0.17236072637734687,
"grad_norm": 3.234375,
"learning_rate": 1.9871761724399617e-05,
"loss": 1.432151436805725,
"step": 140
},
{
"epoch": 0.17482302246845183,
"grad_norm": 2.84375,
"learning_rate": 1.986601505694248e-05,
"loss": 1.500737190246582,
"step": 142
},
{
"epoch": 0.1772853185595568,
"grad_norm": 4.65625,
"learning_rate": 1.986014352165981e-05,
"loss": 1.461523413658142,
"step": 144
},
{
"epoch": 0.17974761465066175,
"grad_norm": 2.859375,
"learning_rate": 1.985414721175837e-05,
"loss": 1.5014877319335938,
"step": 146
},
{
"epoch": 0.1822099107417667,
"grad_norm": 3.3125,
"learning_rate": 1.9848026222425636e-05,
"loss": 1.4726862907409668,
"step": 148
},
{
"epoch": 0.18467220683287167,
"grad_norm": 4.625,
"learning_rate": 1.9841780650828308e-05,
"loss": 1.543365716934204,
"step": 150
},
{
"epoch": 0.1871345029239766,
"grad_norm": 16.5,
"learning_rate": 1.9835410596110723e-05,
"loss": 0.5347945094108582,
"step": 152
},
{
"epoch": 0.18959679901508156,
"grad_norm": 4.71875,
"learning_rate": 1.982891615939333e-05,
"loss": 1.6506417989730835,
"step": 154
},
{
"epoch": 0.19205909510618652,
"grad_norm": 20.875,
"learning_rate": 1.982229744377104e-05,
"loss": 0.903097927570343,
"step": 156
},
{
"epoch": 0.19452139119729148,
"grad_norm": 4.71875,
"learning_rate": 1.9815554554311623e-05,
"loss": 1.4461334943771362,
"step": 158
},
{
"epoch": 0.19698368728839644,
"grad_norm": 7.78125,
"learning_rate": 1.9808687598054023e-05,
"loss": 1.1890747547149658,
"step": 160
},
{
"epoch": 0.1994459833795014,
"grad_norm": 5.15625,
"learning_rate": 1.980169668400666e-05,
"loss": 1.4282548427581787,
"step": 162
},
{
"epoch": 0.20190827947060633,
"grad_norm": 3.359375,
"learning_rate": 1.9794581923145708e-05,
"loss": 1.2562037706375122,
"step": 164
},
{
"epoch": 0.2043705755617113,
"grad_norm": 3.296875,
"learning_rate": 1.9787343428413327e-05,
"loss": 1.4614920616149902,
"step": 166
},
{
"epoch": 0.20683287165281625,
"grad_norm": 3.484375,
"learning_rate": 1.9779981314715866e-05,
"loss": 1.3043287992477417,
"step": 168
},
{
"epoch": 0.2092951677439212,
"grad_norm": 3.203125,
"learning_rate": 1.9772495698922047e-05,
"loss": 1.17995285987854,
"step": 170
},
{
"epoch": 0.21175746383502617,
"grad_norm": 6.03125,
"learning_rate": 1.9764886699861104e-05,
"loss": 2.0112454891204834,
"step": 172
},
{
"epoch": 0.21421975992613113,
"grad_norm": 5.6875,
"learning_rate": 1.9757154438320914e-05,
"loss": 1.485538363456726,
"step": 174
},
{
"epoch": 0.21668205601723609,
"grad_norm": 3.78125,
"learning_rate": 1.974929903704604e-05,
"loss": 1.445993423461914,
"step": 176
},
{
"epoch": 0.21914435210834102,
"grad_norm": 3.59375,
"learning_rate": 1.9741320620735832e-05,
"loss": 1.4375782012939453,
"step": 178
},
{
"epoch": 0.22160664819944598,
"grad_norm": 5.875,
"learning_rate": 1.9733219316042404e-05,
"loss": 1.8119451999664307,
"step": 180
},
{
"epoch": 0.22406894429055094,
"grad_norm": 6.4375,
"learning_rate": 1.9724995251568648e-05,
"loss": 1.9366390705108643,
"step": 182
},
{
"epoch": 0.2265312403816559,
"grad_norm": 5.0,
"learning_rate": 1.97166485578662e-05,
"loss": 1.4353549480438232,
"step": 184
},
{
"epoch": 0.22899353647276086,
"grad_norm": 3.859375,
"learning_rate": 1.9708179367433333e-05,
"loss": 1.4814636707305908,
"step": 186
},
{
"epoch": 0.23145583256386582,
"grad_norm": 2.8125,
"learning_rate": 1.969958781471289e-05,
"loss": 1.3983485698699951,
"step": 188
},
{
"epoch": 0.23391812865497075,
"grad_norm": 6.4375,
"learning_rate": 1.9690874036090126e-05,
"loss": 1.8465726375579834,
"step": 190
},
{
"epoch": 0.2363804247460757,
"grad_norm": 2.875,
"learning_rate": 1.9682038169890563e-05,
"loss": 1.4366203546524048,
"step": 192
},
{
"epoch": 0.23884272083718067,
"grad_norm": 4.96875,
"learning_rate": 1.9673080356377778e-05,
"loss": 1.397793173789978,
"step": 194
},
{
"epoch": 0.24130501692828563,
"grad_norm": 9.1875,
"learning_rate": 1.9664000737751176e-05,
"loss": 0.40697720646858215,
"step": 196
},
{
"epoch": 0.24376731301939059,
"grad_norm": 9.9375,
"learning_rate": 1.9654799458143744e-05,
"loss": 0.7866343259811401,
"step": 198
},
{
"epoch": 0.24622960911049555,
"grad_norm": 3.34375,
"learning_rate": 1.9645476663619748e-05,
"loss": 1.4268109798431396,
"step": 200
},
{
"epoch": 0.2486919052016005,
"grad_norm": 9.3125,
"learning_rate": 1.9636032502172445e-05,
"loss": 1.2419297695159912,
"step": 202
},
{
"epoch": 0.25115420129270544,
"grad_norm": 5.65625,
"learning_rate": 1.962646712372169e-05,
"loss": 1.7364747524261475,
"step": 204
},
{
"epoch": 0.2536164973838104,
"grad_norm": 4.28125,
"learning_rate": 1.9616780680111587e-05,
"loss": 1.3980765342712402,
"step": 206
},
{
"epoch": 0.25607879347491536,
"grad_norm": 12.1875,
"learning_rate": 1.9606973325108077e-05,
"loss": 1.4629418849945068,
"step": 208
},
{
"epoch": 0.2585410895660203,
"grad_norm": 5.8125,
"learning_rate": 1.9597045214396472e-05,
"loss": 1.361374855041504,
"step": 210
},
{
"epoch": 0.2610033856571253,
"grad_norm": 3.90625,
"learning_rate": 1.958699650557902e-05,
"loss": 1.4552102088928223,
"step": 212
},
{
"epoch": 0.2634656817482302,
"grad_norm": 3.703125,
"learning_rate": 1.9576827358172377e-05,
"loss": 1.4295791387557983,
"step": 214
},
{
"epoch": 0.2659279778393352,
"grad_norm": 7.21875,
"learning_rate": 1.956653793360508e-05,
"loss": 1.4938560724258423,
"step": 216
},
{
"epoch": 0.2683902739304401,
"grad_norm": 10.875,
"learning_rate": 1.955612839521499e-05,
"loss": 1.405943512916565,
"step": 218
},
{
"epoch": 0.2708525700215451,
"grad_norm": 7.09375,
"learning_rate": 1.95455989082467e-05,
"loss": 1.8168143033981323,
"step": 220
},
{
"epoch": 0.27331486611265005,
"grad_norm": 4.8125,
"learning_rate": 1.9534949639848894e-05,
"loss": 1.880413293838501,
"step": 222
},
{
"epoch": 0.275777162203755,
"grad_norm": 6.3125,
"learning_rate": 1.9524180759071724e-05,
"loss": 1.4368586540222168,
"step": 224
},
{
"epoch": 0.27823945829485996,
"grad_norm": 7.59375,
"learning_rate": 1.9513292436864107e-05,
"loss": 1.4332315921783447,
"step": 226
},
{
"epoch": 0.2807017543859649,
"grad_norm": 4.375,
"learning_rate": 1.9502284846071003e-05,
"loss": 1.4779151678085327,
"step": 228
},
{
"epoch": 0.2831640504770699,
"grad_norm": 8.75,
"learning_rate": 1.9491158161430703e-05,
"loss": 0.5792175531387329,
"step": 230
},
{
"epoch": 0.2856263465681748,
"grad_norm": 1.3828125,
"learning_rate": 1.9479912559572e-05,
"loss": 1.0462322235107422,
"step": 232
},
{
"epoch": 0.2880886426592798,
"grad_norm": 7.375,
"learning_rate": 1.946854821901146e-05,
"loss": 1.3507080078125,
"step": 234
},
{
"epoch": 0.29055093875038474,
"grad_norm": 4.1875,
"learning_rate": 1.945706532015052e-05,
"loss": 1.4383283853530884,
"step": 236
},
{
"epoch": 0.29301323484148967,
"grad_norm": 5.96875,
"learning_rate": 1.9445464045272668e-05,
"loss": 0.7620460987091064,
"step": 238
},
{
"epoch": 0.29547553093259465,
"grad_norm": 5.03125,
"learning_rate": 1.9433744578540525e-05,
"loss": 1.3795279264450073,
"step": 240
},
{
"epoch": 0.2979378270236996,
"grad_norm": 3.1875,
"learning_rate": 1.942190710599293e-05,
"loss": 1.4460288286209106,
"step": 242
},
{
"epoch": 0.3004001231148046,
"grad_norm": 3.359375,
"learning_rate": 1.940995181554199e-05,
"loss": 1.4355218410491943,
"step": 244
},
{
"epoch": 0.3028624192059095,
"grad_norm": 5.34375,
"learning_rate": 1.93978788969701e-05,
"loss": 1.339043140411377,
"step": 246
},
{
"epoch": 0.3053247152970145,
"grad_norm": 3.03125,
"learning_rate": 1.9385688541926903e-05,
"loss": 1.4305763244628906,
"step": 248
},
{
"epoch": 0.3077870113881194,
"grad_norm": 3.6875,
"learning_rate": 1.9373380943926295e-05,
"loss": 1.7878942489624023,
"step": 250
},
{
"epoch": 0.31024930747922436,
"grad_norm": 5.96875,
"learning_rate": 1.9360956298343313e-05,
"loss": 1.680354356765747,
"step": 252
},
{
"epoch": 0.31271160357032934,
"grad_norm": 5.90625,
"learning_rate": 1.934841480241105e-05,
"loss": 1.5553169250488281,
"step": 254
},
{
"epoch": 0.3151738996614343,
"grad_norm": 3.640625,
"learning_rate": 1.9335756655217513e-05,
"loss": 1.4183763265609741,
"step": 256
},
{
"epoch": 0.31763619575253926,
"grad_norm": 2.890625,
"learning_rate": 1.9322982057702492e-05,
"loss": 1.391609787940979,
"step": 258
},
{
"epoch": 0.3200984918436442,
"grad_norm": 3.59375,
"learning_rate": 1.931009121265433e-05,
"loss": 1.4094479084014893,
"step": 260
},
{
"epoch": 0.3225607879347491,
"grad_norm": 3.21875,
"learning_rate": 1.9297084324706734e-05,
"loss": 1.4225077629089355,
"step": 262
},
{
"epoch": 0.3250230840258541,
"grad_norm": 6.40625,
"learning_rate": 1.9283961600335503e-05,
"loss": 1.468010663986206,
"step": 264
},
{
"epoch": 0.32748538011695905,
"grad_norm": 5.3125,
"learning_rate": 1.927072324785529e-05,
"loss": 1.7119166851043701,
"step": 266
},
{
"epoch": 0.32994767620806403,
"grad_norm": 1.609375,
"learning_rate": 1.9257369477416224e-05,
"loss": 1.0271199941635132,
"step": 268
},
{
"epoch": 0.33240997229916897,
"grad_norm": 8.5,
"learning_rate": 1.9243900501000666e-05,
"loss": 1.992653727531433,
"step": 270
},
{
"epoch": 0.33487226839027395,
"grad_norm": 2.46875,
"learning_rate": 1.9230316532419776e-05,
"loss": 1.1357910633087158,
"step": 272
},
{
"epoch": 0.3373345644813789,
"grad_norm": 4.875,
"learning_rate": 1.9216617787310126e-05,
"loss": 1.4825578927993774,
"step": 274
},
{
"epoch": 0.3397968605724838,
"grad_norm": 1.6328125,
"learning_rate": 1.920280448313031e-05,
"loss": 1.0347270965576172,
"step": 276
},
{
"epoch": 0.3422591566635888,
"grad_norm": 12.625,
"learning_rate": 1.918887683915746e-05,
"loss": 1.3586125373840332,
"step": 278
},
{
"epoch": 0.34472145275469374,
"grad_norm": 4.15625,
"learning_rate": 1.9174835076483786e-05,
"loss": 1.4484443664550781,
"step": 280
},
{
"epoch": 0.3471837488457987,
"grad_norm": 21.0,
"learning_rate": 1.916067941801305e-05,
"loss": 1.623072624206543,
"step": 282
},
{
"epoch": 0.34964604493690365,
"grad_norm": 2.25,
"learning_rate": 1.914641008845704e-05,
"loss": 1.2479501962661743,
"step": 284
},
{
"epoch": 0.35210834102800864,
"grad_norm": 2.1875,
"learning_rate": 1.9132027314331992e-05,
"loss": 1.23157799243927,
"step": 286
},
{
"epoch": 0.3545706371191136,
"grad_norm": 3.9375,
"learning_rate": 1.9117531323955004e-05,
"loss": 1.4075965881347656,
"step": 288
},
{
"epoch": 0.3570329332102185,
"grad_norm": 4.59375,
"learning_rate": 1.910292234744042e-05,
"loss": 1.6323527097702026,
"step": 290
},
{
"epoch": 0.3594952293013235,
"grad_norm": 6.15625,
"learning_rate": 1.9088200616696135e-05,
"loss": 1.7271039485931396,
"step": 292
},
{
"epoch": 0.3619575253924284,
"grad_norm": 9.625,
"learning_rate": 1.9073366365419974e-05,
"loss": 1.7908841371536255,
"step": 294
},
{
"epoch": 0.3644198214835334,
"grad_norm": 5.75,
"learning_rate": 1.9058419829095926e-05,
"loss": 1.6885616779327393,
"step": 296
},
{
"epoch": 0.36688211757463834,
"grad_norm": 4.34375,
"learning_rate": 1.9043361244990458e-05,
"loss": 1.6981712579727173,
"step": 298
},
{
"epoch": 0.36934441366574333,
"grad_norm": 9.1875,
"learning_rate": 1.9028190852148695e-05,
"loss": 1.8226585388183594,
"step": 300
},
{
"epoch": 0.37180670975684826,
"grad_norm": 4.5,
"learning_rate": 1.9012908891390674e-05,
"loss": 1.448561191558838,
"step": 302
},
{
"epoch": 0.3742690058479532,
"grad_norm": 1.8671875,
"learning_rate": 1.8997515605307484e-05,
"loss": 1.1009801626205444,
"step": 304
},
{
"epoch": 0.3767313019390582,
"grad_norm": 3.53125,
"learning_rate": 1.898201123825744e-05,
"loss": 1.4048492908477783,
"step": 306
},
{
"epoch": 0.3791935980301631,
"grad_norm": 3.0625,
"learning_rate": 1.8966396036362197e-05,
"loss": 1.317664384841919,
"step": 308
},
{
"epoch": 0.3816558941212681,
"grad_norm": 7.125,
"learning_rate": 1.8950670247502823e-05,
"loss": 1.1580454111099243,
"step": 310
},
{
"epoch": 0.38411819021237303,
"grad_norm": 2.71875,
"learning_rate": 1.8934834121315904e-05,
"loss": 1.486496925354004,
"step": 312
},
{
"epoch": 0.38658048630347797,
"grad_norm": 2.578125,
"learning_rate": 1.8918887909189537e-05,
"loss": 1.1772874593734741,
"step": 314
},
{
"epoch": 0.38904278239458295,
"grad_norm": 1.75,
"learning_rate": 1.8902831864259384e-05,
"loss": 1.039048671722412,
"step": 316
},
{
"epoch": 0.3915050784856879,
"grad_norm": 2.921875,
"learning_rate": 1.8886666241404614e-05,
"loss": 1.3585329055786133,
"step": 318
},
{
"epoch": 0.3939673745767929,
"grad_norm": 2.46875,
"learning_rate": 1.887039129724387e-05,
"loss": 1.1052215099334717,
"step": 320
},
{
"epoch": 0.3964296706678978,
"grad_norm": 3.296875,
"learning_rate": 1.8854007290131223e-05,
"loss": 1.4763174057006836,
"step": 322
},
{
"epoch": 0.3988919667590028,
"grad_norm": 6.34375,
"learning_rate": 1.8837514480152016e-05,
"loss": 1.395377278327942,
"step": 324
},
{
"epoch": 0.4013542628501077,
"grad_norm": 1.90625,
"learning_rate": 1.882091312911879e-05,
"loss": 1.043440580368042,
"step": 326
},
{
"epoch": 0.40381655894121266,
"grad_norm": 4.5625,
"learning_rate": 1.880420350056709e-05,
"loss": 1.8225022554397583,
"step": 328
},
{
"epoch": 0.40627885503231764,
"grad_norm": 1.3828125,
"learning_rate": 1.87873858597513e-05,
"loss": 1.035279393196106,
"step": 330
},
{
"epoch": 0.4087411511234226,
"grad_norm": 4.46875,
"learning_rate": 1.877046047364044e-05,
"loss": 1.4025003910064697,
"step": 332
},
{
"epoch": 0.41120344721452756,
"grad_norm": 5.3125,
"learning_rate": 1.875342761091389e-05,
"loss": 1.5152015686035156,
"step": 334
},
{
"epoch": 0.4136657433056325,
"grad_norm": 4.4375,
"learning_rate": 1.8736287541957172e-05,
"loss": 1.0334498882293701,
"step": 336
},
{
"epoch": 0.4161280393967375,
"grad_norm": 9.25,
"learning_rate": 1.8719040538857625e-05,
"loss": 1.5699793100357056,
"step": 338
},
{
"epoch": 0.4185903354878424,
"grad_norm": 1.1875,
"learning_rate": 1.8701686875400104e-05,
"loss": 1.0974748134613037,
"step": 340
},
{
"epoch": 0.42105263157894735,
"grad_norm": 4.25,
"learning_rate": 1.8684226827062632e-05,
"loss": 1.4441235065460205,
"step": 342
},
{
"epoch": 0.42351492767005233,
"grad_norm": 7.5,
"learning_rate": 1.8666660671012002e-05,
"loss": 1.0178951025009155,
"step": 344
},
{
"epoch": 0.42597722376115726,
"grad_norm": 6.96875,
"learning_rate": 1.8648988686099416e-05,
"loss": 1.7429275512695312,
"step": 346
},
{
"epoch": 0.42843951985226225,
"grad_norm": 5.5625,
"learning_rate": 1.863121115285604e-05,
"loss": 1.3890095949172974,
"step": 348
},
{
"epoch": 0.4309018159433672,
"grad_norm": 107.0,
"learning_rate": 1.8613328353488533e-05,
"loss": 1.671781301498413,
"step": 350
},
{
"epoch": 0.43336411203447217,
"grad_norm": 6.4375,
"learning_rate": 1.8595340571874607e-05,
"loss": 0.9639192223548889,
"step": 352
},
{
"epoch": 0.4358264081255771,
"grad_norm": 4.34375,
"learning_rate": 1.8577248093558486e-05,
"loss": 1.3523774147033691,
"step": 354
},
{
"epoch": 0.43828870421668203,
"grad_norm": 6.53125,
"learning_rate": 1.855905120574638e-05,
"loss": 1.4467836618423462,
"step": 356
},
{
"epoch": 0.440751000307787,
"grad_norm": 6.125,
"learning_rate": 1.854075019730194e-05,
"loss": 1.521872878074646,
"step": 358
},
{
"epoch": 0.44321329639889195,
"grad_norm": 3.890625,
"learning_rate": 1.8522345358741662e-05,
"loss": 0.7035669088363647,
"step": 360
},
{
"epoch": 0.44567559248999694,
"grad_norm": 6.1875,
"learning_rate": 1.8503836982230284e-05,
"loss": 1.9208122491836548,
"step": 362
},
{
"epoch": 0.4481378885811019,
"grad_norm": 3.328125,
"learning_rate": 1.848522536157612e-05,
"loss": 1.4902818202972412,
"step": 364
},
{
"epoch": 0.45060018467220686,
"grad_norm": 5.625,
"learning_rate": 1.8466510792226447e-05,
"loss": 1.7599055767059326,
"step": 366
},
{
"epoch": 0.4530624807633118,
"grad_norm": 9.875,
"learning_rate": 1.8447693571262757e-05,
"loss": 1.6332001686096191,
"step": 368
},
{
"epoch": 0.4555247768544167,
"grad_norm": 2.953125,
"learning_rate": 1.842877399739608e-05,
"loss": 1.3132367134094238,
"step": 370
},
{
"epoch": 0.4579870729455217,
"grad_norm": 3.09375,
"learning_rate": 1.840975237096224e-05,
"loss": 1.3803317546844482,
"step": 372
},
{
"epoch": 0.46044936903662664,
"grad_norm": 4.15625,
"learning_rate": 1.8390628993917062e-05,
"loss": 1.3456385135650635,
"step": 374
},
{
"epoch": 0.46291166512773163,
"grad_norm": 7.3125,
"learning_rate": 1.8371404169831613e-05,
"loss": 0.39371660351753235,
"step": 376
},
{
"epoch": 0.46537396121883656,
"grad_norm": 3.453125,
"learning_rate": 1.8352078203887346e-05,
"loss": 1.3137223720550537,
"step": 378
},
{
"epoch": 0.4678362573099415,
"grad_norm": 6.5,
"learning_rate": 1.8332651402871286e-05,
"loss": 0.324982613325119,
"step": 380
},
{
"epoch": 0.4702985534010465,
"grad_norm": 7.8125,
"learning_rate": 1.8313124075171153e-05,
"loss": 1.7339143753051758,
"step": 382
},
{
"epoch": 0.4727608494921514,
"grad_norm": 3.546875,
"learning_rate": 1.8293496530770448e-05,
"loss": 1.3264766931533813,
"step": 384
},
{
"epoch": 0.4752231455832564,
"grad_norm": 5.28125,
"learning_rate": 1.827376908124356e-05,
"loss": 1.732757568359375,
"step": 386
},
{
"epoch": 0.47768544167436133,
"grad_norm": 17.625,
"learning_rate": 1.8253942039750795e-05,
"loss": 1.7728583812713623,
"step": 388
},
{
"epoch": 0.4801477377654663,
"grad_norm": 2.046875,
"learning_rate": 1.8234015721033428e-05,
"loss": 1.1088775396347046,
"step": 390
},
{
"epoch": 0.48261003385657125,
"grad_norm": 7.75,
"learning_rate": 1.8213990441408687e-05,
"loss": 1.7161972522735596,
"step": 392
},
{
"epoch": 0.4850723299476762,
"grad_norm": 2.71875,
"learning_rate": 1.819386651876474e-05,
"loss": 1.3242639303207397,
"step": 394
},
{
"epoch": 0.48753462603878117,
"grad_norm": 3.4375,
"learning_rate": 1.8173644272555645e-05,
"loss": 1.387306571006775,
"step": 396
},
{
"epoch": 0.4899969221298861,
"grad_norm": 11.875,
"learning_rate": 1.815332402379629e-05,
"loss": 0.28826314210891724,
"step": 398
},
{
"epoch": 0.4924592182209911,
"grad_norm": 2.953125,
"learning_rate": 1.8132906095057287e-05,
"loss": 1.4168837070465088,
"step": 400
},
{
"epoch": 0.494921514312096,
"grad_norm": 7.15625,
"learning_rate": 1.8112390810459842e-05,
"loss": 1.8249226808547974,
"step": 402
},
{
"epoch": 0.497383810403201,
"grad_norm": 3.328125,
"learning_rate": 1.8091778495670645e-05,
"loss": 1.3672676086425781,
"step": 404
},
{
"epoch": 0.49984610649430594,
"grad_norm": 1.8359375,
"learning_rate": 1.8071069477896655e-05,
"loss": 1.1166040897369385,
"step": 406
},
{
"epoch": 0.5023084025854109,
"grad_norm": 3.609375,
"learning_rate": 1.805026408587994e-05,
"loss": 1.401571273803711,
"step": 408
},
{
"epoch": 0.5047706986765158,
"grad_norm": 1.03125,
"learning_rate": 1.8029362649892436e-05,
"loss": 1.0254689455032349,
"step": 410
},
{
"epoch": 0.5072329947676208,
"grad_norm": 8.3125,
"learning_rate": 1.8008365501730716e-05,
"loss": 1.4256839752197266,
"step": 412
},
{
"epoch": 0.5096952908587258,
"grad_norm": 2.234375,
"learning_rate": 1.7987272974710733e-05,
"loss": 1.2576653957366943,
"step": 414
},
{
"epoch": 0.5121575869498307,
"grad_norm": 6.65625,
"learning_rate": 1.7966085403662502e-05,
"loss": 1.847425937652588,
"step": 416
},
{
"epoch": 0.5146198830409356,
"grad_norm": 1.9609375,
"learning_rate": 1.79448031249248e-05,
"loss": 1.2791142463684082,
"step": 418
},
{
"epoch": 0.5170821791320406,
"grad_norm": 3.453125,
"learning_rate": 1.7923426476339843e-05,
"loss": 1.4304306507110596,
"step": 420
},
{
"epoch": 0.5195444752231456,
"grad_norm": 2.765625,
"learning_rate": 1.7901955797247894e-05,
"loss": 1.354073405265808,
"step": 422
},
{
"epoch": 0.5220067713142506,
"grad_norm": 5.46875,
"learning_rate": 1.7880391428481877e-05,
"loss": 1.1258585453033447,
"step": 424
},
{
"epoch": 0.5244690674053555,
"grad_norm": 7.96875,
"learning_rate": 1.7858733712362006e-05,
"loss": 1.2407653331756592,
"step": 426
},
{
"epoch": 0.5269313634964604,
"grad_norm": 3.953125,
"learning_rate": 1.7836982992690298e-05,
"loss": 1.3420263528823853,
"step": 428
},
{
"epoch": 0.5293936595875655,
"grad_norm": 1.6640625,
"learning_rate": 1.781513961474515e-05,
"loss": 1.070509672164917,
"step": 430
},
{
"epoch": 0.5318559556786704,
"grad_norm": 5.34375,
"learning_rate": 1.7793203925275857e-05,
"loss": 1.4249287843704224,
"step": 432
},
{
"epoch": 0.5343182517697753,
"grad_norm": 3.453125,
"learning_rate": 1.777117627249708e-05,
"loss": 1.3717284202575684,
"step": 434
},
{
"epoch": 0.5367805478608803,
"grad_norm": 3.28125,
"learning_rate": 1.774905700608335e-05,
"loss": 1.177480697631836,
"step": 436
},
{
"epoch": 0.5392428439519852,
"grad_norm": 1.6015625,
"learning_rate": 1.7726846477163506e-05,
"loss": 1.0270402431488037,
"step": 438
},
{
"epoch": 0.5417051400430902,
"grad_norm": 3.859375,
"learning_rate": 1.7704545038315108e-05,
"loss": 1.0033745765686035,
"step": 440
},
{
"epoch": 0.5441674361341952,
"grad_norm": 51.25,
"learning_rate": 1.7682153043558865e-05,
"loss": 1.7934285402297974,
"step": 442
},
{
"epoch": 0.5466297322253001,
"grad_norm": 4.6875,
"learning_rate": 1.765967084835299e-05,
"loss": 1.5169916152954102,
"step": 444
},
{
"epoch": 0.549092028316405,
"grad_norm": 2.15625,
"learning_rate": 1.763709880958758e-05,
"loss": 1.102067470550537,
"step": 446
},
{
"epoch": 0.55155432440751,
"grad_norm": 5.9375,
"learning_rate": 1.7614437285578927e-05,
"loss": 1.742466926574707,
"step": 448
},
{
"epoch": 0.554016620498615,
"grad_norm": 2.484375,
"learning_rate": 1.7591686636063855e-05,
"loss": 0.9622822403907776,
"step": 450
},
{
"epoch": 0.5564789165897199,
"grad_norm": 3.671875,
"learning_rate": 1.756884722219398e-05,
"loss": 1.3980923891067505,
"step": 452
},
{
"epoch": 0.5589412126808249,
"grad_norm": 3.4375,
"learning_rate": 1.754591940653002e-05,
"loss": 1.2967207431793213,
"step": 454
},
{
"epoch": 0.5614035087719298,
"grad_norm": 2.0625,
"learning_rate": 1.7522903553035983e-05,
"loss": 1.026415228843689,
"step": 456
},
{
"epoch": 0.5638658048630347,
"grad_norm": 8.4375,
"learning_rate": 1.749980002707344e-05,
"loss": 1.6526079177856445,
"step": 458
},
{
"epoch": 0.5663281009541398,
"grad_norm": 1.453125,
"learning_rate": 1.747660919539571e-05,
"loss": 1.0682464838027954,
"step": 460
},
{
"epoch": 0.5687903970452447,
"grad_norm": 1.4296875,
"learning_rate": 1.745333142614201e-05,
"loss": 1.2323286533355713,
"step": 462
},
{
"epoch": 0.5712526931363496,
"grad_norm": 8.6875,
"learning_rate": 1.742996708883165e-05,
"loss": 1.657741665840149,
"step": 464
},
{
"epoch": 0.5737149892274546,
"grad_norm": 5.6875,
"learning_rate": 1.740651655435815e-05,
"loss": 1.5120787620544434,
"step": 466
},
{
"epoch": 0.5761772853185596,
"grad_norm": 3.421875,
"learning_rate": 1.7382980194983354e-05,
"loss": 1.3939659595489502,
"step": 468
},
{
"epoch": 0.5786395814096645,
"grad_norm": 5.75,
"learning_rate": 1.735935838433151e-05,
"loss": 1.6433215141296387,
"step": 470
},
{
"epoch": 0.5811018775007695,
"grad_norm": 2.21875,
"learning_rate": 1.7335651497383357e-05,
"loss": 1.078176498413086,
"step": 472
},
{
"epoch": 0.5835641735918744,
"grad_norm": 3.03125,
"learning_rate": 1.731185991047017e-05,
"loss": 1.3398302793502808,
"step": 474
},
{
"epoch": 0.5860264696829793,
"grad_norm": 3.015625,
"learning_rate": 1.7287984001267765e-05,
"loss": 1.344508171081543,
"step": 476
},
{
"epoch": 0.5884887657740844,
"grad_norm": 3.921875,
"learning_rate": 1.7264024148790538e-05,
"loss": 1.453425407409668,
"step": 478
},
{
"epoch": 0.5909510618651893,
"grad_norm": 1.234375,
"learning_rate": 1.7239980733385408e-05,
"loss": 0.9735173583030701,
"step": 480
},
{
"epoch": 0.5934133579562942,
"grad_norm": 3.03125,
"learning_rate": 1.721585413672582e-05,
"loss": 1.3980371952056885,
"step": 482
},
{
"epoch": 0.5958756540473992,
"grad_norm": 3.6875,
"learning_rate": 1.7191644741805648e-05,
"loss": 1.3482059240341187,
"step": 484
},
{
"epoch": 0.5983379501385041,
"grad_norm": 3.203125,
"learning_rate": 1.716735293293316e-05,
"loss": 1.404923439025879,
"step": 486
},
{
"epoch": 0.6008002462296091,
"grad_norm": 5.59375,
"learning_rate": 1.7142979095724865e-05,
"loss": 1.5890945196151733,
"step": 488
},
{
"epoch": 0.6032625423207141,
"grad_norm": 10.25,
"learning_rate": 1.7118523617099435e-05,
"loss": 1.7281887531280518,
"step": 490
},
{
"epoch": 0.605724838411819,
"grad_norm": 3.3125,
"learning_rate": 1.7093986885271532e-05,
"loss": 1.4024686813354492,
"step": 492
},
{
"epoch": 0.6081871345029239,
"grad_norm": 1.171875,
"learning_rate": 1.7069369289745673e-05,
"loss": 1.1231578588485718,
"step": 494
},
{
"epoch": 0.610649430594029,
"grad_norm": 5.40625,
"learning_rate": 1.704467122131003e-05,
"loss": 1.6918822526931763,
"step": 496
},
{
"epoch": 0.6131117266851339,
"grad_norm": 5.6875,
"learning_rate": 1.7019893072030222e-05,
"loss": 1.7565666437149048,
"step": 498
},
{
"epoch": 0.6155740227762388,
"grad_norm": 6.0,
"learning_rate": 1.6995035235243098e-05,
"loss": 1.582336187362671,
"step": 500
},
{
"epoch": 0.6180363188673438,
"grad_norm": 2.515625,
"learning_rate": 1.6970098105550514e-05,
"loss": 1.2266004085540771,
"step": 502
},
{
"epoch": 0.6204986149584487,
"grad_norm": 3.140625,
"learning_rate": 1.694508207881302e-05,
"loss": 1.3281134366989136,
"step": 504
},
{
"epoch": 0.6229609110495538,
"grad_norm": 2.6875,
"learning_rate": 1.691998755214363e-05,
"loss": 1.2356681823730469,
"step": 506
},
{
"epoch": 0.6254232071406587,
"grad_norm": 3.40625,
"learning_rate": 1.689481492390148e-05,
"loss": 1.0685112476348877,
"step": 508
},
{
"epoch": 0.6278855032317636,
"grad_norm": 3.203125,
"learning_rate": 1.686956459368551e-05,
"loss": 1.0986112356185913,
"step": 510
},
{
"epoch": 0.6303477993228686,
"grad_norm": 1.8671875,
"learning_rate": 1.6844236962328154e-05,
"loss": 1.1448196172714233,
"step": 512
},
{
"epoch": 0.6328100954139735,
"grad_norm": 4.0625,
"learning_rate": 1.681883243188892e-05,
"loss": 1.5838472843170166,
"step": 514
},
{
"epoch": 0.6352723915050785,
"grad_norm": 5.0,
"learning_rate": 1.6793351405648053e-05,
"loss": 1.0939499139785767,
"step": 516
},
{
"epoch": 0.6377346875961835,
"grad_norm": 2.078125,
"learning_rate": 1.6767794288100123e-05,
"loss": 0.9746682643890381,
"step": 518
},
{
"epoch": 0.6401969836872884,
"grad_norm": 7.53125,
"learning_rate": 1.6742161484947596e-05,
"loss": 0.9929898977279663,
"step": 520
},
{
"epoch": 0.6426592797783933,
"grad_norm": 7.28125,
"learning_rate": 1.6716453403094394e-05,
"loss": 1.6372830867767334,
"step": 522
},
{
"epoch": 0.6451215758694983,
"grad_norm": 18.875,
"learning_rate": 1.6690670450639435e-05,
"loss": 0.2726695239543915,
"step": 524
},
{
"epoch": 0.6475838719606033,
"grad_norm": 4.1875,
"learning_rate": 1.6664813036870174e-05,
"loss": 1.3791524171829224,
"step": 526
},
{
"epoch": 0.6500461680517082,
"grad_norm": 20.5,
"learning_rate": 1.6638881572256078e-05,
"loss": 1.7047182321548462,
"step": 528
},
{
"epoch": 0.6525084641428132,
"grad_norm": 5.5,
"learning_rate": 1.6612876468442118e-05,
"loss": 1.8910508155822754,
"step": 530
},
{
"epoch": 0.6549707602339181,
"grad_norm": 6.0,
"learning_rate": 1.6586798138242258e-05,
"loss": 1.3536272048950195,
"step": 532
},
{
"epoch": 0.6574330563250231,
"grad_norm": 5.9375,
"learning_rate": 1.6560646995632865e-05,
"loss": 1.404782772064209,
"step": 534
},
{
"epoch": 0.6598953524161281,
"grad_norm": 3.296875,
"learning_rate": 1.6534423455746157e-05,
"loss": 1.3882639408111572,
"step": 536
},
{
"epoch": 0.662357648507233,
"grad_norm": 4.65625,
"learning_rate": 1.6508127934863633e-05,
"loss": 1.3433642387390137,
"step": 538
},
{
"epoch": 0.6648199445983379,
"grad_norm": 7.5625,
"learning_rate": 1.6481760850409406e-05,
"loss": 1.7808656692504883,
"step": 540
},
{
"epoch": 0.6672822406894429,
"grad_norm": 5.1875,
"learning_rate": 1.645532262094364e-05,
"loss": 1.405790090560913,
"step": 542
},
{
"epoch": 0.6697445367805479,
"grad_norm": 8.0625,
"learning_rate": 1.6428813666155878e-05,
"loss": 1.3506624698638916,
"step": 544
},
{
"epoch": 0.6722068328716528,
"grad_norm": 4.125,
"learning_rate": 1.6402234406858375e-05,
"loss": 1.3872720003128052,
"step": 546
},
{
"epoch": 0.6746691289627578,
"grad_norm": 2.234375,
"learning_rate": 1.6375585264979423e-05,
"loss": 1.1865075826644897,
"step": 548
},
{
"epoch": 0.6771314250538627,
"grad_norm": 4.625,
"learning_rate": 1.6348866663556645e-05,
"loss": 1.4540220499038696,
"step": 550
},
{
"epoch": 0.6795937211449676,
"grad_norm": 8.3125,
"learning_rate": 1.6322079026730317e-05,
"loss": 1.0791795253753662,
"step": 552
},
{
"epoch": 0.6820560172360727,
"grad_norm": 2.359375,
"learning_rate": 1.6295222779736586e-05,
"loss": 1.1618213653564453,
"step": 554
},
{
"epoch": 0.6845183133271776,
"grad_norm": 5.78125,
"learning_rate": 1.626829834890074e-05,
"loss": 1.6633763313293457,
"step": 556
},
{
"epoch": 0.6869806094182825,
"grad_norm": 8.375,
"learning_rate": 1.624130616163044e-05,
"loss": 1.7596007585525513,
"step": 558
},
{
"epoch": 0.6894429055093875,
"grad_norm": 1.5625,
"learning_rate": 1.6214246646408946e-05,
"loss": 1.0530022382736206,
"step": 560
},
{
"epoch": 0.6919052016004925,
"grad_norm": 3.375,
"learning_rate": 1.61871202327883e-05,
"loss": 1.3792953491210938,
"step": 562
},
{
"epoch": 0.6943674976915974,
"grad_norm": 3.640625,
"learning_rate": 1.6159927351382512e-05,
"loss": 1.3962174654006958,
"step": 564
},
{
"epoch": 0.6968297937827024,
"grad_norm": 3.59375,
"learning_rate": 1.6132668433860723e-05,
"loss": 1.3606011867523193,
"step": 566
},
{
"epoch": 0.6992920898738073,
"grad_norm": 3.859375,
"learning_rate": 1.6105343912940355e-05,
"loss": 1.3807508945465088,
"step": 568
},
{
"epoch": 0.7017543859649122,
"grad_norm": 2.78125,
"learning_rate": 1.6077954222380235e-05,
"loss": 1.3539392948150635,
"step": 570
},
{
"epoch": 0.7042166820560173,
"grad_norm": 5.125,
"learning_rate": 1.6050499796973733e-05,
"loss": 1.3989124298095703,
"step": 572
},
{
"epoch": 0.7066789781471222,
"grad_norm": 3.875,
"learning_rate": 1.6022981072541823e-05,
"loss": 1.3723649978637695,
"step": 574
},
{
"epoch": 0.7091412742382271,
"grad_norm": 7.65625,
"learning_rate": 1.599539848592619e-05,
"loss": 1.3160829544067383,
"step": 576
},
{
"epoch": 0.7116035703293321,
"grad_norm": 3.171875,
"learning_rate": 1.5967752474982296e-05,
"loss": 1.1242200136184692,
"step": 578
},
{
"epoch": 0.714065866420437,
"grad_norm": 2.4375,
"learning_rate": 1.5940043478572413e-05,
"loss": 1.0219156742095947,
"step": 580
},
{
"epoch": 0.716528162511542,
"grad_norm": 1.7578125,
"learning_rate": 1.591227193655867e-05,
"loss": 0.9959127902984619,
"step": 582
},
{
"epoch": 0.718990458602647,
"grad_norm": 5.15625,
"learning_rate": 1.5884438289796067e-05,
"loss": 1.828487753868103,
"step": 584
},
{
"epoch": 0.7214527546937519,
"grad_norm": 3.828125,
"learning_rate": 1.5856542980125477e-05,
"loss": 1.4034947156906128,
"step": 586
},
{
"epoch": 0.7239150507848569,
"grad_norm": 13.75,
"learning_rate": 1.5828586450366626e-05,
"loss": 1.3598823547363281,
"step": 588
},
{
"epoch": 0.7263773468759618,
"grad_norm": 3.453125,
"learning_rate": 1.5800569144311078e-05,
"loss": 1.3686673641204834,
"step": 590
},
{
"epoch": 0.7288396429670668,
"grad_norm": 4.21875,
"learning_rate": 1.5772491506715174e-05,
"loss": 1.3769757747650146,
"step": 592
},
{
"epoch": 0.7313019390581718,
"grad_norm": 9.5625,
"learning_rate": 1.5744353983292975e-05,
"loss": 0.6412605047225952,
"step": 594
},
{
"epoch": 0.7337642351492767,
"grad_norm": 3.921875,
"learning_rate": 1.5716157020709196e-05,
"loss": 1.3520535230636597,
"step": 596
},
{
"epoch": 0.7362265312403816,
"grad_norm": 3.03125,
"learning_rate": 1.5687901066572116e-05,
"loss": 1.0531518459320068,
"step": 598
},
{
"epoch": 0.7386888273314867,
"grad_norm": 3.375,
"learning_rate": 1.565958656942645e-05,
"loss": 1.3364739418029785,
"step": 600
},
{
"epoch": 0.7411511234225916,
"grad_norm": 1.4140625,
"learning_rate": 1.563121397874626e-05,
"loss": 1.0583405494689941,
"step": 602
},
{
"epoch": 0.7436134195136965,
"grad_norm": 6.875,
"learning_rate": 1.5602783744927794e-05,
"loss": 1.699558138847351,
"step": 604
},
{
"epoch": 0.7460757156048015,
"grad_norm": 5.5625,
"learning_rate": 1.557429631928235e-05,
"loss": 1.6496608257293701,
"step": 606
},
{
"epoch": 0.7485380116959064,
"grad_norm": 4.6875,
"learning_rate": 1.5545752154029118e-05,
"loss": 1.3926259279251099,
"step": 608
},
{
"epoch": 0.7510003077870114,
"grad_norm": 1.796875,
"learning_rate": 1.5517151702287977e-05,
"loss": 1.0908641815185547,
"step": 610
},
{
"epoch": 0.7534626038781164,
"grad_norm": 2.28125,
"learning_rate": 1.548849541807233e-05,
"loss": 1.1665232181549072,
"step": 612
},
{
"epoch": 0.7559248999692213,
"grad_norm": 4.375,
"learning_rate": 1.5459783756281872e-05,
"loss": 1.5498981475830078,
"step": 614
},
{
"epoch": 0.7583871960603262,
"grad_norm": 3.796875,
"learning_rate": 1.543101717269539e-05,
"loss": 1.3930026292800903,
"step": 616
},
{
"epoch": 0.7608494921514312,
"grad_norm": 18.75,
"learning_rate": 1.5402196123963514e-05,
"loss": 1.3921393156051636,
"step": 618
},
{
"epoch": 0.7633117882425362,
"grad_norm": 4.4375,
"learning_rate": 1.537332106760147e-05,
"loss": 1.5707228183746338,
"step": 620
},
{
"epoch": 0.7657740843336411,
"grad_norm": 5.125,
"learning_rate": 1.5344392461981835e-05,
"loss": 1.1423331499099731,
"step": 622
},
{
"epoch": 0.7682363804247461,
"grad_norm": 3.328125,
"learning_rate": 1.5315410766327224e-05,
"loss": 1.3473308086395264,
"step": 624
},
{
"epoch": 0.770698676515851,
"grad_norm": 6.65625,
"learning_rate": 1.5286376440703034e-05,
"loss": 1.5814166069030762,
"step": 626
},
{
"epoch": 0.7731609726069559,
"grad_norm": 6.625,
"learning_rate": 1.5257289946010123e-05,
"loss": 1.664976954460144,
"step": 628
},
{
"epoch": 0.775623268698061,
"grad_norm": 3.984375,
"learning_rate": 1.5228151743977502e-05,
"loss": 1.0675089359283447,
"step": 630
},
{
"epoch": 0.7780855647891659,
"grad_norm": 6.34375,
"learning_rate": 1.5198962297155002e-05,
"loss": 1.5473486185073853,
"step": 632
},
{
"epoch": 0.7805478608802708,
"grad_norm": 3.296875,
"learning_rate": 1.5169722068905927e-05,
"loss": 1.4237251281738281,
"step": 634
},
{
"epoch": 0.7830101569713758,
"grad_norm": 1.5859375,
"learning_rate": 1.514043152339971e-05,
"loss": 1.1319770812988281,
"step": 636
},
{
"epoch": 0.7854724530624808,
"grad_norm": 5.90625,
"learning_rate": 1.5111091125604538e-05,
"loss": 1.7654370069503784,
"step": 638
},
{
"epoch": 0.7879347491535857,
"grad_norm": 4.46875,
"learning_rate": 1.5081701341279957e-05,
"loss": 1.407934546470642,
"step": 640
},
{
"epoch": 0.7903970452446907,
"grad_norm": 4.09375,
"learning_rate": 1.5052262636969506e-05,
"loss": 1.3491337299346924,
"step": 642
},
{
"epoch": 0.7928593413357956,
"grad_norm": 1.796875,
"learning_rate": 1.502277547999329e-05,
"loss": 1.125083565711975,
"step": 644
},
{
"epoch": 0.7953216374269005,
"grad_norm": 6.0,
"learning_rate": 1.4993240338440571e-05,
"loss": 1.3817883729934692,
"step": 646
},
{
"epoch": 0.7977839335180056,
"grad_norm": 3.46875,
"learning_rate": 1.4963657681162328e-05,
"loss": 1.3694324493408203,
"step": 648
},
{
"epoch": 0.8002462296091105,
"grad_norm": 7.71875,
"learning_rate": 1.4934027977763838e-05,
"loss": 1.451867699623108,
"step": 650
},
{
"epoch": 0.8027085257002154,
"grad_norm": 3.59375,
"learning_rate": 1.4904351698597181e-05,
"loss": 1.386351466178894,
"step": 652
},
{
"epoch": 0.8051708217913204,
"grad_norm": 1.6796875,
"learning_rate": 1.4874629314753812e-05,
"loss": 1.0673191547393799,
"step": 654
},
{
"epoch": 0.8076331178824253,
"grad_norm": 4.28125,
"learning_rate": 1.4844861298057068e-05,
"loss": 1.4586551189422607,
"step": 656
},
{
"epoch": 0.8100954139735304,
"grad_norm": 1.6015625,
"learning_rate": 1.4815048121054667e-05,
"loss": 1.042107105255127,
"step": 658
},
{
"epoch": 0.8125577100646353,
"grad_norm": 4.1875,
"learning_rate": 1.4785190257011231e-05,
"loss": 1.6682562828063965,
"step": 660
},
{
"epoch": 0.8150200061557402,
"grad_norm": 7.21875,
"learning_rate": 1.4755288179900741e-05,
"loss": 1.720628261566162,
"step": 662
},
{
"epoch": 0.8174823022468451,
"grad_norm": 7.15625,
"learning_rate": 1.4725342364399055e-05,
"loss": 1.3896342515945435,
"step": 664
},
{
"epoch": 0.8199445983379502,
"grad_norm": 4.40625,
"learning_rate": 1.4695353285876328e-05,
"loss": 1.3969242572784424,
"step": 666
},
{
"epoch": 0.8224068944290551,
"grad_norm": 4.4375,
"learning_rate": 1.46653214203895e-05,
"loss": 1.686731219291687,
"step": 668
},
{
"epoch": 0.8248691905201601,
"grad_norm": 6.28125,
"learning_rate": 1.463524724467472e-05,
"loss": 1.7890194654464722,
"step": 670
},
{
"epoch": 0.827331486611265,
"grad_norm": 3.0625,
"learning_rate": 1.4605131236139789e-05,
"loss": 1.3969485759735107,
"step": 672
},
{
"epoch": 0.8297937827023699,
"grad_norm": 1.9765625,
"learning_rate": 1.4574973872856566e-05,
"loss": 1.009456992149353,
"step": 674
},
{
"epoch": 0.832256078793475,
"grad_norm": 5.75,
"learning_rate": 1.4544775633553409e-05,
"loss": 1.0795286893844604,
"step": 676
},
{
"epoch": 0.8347183748845799,
"grad_norm": 3.765625,
"learning_rate": 1.4514536997607533e-05,
"loss": 1.291078805923462,
"step": 678
},
{
"epoch": 0.8371806709756848,
"grad_norm": 3.640625,
"learning_rate": 1.4484258445037437e-05,
"loss": 1.2912898063659668,
"step": 680
},
{
"epoch": 0.8396429670667898,
"grad_norm": 5.3125,
"learning_rate": 1.4453940456495268e-05,
"loss": 1.5154544115066528,
"step": 682
},
{
"epoch": 0.8421052631578947,
"grad_norm": 11.6875,
"learning_rate": 1.4423583513259196e-05,
"loss": 1.7723913192749023,
"step": 684
},
{
"epoch": 0.8445675592489997,
"grad_norm": 3.890625,
"learning_rate": 1.4393188097225764e-05,
"loss": 1.4048473834991455,
"step": 686
},
{
"epoch": 0.8470298553401047,
"grad_norm": 4.65625,
"learning_rate": 1.4362754690902242e-05,
"loss": 1.736893653869629,
"step": 688
},
{
"epoch": 0.8494921514312096,
"grad_norm": 2.625,
"learning_rate": 1.4332283777398992e-05,
"loss": 1.4180538654327393,
"step": 690
},
{
"epoch": 0.8519544475223145,
"grad_norm": 4.875,
"learning_rate": 1.4301775840421756e-05,
"loss": 1.700308084487915,
"step": 692
},
{
"epoch": 0.8544167436134195,
"grad_norm": 2.859375,
"learning_rate": 1.4271231364264008e-05,
"loss": 1.2139472961425781,
"step": 694
},
{
"epoch": 0.8568790397045245,
"grad_norm": 8.25,
"learning_rate": 1.424065083379926e-05,
"loss": 1.690704584121704,
"step": 696
},
{
"epoch": 0.8593413357956294,
"grad_norm": 5.625,
"learning_rate": 1.421003473447335e-05,
"loss": 1.757250189781189,
"step": 698
},
{
"epoch": 0.8618036318867344,
"grad_norm": 9.25,
"learning_rate": 1.4179383552296768e-05,
"loss": 1.7566320896148682,
"step": 700
},
{
"epoch": 0.8642659279778393,
"grad_norm": 6.0625,
"learning_rate": 1.4148697773836908e-05,
"loss": 1.804456353187561,
"step": 702
},
{
"epoch": 0.8667282240689443,
"grad_norm": 3.609375,
"learning_rate": 1.4117977886210352e-05,
"loss": 1.6510390043258667,
"step": 704
},
{
"epoch": 0.8691905201600493,
"grad_norm": 5.40625,
"learning_rate": 1.4087224377075162e-05,
"loss": 1.194544792175293,
"step": 706
},
{
"epoch": 0.8716528162511542,
"grad_norm": 5.75,
"learning_rate": 1.4056437734623103e-05,
"loss": 1.3318874835968018,
"step": 708
},
{
"epoch": 0.8741151123422591,
"grad_norm": 4.78125,
"learning_rate": 1.4025618447571914e-05,
"loss": 1.4258933067321777,
"step": 710
},
{
"epoch": 0.8765774084333641,
"grad_norm": 1.8671875,
"learning_rate": 1.3994767005157543e-05,
"loss": 1.1039819717407227,
"step": 712
},
{
"epoch": 0.8790397045244691,
"grad_norm": 3.71875,
"learning_rate": 1.3963883897126395e-05,
"loss": 1.3149468898773193,
"step": 714
},
{
"epoch": 0.881502000615574,
"grad_norm": 7.15625,
"learning_rate": 1.393296961372753e-05,
"loss": 1.3563876152038574,
"step": 716
},
{
"epoch": 0.883964296706679,
"grad_norm": 2.578125,
"learning_rate": 1.390202464570491e-05,
"loss": 1.226351022720337,
"step": 718
},
{
"epoch": 0.8864265927977839,
"grad_norm": 4.40625,
"learning_rate": 1.3871049484289586e-05,
"loss": 1.6103639602661133,
"step": 720
},
{
"epoch": 0.8888888888888888,
"grad_norm": 3.671875,
"learning_rate": 1.3840044621191907e-05,
"loss": 1.40117347240448,
"step": 722
},
{
"epoch": 0.8913511849799939,
"grad_norm": 3.3125,
"learning_rate": 1.380901054859373e-05,
"loss": 1.0493632555007935,
"step": 724
},
{
"epoch": 0.8938134810710988,
"grad_norm": 4.78125,
"learning_rate": 1.3777947759140581e-05,
"loss": 1.497347354888916,
"step": 726
},
{
"epoch": 0.8962757771622037,
"grad_norm": 1.546875,
"learning_rate": 1.3746856745933861e-05,
"loss": 1.1111018657684326,
"step": 728
},
{
"epoch": 0.8987380732533087,
"grad_norm": 1.7734375,
"learning_rate": 1.3715738002522983e-05,
"loss": 1.1223242282867432,
"step": 730
},
{
"epoch": 0.9012003693444137,
"grad_norm": 7.78125,
"learning_rate": 1.3684592022897577e-05,
"loss": 1.526750087738037,
"step": 732
},
{
"epoch": 0.9036626654355187,
"grad_norm": 1.3203125,
"learning_rate": 1.3653419301479625e-05,
"loss": 1.1531429290771484,
"step": 734
},
{
"epoch": 0.9061249615266236,
"grad_norm": 6.84375,
"learning_rate": 1.3622220333115618e-05,
"loss": 1.627996563911438,
"step": 736
},
{
"epoch": 0.9085872576177285,
"grad_norm": 5.0,
"learning_rate": 1.3590995613068695e-05,
"loss": 1.3804816007614136,
"step": 738
},
{
"epoch": 0.9110495537088334,
"grad_norm": 3.890625,
"learning_rate": 1.3559745637010796e-05,
"loss": 1.3431119918823242,
"step": 740
},
{
"epoch": 0.9135118497999385,
"grad_norm": 7.21875,
"learning_rate": 1.3528470901014768e-05,
"loss": 1.7569446563720703,
"step": 742
},
{
"epoch": 0.9159741458910434,
"grad_norm": 3.453125,
"learning_rate": 1.3497171901546527e-05,
"loss": 1.4046237468719482,
"step": 744
},
{
"epoch": 0.9184364419821484,
"grad_norm": 2.65625,
"learning_rate": 1.3465849135457133e-05,
"loss": 1.1801738739013672,
"step": 746
},
{
"epoch": 0.9208987380732533,
"grad_norm": 3.625,
"learning_rate": 1.3434503099974943e-05,
"loss": 1.414689540863037,
"step": 748
},
{
"epoch": 0.9233610341643582,
"grad_norm": 3.40625,
"learning_rate": 1.3403134292697688e-05,
"loss": 1.3589739799499512,
"step": 750
},
{
"epoch": 0.9258233302554633,
"grad_norm": 3.390625,
"learning_rate": 1.3371743211584602e-05,
"loss": 1.2147026062011719,
"step": 752
},
{
"epoch": 0.9282856263465682,
"grad_norm": 14.0,
"learning_rate": 1.3340330354948483e-05,
"loss": 0.6764575242996216,
"step": 754
},
{
"epoch": 0.9307479224376731,
"grad_norm": 2.515625,
"learning_rate": 1.330889622144781e-05,
"loss": 1.1622259616851807,
"step": 756
},
{
"epoch": 0.9332102185287781,
"grad_norm": 3.53125,
"learning_rate": 1.3277441310078824e-05,
"loss": 1.3609400987625122,
"step": 758
},
{
"epoch": 0.935672514619883,
"grad_norm": 10.8125,
"learning_rate": 1.3245966120167592e-05,
"loss": 1.149078130722046,
"step": 760
},
{
"epoch": 0.938134810710988,
"grad_norm": 2.1875,
"learning_rate": 1.3214471151362092e-05,
"loss": 1.119340419769287,
"step": 762
},
{
"epoch": 0.940597106802093,
"grad_norm": 3.546875,
"learning_rate": 1.3182956903624278e-05,
"loss": 1.0370396375656128,
"step": 764
},
{
"epoch": 0.9430594028931979,
"grad_norm": 2.453125,
"learning_rate": 1.3151423877222147e-05,
"loss": 1.1257320642471313,
"step": 766
},
{
"epoch": 0.9455216989843028,
"grad_norm": 3.34375,
"learning_rate": 1.3119872572721794e-05,
"loss": 1.3441581726074219,
"step": 768
},
{
"epoch": 0.9479839950754079,
"grad_norm": 5.75,
"learning_rate": 1.3088303490979471e-05,
"loss": 1.3604907989501953,
"step": 770
},
{
"epoch": 0.9504462911665128,
"grad_norm": 4.6875,
"learning_rate": 1.3056717133133621e-05,
"loss": 1.6805719137191772,
"step": 772
},
{
"epoch": 0.9529085872576177,
"grad_norm": 5.625,
"learning_rate": 1.3025114000596943e-05,
"loss": 1.780057430267334,
"step": 774
},
{
"epoch": 0.9553708833487227,
"grad_norm": 3.28125,
"learning_rate": 1.2993494595048422e-05,
"loss": 1.401186466217041,
"step": 776
},
{
"epoch": 0.9578331794398276,
"grad_norm": 8.875,
"learning_rate": 1.2961859418425365e-05,
"loss": 1.7668989896774292,
"step": 778
},
{
"epoch": 0.9602954755309326,
"grad_norm": 5.6875,
"learning_rate": 1.2930208972915437e-05,
"loss": 1.4184396266937256,
"step": 780
},
{
"epoch": 0.9627577716220376,
"grad_norm": 3.75,
"learning_rate": 1.2898543760948673e-05,
"loss": 1.4058780670166016,
"step": 782
},
{
"epoch": 0.9652200677131425,
"grad_norm": 1.0859375,
"learning_rate": 1.2866864285189543e-05,
"loss": 1.0642720460891724,
"step": 784
},
{
"epoch": 0.9676823638042474,
"grad_norm": 5.65625,
"learning_rate": 1.2835171048528916e-05,
"loss": 1.7296231985092163,
"step": 786
},
{
"epoch": 0.9701446598953524,
"grad_norm": 4.96875,
"learning_rate": 1.2803464554076128e-05,
"loss": 1.4836857318878174,
"step": 788
},
{
"epoch": 0.9726069559864574,
"grad_norm": 5.1875,
"learning_rate": 1.2771745305150965e-05,
"loss": 1.7830345630645752,
"step": 790
},
{
"epoch": 0.9750692520775623,
"grad_norm": 3.421875,
"learning_rate": 1.2740013805275672e-05,
"loss": 1.3922364711761475,
"step": 792
},
{
"epoch": 0.9775315481686673,
"grad_norm": 7.34375,
"learning_rate": 1.2708270558166995e-05,
"loss": 1.0389618873596191,
"step": 794
},
{
"epoch": 0.9799938442597722,
"grad_norm": 5.1875,
"learning_rate": 1.2676516067728135e-05,
"loss": 1.5342938899993896,
"step": 796
},
{
"epoch": 0.9824561403508771,
"grad_norm": 5.03125,
"learning_rate": 1.264475083804078e-05,
"loss": 1.7565385103225708,
"step": 798
},
{
"epoch": 0.9849184364419822,
"grad_norm": 4.34375,
"learning_rate": 1.2612975373357113e-05,
"loss": 1.398611068725586,
"step": 800
},
{
"epoch": 0.9873807325330871,
"grad_norm": 5.78125,
"learning_rate": 1.2581190178091764e-05,
"loss": 1.4105567932128906,
"step": 802
},
{
"epoch": 0.989843028624192,
"grad_norm": 4.59375,
"learning_rate": 1.2549395756813852e-05,
"loss": 1.1484860181808472,
"step": 804
},
{
"epoch": 0.992305324715297,
"grad_norm": 6.875,
"learning_rate": 1.251759261423894e-05,
"loss": 0.9945257902145386,
"step": 806
},
{
"epoch": 0.994767620806402,
"grad_norm": 12.5625,
"learning_rate": 1.2485781255221037e-05,
"loss": 1.5860981941223145,
"step": 808
},
{
"epoch": 0.997229916897507,
"grad_norm": 13.4375,
"learning_rate": 1.2453962184744595e-05,
"loss": 1.3061414957046509,
"step": 810
},
{
"epoch": 0.9996922129886119,
"grad_norm": 2.109375,
"learning_rate": 1.2422135907916459e-05,
"loss": 1.0748600959777832,
"step": 812
},
{
"epoch": 1.0012311480455525,
"grad_norm": 1.0078125,
"learning_rate": 1.239030292995789e-05,
"loss": 1.1813337802886963,
"step": 814
},
{
"epoch": 1.0036934441366574,
"grad_norm": 3.109375,
"learning_rate": 1.2358463756196515e-05,
"loss": 1.3365702629089355,
"step": 816
},
{
"epoch": 1.0061557402277623,
"grad_norm": 3.484375,
"learning_rate": 1.2326618892058316e-05,
"loss": 1.269797444343567,
"step": 818
},
{
"epoch": 1.0086180363188673,
"grad_norm": 2.734375,
"learning_rate": 1.2294768843059611e-05,
"loss": 1.130170226097107,
"step": 820
},
{
"epoch": 1.0110803324099722,
"grad_norm": 4.1875,
"learning_rate": 1.2262914114799011e-05,
"loss": 1.5535081624984741,
"step": 822
},
{
"epoch": 1.0135426285010773,
"grad_norm": 9.0625,
"learning_rate": 1.2231055212949427e-05,
"loss": 1.7664412260055542,
"step": 824
},
{
"epoch": 1.0160049245921823,
"grad_norm": 2.109375,
"learning_rate": 1.219919264325001e-05,
"loss": 1.4970834255218506,
"step": 826
},
{
"epoch": 1.0184672206832872,
"grad_norm": 1.7734375,
"learning_rate": 1.2167326911498137e-05,
"loss": 1.1276826858520508,
"step": 828
},
{
"epoch": 1.0209295167743921,
"grad_norm": 2.328125,
"learning_rate": 1.2135458523541384e-05,
"loss": 1.0198701620101929,
"step": 830
},
{
"epoch": 1.023391812865497,
"grad_norm": 6.0,
"learning_rate": 1.2103587985269483e-05,
"loss": 1.1860932111740112,
"step": 832
},
{
"epoch": 1.025854108956602,
"grad_norm": 2.703125,
"learning_rate": 1.207171580260632e-05,
"loss": 1.4768877029418945,
"step": 834
},
{
"epoch": 1.028316405047707,
"grad_norm": 8.625,
"learning_rate": 1.2039842481501865e-05,
"loss": 1.481208086013794,
"step": 836
},
{
"epoch": 1.0307787011388119,
"grad_norm": 2.90625,
"learning_rate": 1.200796852792417e-05,
"loss": 1.473567008972168,
"step": 838
},
{
"epoch": 1.0332409972299168,
"grad_norm": 10.5625,
"learning_rate": 1.1976094447851323e-05,
"loss": 1.5777289867401123,
"step": 840
},
{
"epoch": 1.035703293321022,
"grad_norm": 4.03125,
"learning_rate": 1.1944220747263425e-05,
"loss": 1.3818743228912354,
"step": 842
},
{
"epoch": 1.0381655894121269,
"grad_norm": 3.625,
"learning_rate": 1.1912347932134552e-05,
"loss": 1.2724220752716064,
"step": 844
},
{
"epoch": 1.0406278855032318,
"grad_norm": 2.6875,
"learning_rate": 1.1880476508424717e-05,
"loss": 1.3566083908081055,
"step": 846
},
{
"epoch": 1.0430901815943368,
"grad_norm": 1.3515625,
"learning_rate": 1.1848606982071851e-05,
"loss": 1.2785669565200806,
"step": 848
},
{
"epoch": 1.0455524776854417,
"grad_norm": 5.375,
"learning_rate": 1.1816739858983767e-05,
"loss": 1.5428179502487183,
"step": 850
},
{
"epoch": 1.0480147737765466,
"grad_norm": 5.53125,
"learning_rate": 1.178487564503012e-05,
"loss": 1.7369728088378906,
"step": 852
},
{
"epoch": 1.0504770698676515,
"grad_norm": 5.15625,
"learning_rate": 1.1753014846034398e-05,
"loss": 1.6508008241653442,
"step": 854
},
{
"epoch": 1.0529393659587565,
"grad_norm": 5.125,
"learning_rate": 1.1721157967765869e-05,
"loss": 1.4951319694519043,
"step": 856
},
{
"epoch": 1.0554016620498614,
"grad_norm": 6.3125,
"learning_rate": 1.1689305515931556e-05,
"loss": 1.5579488277435303,
"step": 858
},
{
"epoch": 1.0578639581409663,
"grad_norm": 9.0625,
"learning_rate": 1.1657457996168233e-05,
"loss": 1.2465214729309082,
"step": 860
},
{
"epoch": 1.0603262542320715,
"grad_norm": 2.796875,
"learning_rate": 1.1625615914034363e-05,
"loss": 1.1531850099563599,
"step": 862
},
{
"epoch": 1.0627885503231764,
"grad_norm": 7.625,
"learning_rate": 1.1593779775002104e-05,
"loss": 1.6242802143096924,
"step": 864
},
{
"epoch": 1.0652508464142814,
"grad_norm": 6.8125,
"learning_rate": 1.1561950084449258e-05,
"loss": 1.7797261476516724,
"step": 866
},
{
"epoch": 1.0677131425053863,
"grad_norm": 7.625,
"learning_rate": 1.153012734765127e-05,
"loss": 1.0688107013702393,
"step": 868
},
{
"epoch": 1.0701754385964912,
"grad_norm": 1.640625,
"learning_rate": 1.1498312069773205e-05,
"loss": 0.603493332862854,
"step": 870
},
{
"epoch": 1.0726377346875962,
"grad_norm": 7.59375,
"learning_rate": 1.1466504755861708e-05,
"loss": 1.2946546077728271,
"step": 872
},
{
"epoch": 1.075100030778701,
"grad_norm": 4.0,
"learning_rate": 1.143470591083701e-05,
"loss": 1.3011809587478638,
"step": 874
},
{
"epoch": 1.077562326869806,
"grad_norm": 3.1875,
"learning_rate": 1.1402916039484898e-05,
"loss": 1.3322241306304932,
"step": 876
},
{
"epoch": 1.080024622960911,
"grad_norm": 2.953125,
"learning_rate": 1.1371135646448716e-05,
"loss": 1.3409028053283691,
"step": 878
},
{
"epoch": 1.082486919052016,
"grad_norm": 4.9375,
"learning_rate": 1.1339365236221344e-05,
"loss": 1.5541951656341553,
"step": 880
},
{
"epoch": 1.084949215143121,
"grad_norm": 4.1875,
"learning_rate": 1.1307605313137185e-05,
"loss": 1.6270629167556763,
"step": 882
},
{
"epoch": 1.087411511234226,
"grad_norm": 3.515625,
"learning_rate": 1.127585638136417e-05,
"loss": 1.40193510055542,
"step": 884
},
{
"epoch": 1.089873807325331,
"grad_norm": 4.90625,
"learning_rate": 1.1244118944895751e-05,
"loss": 1.3631030321121216,
"step": 886
},
{
"epoch": 1.0923361034164358,
"grad_norm": 11.4375,
"learning_rate": 1.1212393507542898e-05,
"loss": 1.293651819229126,
"step": 888
},
{
"epoch": 1.0947983995075408,
"grad_norm": 4.6875,
"learning_rate": 1.1180680572926107e-05,
"loss": 1.4282387495040894,
"step": 890
},
{
"epoch": 1.0972606955986457,
"grad_norm": 4.09375,
"learning_rate": 1.1148980644467393e-05,
"loss": 1.5414776802062988,
"step": 892
},
{
"epoch": 1.0997229916897506,
"grad_norm": 2.0625,
"learning_rate": 1.1117294225382316e-05,
"loss": 1.2819738388061523,
"step": 894
},
{
"epoch": 1.1021852877808556,
"grad_norm": 3.625,
"learning_rate": 1.1085621818671974e-05,
"loss": 1.116639256477356,
"step": 896
},
{
"epoch": 1.1046475838719605,
"grad_norm": 5.15625,
"learning_rate": 1.1053963927115037e-05,
"loss": 1.3504618406295776,
"step": 898
},
{
"epoch": 1.1071098799630656,
"grad_norm": 4.375,
"learning_rate": 1.102232105325975e-05,
"loss": 1.4307514429092407,
"step": 900
},
{
"epoch": 1.1095721760541706,
"grad_norm": 2.5,
"learning_rate": 1.0990693699415962e-05,
"loss": 1.2542567253112793,
"step": 902
},
{
"epoch": 1.1120344721452755,
"grad_norm": 7.71875,
"learning_rate": 1.0959082367647155e-05,
"loss": 1.3272080421447754,
"step": 904
},
{
"epoch": 1.1144967682363804,
"grad_norm": 5.3125,
"learning_rate": 1.0927487559762478e-05,
"loss": 1.344172477722168,
"step": 906
},
{
"epoch": 1.1169590643274854,
"grad_norm": 7.59375,
"learning_rate": 1.0895909777308757e-05,
"loss": 1.2731947898864746,
"step": 908
},
{
"epoch": 1.1194213604185903,
"grad_norm": 1.5234375,
"learning_rate": 1.0864349521562563e-05,
"loss": 1.2336888313293457,
"step": 910
},
{
"epoch": 1.1218836565096952,
"grad_norm": 2.21875,
"learning_rate": 1.0832807293522239e-05,
"loss": 1.125575304031372,
"step": 912
},
{
"epoch": 1.1243459526008002,
"grad_norm": 2.609375,
"learning_rate": 1.080128359389995e-05,
"loss": 1.1796314716339111,
"step": 914
},
{
"epoch": 1.1268082486919053,
"grad_norm": 4.28125,
"learning_rate": 1.0769778923113736e-05,
"loss": 1.1832040548324585,
"step": 916
},
{
"epoch": 1.1292705447830103,
"grad_norm": 2.15625,
"learning_rate": 1.0738293781279561e-05,
"loss": 1.1113415956497192,
"step": 918
},
{
"epoch": 1.1317328408741152,
"grad_norm": 2.734375,
"learning_rate": 1.0706828668203384e-05,
"loss": 1.1446493864059448,
"step": 920
},
{
"epoch": 1.1341951369652201,
"grad_norm": 4.09375,
"learning_rate": 1.067538408337323e-05,
"loss": 1.3466662168502808,
"step": 922
},
{
"epoch": 1.136657433056325,
"grad_norm": 2.953125,
"learning_rate": 1.064396052595123e-05,
"loss": 1.1979475021362305,
"step": 924
},
{
"epoch": 1.13911972914743,
"grad_norm": 6.0,
"learning_rate": 1.0612558494765735e-05,
"loss": 1.2253812551498413,
"step": 926
},
{
"epoch": 1.141582025238535,
"grad_norm": 5.59375,
"learning_rate": 1.0581178488303379e-05,
"loss": 1.512798547744751,
"step": 928
},
{
"epoch": 1.1440443213296398,
"grad_norm": 1.5859375,
"learning_rate": 1.0549821004701163e-05,
"loss": 1.214385986328125,
"step": 930
},
{
"epoch": 1.1465066174207448,
"grad_norm": 1.640625,
"learning_rate": 1.0518486541738552e-05,
"loss": 1.0102102756500244,
"step": 932
},
{
"epoch": 1.1489689135118497,
"grad_norm": 8.4375,
"learning_rate": 1.0487175596829584e-05,
"loss": 1.2178149223327637,
"step": 934
},
{
"epoch": 1.1514312096029546,
"grad_norm": 5.59375,
"learning_rate": 1.0455888667014956e-05,
"loss": 1.3471554517745972,
"step": 936
},
{
"epoch": 1.1538935056940598,
"grad_norm": 32.5,
"learning_rate": 1.0424626248954135e-05,
"loss": 1.5330407619476318,
"step": 938
},
{
"epoch": 1.1563558017851647,
"grad_norm": 3.359375,
"learning_rate": 1.0393388838917489e-05,
"loss": 1.6406910419464111,
"step": 940
},
{
"epoch": 1.1588180978762697,
"grad_norm": 1.703125,
"learning_rate": 1.0362176932778399e-05,
"loss": 1.2105987071990967,
"step": 942
},
{
"epoch": 1.1612803939673746,
"grad_norm": 1.59375,
"learning_rate": 1.0330991026005384e-05,
"loss": 1.194588303565979,
"step": 944
},
{
"epoch": 1.1637426900584795,
"grad_norm": 2.65625,
"learning_rate": 1.0299831613654243e-05,
"loss": 1.1566952466964722,
"step": 946
},
{
"epoch": 1.1662049861495845,
"grad_norm": 3.71875,
"learning_rate": 1.026869919036019e-05,
"loss": 1.2074699401855469,
"step": 948
},
{
"epoch": 1.1686672822406894,
"grad_norm": 11.375,
"learning_rate": 1.0237594250330013e-05,
"loss": 1.3596782684326172,
"step": 950
},
{
"epoch": 1.1711295783317943,
"grad_norm": 5.84375,
"learning_rate": 1.020651728733422e-05,
"loss": 1.3205690383911133,
"step": 952
},
{
"epoch": 1.1735918744228995,
"grad_norm": 3.734375,
"learning_rate": 1.0175468794699193e-05,
"loss": 1.337862253189087,
"step": 954
},
{
"epoch": 1.1760541705140044,
"grad_norm": 3.875,
"learning_rate": 1.014444926529937e-05,
"loss": 1.3420543670654297,
"step": 956
},
{
"epoch": 1.1785164666051093,
"grad_norm": 3.265625,
"learning_rate": 1.0113459191549423e-05,
"loss": 1.3313000202178955,
"step": 958
},
{
"epoch": 1.1809787626962143,
"grad_norm": 5.03125,
"learning_rate": 1.008249906539643e-05,
"loss": 1.4042177200317383,
"step": 960
},
{
"epoch": 1.1834410587873192,
"grad_norm": 3.9375,
"learning_rate": 1.0051569378312066e-05,
"loss": 1.3378522396087646,
"step": 962
},
{
"epoch": 1.1859033548784241,
"grad_norm": 10.9375,
"learning_rate": 1.0020670621284814e-05,
"loss": 0.8419127464294434,
"step": 964
},
{
"epoch": 1.188365650969529,
"grad_norm": 3.796875,
"learning_rate": 9.989803284812156e-06,
"loss": 0.8327467441558838,
"step": 966
},
{
"epoch": 1.190827947060634,
"grad_norm": 18.0,
"learning_rate": 9.958967858892796e-06,
"loss": 1.1072711944580078,
"step": 968
},
{
"epoch": 1.193290243151739,
"grad_norm": 11.375,
"learning_rate": 9.928164833018884e-06,
"loss": 1.109494686126709,
"step": 970
},
{
"epoch": 1.1957525392428439,
"grad_norm": 7.15625,
"learning_rate": 9.897394696168232e-06,
"loss": 1.2777066230773926,
"step": 972
},
{
"epoch": 1.1982148353339488,
"grad_norm": 4.03125,
"learning_rate": 9.866657936796567e-06,
"loss": 1.089713454246521,
"step": 974
},
{
"epoch": 1.200677131425054,
"grad_norm": 4.875,
"learning_rate": 9.835955042829762e-06,
"loss": 1.1587715148925781,
"step": 976
},
{
"epoch": 1.2031394275161589,
"grad_norm": 5.3125,
"learning_rate": 9.805286501656111e-06,
"loss": 1.300113558769226,
"step": 978
},
{
"epoch": 1.2056017236072638,
"grad_norm": 7.34375,
"learning_rate": 9.774652800118567e-06,
"loss": 1.2401779890060425,
"step": 980
},
{
"epoch": 1.2080640196983687,
"grad_norm": 1.3046875,
"learning_rate": 9.74405442450704e-06,
"loss": 1.2466282844543457,
"step": 982
},
{
"epoch": 1.2105263157894737,
"grad_norm": 9.9375,
"learning_rate": 9.713491860550646e-06,
"loss": 1.485695242881775,
"step": 984
},
{
"epoch": 1.2129886118805786,
"grad_norm": 4.9375,
"learning_rate": 9.682965593410037e-06,
"loss": 1.6573221683502197,
"step": 986
},
{
"epoch": 1.2154509079716835,
"grad_norm": 3.734375,
"learning_rate": 9.652476107669662e-06,
"loss": 1.3761565685272217,
"step": 988
},
{
"epoch": 1.2179132040627885,
"grad_norm": 4.09375,
"learning_rate": 9.622023887330094e-06,
"loss": 1.3099732398986816,
"step": 990
},
{
"epoch": 1.2203755001538936,
"grad_norm": 10.625,
"learning_rate": 9.591609415800338e-06,
"loss": 1.5944232940673828,
"step": 992
},
{
"epoch": 1.2228377962449986,
"grad_norm": 7.375,
"learning_rate": 9.561233175890165e-06,
"loss": 1.7219964265823364,
"step": 994
},
{
"epoch": 1.2253000923361035,
"grad_norm": 2.78125,
"learning_rate": 9.530895649802445e-06,
"loss": 1.623438835144043,
"step": 996
},
{
"epoch": 1.2277623884272084,
"grad_norm": 3.359375,
"learning_rate": 9.50059731912549e-06,
"loss": 1.3701614141464233,
"step": 998
},
{
"epoch": 1.2302246845183133,
"grad_norm": 2.8125,
"learning_rate": 9.470338664825408e-06,
"loss": 1.2980146408081055,
"step": 1000
},
{
"epoch": 1.2326869806094183,
"grad_norm": 5.71875,
"learning_rate": 9.44012016723848e-06,
"loss": 1.5235289335250854,
"step": 1002
},
{
"epoch": 1.2351492767005232,
"grad_norm": 8.3125,
"learning_rate": 9.409942306063513e-06,
"loss": 1.6062097549438477,
"step": 1004
},
{
"epoch": 1.2376115727916281,
"grad_norm": 2.65625,
"learning_rate": 9.379805560354246e-06,
"loss": 1.3337829113006592,
"step": 1006
},
{
"epoch": 1.240073868882733,
"grad_norm": 4.78125,
"learning_rate": 9.349710408511734e-06,
"loss": 0.7538601160049438,
"step": 1008
},
{
"epoch": 1.242536164973838,
"grad_norm": 27.25,
"learning_rate": 9.319657328276757e-06,
"loss": 0.47900092601776123,
"step": 1010
},
{
"epoch": 1.244998461064943,
"grad_norm": 4.75,
"learning_rate": 9.289646796722234e-06,
"loss": 1.0039315223693848,
"step": 1012
},
{
"epoch": 1.247460757156048,
"grad_norm": 6.375,
"learning_rate": 9.259679290245658e-06,
"loss": 1.2915596961975098,
"step": 1014
},
{
"epoch": 1.249923053247153,
"grad_norm": 4.0,
"learning_rate": 9.229755284561518e-06,
"loss": 1.336082935333252,
"step": 1016
},
{
"epoch": 1.252385349338258,
"grad_norm": 4.90625,
"learning_rate": 9.19987525469376e-06,
"loss": 1.416182279586792,
"step": 1018
},
{
"epoch": 1.254847645429363,
"grad_norm": 5.0625,
"learning_rate": 9.170039674968254e-06,
"loss": 1.378662109375,
"step": 1020
},
{
"epoch": 1.2573099415204678,
"grad_norm": 2.484375,
"learning_rate": 9.140249019005236e-06,
"loss": 1.3030860424041748,
"step": 1022
},
{
"epoch": 1.2597722376115728,
"grad_norm": 4.0625,
"learning_rate": 9.110503759711811e-06,
"loss": 1.3451809883117676,
"step": 1024
},
{
"epoch": 1.2622345337026777,
"grad_norm": 5.75,
"learning_rate": 9.080804369274451e-06,
"loss": 1.3729634284973145,
"step": 1026
},
{
"epoch": 1.2646968297937828,
"grad_norm": 3.65625,
"learning_rate": 9.051151319151479e-06,
"loss": 1.3505221605300903,
"step": 1028
},
{
"epoch": 1.2671591258848878,
"grad_norm": 5.1875,
"learning_rate": 9.021545080065603e-06,
"loss": 1.3553135395050049,
"step": 1030
},
{
"epoch": 1.2696214219759927,
"grad_norm": 6.09375,
"learning_rate": 8.991986121996432e-06,
"loss": 1.4693278074264526,
"step": 1032
},
{
"epoch": 1.2720837180670976,
"grad_norm": 7.09375,
"learning_rate": 8.962474914173022e-06,
"loss": 1.8386784791946411,
"step": 1034
},
{
"epoch": 1.2745460141582026,
"grad_norm": 5.3125,
"learning_rate": 8.933011925066431e-06,
"loss": 1.5184224843978882,
"step": 1036
},
{
"epoch": 1.2770083102493075,
"grad_norm": 14.0,
"learning_rate": 8.903597622382263e-06,
"loss": 1.3686227798461914,
"step": 1038
},
{
"epoch": 1.2794706063404124,
"grad_norm": 5.15625,
"learning_rate": 8.87423247305327e-06,
"loss": 1.3770601749420166,
"step": 1040
},
{
"epoch": 1.2819329024315174,
"grad_norm": 8.75,
"learning_rate": 8.84491694323192e-06,
"loss": 0.8821253776550293,
"step": 1042
},
{
"epoch": 1.2843951985226223,
"grad_norm": 2.84375,
"learning_rate": 8.815651498283002e-06,
"loss": 0.7115093469619751,
"step": 1044
},
{
"epoch": 1.2868574946137272,
"grad_norm": 3.0625,
"learning_rate": 8.786436602776248e-06,
"loss": 1.1449503898620605,
"step": 1046
},
{
"epoch": 1.2893197907048322,
"grad_norm": 3.359375,
"learning_rate": 8.757272720478942e-06,
"loss": 1.3050785064697266,
"step": 1048
},
{
"epoch": 1.291782086795937,
"grad_norm": 4.78125,
"learning_rate": 8.728160314348575e-06,
"loss": 1.0610979795455933,
"step": 1050
},
{
"epoch": 1.2942443828870422,
"grad_norm": 2.96875,
"learning_rate": 8.699099846525486e-06,
"loss": 0.9030791521072388,
"step": 1052
},
{
"epoch": 1.2967066789781472,
"grad_norm": 4.15625,
"learning_rate": 8.670091778325521e-06,
"loss": 1.3431543111801147,
"step": 1054
},
{
"epoch": 1.299168975069252,
"grad_norm": 2.90625,
"learning_rate": 8.641136570232724e-06,
"loss": 1.3691339492797852,
"step": 1056
},
{
"epoch": 1.301631271160357,
"grad_norm": 2.78125,
"learning_rate": 8.612234681892017e-06,
"loss": 1.3442999124526978,
"step": 1058
},
{
"epoch": 1.304093567251462,
"grad_norm": 4.0625,
"learning_rate": 8.583386572101902e-06,
"loss": 1.3384771347045898,
"step": 1060
},
{
"epoch": 1.306555863342567,
"grad_norm": 6.6875,
"learning_rate": 8.554592698807185e-06,
"loss": 1.4566752910614014,
"step": 1062
},
{
"epoch": 1.3090181594336718,
"grad_norm": 6.09375,
"learning_rate": 8.525853519091708e-06,
"loss": 1.7774509191513062,
"step": 1064
},
{
"epoch": 1.311480455524777,
"grad_norm": 5.5625,
"learning_rate": 8.497169489171077e-06,
"loss": 1.4398928880691528,
"step": 1066
},
{
"epoch": 1.313942751615882,
"grad_norm": 4.5,
"learning_rate": 8.468541064385447e-06,
"loss": 1.4056460857391357,
"step": 1068
},
{
"epoch": 1.3164050477069869,
"grad_norm": 4.78125,
"learning_rate": 8.439968699192262e-06,
"loss": 1.2474167346954346,
"step": 1070
},
{
"epoch": 1.3188673437980918,
"grad_norm": 2.65625,
"learning_rate": 8.411452847159063e-06,
"loss": 1.4466845989227295,
"step": 1072
},
{
"epoch": 1.3213296398891967,
"grad_norm": 2.875,
"learning_rate": 8.382993960956287e-06,
"loss": 1.3356812000274658,
"step": 1074
},
{
"epoch": 1.3237919359803016,
"grad_norm": 3.578125,
"learning_rate": 8.35459249235007e-06,
"loss": 1.3684732913970947,
"step": 1076
},
{
"epoch": 1.3262542320714066,
"grad_norm": 13.3125,
"learning_rate": 8.32624889219508e-06,
"loss": 1.5551846027374268,
"step": 1078
},
{
"epoch": 1.3287165281625115,
"grad_norm": 1.7734375,
"learning_rate": 8.297963610427366e-06,
"loss": 1.287471055984497,
"step": 1080
},
{
"epoch": 1.3311788242536164,
"grad_norm": 8.375,
"learning_rate": 8.269737096057207e-06,
"loss": 1.3594995737075806,
"step": 1082
},
{
"epoch": 1.3336411203447214,
"grad_norm": 4.125,
"learning_rate": 8.24156979716199e-06,
"loss": 1.451033592224121,
"step": 1084
},
{
"epoch": 1.3361034164358263,
"grad_norm": 4.625,
"learning_rate": 8.213462160879098e-06,
"loss": 1.272244930267334,
"step": 1086
},
{
"epoch": 1.3385657125269312,
"grad_norm": 2.0625,
"learning_rate": 8.185414633398805e-06,
"loss": 1.1681973934173584,
"step": 1088
},
{
"epoch": 1.3410280086180364,
"grad_norm": 4.0625,
"learning_rate": 8.157427659957198e-06,
"loss": 1.1624126434326172,
"step": 1090
},
{
"epoch": 1.3434903047091413,
"grad_norm": 9.9375,
"learning_rate": 8.12950168482911e-06,
"loss": 1.3475921154022217,
"step": 1092
},
{
"epoch": 1.3459526008002463,
"grad_norm": 13.6875,
"learning_rate": 8.101637151321057e-06,
"loss": 1.4795109033584595,
"step": 1094
},
{
"epoch": 1.3484148968913512,
"grad_norm": 2.078125,
"learning_rate": 8.07383450176423e-06,
"loss": 1.3539352416992188,
"step": 1096
},
{
"epoch": 1.3508771929824561,
"grad_norm": 2.984375,
"learning_rate": 8.046094177507436e-06,
"loss": 1.0916264057159424,
"step": 1098
},
{
"epoch": 1.353339489073561,
"grad_norm": 3.078125,
"learning_rate": 8.018416618910105e-06,
"loss": 1.337206482887268,
"step": 1100
},
{
"epoch": 1.355801785164666,
"grad_norm": 6.78125,
"learning_rate": 7.99080226533532e-06,
"loss": 1.5372506380081177,
"step": 1102
},
{
"epoch": 1.3582640812557711,
"grad_norm": 8.875,
"learning_rate": 7.963251555142813e-06,
"loss": 1.4474639892578125,
"step": 1104
},
{
"epoch": 1.360726377346876,
"grad_norm": 6.90625,
"learning_rate": 7.935764925682028e-06,
"loss": 1.782578468322754,
"step": 1106
},
{
"epoch": 1.363188673437981,
"grad_norm": 10.25,
"learning_rate": 7.908342813285159e-06,
"loss": 1.6106759309768677,
"step": 1108
},
{
"epoch": 1.365650969529086,
"grad_norm": 10.125,
"learning_rate": 7.880985653260244e-06,
"loss": 1.5926954746246338,
"step": 1110
},
{
"epoch": 1.3681132656201909,
"grad_norm": 7.375,
"learning_rate": 7.853693879884239e-06,
"loss": 1.7612438201904297,
"step": 1112
},
{
"epoch": 1.3705755617112958,
"grad_norm": 3.3125,
"learning_rate": 7.826467926396125e-06,
"loss": 1.5579084157943726,
"step": 1114
},
{
"epoch": 1.3730378578024007,
"grad_norm": 1.5703125,
"learning_rate": 7.799308224990049e-06,
"loss": 1.1745721101760864,
"step": 1116
},
{
"epoch": 1.3755001538935057,
"grad_norm": 3.59375,
"learning_rate": 7.772215206808441e-06,
"loss": 1.1942408084869385,
"step": 1118
},
{
"epoch": 1.3779624499846106,
"grad_norm": 3.890625,
"learning_rate": 7.745189301935184e-06,
"loss": 1.2781388759613037,
"step": 1120
},
{
"epoch": 1.3804247460757155,
"grad_norm": 10.5,
"learning_rate": 7.71823093938877e-06,
"loss": 1.2326617240905762,
"step": 1122
},
{
"epoch": 1.3828870421668205,
"grad_norm": 3.4375,
"learning_rate": 7.691340547115508e-06,
"loss": 1.1817359924316406,
"step": 1124
},
{
"epoch": 1.3853493382579254,
"grad_norm": 1.65625,
"learning_rate": 7.664518551982729e-06,
"loss": 1.280542016029358,
"step": 1126
},
{
"epoch": 1.3878116343490305,
"grad_norm": 1.4140625,
"learning_rate": 7.637765379771997e-06,
"loss": 1.0744314193725586,
"step": 1128
},
{
"epoch": 1.3902739304401355,
"grad_norm": 3.484375,
"learning_rate": 7.61108145517236e-06,
"loss": 1.1780340671539307,
"step": 1130
},
{
"epoch": 1.3927362265312404,
"grad_norm": 2.65625,
"learning_rate": 7.5844672017736e-06,
"loss": 1.1386570930480957,
"step": 1132
},
{
"epoch": 1.3951985226223453,
"grad_norm": 4.21875,
"learning_rate": 7.557923042059525e-06,
"loss": 1.2564072608947754,
"step": 1134
},
{
"epoch": 1.3976608187134503,
"grad_norm": 8.5625,
"learning_rate": 7.531449397401243e-06,
"loss": 1.358655333518982,
"step": 1136
},
{
"epoch": 1.4001231148045552,
"grad_norm": 12.125,
"learning_rate": 7.505046688050486e-06,
"loss": 1.1821155548095703,
"step": 1138
},
{
"epoch": 1.4025854108956601,
"grad_norm": 4.65625,
"learning_rate": 7.4787153331329356e-06,
"loss": 1.3920905590057373,
"step": 1140
},
{
"epoch": 1.4050477069867653,
"grad_norm": 2.703125,
"learning_rate": 7.452455750641563e-06,
"loss": 1.3678568601608276,
"step": 1142
},
{
"epoch": 1.4075100030778702,
"grad_norm": 3.71875,
"learning_rate": 7.4262683574300046e-06,
"loss": 1.2067809104919434,
"step": 1144
},
{
"epoch": 1.4099722991689752,
"grad_norm": 10.25,
"learning_rate": 7.4001535692059335e-06,
"loss": 1.400128722190857,
"step": 1146
},
{
"epoch": 1.41243459526008,
"grad_norm": 3.140625,
"learning_rate": 7.374111800524476e-06,
"loss": 1.1754021644592285,
"step": 1148
},
{
"epoch": 1.414896891351185,
"grad_norm": 5.3125,
"learning_rate": 7.34814346478161e-06,
"loss": 1.3996424674987793,
"step": 1150
},
{
"epoch": 1.41735918744229,
"grad_norm": 1.78125,
"learning_rate": 7.322248974207624e-06,
"loss": 1.1624915599822998,
"step": 1152
},
{
"epoch": 1.4198214835333949,
"grad_norm": 7.1875,
"learning_rate": 7.296428739860557e-06,
"loss": 1.2524189949035645,
"step": 1154
},
{
"epoch": 1.4222837796244998,
"grad_norm": 2.03125,
"learning_rate": 7.270683171619675e-06,
"loss": 1.1983616352081299,
"step": 1156
},
{
"epoch": 1.4247460757156047,
"grad_norm": 5.15625,
"learning_rate": 7.2450126781789795e-06,
"loss": 1.263120412826538,
"step": 1158
},
{
"epoch": 1.4272083718067097,
"grad_norm": 2.8125,
"learning_rate": 7.219417667040702e-06,
"loss": 1.5528199672698975,
"step": 1160
},
{
"epoch": 1.4296706678978146,
"grad_norm": 6.375,
"learning_rate": 7.193898544508842e-06,
"loss": 1.5049046277999878,
"step": 1162
},
{
"epoch": 1.4321329639889195,
"grad_norm": 2.03125,
"learning_rate": 7.168455715682716e-06,
"loss": 1.2450196743011475,
"step": 1164
},
{
"epoch": 1.4345952600800247,
"grad_norm": 5.75,
"learning_rate": 7.143089584450531e-06,
"loss": 1.0869059562683105,
"step": 1166
},
{
"epoch": 1.4370575561711296,
"grad_norm": 3.734375,
"learning_rate": 7.117800553482971e-06,
"loss": 1.3680589199066162,
"step": 1168
},
{
"epoch": 1.4395198522622346,
"grad_norm": 13.8125,
"learning_rate": 7.092589024226804e-06,
"loss": 1.4548523426055908,
"step": 1170
},
{
"epoch": 1.4419821483533395,
"grad_norm": 6.15625,
"learning_rate": 7.067455396898504e-06,
"loss": 1.0294753313064575,
"step": 1172
},
{
"epoch": 1.4444444444444444,
"grad_norm": 14.375,
"learning_rate": 7.042400070477908e-06,
"loss": 1.1527860164642334,
"step": 1174
},
{
"epoch": 1.4469067405355494,
"grad_norm": 5.0625,
"learning_rate": 7.0174234427018736e-06,
"loss": 1.667987585067749,
"step": 1176
},
{
"epoch": 1.4493690366266543,
"grad_norm": 9.75,
"learning_rate": 6.992525910057972e-06,
"loss": 1.6407973766326904,
"step": 1178
},
{
"epoch": 1.4518313327177594,
"grad_norm": 9.125,
"learning_rate": 6.967707867778193e-06,
"loss": 1.551527500152588,
"step": 1180
},
{
"epoch": 1.4542936288088644,
"grad_norm": 3.21875,
"learning_rate": 6.9429697098326634e-06,
"loss": 1.400420069694519,
"step": 1182
},
{
"epoch": 1.4567559248999693,
"grad_norm": 3.28125,
"learning_rate": 6.918311828923403e-06,
"loss": 1.3203402757644653,
"step": 1184
},
{
"epoch": 1.4592182209910742,
"grad_norm": 3.4375,
"learning_rate": 6.893734616478087e-06,
"loss": 1.2934377193450928,
"step": 1186
},
{
"epoch": 1.4616805170821792,
"grad_norm": 7.84375,
"learning_rate": 6.869238462643825e-06,
"loss": 0.8468174934387207,
"step": 1188
},
{
"epoch": 1.464142813173284,
"grad_norm": 3.78125,
"learning_rate": 6.844823756280985e-06,
"loss": 0.7017765641212463,
"step": 1190
},
{
"epoch": 1.466605109264389,
"grad_norm": 6.75,
"learning_rate": 6.8204908849569996e-06,
"loss": 0.8379335999488831,
"step": 1192
},
{
"epoch": 1.469067405355494,
"grad_norm": 4.78125,
"learning_rate": 6.79624023494023e-06,
"loss": 0.8475155234336853,
"step": 1194
},
{
"epoch": 1.471529701446599,
"grad_norm": 3.625,
"learning_rate": 6.772072191193826e-06,
"loss": 1.5360143184661865,
"step": 1196
},
{
"epoch": 1.4739919975377038,
"grad_norm": 12.375,
"learning_rate": 6.747987137369616e-06,
"loss": 1.451025366783142,
"step": 1198
},
{
"epoch": 1.4764542936288088,
"grad_norm": 6.125,
"learning_rate": 6.72398545580202e-06,
"loss": 1.6992993354797363,
"step": 1200
},
{
"epoch": 1.4789165897199137,
"grad_norm": 2.859375,
"learning_rate": 6.700067527501979e-06,
"loss": 1.4374724626541138,
"step": 1202
},
{
"epoch": 1.4813788858110188,
"grad_norm": 4.34375,
"learning_rate": 6.676233732150905e-06,
"loss": 1.423210859298706,
"step": 1204
},
{
"epoch": 1.4838411819021238,
"grad_norm": 6.375,
"learning_rate": 6.652484448094654e-06,
"loss": 1.3673293590545654,
"step": 1206
},
{
"epoch": 1.4863034779932287,
"grad_norm": 3.171875,
"learning_rate": 6.628820052337515e-06,
"loss": 1.3383548259735107,
"step": 1208
},
{
"epoch": 1.4887657740843336,
"grad_norm": 2.71875,
"learning_rate": 6.605240920536241e-06,
"loss": 0.7290570139884949,
"step": 1210
},
{
"epoch": 1.4912280701754386,
"grad_norm": 3.828125,
"learning_rate": 6.581747426994074e-06,
"loss": 0.8285163044929504,
"step": 1212
},
{
"epoch": 1.4936903662665435,
"grad_norm": 4.8125,
"learning_rate": 6.558339944654797e-06,
"loss": 1.524817705154419,
"step": 1214
},
{
"epoch": 1.4961526623576484,
"grad_norm": 4.09375,
"learning_rate": 6.5350188450968275e-06,
"loss": 1.5156073570251465,
"step": 1216
},
{
"epoch": 1.4986149584487536,
"grad_norm": 1.96875,
"learning_rate": 6.511784498527316e-06,
"loss": 1.266753911972046,
"step": 1218
},
{
"epoch": 1.5010772545398585,
"grad_norm": 4.28125,
"learning_rate": 6.488637273776258e-06,
"loss": 1.234669804573059,
"step": 1220
},
{
"epoch": 1.5035395506309635,
"grad_norm": 3.296875,
"learning_rate": 6.465577538290656e-06,
"loss": 1.1362870931625366,
"step": 1222
},
{
"epoch": 1.5060018467220684,
"grad_norm": 5.78125,
"learning_rate": 6.4426056581286736e-06,
"loss": 1.2194573879241943,
"step": 1224
},
{
"epoch": 1.5084641428131733,
"grad_norm": 2.484375,
"learning_rate": 6.419721997953825e-06,
"loss": 1.3203624486923218,
"step": 1226
},
{
"epoch": 1.5109264389042782,
"grad_norm": 8.875,
"learning_rate": 6.396926921029197e-06,
"loss": 1.4041712284088135,
"step": 1228
},
{
"epoch": 1.5133887349953832,
"grad_norm": 3.015625,
"learning_rate": 6.374220789211669e-06,
"loss": 1.6859148740768433,
"step": 1230
},
{
"epoch": 1.515851031086488,
"grad_norm": 3.03125,
"learning_rate": 6.351603962946182e-06,
"loss": 1.2609457969665527,
"step": 1232
},
{
"epoch": 1.518313327177593,
"grad_norm": 4.0625,
"learning_rate": 6.329076801260007e-06,
"loss": 1.3652920722961426,
"step": 1234
},
{
"epoch": 1.520775623268698,
"grad_norm": 0.9921875,
"learning_rate": 6.306639661757047e-06,
"loss": 1.1765468120574951,
"step": 1236
},
{
"epoch": 1.523237919359803,
"grad_norm": 12.0625,
"learning_rate": 6.2842929006121645e-06,
"loss": 1.2304123640060425,
"step": 1238
},
{
"epoch": 1.5257002154509078,
"grad_norm": 4.03125,
"learning_rate": 6.262036872565519e-06,
"loss": 1.1622458696365356,
"step": 1240
},
{
"epoch": 1.5281625115420128,
"grad_norm": 1.765625,
"learning_rate": 6.239871930916952e-06,
"loss": 1.1903202533721924,
"step": 1242
},
{
"epoch": 1.530624807633118,
"grad_norm": 4.59375,
"learning_rate": 6.21779842752036e-06,
"loss": 1.1756622791290283,
"step": 1244
},
{
"epoch": 1.5330871037242229,
"grad_norm": 3.09375,
"learning_rate": 6.195816712778119e-06,
"loss": 1.361944556236267,
"step": 1246
},
{
"epoch": 1.5355493998153278,
"grad_norm": 2.015625,
"learning_rate": 6.1739271356355205e-06,
"loss": 1.207919955253601,
"step": 1248
},
{
"epoch": 1.5380116959064327,
"grad_norm": 2.96875,
"learning_rate": 6.152130043575235e-06,
"loss": 1.128209114074707,
"step": 1250
},
{
"epoch": 1.5404739919975377,
"grad_norm": 4.9375,
"learning_rate": 6.130425782611788e-06,
"loss": 0.9894086122512817,
"step": 1252
},
{
"epoch": 1.5429362880886428,
"grad_norm": 5.4375,
"learning_rate": 6.1088146972860796e-06,
"loss": 1.4114530086517334,
"step": 1254
},
{
"epoch": 1.5453985841797477,
"grad_norm": 3.28125,
"learning_rate": 6.0872971306598985e-06,
"loss": 1.6339147090911865,
"step": 1256
},
{
"epoch": 1.5478608802708527,
"grad_norm": 1.34375,
"learning_rate": 6.065873424310493e-06,
"loss": 1.2093985080718994,
"step": 1258
},
{
"epoch": 1.5503231763619576,
"grad_norm": 4.1875,
"learning_rate": 6.044543918325134e-06,
"loss": 1.422555923461914,
"step": 1260
},
{
"epoch": 1.5527854724530625,
"grad_norm": 1.6015625,
"learning_rate": 6.0233089512957335e-06,
"loss": 1.3422693014144897,
"step": 1262
},
{
"epoch": 1.5552477685441675,
"grad_norm": 6.75,
"learning_rate": 6.002168860313449e-06,
"loss": 1.1010103225708008,
"step": 1264
},
{
"epoch": 1.5577100646352724,
"grad_norm": 4.28125,
"learning_rate": 5.9811239809633504e-06,
"loss": 1.3068557977676392,
"step": 1266
},
{
"epoch": 1.5601723607263773,
"grad_norm": 3.875,
"learning_rate": 5.960174647319083e-06,
"loss": 1.1887340545654297,
"step": 1268
},
{
"epoch": 1.5626346568174823,
"grad_norm": 6.21875,
"learning_rate": 5.939321191937567e-06,
"loss": 1.1840931177139282,
"step": 1270
},
{
"epoch": 1.5650969529085872,
"grad_norm": 2.609375,
"learning_rate": 5.918563945853714e-06,
"loss": 1.3886611461639404,
"step": 1272
},
{
"epoch": 1.5675592489996921,
"grad_norm": 1.703125,
"learning_rate": 5.8979032385751845e-06,
"loss": 1.1980421543121338,
"step": 1274
},
{
"epoch": 1.570021545090797,
"grad_norm": 5.4375,
"learning_rate": 5.877339398077142e-06,
"loss": 1.4251586198806763,
"step": 1276
},
{
"epoch": 1.572483841181902,
"grad_norm": 8.9375,
"learning_rate": 5.8568727507970566e-06,
"loss": 1.4789252281188965,
"step": 1278
},
{
"epoch": 1.574946137273007,
"grad_norm": 3.453125,
"learning_rate": 5.836503621629518e-06,
"loss": 1.3751678466796875,
"step": 1280
},
{
"epoch": 1.577408433364112,
"grad_norm": 5.09375,
"learning_rate": 5.8162323339210795e-06,
"loss": 1.5434916019439697,
"step": 1282
},
{
"epoch": 1.579870729455217,
"grad_norm": 1.0546875,
"learning_rate": 5.796059209465128e-06,
"loss": 1.2941160202026367,
"step": 1284
},
{
"epoch": 1.582333025546322,
"grad_norm": 3.40625,
"learning_rate": 5.775984568496774e-06,
"loss": 1.2361758947372437,
"step": 1286
},
{
"epoch": 1.5847953216374269,
"grad_norm": 4.125,
"learning_rate": 5.756008729687764e-06,
"loss": 1.2213199138641357,
"step": 1288
},
{
"epoch": 1.587257617728532,
"grad_norm": 5.15625,
"learning_rate": 5.7361320101414264e-06,
"loss": 1.370686411857605,
"step": 1290
},
{
"epoch": 1.589719913819637,
"grad_norm": 2.328125,
"learning_rate": 5.716354725387634e-06,
"loss": 1.160779595375061,
"step": 1292
},
{
"epoch": 1.5921822099107419,
"grad_norm": 3.625,
"learning_rate": 5.696677189377804e-06,
"loss": 1.149789810180664,
"step": 1294
},
{
"epoch": 1.5946445060018468,
"grad_norm": 3.96875,
"learning_rate": 5.677099714479901e-06,
"loss": 1.3322994709014893,
"step": 1296
},
{
"epoch": 1.5971068020929517,
"grad_norm": 3.640625,
"learning_rate": 5.657622611473487e-06,
"loss": 1.3151819705963135,
"step": 1298
},
{
"epoch": 1.5995690981840567,
"grad_norm": 3.78125,
"learning_rate": 5.638246189544789e-06,
"loss": 1.4213796854019165,
"step": 1300
},
{
"epoch": 1.6020313942751616,
"grad_norm": 4.75,
"learning_rate": 5.618970756281786e-06,
"loss": 1.6766854524612427,
"step": 1302
},
{
"epoch": 1.6044936903662665,
"grad_norm": 8.0625,
"learning_rate": 5.5997966176693255e-06,
"loss": 1.551700472831726,
"step": 1304
},
{
"epoch": 1.6069559864573715,
"grad_norm": 2.46875,
"learning_rate": 5.580724078084273e-06,
"loss": 1.2433726787567139,
"step": 1306
},
{
"epoch": 1.6094182825484764,
"grad_norm": 10.25,
"learning_rate": 5.561753440290676e-06,
"loss": 1.3765232563018799,
"step": 1308
},
{
"epoch": 1.6118805786395813,
"grad_norm": 4.3125,
"learning_rate": 5.542885005434956e-06,
"loss": 1.6626167297363281,
"step": 1310
},
{
"epoch": 1.6143428747306863,
"grad_norm": 11.0625,
"learning_rate": 5.524119073041125e-06,
"loss": 1.5003547668457031,
"step": 1312
},
{
"epoch": 1.6168051708217912,
"grad_norm": 4.125,
"learning_rate": 5.505455941006048e-06,
"loss": 1.4539849758148193,
"step": 1314
},
{
"epoch": 1.6192674669128961,
"grad_norm": 3.234375,
"learning_rate": 5.486895905594696e-06,
"loss": 1.255268931388855,
"step": 1316
},
{
"epoch": 1.621729763004001,
"grad_norm": 3.234375,
"learning_rate": 5.468439261435443e-06,
"loss": 1.2248173952102661,
"step": 1318
},
{
"epoch": 1.6241920590951062,
"grad_norm": 1.5625,
"learning_rate": 5.450086301515402e-06,
"loss": 1.1668376922607422,
"step": 1320
},
{
"epoch": 1.6266543551862112,
"grad_norm": 1.3125,
"learning_rate": 5.4318373171757635e-06,
"loss": 0.9886284470558167,
"step": 1322
},
{
"epoch": 1.629116651277316,
"grad_norm": 5.0,
"learning_rate": 5.413692598107173e-06,
"loss": 1.1245368719100952,
"step": 1324
},
{
"epoch": 1.631578947368421,
"grad_norm": 4.65625,
"learning_rate": 5.395652432345137e-06,
"loss": 1.3283562660217285,
"step": 1326
},
{
"epoch": 1.6340412434595262,
"grad_norm": 3.625,
"learning_rate": 5.377717106265447e-06,
"loss": 1.361234426498413,
"step": 1328
},
{
"epoch": 1.636503539550631,
"grad_norm": 1.625,
"learning_rate": 5.3598869045796256e-06,
"loss": 1.0329114198684692,
"step": 1330
},
{
"epoch": 1.638965835641736,
"grad_norm": 4.28125,
"learning_rate": 5.342162110330427e-06,
"loss": 0.9817519187927246,
"step": 1332
},
{
"epoch": 1.641428131732841,
"grad_norm": 7.1875,
"learning_rate": 5.3245430048873205e-06,
"loss": 1.1899058818817139,
"step": 1334
},
{
"epoch": 1.643890427823946,
"grad_norm": 18.125,
"learning_rate": 5.307029867942037e-06,
"loss": 0.9700236320495605,
"step": 1336
},
{
"epoch": 1.6463527239150508,
"grad_norm": 3.28125,
"learning_rate": 5.289622977504136e-06,
"loss": 0.7763628959655762,
"step": 1338
},
{
"epoch": 1.6488150200061558,
"grad_norm": 7.28125,
"learning_rate": 5.272322609896572e-06,
"loss": 1.5835676193237305,
"step": 1340
},
{
"epoch": 1.6512773160972607,
"grad_norm": 6.625,
"learning_rate": 5.2551290397513266e-06,
"loss": 1.6835378408432007,
"step": 1342
},
{
"epoch": 1.6537396121883656,
"grad_norm": 10.875,
"learning_rate": 5.2380425400050375e-06,
"loss": 1.568629503250122,
"step": 1344
},
{
"epoch": 1.6562019082794706,
"grad_norm": 3.21875,
"learning_rate": 5.221063381894673e-06,
"loss": 1.3448878526687622,
"step": 1346
},
{
"epoch": 1.6586642043705755,
"grad_norm": 4.75,
"learning_rate": 5.204191834953222e-06,
"loss": 1.3649985790252686,
"step": 1348
},
{
"epoch": 1.6611265004616804,
"grad_norm": 5.125,
"learning_rate": 5.187428167005419e-06,
"loss": 1.326650619506836,
"step": 1350
},
{
"epoch": 1.6635887965527854,
"grad_norm": 7.0,
"learning_rate": 5.1707726441634875e-06,
"loss": 1.4459569454193115,
"step": 1352
},
{
"epoch": 1.6660510926438903,
"grad_norm": 7.65625,
"learning_rate": 5.1542255308229185e-06,
"loss": 1.614980936050415,
"step": 1354
},
{
"epoch": 1.6685133887349952,
"grad_norm": 5.4375,
"learning_rate": 5.137787089658273e-06,
"loss": 1.3426003456115723,
"step": 1356
},
{
"epoch": 1.6709756848261004,
"grad_norm": 5.03125,
"learning_rate": 5.121457581619018e-06,
"loss": 1.3568965196609497,
"step": 1358
},
{
"epoch": 1.6734379809172053,
"grad_norm": 2.859375,
"learning_rate": 5.105237265925373e-06,
"loss": 1.208372712135315,
"step": 1360
},
{
"epoch": 1.6759002770083102,
"grad_norm": 4.59375,
"learning_rate": 5.089126400064199e-06,
"loss": 1.2874377965927124,
"step": 1362
},
{
"epoch": 1.6783625730994152,
"grad_norm": 1.796875,
"learning_rate": 5.0731252397849195e-06,
"loss": 1.2037644386291504,
"step": 1364
},
{
"epoch": 1.6808248691905203,
"grad_norm": 2.09375,
"learning_rate": 5.057234039095447e-06,
"loss": 1.1050446033477783,
"step": 1366
},
{
"epoch": 1.6832871652816253,
"grad_norm": 4.0,
"learning_rate": 5.041453050258165e-06,
"loss": 1.3572784662246704,
"step": 1368
},
{
"epoch": 1.6857494613727302,
"grad_norm": 6.5625,
"learning_rate": 5.025782523785911e-06,
"loss": 1.7393821477890015,
"step": 1370
},
{
"epoch": 1.6882117574638351,
"grad_norm": 7.9375,
"learning_rate": 5.010222708438004e-06,
"loss": 1.312801480293274,
"step": 1372
},
{
"epoch": 1.69067405355494,
"grad_norm": 4.03125,
"learning_rate": 4.9947738512163e-06,
"loss": 1.1735351085662842,
"step": 1374
},
{
"epoch": 1.693136349646045,
"grad_norm": 4.28125,
"learning_rate": 4.979436197361265e-06,
"loss": 1.368802547454834,
"step": 1376
},
{
"epoch": 1.69559864573715,
"grad_norm": 3.953125,
"learning_rate": 4.964209990348089e-06,
"loss": 1.3448070287704468,
"step": 1378
},
{
"epoch": 1.6980609418282548,
"grad_norm": 4.0,
"learning_rate": 4.94909547188281e-06,
"loss": 1.2951633930206299,
"step": 1380
},
{
"epoch": 1.7005232379193598,
"grad_norm": 2.96875,
"learning_rate": 4.934092881898489e-06,
"loss": 1.3092372417449951,
"step": 1382
},
{
"epoch": 1.7029855340104647,
"grad_norm": 5.875,
"learning_rate": 4.919202458551394e-06,
"loss": 1.4099408388137817,
"step": 1384
},
{
"epoch": 1.7054478301015696,
"grad_norm": 4.34375,
"learning_rate": 4.9044244382172215e-06,
"loss": 1.3373868465423584,
"step": 1386
},
{
"epoch": 1.7079101261926746,
"grad_norm": 4.625,
"learning_rate": 4.88975905548734e-06,
"loss": 1.3168833255767822,
"step": 1388
},
{
"epoch": 1.7103724222837795,
"grad_norm": 1.96875,
"learning_rate": 4.8752065431650775e-06,
"loss": 1.1487715244293213,
"step": 1390
},
{
"epoch": 1.7128347183748844,
"grad_norm": 1.984375,
"learning_rate": 4.8607671322620134e-06,
"loss": 1.083390712738037,
"step": 1392
},
{
"epoch": 1.7152970144659896,
"grad_norm": 2.015625,
"learning_rate": 4.846441051994317e-06,
"loss": 0.9462494850158691,
"step": 1394
},
{
"epoch": 1.7177593105570945,
"grad_norm": 4.5625,
"learning_rate": 4.832228529779107e-06,
"loss": 1.4706915616989136,
"step": 1396
},
{
"epoch": 1.7202216066481995,
"grad_norm": 3.828125,
"learning_rate": 4.818129791230845e-06,
"loss": 1.5781259536743164,
"step": 1398
},
{
"epoch": 1.7226839027393044,
"grad_norm": 3.421875,
"learning_rate": 4.804145060157752e-06,
"loss": 1.3088247776031494,
"step": 1400
},
{
"epoch": 1.7251461988304093,
"grad_norm": 3.359375,
"learning_rate": 4.790274558558255e-06,
"loss": 1.305666446685791,
"step": 1402
},
{
"epoch": 1.7276084949215145,
"grad_norm": 3.125,
"learning_rate": 4.776518506617457e-06,
"loss": 1.3846698999404907,
"step": 1404
},
{
"epoch": 1.7300707910126194,
"grad_norm": 8.0,
"learning_rate": 4.762877122703658e-06,
"loss": 0.9111043214797974,
"step": 1406
},
{
"epoch": 1.7325330871037243,
"grad_norm": 4.0625,
"learning_rate": 4.749350623364867e-06,
"loss": 0.9622360467910767,
"step": 1408
},
{
"epoch": 1.7349953831948293,
"grad_norm": 1.6640625,
"learning_rate": 4.735939223325387e-06,
"loss": 1.1692111492156982,
"step": 1410
},
{
"epoch": 1.7374576792859342,
"grad_norm": 3.109375,
"learning_rate": 4.722643135482389e-06,
"loss": 1.1715750694274902,
"step": 1412
},
{
"epoch": 1.7399199753770391,
"grad_norm": 1.53125,
"learning_rate": 4.709462570902536e-06,
"loss": 1.1869937181472778,
"step": 1414
},
{
"epoch": 1.742382271468144,
"grad_norm": 6.3125,
"learning_rate": 4.696397738818644e-06,
"loss": 1.3076727390289307,
"step": 1416
},
{
"epoch": 1.744844567559249,
"grad_norm": 6.46875,
"learning_rate": 4.683448846626342e-06,
"loss": 1.78236722946167,
"step": 1418
},
{
"epoch": 1.747306863650354,
"grad_norm": 5.84375,
"learning_rate": 4.670616099880796e-06,
"loss": 1.399848222732544,
"step": 1420
},
{
"epoch": 1.7497691597414589,
"grad_norm": 1.5234375,
"learning_rate": 4.657899702293436e-06,
"loss": 1.1672091484069824,
"step": 1422
},
{
"epoch": 1.7522314558325638,
"grad_norm": 1.9921875,
"learning_rate": 4.645299855728726e-06,
"loss": 1.084723949432373,
"step": 1424
},
{
"epoch": 1.7546937519236687,
"grad_norm": 4.4375,
"learning_rate": 4.63281676020096e-06,
"loss": 1.277264952659607,
"step": 1426
},
{
"epoch": 1.7571560480147737,
"grad_norm": 3.15625,
"learning_rate": 4.620450613871082e-06,
"loss": 1.5163521766662598,
"step": 1428
},
{
"epoch": 1.7596183441058786,
"grad_norm": 4.1875,
"learning_rate": 4.608201613043551e-06,
"loss": 1.3597209453582764,
"step": 1430
},
{
"epoch": 1.7620806401969837,
"grad_norm": 4.71875,
"learning_rate": 4.596069952163215e-06,
"loss": 1.3845343589782715,
"step": 1432
},
{
"epoch": 1.7645429362880887,
"grad_norm": 8.3125,
"learning_rate": 4.584055823812224e-06,
"loss": 1.3936517238616943,
"step": 1434
},
{
"epoch": 1.7670052323791936,
"grad_norm": 3.109375,
"learning_rate": 4.572159418706983e-06,
"loss": 1.2084264755249023,
"step": 1436
},
{
"epoch": 1.7694675284702985,
"grad_norm": 4.25,
"learning_rate": 4.560380925695109e-06,
"loss": 1.3428120613098145,
"step": 1438
},
{
"epoch": 1.7719298245614035,
"grad_norm": 6.8125,
"learning_rate": 4.54872053175245e-06,
"loss": 1.7809275388717651,
"step": 1440
},
{
"epoch": 1.7743921206525086,
"grad_norm": 3.078125,
"learning_rate": 4.537178421980104e-06,
"loss": 1.2580034732818604,
"step": 1442
},
{
"epoch": 1.7768544167436136,
"grad_norm": 14.4375,
"learning_rate": 4.52575477960149e-06,
"loss": 1.1773604154586792,
"step": 1444
},
{
"epoch": 1.7793167128347185,
"grad_norm": 4.59375,
"learning_rate": 4.514449785959429e-06,
"loss": 1.5239715576171875,
"step": 1446
},
{
"epoch": 1.7817790089258234,
"grad_norm": 3.125,
"learning_rate": 4.503263620513274e-06,
"loss": 1.2753288745880127,
"step": 1448
},
{
"epoch": 1.7842413050169283,
"grad_norm": 4.71875,
"learning_rate": 4.49219646083606e-06,
"loss": 1.2915542125701904,
"step": 1450
},
{
"epoch": 1.7867036011080333,
"grad_norm": 5.0625,
"learning_rate": 4.481248482611682e-06,
"loss": 1.6656956672668457,
"step": 1452
},
{
"epoch": 1.7891658971991382,
"grad_norm": 4.65625,
"learning_rate": 4.470419859632109e-06,
"loss": 1.3530993461608887,
"step": 1454
},
{
"epoch": 1.7916281932902431,
"grad_norm": 3.125,
"learning_rate": 4.459710763794619e-06,
"loss": 1.230569839477539,
"step": 1456
},
{
"epoch": 1.794090489381348,
"grad_norm": 8.5625,
"learning_rate": 4.449121365099082e-06,
"loss": 1.2140610218048096,
"step": 1458
},
{
"epoch": 1.796552785472453,
"grad_norm": 3.8125,
"learning_rate": 4.4386518316452475e-06,
"loss": 1.3054462671279907,
"step": 1460
},
{
"epoch": 1.799015081563558,
"grad_norm": 6.59375,
"learning_rate": 4.428302329630089e-06,
"loss": 1.515989065170288,
"step": 1462
},
{
"epoch": 1.8014773776546629,
"grad_norm": 3.796875,
"learning_rate": 4.418073023345158e-06,
"loss": 1.2513904571533203,
"step": 1464
},
{
"epoch": 1.8039396737457678,
"grad_norm": 8.9375,
"learning_rate": 4.407964075173976e-06,
"loss": 1.2142295837402344,
"step": 1466
},
{
"epoch": 1.8064019698368727,
"grad_norm": 4.6875,
"learning_rate": 4.397975645589459e-06,
"loss": 1.1632449626922607,
"step": 1468
},
{
"epoch": 1.8088642659279779,
"grad_norm": 2.28125,
"learning_rate": 4.38810789315137e-06,
"loss": 1.2213386297225952,
"step": 1470
},
{
"epoch": 1.8113265620190828,
"grad_norm": 5.5625,
"learning_rate": 4.378360974503803e-06,
"loss": 1.3299362659454346,
"step": 1472
},
{
"epoch": 1.8137888581101878,
"grad_norm": 19.125,
"learning_rate": 4.368735044372691e-06,
"loss": 1.8193198442459106,
"step": 1474
},
{
"epoch": 1.8162511542012927,
"grad_norm": 4.03125,
"learning_rate": 4.359230255563357e-06,
"loss": 1.4013632535934448,
"step": 1476
},
{
"epoch": 1.8187134502923976,
"grad_norm": 3.859375,
"learning_rate": 4.349846758958085e-06,
"loss": 1.3816094398498535,
"step": 1478
},
{
"epoch": 1.8211757463835028,
"grad_norm": 5.40625,
"learning_rate": 4.340584703513722e-06,
"loss": 1.48891019821167,
"step": 1480
},
{
"epoch": 1.8236380424746077,
"grad_norm": 6.875,
"learning_rate": 4.33144423625932e-06,
"loss": 1.8420138359069824,
"step": 1482
},
{
"epoch": 1.8261003385657126,
"grad_norm": 5.625,
"learning_rate": 4.322425502293797e-06,
"loss": 1.484515905380249,
"step": 1484
},
{
"epoch": 1.8285626346568176,
"grad_norm": 4.0,
"learning_rate": 4.313528644783633e-06,
"loss": 1.1373395919799805,
"step": 1486
},
{
"epoch": 1.8310249307479225,
"grad_norm": 2.5625,
"learning_rate": 4.304753804960603e-06,
"loss": 1.0549803972244263,
"step": 1488
},
{
"epoch": 1.8334872268390274,
"grad_norm": 3.125,
"learning_rate": 4.2961011221195255e-06,
"loss": 1.1374645233154297,
"step": 1490
},
{
"epoch": 1.8359495229301324,
"grad_norm": 3.15625,
"learning_rate": 4.287570733616063e-06,
"loss": 1.2891483306884766,
"step": 1492
},
{
"epoch": 1.8384118190212373,
"grad_norm": 6.15625,
"learning_rate": 4.279162774864535e-06,
"loss": 1.3952784538269043,
"step": 1494
},
{
"epoch": 1.8408741151123422,
"grad_norm": 11.4375,
"learning_rate": 4.270877379335764e-06,
"loss": 1.6006450653076172,
"step": 1496
},
{
"epoch": 1.8433364112034472,
"grad_norm": 3.375,
"learning_rate": 4.2627146785549675e-06,
"loss": 1.6013039350509644,
"step": 1498
},
{
"epoch": 1.845798707294552,
"grad_norm": 5.65625,
"learning_rate": 4.254674802099661e-06,
"loss": 1.509192943572998,
"step": 1500
},
{
"epoch": 1.848261003385657,
"grad_norm": 5.1875,
"learning_rate": 4.2467578775976064e-06,
"loss": 1.611980676651001,
"step": 1502
},
{
"epoch": 1.850723299476762,
"grad_norm": 4.125,
"learning_rate": 4.238964030724785e-06,
"loss": 1.4414465427398682,
"step": 1504
},
{
"epoch": 1.8531855955678669,
"grad_norm": 3.109375,
"learning_rate": 4.231293385203395e-06,
"loss": 1.5326135158538818,
"step": 1506
},
{
"epoch": 1.855647891658972,
"grad_norm": 7.09375,
"learning_rate": 4.2237460627999035e-06,
"loss": 1.3705086708068848,
"step": 1508
},
{
"epoch": 1.858110187750077,
"grad_norm": 5.5,
"learning_rate": 4.216322183323097e-06,
"loss": 1.7913298606872559,
"step": 1510
},
{
"epoch": 1.860572483841182,
"grad_norm": 6.28125,
"learning_rate": 4.2090218646221884e-06,
"loss": 1.5537046194076538,
"step": 1512
},
{
"epoch": 1.8630347799322868,
"grad_norm": 6.03125,
"learning_rate": 4.201845222584946e-06,
"loss": 1.7360601425170898,
"step": 1514
},
{
"epoch": 1.8654970760233918,
"grad_norm": 6.34375,
"learning_rate": 4.194792371135853e-06,
"loss": 1.8205009698867798,
"step": 1516
},
{
"epoch": 1.867959372114497,
"grad_norm": 3.375,
"learning_rate": 4.187863422234293e-06,
"loss": 1.408042073249817,
"step": 1518
},
{
"epoch": 1.8704216682056019,
"grad_norm": 5.625,
"learning_rate": 4.181058485872784e-06,
"loss": 1.096937656402588,
"step": 1520
},
{
"epoch": 1.8728839642967068,
"grad_norm": 6.09375,
"learning_rate": 4.174377670075222e-06,
"loss": 1.3984037637710571,
"step": 1522
},
{
"epoch": 1.8753462603878117,
"grad_norm": 2.15625,
"learning_rate": 4.167821080895174e-06,
"loss": 1.3008735179901123,
"step": 1524
},
{
"epoch": 1.8778085564789166,
"grad_norm": 3.296875,
"learning_rate": 4.161388822414189e-06,
"loss": 1.1213737726211548,
"step": 1526
},
{
"epoch": 1.8802708525700216,
"grad_norm": 6.40625,
"learning_rate": 4.155080996740145e-06,
"loss": 1.3446485996246338,
"step": 1528
},
{
"epoch": 1.8827331486611265,
"grad_norm": 3.546875,
"learning_rate": 4.148897704005638e-06,
"loss": 1.3206844329833984,
"step": 1530
},
{
"epoch": 1.8851954447522314,
"grad_norm": 48.0,
"learning_rate": 4.14283904236638e-06,
"loss": 1.3920776844024658,
"step": 1532
},
{
"epoch": 1.8876577408433364,
"grad_norm": 3.3125,
"learning_rate": 4.136905107999645e-06,
"loss": 1.4610090255737305,
"step": 1534
},
{
"epoch": 1.8901200369344413,
"grad_norm": 2.0625,
"learning_rate": 4.13109599510275e-06,
"loss": 1.2146025896072388,
"step": 1536
},
{
"epoch": 1.8925823330255462,
"grad_norm": 3.296875,
"learning_rate": 4.125411795891547e-06,
"loss": 1.1985912322998047,
"step": 1538
},
{
"epoch": 1.8950446291166512,
"grad_norm": 2.4375,
"learning_rate": 4.119852600598966e-06,
"loss": 1.32261323928833,
"step": 1540
},
{
"epoch": 1.897506925207756,
"grad_norm": 1.4453125,
"learning_rate": 4.114418497473584e-06,
"loss": 1.1342700719833374,
"step": 1542
},
{
"epoch": 1.899969221298861,
"grad_norm": 5.71875,
"learning_rate": 4.109109572778222e-06,
"loss": 1.235834002494812,
"step": 1544
},
{
"epoch": 1.9024315173899662,
"grad_norm": 2.859375,
"learning_rate": 4.103925910788572e-06,
"loss": 1.3796794414520264,
"step": 1546
},
{
"epoch": 1.9048938134810711,
"grad_norm": 3.3125,
"learning_rate": 4.0988675937918686e-06,
"loss": 1.2857390642166138,
"step": 1548
},
{
"epoch": 1.907356109572176,
"grad_norm": 4.5,
"learning_rate": 4.093934702085574e-06,
"loss": 1.4970194101333618,
"step": 1550
},
{
"epoch": 1.909818405663281,
"grad_norm": 6.25,
"learning_rate": 4.089127313976101e-06,
"loss": 1.31523597240448,
"step": 1552
},
{
"epoch": 1.912280701754386,
"grad_norm": 6.5,
"learning_rate": 4.084445505777584e-06,
"loss": 1.5725702047348022,
"step": 1554
},
{
"epoch": 1.914742997845491,
"grad_norm": 4.75,
"learning_rate": 4.079889351810655e-06,
"loss": 1.5622414350509644,
"step": 1556
},
{
"epoch": 1.917205293936596,
"grad_norm": 9.125,
"learning_rate": 4.0754589244012665e-06,
"loss": 1.2499128580093384,
"step": 1558
},
{
"epoch": 1.919667590027701,
"grad_norm": 5.34375,
"learning_rate": 4.071154293879545e-06,
"loss": 1.224461555480957,
"step": 1560
},
{
"epoch": 1.9221298861188059,
"grad_norm": 7.53125,
"learning_rate": 4.066975528578675e-06,
"loss": 1.4134670495986938,
"step": 1562
},
{
"epoch": 1.9245921822099108,
"grad_norm": 3.546875,
"learning_rate": 4.062922694833813e-06,
"loss": 1.2926013469696045,
"step": 1564
},
{
"epoch": 1.9270544783010157,
"grad_norm": 4.6875,
"learning_rate": 4.058995856981032e-06,
"loss": 0.9741660356521606,
"step": 1566
},
{
"epoch": 1.9295167743921207,
"grad_norm": 4.46875,
"learning_rate": 4.055195077356308e-06,
"loss": 0.7483295798301697,
"step": 1568
},
{
"epoch": 1.9319790704832256,
"grad_norm": 4.46875,
"learning_rate": 4.051520416294521e-06,
"loss": 1.2966933250427246,
"step": 1570
},
{
"epoch": 1.9344413665743305,
"grad_norm": 9.4375,
"learning_rate": 4.0479719321285045e-06,
"loss": 1.2867720127105713,
"step": 1572
},
{
"epoch": 1.9369036626654355,
"grad_norm": 2.4375,
"learning_rate": 4.044549681188113e-06,
"loss": 1.154860258102417,
"step": 1574
},
{
"epoch": 1.9393659587565404,
"grad_norm": 2.09375,
"learning_rate": 4.041253717799337e-06,
"loss": 1.0206176042556763,
"step": 1576
},
{
"epoch": 1.9418282548476453,
"grad_norm": 1.3984375,
"learning_rate": 4.038084094283428e-06,
"loss": 1.0539655685424805,
"step": 1578
},
{
"epoch": 1.9442905509387503,
"grad_norm": 5.78125,
"learning_rate": 4.035040860956082e-06,
"loss": 1.2525365352630615,
"step": 1580
},
{
"epoch": 1.9467528470298552,
"grad_norm": 6.0625,
"learning_rate": 4.032124066126629e-06,
"loss": 1.2998080253601074,
"step": 1582
},
{
"epoch": 1.9492151431209603,
"grad_norm": 3.453125,
"learning_rate": 4.029333756097271e-06,
"loss": 1.5448267459869385,
"step": 1584
},
{
"epoch": 1.9516774392120653,
"grad_norm": 4.40625,
"learning_rate": 4.026669975162351e-06,
"loss": 1.6457065343856812,
"step": 1586
},
{
"epoch": 1.9541397353031702,
"grad_norm": 6.4375,
"learning_rate": 4.02413276560764e-06,
"loss": 1.6280792951583862,
"step": 1588
},
{
"epoch": 1.9566020313942751,
"grad_norm": 4.4375,
"learning_rate": 4.021722167709676e-06,
"loss": 1.5384184122085571,
"step": 1590
},
{
"epoch": 1.95906432748538,
"grad_norm": 4.5,
"learning_rate": 4.019438219735116e-06,
"loss": 1.6012859344482422,
"step": 1592
},
{
"epoch": 1.9615266235764852,
"grad_norm": 6.4375,
"learning_rate": 4.017280957940137e-06,
"loss": 1.3362534046173096,
"step": 1594
},
{
"epoch": 1.9639889196675901,
"grad_norm": 2.15625,
"learning_rate": 4.015250416569853e-06,
"loss": 1.2762130498886108,
"step": 1596
},
{
"epoch": 1.966451215758695,
"grad_norm": 5.25,
"learning_rate": 4.013346627857777e-06,
"loss": 1.3821439743041992,
"step": 1598
},
{
"epoch": 1.9689135118498,
"grad_norm": 3.984375,
"learning_rate": 4.0115696220253025e-06,
"loss": 1.5566853284835815,
"step": 1600
},
{
"epoch": 1.971375807940905,
"grad_norm": 11.1875,
"learning_rate": 4.009919427281232e-06,
"loss": 1.609104037284851,
"step": 1602
},
{
"epoch": 1.9738381040320099,
"grad_norm": 7.1875,
"learning_rate": 4.0083960698213234e-06,
"loss": 1.6049237251281738,
"step": 1604
},
{
"epoch": 1.9763004001231148,
"grad_norm": 2.359375,
"learning_rate": 4.006999573827876e-06,
"loss": 1.2179689407348633,
"step": 1606
},
{
"epoch": 1.9787626962142197,
"grad_norm": 6.4375,
"learning_rate": 4.005729961469349e-06,
"loss": 1.2181804180145264,
"step": 1608
},
{
"epoch": 1.9812249923053247,
"grad_norm": 9.375,
"learning_rate": 4.0045872529000035e-06,
"loss": 1.6380505561828613,
"step": 1610
},
{
"epoch": 1.9836872883964296,
"grad_norm": 6.9375,
"learning_rate": 4.003571466259587e-06,
"loss": 1.5696303844451904,
"step": 1612
},
{
"epoch": 1.9861495844875345,
"grad_norm": 9.4375,
"learning_rate": 4.002682617673048e-06,
"loss": 1.3733805418014526,
"step": 1614
},
{
"epoch": 1.9886118805786395,
"grad_norm": 2.90625,
"learning_rate": 4.001920721250273e-06,
"loss": 1.317124843597412,
"step": 1616
},
{
"epoch": 1.9910741766697444,
"grad_norm": 11.1875,
"learning_rate": 4.001285789085867e-06,
"loss": 1.012315034866333,
"step": 1618
},
{
"epoch": 1.9935364727608493,
"grad_norm": 4.6875,
"learning_rate": 4.000777831258963e-06,
"loss": 1.1209490299224854,
"step": 1620
},
{
"epoch": 1.9959987688519545,
"grad_norm": 4.1875,
"learning_rate": 4.000396855833057e-06,
"loss": 1.491336464881897,
"step": 1622
},
{
"epoch": 1.9984610649430594,
"grad_norm": 1.546875,
"learning_rate": 4.000142868855884e-06,
"loss": 1.23062264919281,
"step": 1624
},
{
"epoch": 2.0,
"grad_norm": 4.5625,
"learning_rate": 4.0000158743593194e-06,
"loss": 1.0397253036499023,
"step": 1626
},
{
"epoch": 2.0,
"step": 1626,
"total_flos": 2.5753569883429274e+18,
"train_loss": 1.3656025735654513,
"train_runtime": 15098.7141,
"train_samples_per_second": 1.721,
"train_steps_per_second": 0.108
}
],
"logging_steps": 2,
"max_steps": 1626,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 9999999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.5753569883429274e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}